In [None]:
import os
import pandas as pd

# Define the data directory path
data_dir = '../data'

In [8]:
# Read CSV files with proper encoding handling
# Try different encodings if utf-8 fails
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']

def read_csv_with_encoding(file_path):
    """Try to read CSV with different encodings"""
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            print(f"‚úì Loaded {os.path.basename(file_path)} with {encoding} encoding")
            return df
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Could not read {file_path} with any encoding")

# Read all CSV files
dataco_df = read_csv_with_encoding(os.path.join(data_dir, 'DataCoSupplyChainDataset.csv'))
dynamic_logistics_df = read_csv_with_encoding(os.path.join(data_dir, 'dynamic_supply_chain_logistics_dataset.csv'))
supply_chain_df = read_csv_with_encoding(os.path.join(data_dir, 'supply_chain_data.csv'))

# Read Excel file
retail_sales_df = pd.read_excel(os.path.join(data_dir, 'Retail-Supply-Chain-Sales-Dataset.xlsx'))
print(f"‚úì Loaded Retail-Supply-Chain-Sales-Dataset.xlsx")

print("\n" + "="*60)
print("All data loaded successfully!")
print("="*60)
print(f"\nDataCoSupplyChainDataset: {dataco_df.shape}")
print(f"Dynamic Supply Chain Logistics: {dynamic_logistics_df.shape}")
print(f"Supply Chain Data: {supply_chain_df.shape}")
print(f"Retail Sales: {retail_sales_df.shape}")

‚úì Loaded DataCoSupplyChainDataset.csv with latin-1 encoding
‚úì Loaded dynamic_supply_chain_logistics_dataset.csv with utf-8 encoding
‚úì Loaded supply_chain_data.csv with utf-8 encoding
‚úì Loaded Retail-Supply-Chain-Sales-Dataset.xlsx

All data loaded successfully!

DataCoSupplyChainDataset: (180519, 53)
Dynamic Supply Chain Logistics: (32065, 26)
Supply Chain Data: (100, 24)
Retail Sales: (9994, 23)


In [10]:
dataco_df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


In [12]:
dynamic_logistics_df.head()

Unnamed: 0,timestamp,vehicle_gps_latitude,vehicle_gps_longitude,fuel_consumption_rate,eta_variation_hours,traffic_congestion_level,warehouse_inventory_level,loading_unloading_time,handling_equipment_availability,order_fulfillment_status,...,iot_temperature,cargo_condition_status,route_risk_level,customs_clearance_time,driver_behavior_score,fatigue_monitoring_score,disruption_likelihood_score,delay_probability,risk_classification,delivery_time_deviation
0,2021-01-01 00:00:00,40.375568,-77.014318,5.136512,4.998009,5.927586,985.716862,4.951392,0.481294,0.761166,...,0.5744,0.777263,1.182116,0.502006,0.033843,0.978599,0.506152,0.885291,Moderate Risk,9.110682
1,2021-01-01 01:00:00,33.507818,-117.036902,5.101512,0.984929,1.591992,396.700206,1.030379,0.62078,0.196594,...,-9.753493,0.091839,9.611988,0.966774,0.201725,0.918586,0.980784,0.544178,High Risk,8.175281
2,2021-01-01 02:00:00,30.02064,-75.269224,5.090803,4.972665,8.787765,832.408935,4.220229,0.810933,0.152742,...,-6.491034,0.253529,6.570431,0.945627,0.264045,0.394215,0.998633,0.803322,High Risk,1.283594
3,2021-01-01 03:00:00,36.649223,-70.190529,8.219558,3.095064,0.045257,0.573283,0.530186,0.008525,0.811885,...,-0.151276,0.877576,0.548952,4.674035,0.362885,0.905444,0.99332,0.025977,High Risk,9.304897
4,2021-01-01 04:00:00,30.001279,-70.012195,5.000075,3.216077,8.004851,914.925067,3.62089,0.020083,0.053659,...,2.429448,0.262081,8.861443,3.445429,0.016957,0.258702,0.912433,0.991122,High Risk,7.752484


In [13]:
supply_chain_df.head()

Unnamed: 0,Product type,SKU,Price,Availability,Number of products sold,Revenue generated,Customer demographics,Stock levels,Lead times,Order quantities,...,Location,Lead time,Production volumes,Manufacturing lead time,Manufacturing costs,Inspection results,Defect rates,Transportation modes,Routes,Costs
0,haircare,SKU0,69.808006,55,802,8661.996792,Non-binary,58,7,96,...,Mumbai,29,215,29,46.279879,Pending,0.22641,Road,Route B,187.752075
1,skincare,SKU1,14.843523,95,736,7460.900065,Female,53,30,37,...,Mumbai,23,517,30,33.616769,Pending,4.854068,Road,Route B,503.065579
2,haircare,SKU2,11.319683,34,8,9577.749626,Unknown,1,10,88,...,Mumbai,12,971,27,30.688019,Pending,4.580593,Air,Route C,141.920282
3,skincare,SKU3,61.163343,68,83,7766.836426,Non-binary,23,13,59,...,Kolkata,24,937,18,35.624741,Fail,4.746649,Rail,Route A,254.776159
4,skincare,SKU4,4.805496,26,871,2686.505152,Non-binary,5,3,56,...,Delhi,5,414,3,92.065161,Fail,3.14558,Air,Route A,923.440632


In [14]:
retail_sales_df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Retail Sales People,Product ID,Category,Sub-Category,Product Name,Returned,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-08-11,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Cassandra Brandow,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,Not,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-08-11,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Cassandra Brandow,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Not,731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-12-06,2016-12-06,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,Anna Andreadi,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,Not,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-11-10,2015-11-10,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Cassandra Brandow,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,Not,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-11-10,2015-11-10,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Cassandra Brandow,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,Not,22.368,2,0.2,2.5164


# Dataset Analysis for Multi-Agent Supply Chain Intelligence Platform

## Overview of Available Datasets

Based on the loaded data, we have 4 datasets from Kaggle:
1. **DataCo Supply Chain Dataset** (180,519 rows √ó 53 columns)
2. **Dynamic Logistics Dataset** (32,065 rows √ó 26 columns)
3. **Supply Chain Data** (100 rows √ó 24 columns)
4. **Retail Supply Chain Sales Dataset** (9,994 rows √ó 23 columns)

In [15]:
# Detailed analysis of each dataset
print("="*80)
print("1. DATACO SUPPLY CHAIN DATASET")
print("="*80)
print(f"\nShape: {dataco_df.shape}")
print(f"\nColumns ({len(dataco_df.columns)}):")
print(dataco_df.columns.tolist())
print(f"\nData Types:")
print(dataco_df.dtypes)
print(f"\nSample Data:")
print(dataco_df.head(3))

1. DATACO SUPPLY CHAIN DATASET

Shape: (180519, 53)

Columns (53):
['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)', 'Benefit per order', 'Sales per customer', 'Delivery Status', 'Late_delivery_risk', 'Category Id', 'Category Name', 'Customer City', 'Customer Country', 'Customer Email', 'Customer Fname', 'Customer Id', 'Customer Lname', 'Customer Password', 'Customer Segment', 'Customer State', 'Customer Street', 'Customer Zipcode', 'Department Id', 'Department Name', 'Latitude', 'Longitude', 'Market', 'Order City', 'Order Country', 'Order Customer Id', 'order date (DateOrders)', 'Order Id', 'Order Item Cardprod Id', 'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price', 'Order Item Profit Ratio', 'Order Item Quantity', 'Sales', 'Order Item Total', 'Order Profit Per Order', 'Order Region', 'Order State', 'Order Status', 'Order Zipcode', 'Product Card Id', 'Product Category Id', 'Product Description', 'Product Image', 'Product

## Agent Capability Assessment

Let's assess whether these datasets support the 5 agent types you want to build:
- **Data Analyst Agent**: Trends, outliers, anomalies, SQL queries, auto-charts
- **Supply Chain Risk Agent**: Supplier delays, stockout prediction, demand spikes, reorder quantities
- **Finance Insight Agent**: P&L, expense patterns, cashflow forecasting
- **Meeting/Report Agent**: Weekly reports, CxO summaries, recommended actions
- **Email/Workflow Agent**: Stakeholder alerts, follow-up tasks, meeting agendas

In [16]:
# Analyze key features for each agent type

print("üîç DATASET CAPABILITY ANALYSIS FOR MULTI-AGENT SYSTEM\n")
print("="*80)

# 1. DataCo Supply Chain Dataset Analysis
print("\nüì¶ Dataset 1: DataCo Supply Chain (Main Dataset)")
print("-"*80)
dataco_features = {
    'Sales & Orders': [col for col in dataco_df.columns if any(x in col.lower() for x in ['sales', 'order', 'customer', 'product'])],
    'Shipping & Logistics': [col for col in dataco_df.columns if any(x in col.lower() for x in ['ship', 'delivery', 'days'])],
    'Financial': [col for col in dataco_df.columns if any(x in col.lower() for x in ['benefit', 'price', 'payment'])],
    'Risk Indicators': [col for col in dataco_df.columns if any(x in col.lower() for x in ['late', 'risk', 'status'])]
}

for category, cols in dataco_features.items():
    print(f"\n  {category}: {len(cols)} columns")
    print(f"    {cols[:5]}")  # Show first 5

# Check for time series capability
print(f"\n  ‚è∞ Time-series columns:")
time_cols = [col for col in dataco_df.columns if 'date' in col.lower() or 'time' in col.lower()]
print(f"    {time_cols}")

# Check data coverage
print(f"\n  üìä Data Coverage:")
if 'order date (DateOrders)' in dataco_df.columns:
    dataco_df['order_date_parsed'] = pd.to_datetime(dataco_df['order date (DateOrders)'], errors='coerce')
    print(f"    Date Range: {dataco_df['order_date_parsed'].min()} to {dataco_df['order_date_parsed'].max()}")
    print(f"    Duration: {(dataco_df['order_date_parsed'].max() - dataco_df['order_date_parsed'].min()).days} days")

üîç DATASET CAPABILITY ANALYSIS FOR MULTI-AGENT SYSTEM


üì¶ Dataset 1: DataCo Supply Chain (Main Dataset)
--------------------------------------------------------------------------------

  Sales & Orders: 40 columns
    ['Benefit per order', 'Sales per customer', 'Customer City', 'Customer Country', 'Customer Email']

  Shipping & Logistics: 6 columns
    ['Days for shipping (real)', 'Days for shipment (scheduled)', 'Delivery Status', 'Late_delivery_risk', 'shipping date (DateOrders)']

  Financial: 3 columns
    ['Benefit per order', 'Order Item Product Price', 'Product Price']

  Risk Indicators: 4 columns
    ['Delivery Status', 'Late_delivery_risk', 'Order Status', 'Product Status']

  ‚è∞ Time-series columns:
    ['order date (DateOrders)', 'shipping date (DateOrders)']

  üìä Data Coverage:
    Date Range: 2015-01-01 00:00:00 to 2018-01-31 23:38:00
    Duration: 1126 days


In [17]:
# 2. Dynamic Logistics Dataset Analysis
print("\n\nüöö Dataset 2: Dynamic Logistics Dataset")
print("-"*80)
print(f"  Rows: {dynamic_logistics_df.shape[0]:,}")
print(f"  Columns: {dynamic_logistics_df.shape[1]}")

logistics_features = {
    'Vehicle & GPS': [col for col in dynamic_logistics_df.columns if any(x in col.lower() for x in ['vehicle', 'gps', 'fuel'])],
    'Warehouse & Inventory': [col for col in dynamic_logistics_df.columns if any(x in col.lower() for x in ['warehouse', 'inventory', 'loading'])],
    'Risk & Performance': [col for col in dynamic_logistics_df.columns if any(x in col.lower() for x in ['risk', 'delay', 'disruption'])],
    'Real-time Monitoring': [col for col in dynamic_logistics_df.columns if any(x in col.lower() for x in ['iot', 'temperature', 'fatigue', 'behavior'])]
}

for category, cols in logistics_features.items():
    print(f"\n  {category}: {len(cols)} columns")
    print(f"    {cols}")

# Time range
if 'timestamp' in dynamic_logistics_df.columns:
    dynamic_logistics_df['timestamp_parsed'] = pd.to_datetime(dynamic_logistics_df['timestamp'], errors='coerce')
    print(f"\n  ‚è∞ Time Range: {dynamic_logistics_df['timestamp_parsed'].min()} to {dynamic_logistics_df['timestamp_parsed'].max()}")
    print(f"  üìä Duration: {(dynamic_logistics_df['timestamp_parsed'].max() - dynamic_logistics_df['timestamp_parsed'].min()).days} days")



üöö Dataset 2: Dynamic Logistics Dataset
--------------------------------------------------------------------------------
  Rows: 32,065
  Columns: 26

  Vehicle & GPS: 3 columns
    ['vehicle_gps_latitude', 'vehicle_gps_longitude', 'fuel_consumption_rate']

  Warehouse & Inventory: 2 columns
    ['warehouse_inventory_level', 'loading_unloading_time']

  Risk & Performance: 4 columns
    ['route_risk_level', 'disruption_likelihood_score', 'delay_probability', 'risk_classification']

  Real-time Monitoring: 3 columns
    ['iot_temperature', 'driver_behavior_score', 'fatigue_monitoring_score']

  ‚è∞ Time Range: 2021-01-01 00:00:00 to 2024-08-29 00:00:00
  üìä Duration: 1336 days


In [18]:
# 3. Retail Supply Chain Sales Dataset
print("\n\nüõí Dataset 3: Retail Supply Chain Sales")
print("-"*80)
print(f"  Rows: {retail_sales_df.shape[0]:,}")
print(f"  Columns: {retail_sales_df.shape[1]}")
print(f"\n  Key Features:")
print(f"    - Orders, Customers, Products")
print(f"    - Sales, Profit, Discount")
print(f"    - Categories, Sub-categories")
print(f"    - Returns tracking")
print(f"    - Geographical data (Country, City, Region)")

if 'Order Date' in retail_sales_df.columns:
    retail_sales_df['Order Date'] = pd.to_datetime(retail_sales_df['Order Date'], errors='coerce')
    print(f"\n  ‚è∞ Date Range: {retail_sales_df['Order Date'].min()} to {retail_sales_df['Order Date'].max()}")
    print(f"  üìä Duration: {(retail_sales_df['Order Date'].max() - retail_sales_df['Order Date'].min()).days} days")
    
# Financial summary
if 'Sales' in retail_sales_df.columns and 'Profit' in retail_sales_df.columns:
    print(f"\n  üí∞ Financial Overview:")
    print(f"    Total Sales: ${retail_sales_df['Sales'].sum():,.2f}")
    print(f"    Total Profit: ${retail_sales_df['Profit'].sum():,.2f}")
    print(f"    Profit Margin: {(retail_sales_df['Profit'].sum() / retail_sales_df['Sales'].sum() * 100):.2f}%")



üõí Dataset 3: Retail Supply Chain Sales
--------------------------------------------------------------------------------
  Rows: 9,994
  Columns: 23

  Key Features:
    - Orders, Customers, Products
    - Sales, Profit, Discount
    - Categories, Sub-categories
    - Returns tracking
    - Geographical data (Country, City, Region)

  ‚è∞ Date Range: 2014-01-02 00:00:00 to 2017-12-30 00:00:00
  üìä Duration: 1458 days

  üí∞ Financial Overview:
    Total Sales: $2,297,200.86
    Total Profit: $286,397.02
    Profit Margin: 12.47%


In [19]:
# 4. Supply Chain Data (Smaller dataset)
print("\n\nüìä Dataset 4: Supply Chain Data (Small)")
print("-"*80)
print(f"  Rows: {supply_chain_df.shape[0]:,}")
print(f"  Columns: {supply_chain_df.shape[1]}")
print(f"\n  Key Features:")
print(f"    Columns: {supply_chain_df.columns.tolist()}")
print(f"\n  Note: This is a small dataset (100 rows) - good for testing/validation")



üìä Dataset 4: Supply Chain Data (Small)
--------------------------------------------------------------------------------
  Rows: 100
  Columns: 24

  Key Features:
    Columns: ['Product type', 'SKU', 'Price', 'Availability', 'Number of products sold', 'Revenue generated', 'Customer demographics', 'Stock levels', 'Lead times', 'Order quantities', 'Shipping times', 'Shipping carriers', 'Shipping costs', 'Supplier name', 'Location', 'Lead time', 'Production volumes', 'Manufacturing lead time', 'Manufacturing costs', 'Inspection results', 'Defect rates', 'Transportation modes', 'Routes', 'Costs']

  Note: This is a small dataset (100 rows) - good for testing/validation


## Assessment: Can We Build the Multi-Agent System?

In [20]:
print("\n" + "="*80)
print("‚úÖ FEASIBILITY ASSESSMENT FOR EACH AGENT")
print("="*80)

agents = {
    "1. Data Analyst Agent": {
        "Required Capabilities": [
            "Identify trends, outliers, anomalies",
            "Run SQL queries on demand",
            "Auto-generate charts"
        ],
        "Data Support": "‚úÖ EXCELLENT",
        "Details": [
            "‚úÖ 180K+ rows of transactional data (DataCo)",
            "‚úÖ Time-series data spanning multiple years",
            "‚úÖ Multiple metrics: sales, orders, shipping, customers",
            "‚úÖ Can detect anomalies in delivery times, sales patterns",
            "‚úÖ Rich categorical data for segmentation"
        ]
    },
    
    "2. Supply Chain Risk Agent": {
        "Required Capabilities": [
            "Detect supplier delays",
            "Predict stockouts",
            "Flag demand spikes",
            "Suggest reorder quantities"
        ],
        "Data Support": "‚úÖ VERY GOOD",
        "Details": [
            "‚úÖ Late delivery risk column (DataCo)",
            "‚úÖ Delivery status tracking",
            "‚úÖ Days for shipping (actual vs scheduled)",
            "‚úÖ Warehouse inventory levels (Dynamic Logistics)",
            "‚úÖ Stock levels and lead times (Supply Chain Data)",
            "‚ö†Ô∏è  Limited supplier-specific data",
            "‚ö†Ô∏è  Would benefit from more procurement data"
        ]
    },
    
    "3. Finance Insight Agent": {
        "Required Capabilities": [
            "P&L summarization",
            "Expense pattern analysis",
            "Cashflow forecasting"
        ],
        "Data Support": "‚úÖ GOOD",
        "Details": [
            "‚úÖ Sales, Revenue, Profit data (Retail Sales)",
            "‚úÖ Benefit per order (DataCo)",
            "‚úÖ Product pricing, costs",
            "‚úÖ Payment types (DEBIT, CASH, TRANSFER)",
            "‚ö†Ô∏è  Limited expense breakdown data",
            "‚ö†Ô∏è  No direct cashflow/AR/AP data",
            "üí° Can calculate margins, profitability trends"
        ]
    },
    
    "4. Meeting/Report Agent": {
        "Required Capabilities": [
            "Weekly supply chain reports",
            "CxO-level summaries",
            "Recommend 3 actions"
        ],
        "Data Support": "‚úÖ EXCELLENT",
        "Details": [
            "‚úÖ Can aggregate data from all agents",
            "‚úÖ Rich data for KPI dashboards",
            "‚úÖ Time-series for trend reporting",
            "‚úÖ Multiple business dimensions available"
        ]
    },
    
    "5. Email/Workflow Agent": {
        "Required Capabilities": [
            "Send stakeholder alerts",
            "Create follow-up tasks",
            "Generate meeting agendas"
        ],
        "Data Support": "‚úÖ EXCELLENT",
        "Details": [
            "‚úÖ Can trigger on risk events (late deliveries)",
            "‚úÖ Can trigger on inventory thresholds",
            "‚úÖ Can create alerts from other agents' insights",
            "‚úÖ Customer/order data for context"
        ]
    }
}

for agent_name, info in agents.items():
    print(f"\n{agent_name}")
    print("-" * 80)
    print(f"Status: {info['Data Support']}")
    print(f"\nRequired Capabilities:")
    for cap in info['Required Capabilities']:
        print(f"  ‚Ä¢ {cap}")
    print(f"\nData Support Details:")
    for detail in info['Details']:
        print(f"  {detail}")

print("\n" + "="*80)
print("üéØ OVERALL VERDICT")
print("="*80)
print("\n‚úÖ YES - You CAN build this multi-agent system with the available data!")
print("\nStrengths:")
print("  ‚Ä¢ Large transaction dataset (180K+ rows)")
print("  ‚Ä¢ Time-series data for trends and forecasting")
print("  ‚Ä¢ Logistics and delivery tracking")
print("  ‚Ä¢ Financial metrics (sales, profit, costs)")
print("  ‚Ä¢ Risk indicators (late delivery, stockouts)")
print("  ‚Ä¢ Real-time monitoring data (IoT, GPS)")
print("\nGaps & Workarounds:")
print("  ‚ö†Ô∏è  Limited supplier-specific data ‚Üí Focus on delivery performance")
print("  ‚ö†Ô∏è  No direct AR/AP data ‚Üí Use sales/profit as proxy for cashflow")
print("  ‚ö†Ô∏è  Limited procurement data ‚Üí Focus on inventory and reorder signals")
print("\nRecommendation:")
print("  üöÄ START BUILDING! The data supports 90% of your agent requirements.")
print("  üí° Use vector embeddings for the knowledge base")
print("  üí° Implement time-series models for forecasting")
print("  üí° Use LangGraph or CrewAI for agent orchestration")


‚úÖ FEASIBILITY ASSESSMENT FOR EACH AGENT

1. Data Analyst Agent
--------------------------------------------------------------------------------
Status: ‚úÖ EXCELLENT

Required Capabilities:
  ‚Ä¢ Identify trends, outliers, anomalies
  ‚Ä¢ Run SQL queries on demand
  ‚Ä¢ Auto-generate charts

Data Support Details:
  ‚úÖ 180K+ rows of transactional data (DataCo)
  ‚úÖ Time-series data spanning multiple years
  ‚úÖ Multiple metrics: sales, orders, shipping, customers
  ‚úÖ Can detect anomalies in delivery times, sales patterns
  ‚úÖ Rich categorical data for segmentation

2. Supply Chain Risk Agent
--------------------------------------------------------------------------------
Status: ‚úÖ VERY GOOD

Required Capabilities:
  ‚Ä¢ Detect supplier delays
  ‚Ä¢ Predict stockouts
  ‚Ä¢ Flag demand spikes
  ‚Ä¢ Suggest reorder quantities

Data Support Details:
  ‚úÖ Late delivery risk column (DataCo)
  ‚úÖ Delivery status tracking
  ‚úÖ Days for shipping (actual vs scheduled)
  ‚úÖ Warehouse

## Key Insights About Your Data

In [21]:
print("="*80)
print("üìã SUMMARY OF DATASETS")
print("="*80)

summary_data = {
    'Dataset': ['DataCo Supply Chain', 'Dynamic Logistics', 'Retail Sales', 'Supply Chain Data'],
    'Rows': [f"{dataco_df.shape[0]:,}", f"{dynamic_logistics_df.shape[0]:,}", 
             f"{retail_sales_df.shape[0]:,}", f"{supply_chain_df.shape[0]:,}"],
    'Columns': [dataco_df.shape[1], dynamic_logistics_df.shape[1], 
                retail_sales_df.shape[1], supply_chain_df.shape[1]],
    'Primary Use': [
        'Sales transactions, orders, shipping',
        'Real-time logistics, GPS, IoT monitoring',
        'Retail operations, profitability',
        'Manufacturing, procurement (small test set)'
    ],
    'Best For Agent': [
        'Data Analyst, Finance, Risk',
        'Risk, Real-time monitoring',
        'Finance, Data Analyst',
        'Testing/Validation'
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\n")
display(summary_df)

print("\n" + "="*80)
print("üîë KEY DATA POINTS AVAILABLE")
print("="*80)
print("\nüìä For Data Analyst Agent:")
print("  ‚Ä¢ Order volumes, sales trends over time")
print("  ‚Ä¢ Customer segmentation (city, state, country)")
print("  ‚Ä¢ Product categories and performance")
print("  ‚Ä¢ Shipping modes and delivery patterns")

print("\n‚ö†Ô∏è  For Supply Chain Risk Agent:")
print("  ‚Ä¢ Late delivery risk scores")
print("  ‚Ä¢ Actual vs scheduled shipping days")
print("  ‚Ä¢ Warehouse inventory levels")
print("  ‚Ä¢ Disruption likelihood scores")
print("  ‚Ä¢ Route risk levels")
print("  ‚Ä¢ Stock levels and lead times")

print("\nüí∞ For Finance Insight Agent:")
print("  ‚Ä¢ Sales revenue by product/customer/region")
print("  ‚Ä¢ Profit and profit margins")
print("  ‚Ä¢ Discount patterns")
print("  ‚Ä¢ Benefit per order")
print("  ‚Ä¢ Product costs and pricing")

print("\nüìà For Reporting Agent:")
print("  ‚Ä¢ Time-series data for weekly/monthly reports")
print("  ‚Ä¢ KPIs: delivery performance, sales, profit")
print("  ‚Ä¢ Regional performance metrics")
print("  ‚Ä¢ Return rates")

print("\nüîî For Workflow/Alert Agent:")
print("  ‚Ä¢ Late delivery triggers")
print("  ‚Ä¢ Low inventory alerts")
print("  ‚Ä¢ High-risk shipment flags")
print("  ‚Ä¢ Demand spike detection")

üìã SUMMARY OF DATASETS




Unnamed: 0,Dataset,Rows,Columns,Primary Use,Best For Agent
0,DataCo Supply Chain,180519,54,"Sales transactions, orders, shipping","Data Analyst, Finance, Risk"
1,Dynamic Logistics,32065,27,"Real-time logistics, GPS, IoT monitoring","Risk, Real-time monitoring"
2,Retail Sales,9994,23,"Retail operations, profitability","Finance, Data Analyst"
3,Supply Chain Data,100,24,"Manufacturing, procurement (small test set)",Testing/Validation



üîë KEY DATA POINTS AVAILABLE

üìä For Data Analyst Agent:
  ‚Ä¢ Order volumes, sales trends over time
  ‚Ä¢ Customer segmentation (city, state, country)
  ‚Ä¢ Product categories and performance
  ‚Ä¢ Shipping modes and delivery patterns

‚ö†Ô∏è  For Supply Chain Risk Agent:
  ‚Ä¢ Late delivery risk scores
  ‚Ä¢ Actual vs scheduled shipping days
  ‚Ä¢ Warehouse inventory levels
  ‚Ä¢ Disruption likelihood scores
  ‚Ä¢ Route risk levels
  ‚Ä¢ Stock levels and lead times

üí∞ For Finance Insight Agent:
  ‚Ä¢ Sales revenue by product/customer/region
  ‚Ä¢ Profit and profit margins
  ‚Ä¢ Discount patterns
  ‚Ä¢ Benefit per order
  ‚Ä¢ Product costs and pricing

üìà For Reporting Agent:
  ‚Ä¢ Time-series data for weekly/monthly reports
  ‚Ä¢ KPIs: delivery performance, sales, profit
  ‚Ä¢ Regional performance metrics
  ‚Ä¢ Return rates

üîî For Workflow/Alert Agent:
  ‚Ä¢ Late delivery triggers
  ‚Ä¢ Low inventory alerts
  ‚Ä¢ High-risk shipment flags
  ‚Ä¢ Demand spike detection


## Next Steps: Building Your Multi-Agent System

Based on this analysis, here's what you should do next:

### 1. Data Preparation
- Clean and merge datasets where appropriate
- Create unified time-series structures
- Build vector embeddings for knowledge base (use ChromaDB or Pinecone)
- Set up SQL database for agent queries

### 2. Agent Framework Selection
Choose one of these:
- **LangGraph** (Recommended) - Great for complex agentic workflows
- **CrewAI** - Easy to get started, good for role-based agents
- **AutoGen** - Microsoft's framework, good for multi-agent collaboration
- **Custom with LangChain** - Maximum flexibility

### 3. Implementation Priority
1. Start with **Data Analyst Agent** (foundation for others)
2. Build **Supply Chain Risk Agent** (high value)
3. Add **Finance Insight Agent** 
4. Create **Meeting/Report Agent** (aggregates insights)
5. Finally, **Email/Workflow Agent** (orchestration layer)

### 4. Tech Stack Recommendation
- **LLM**: OpenAI GPT-4 or Claude (for agent reasoning)
- **Vector DB**: ChromaDB or Pinecone (for knowledge base)
- **Database**: PostgreSQL or DuckDB (for SQL queries)
- **Visualization**: Plotly or Matplotlib (auto-charts)
- **Orchestration**: LangGraph or CrewAI
- **Monitoring**: LangSmith or custom logging

# Opik Testing

In [22]:
from opik import track
import opik
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(override=True)

# Initialize Opik
client = opik.Opik()

@track(project_name="omnisupply-data-analyst")
def test_tracking():
    """Test Opik tracking"""
    return {"status": "success", "message": "Opik is working!"}

if __name__ == "__main__":
    result = test_tracking()
    print(result)
    print("\n‚úÖ Check your Comet workspace for traces!")
    print("üîó https://www.comet.com/omnisupply/projects/omnisupply-data-analyst")

OPIK: Started logging traces to the "omnisupply-data-analyst" project at https://www.comet.com/opik/api/v1/session/redirect/projects/?trace_id=019ae064-68b5-7844-ab6d-2fafea977deb&path=aHR0cHM6Ly93d3cuY29tZXQuY29tL29waWsvYXBpLw==.


{'status': 'success', 'message': 'Opik is working!'}

‚úÖ Check your Comet workspace for traces!
üîó https://www.comet.com/omnisupply/projects/omnisupply-data-analyst
