In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.offline as pyo

In [2]:
try:
    # Load preprocessed datasets
    orders_df = pd.read_csv('../data/processed/orders_ingested.csv')
    inventory_df = pd.read_csv('../data/processed/inventory_ingested.csv')
    fulfillment_df = pd.read_csv('../data/processed/fulfillment_ingested.csv')
    procurement_df = pd.read_csv('../data/processed/procurement_features.csv')
    ml_features_df = pd.read_csv('../data/processed/ml_features.csv')
    
    # Convert date columns
    orders_df['Order_Date'] = pd.to_datetime(orders_df['Order_Date'])
    orders_df['Shipment_Date'] = pd.to_datetime(orders_df['Shipment_Date'])
    inventory_df['Year_Month_Date'] = pd.to_datetime(inventory_df['Year_Month_Date'])
    
    print(f"✅ Orders Data: {orders_df.shape}")
    print(f"✅ Inventory Data: {inventory_df.shape}")
    print(f"✅ Fulfillment Data: {fulfillment_df.shape}")
    print(f"✅ Procurement Features: {procurement_df.shape}")
    print(f"✅ ML Features: {ml_features_df.shape}")
    
except Exception as e:
    print(f"❌ Error loading data: {e}")
    print("Please run the preprocessing pipeline first!")


✅ Orders Data: (30871, 26)
✅ Inventory Data: (4200, 5)
✅ Fulfillment Data: (118, 2)
✅ Procurement Features: (118, 59)
✅ ML Features: (118, 23)


In [10]:
def display_dataset_info(df, name):
    """Display comprehensive dataset information"""
    print(f"\n{name} Dataset Summary:")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"Date range: {df.select_dtypes(include=['datetime64']).min().min() if not df.select_dtypes(include=['datetime64']).empty else 'No dates'} to {df.select_dtypes(include=['datetime64']).max().max() if not df.select_dtypes(include=['datetime64']).empty else 'No dates'}")
    
    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"Missing values: {missing[missing > 0].to_dict()}")
    else:
        print("Missing values: None")
    
    # Data types
    print(f"Data types: {df.dtypes.value_counts().to_dict()}")

display_dataset_info(orders_df, "Orders")
display_dataset_info(inventory_df, "Inventory")
display_dataset_info(procurement_df, "Procurement Features")


Orders Dataset Summary:
Shape: (30871, 29)
Memory usage: 24.40 MB
Date range: 2015-01-01 00:00:00 to 2017-12-31 00:00:00
Missing values: {'Discount %': 1749}
Data types: {dtype('int64'): 14, dtype('O'): 10, dtype('<M8[ns]'): 2, dtype('int32'): 2, dtype('float64'): 1}

Inventory Dataset Summary:
Shape: (4200, 5)
Memory usage: 0.49 MB
Date range: 2015-01-01 00:00:00 to 2017-12-01 00:00:00
Missing values: None
Data types: {dtype('int64'): 2, dtype('O'): 1, dtype('float64'): 1, dtype('<M8[ns]'): 1}

Procurement Features Dataset Summary:
Shape: (118, 61)
Memory usage: 0.08 MB
Date range: No dates to No dates
Missing values: None
Data types: {dtype('float64'): 51, dtype('int64'): 6, dtype('O'): 4}


In [5]:
total_products = procurement_df['Product Name'].nunique()
total_revenue = procurement_df['Gross Sales_sum'].sum()
total_profit = procurement_df['Profit_sum'].sum()
avg_profit_margin = (total_profit / total_revenue) * 100

reorder_required = len(procurement_df[procurement_df['Stock_Status'] == 'Reorder_Required'])
stockout_products = len(procurement_df[procurement_df['Stock_Status'] == 'Stockout'])
overstock_products = len(procurement_df[procurement_df['Stock_Status'] == 'Overstock'])

print(f"📦 Total Products: {total_products:,}")
print(f"💰 Total Revenue: ${total_revenue:,.2f}")
print(f"💸 Total Profit: ${total_profit:,.2f}")
print(f"📊 Average Profit Margin: {avg_profit_margin:.2f}%")
print(f"🚨 Products Requiring Reorder: {reorder_required} ({reorder_required/total_products*100:.1f}%)")
print(f"❌ Products in Stockout: {stockout_products} ({stockout_products/total_products*100:.1f}%)")
print(f"📈 Products Overstocked: {overstock_products} ({overstock_products/total_products*100:.1f}%)")

# ABC Analysis Summary
abc_distribution = procurement_df['ABC_Category'].value_counts()
print(f"\n🎯 ABC Classification:")
for category, count in abc_distribution.items():
    percentage = (count / total_products) * 100
    revenue_contribution = procurement_df[procurement_df['ABC_Category'] == category]['Gross Sales_sum'].sum()
    revenue_percentage = (revenue_contribution / total_revenue) * 100
    print(f"   Category {category}: {count} products ({percentage:.1f}%) - {revenue_percentage:.1f}% of revenue")

📦 Total Products: 118
💰 Total Revenue: $6,181,476.00
💸 Total Profit: $3,994,192.00
📊 Average Profit Margin: 64.62%
🚨 Products Requiring Reorder: 3 (2.5%)
❌ Products in Stockout: 96 (81.4%)
📈 Products Overstocked: 16 (13.6%)

🎯 ABC Classification:
   Category C: 56 products (47.5%) - 1.6% of revenue
   Category B: 39 products (33.1%) - 2.8% of revenue
   Category A: 23 products (19.5%) - 95.6% of revenue


In [11]:
monthly_orders = orders_df.groupby(orders_df['Order_Date'].dt.to_period('M')).agg({
    'Order Quantity': 'sum',
    'Gross Sales': 'sum',
    'Order ID': 'nunique'
}).reset_index()

monthly_orders['Order_Date'] = monthly_orders['Order_Date'].dt.to_timestamp()

print(f"📅 Data Period: {orders_df['Order_Date'].min().strftime('%Y-%m-%d')} to {orders_df['Order_Date'].max().strftime('%Y-%m-%d')}")
print(f"📈 Peak Order Month: {monthly_orders.loc[monthly_orders['Order Quantity'].idxmax(), 'Order_Date'].strftime('%Y-%m')} ({monthly_orders['Order Quantity'].max():,} units)")
print(f"💰 Peak Revenue Month: {monthly_orders.loc[monthly_orders['Gross Sales'].idxmax(), 'Order_Date'].strftime('%Y-%m')} (${monthly_orders['Gross Sales'].max():,.2f})")

# Seasonality analysis
orders_df['Month'] = orders_df['Order_Date'].dt.month
orders_df['Quarter'] = orders_df['Order_Date'].dt.quarter
orders_df['Day_of_Week'] = orders_df['Order_Date'].dt.day_name()

seasonal_patterns = orders_df.groupby('Month')['Order Quantity'].sum()
quarterly_patterns = orders_df.groupby('Quarter')['Order Quantity'].sum()
weekly_patterns = orders_df.groupby('Day_of_Week')['Order Quantity'].sum()

print(f"\n🌟 Seasonal Insights:")
print(f"   Highest demand month: {seasonal_patterns.idxmax()} ({seasonal_patterns.max():,} units)")
print(f"   Lowest demand month: {seasonal_patterns.idxmin()} ({seasonal_patterns.min():,} units)")
print(f"   Highest demand quarter: Q{quarterly_patterns.idxmax()} ({quarterly_patterns.max():,} units)")

📅 Data Period: 2015-01-01 to 2017-12-31
📈 Peak Order Month: 2016-10 (2,324 units)
💰 Peak Revenue Month: 2016-08 ($203,949.00)

🌟 Seasonal Insights:
   Highest demand month: 1 (6,215 units)
   Lowest demand month: 12 (4,171 units)
   Highest demand quarter: Q1 (18,153 units)


In [5]:

# Top performing products
top_products_revenue = procurement_df.nlargest(10, 'Gross Sales_sum')[['Product Name', 'Gross Sales_sum', 'Order Quantity_sum', 'ABC_Category']]
top_products_volume = procurement_df.nlargest(10, 'Order Quantity_sum')[['Product Name', 'Order Quantity_sum', 'Gross Sales_sum', 'ABC_Category']]

print("💰 Top 10 Products by Revenue:")
for idx, row in top_products_revenue.iterrows():
    print(f"   {row['Product Name'][:50]}... - ${row['Gross Sales_sum']:,.2f} (Category {row['ABC_Category']})")

print("\n📦 Top 10 Products by Volume:")
for idx, row in top_products_volume.iterrows():
    print(f"   {row['Product Name'][:50]}... - {row['Order Quantity_sum']:,} units (Category {row['ABC_Category']})")

# Product diversity analysis
dept_performance = orders_df.groupby('Product Department').agg({
    'Order Quantity': 'sum',
    'Gross Sales': 'sum',
    'Product Name': 'nunique'
}).round(2)

print(f"\n🏢 Department Performance:")
for dept, data in dept_performance.iterrows():
    print(f"   {dept}: {data['Product Name']} products, {data['Order Quantity']:,} units, ${data['Gross Sales']:,.2f}")


💰 Top 10 Products by Revenue:
   Field & Stream Sportsman 16 Gun Fire Safe... - $1,151,200.00 (Category A)
   Perfect Fitness Perfect Rip Deck... - $777,120.00 (Category A)
   Diamondback Women's Serene Classic Comfort Bi... - $728,100.00 (Category A)
   Nike Men's Free 5.0+ Running Shoe... - $645,500.00 (Category A)
   Nike Men's Dri-FIT Victory Golf Polo... - $544,300.00 (Category A)
   Pelican Sunstream 100 Kayak... - $536,200.00 (Category A)
   O'Brien Men's Neoprene Life Vest... - $508,200.00 (Category A)
   Nike Men's CJ Elite 2 TD Football Cleat... - $501,150.00 (Category A)
   Under Armour Girls' Toddler Spine Surge Runni... - $227,560.00 (Category A)
   Web Camera... - $86,332.00 (Category A)

📦 Top 10 Products by Volume:
   Perfect Fitness Perfect Rip Deck... - 12,952.0 units (Category A)
   Nike Men's Dri-FIT Victory Golf Polo... - 10,886.0 units (Category A)
   O'Brien Men's Neoprene Life Vest... - 10,164.0 units (Category A)
   Nike Men's Free 5.0+ Running Shoe... - 6,455.

In [12]:
current_inventory_stats = procurement_df[['Current_Inventory', 'Reorder_Point', 'Safety_Stock', 'EOQ']].describe()
print("📊 Inventory Statistics:")
print(current_inventory_stats.round(2))

# Stockout risk analysis
high_risk_products = procurement_df[
    (procurement_df['Stockout_Frequency'] > 0.2) |  # More than 20% stockout frequency
    (procurement_df['Current_Inventory'] <= procurement_df['Reorder_Point'])
].sort_values('Procurement_Priority_Score', ascending=False)

print(f"\n🚨 High Risk Products (Stockout Frequency > 20% or Below Reorder Point): {len(high_risk_products)}")
if len(high_risk_products) > 0:
    print("Top 5 High Risk Products:")
    for idx, row in high_risk_products.head().iterrows():
        print(f"   {row['Product Name'][:50]}... - Current: {row['Current_Inventory']}, Reorder: {row['Reorder_Point']:.1f}, Priority: {row['Procurement_Priority_Score']:.1f}")

# Inventory efficiency metrics
procurement_df['Inventory_Efficiency'] = procurement_df['Order Quantity_sum'] / (procurement_df['Current_Inventory'] + 1)  # Add 1 to avoid division by zero
procurement_df['Cost_Efficiency'] = procurement_df['Profit_sum'] / (procurement_df['Current_Unit_Cost'] * procurement_df['Current_Inventory'] + 1)

efficiency_stats = procurement_df[['Inventory_Efficiency', 'Cost_Efficiency']].describe()
print(f"\n⚡ Efficiency Metrics:")
print(efficiency_stats.round(2))

📊 Inventory Statistics:
       Current_Inventory  Reorder_Point  Safety_Stock      EOQ
count             118.00         118.00        118.00   118.00
mean                3.34           0.90          0.53   313.84
std                14.66           0.78          0.52   522.27
min                 0.00           0.00          0.00     0.00
25%                 0.00           0.17          0.00    88.08
50%                 0.00           0.83          0.59   183.56
75%                 0.00           1.57          1.01   290.40
max               133.00           2.36          1.46  3052.16

🚨 High Risk Products (Stockout Frequency > 20% or Below Reorder Point): 112
Top 5 High Risk Products:
   First aid kit... - Current: 18, Reorder: 0.1, Priority: 82.2
   Summer dresses... - Current: 13, Reorder: 0.2, Priority: 81.6
   Porcelain crafts... - Current: 0, Reorder: 0.2, Priority: 80.9
   Industrial consumer electronics... - Current: 0, Reorder: 0.2, Priority: 80.4
   Lawn mower... - Current: 4,

In [7]:
delivery_stats = procurement_df[['Warehouse_Fulfillment_Days', 'On_Time_Delivery_mean', 'Delivery_Reliability']].describe()
print("📦 Delivery Performance Statistics:")
print(delivery_stats.round(3))

# Best and worst performing products by delivery
best_delivery = procurement_df.nlargest(5, 'Delivery_Reliability')[['Product Name', 'Delivery_Reliability', 'Warehouse_Fulfillment_Days']]
worst_delivery = procurement_df.nsmallest(5, 'Delivery_Reliability')[['Product Name', 'Delivery_Reliability', 'Warehouse_Fulfillment_Days']]

print(f"\n✅ Best Delivery Performance:")
for idx, row in best_delivery.iterrows():
    print(f"   {row['Product Name'][:50]}... - Reliability: {row['Delivery_Reliability']:.3f}, Lead Time: {row['Warehouse_Fulfillment_Days']:.1f} days")

print(f"\n❌ Worst Delivery Performance:")
for idx, row in worst_delivery.iterrows():
    print(f"   {row['Product Name'][:50]}... - Reliability: {row['Delivery_Reliability']:.3f}, Lead Time: {row['Warehouse_Fulfillment_Days']:.1f} days")

📦 Delivery Performance Statistics:
       Warehouse_Fulfillment_Days  On_Time_Delivery_mean  Delivery_Reliability
count                     118.000                118.000               118.000
mean                        5.192                  0.560                 0.694
std                         2.578                  0.179                 0.189
min                         0.000                  0.000                 0.000
25%                         3.300                  0.500                 0.651
50%                         5.300                  0.563                 0.717
75%                         6.900                  0.660                 0.777
max                         9.900                  1.000                 1.000

✅ Best Delivery Performance:
   Garmin Forerunner 910XT GPS Watch... - Reliability: 1.000, Lead Time: 5.6 days
   GoPro HERO3+ Black Edition Camera... - Reliability: 1.000, Lead Time: 5.4 days
   GolfBuddy VT3 GPS Watch... - Reliability: 1.000, Lead Tim

In [8]:
demand_analysis = procurement_df[['Order Quantity_mean', 'Order Quantity_std', 'Demand_Variability', 'Demand_Growth_Mean']].describe()
print("📊 Demand Analysis:")
print(demand_analysis.round(3))

# High growth products
high_growth = procurement_df[procurement_df['Demand_Growth_Mean'] > 0.1].sort_values('Demand_Growth_Mean', ascending=False)
declining_demand = procurement_df[procurement_df['Demand_Growth_Mean'] < -0.1].sort_values('Demand_Growth_Mean', ascending=True)

print(f"\n📈 High Growth Products (>10% growth): {len(high_growth)}")
if len(high_growth) > 0:
    print("Top 5 Growing Products:")
    for idx, row in high_growth.head().iterrows():
        print(f"   {row['Product Name'][:50]}... - Growth: {row['Demand_Growth_Mean']:.3f}, Current Demand: {row['Order Quantity_mean']:.1f}")

print(f"\n📉 Declining Demand Products (<-10% growth): {len(declining_demand)}")
if len(declining_demand) > 0:
    print("Top 5 Declining Products:")
    for idx, row in declining_demand.head().iterrows():
        print(f"   {row['Product Name'][:50]}... - Growth: {row['Demand_Growth_Mean']:.3f}, Current Demand: {row['Order Quantity_mean']:.1f}")


📊 Demand Analysis:
       Order Quantity_mean  Order Quantity_std  Demand_Variability  \
count              118.000             118.000             118.000   
mean                 2.030               0.753               0.254   
std                  1.069               0.701               0.238   
min                  0.000               0.000               0.000   
25%                  1.000               0.000               0.000   
50%                  2.611               1.184               0.379   
75%                  3.000               1.420               0.479   
max                  3.667               1.586               0.570   

       Demand_Growth_Mean  
count             118.000  
mean                0.827  
std                 1.256  
min                -0.800  
25%                 0.220  
50%                 0.503  
75%                 0.853  
max                 8.833  

📈 High Growth Products (>10% growth): 93
Top 5 Growing Products:
   Lawn mower... - Growth: 8.833

In [14]:

total_products = procurement_df['Product Name'].nunique()
eoq_stats = procurement_df['EOQ'].describe()
print("📦 Economic Order Quantity (EOQ) Analysis:")
print(eoq_stats.round(2))

# Reorder point analysis
reorder_stats = procurement_df['Reorder_Point'].describe()
print(f"\n🔄 Reorder Point Analysis:")
print(reorder_stats.round(2))

# Priority score distribution
priority_stats = procurement_df['Procurement_Priority_Score'].describe()
print(f"\n⭐ Procurement Priority Score Analysis:")
print(priority_stats.round(2))

# Action recommendations summary
action_summary = procurement_df['Recommended_Action'].value_counts()
print(f"\n📋 Recommended Actions Summary:")
for action, count in action_summary.items():
    percentage = (count / total_products) * 100
    print(f"   {action}: {count} products ({percentage:.1f}%)")

📦 Economic Order Quantity (EOQ) Analysis:
count     118.00
mean      313.84
std       522.27
min         0.00
25%        88.08
50%       183.56
75%       290.40
max      3052.16
Name: EOQ, dtype: float64

🔄 Reorder Point Analysis:
count    118.00
mean       0.90
std        0.78
min        0.00
25%        0.17
50%        0.83
75%        1.57
max        2.36
Name: Reorder_Point, dtype: float64

⭐ Procurement Priority Score Analysis:
count    118.00
mean      52.22
std       14.65
min       17.97
25%       44.81
50%       50.56
75%       59.57
max       82.21
Name: Procurement_Priority_Score, dtype: float64

📋 Recommended Actions Summary:
   Urgent_Reorder: 96 products (81.4%)
   Reduce_Orders: 16 products (13.6%)
   Reorder_Soon: 3 products (2.5%)
   Increase_Stock: 2 products (1.7%)
   Monitor: 1 products (0.8%)


In [15]:
correlation_features = [
    'Order Quantity_mean', 'Gross Sales_sum', 'Profit_sum',
    'Current_Inventory', 'Stockout_Frequency', 'Demand_Variability',
    'Warehouse_Fulfillment_Days', 'Delivery_Reliability',
    'Reorder_Point', 'Safety_Stock', 'EOQ', 'Procurement_Priority_Score'
]

correlation_matrix = procurement_df[correlation_features].corr()

# Find strong correlations (>0.7 or <-0.7)
strong_correlations = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.7:
            strong_correlations.append({
                'feature1': correlation_matrix.columns[i],
                'feature2': correlation_matrix.columns[j],
                'correlation': corr_value
            })

print(f"🔗 Strong Correlations (|r| > 0.7): {len(strong_correlations)}")
for corr in sorted(strong_correlations, key=lambda x: abs(x['correlation']), reverse=True)[:5]:
    print(f"   {corr['feature1']} ↔ {corr['feature2']}: {corr['correlation']:.3f}")

🔗 Strong Correlations (|r| > 0.7): 10
   Reorder_Point ↔ Safety_Stock: 0.986
   Gross Sales_sum ↔ Profit_sum: 0.968
   Demand_Variability ↔ Safety_Stock: 0.940
   Order Quantity_mean ↔ Demand_Variability: 0.935
   Order Quantity_mean ↔ Safety_Stock: 0.911


In [16]:
def detect_outliers_iqr(df, column):
    """Detect outliers using IQR method"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Detect outliers in key metrics
outlier_columns = ['Order Quantity_sum', 'Gross Sales_sum', 'Current_Inventory', 'Procurement_Priority_Score']

print("🎯 Outlier Detection Results:")
for col in outlier_columns:
    if col in procurement_df.columns:
        outliers = detect_outliers_iqr(procurement_df, col)
        percentage = (len(outliers) / len(procurement_df)) * 100
        print(f"   {col}: {len(outliers)} outliers ({percentage:.1f}%)")
        
        if len(outliers) > 0 and len(outliers) <= 5:
            print(f"      Example outliers: {outliers['Product Name'].head(3).tolist()}")


🎯 Outlier Detection Results:
   Order Quantity_sum: 9 outliers (7.6%)
   Gross Sales_sum: 19 outliers (16.1%)
   Current_Inventory: 22 outliers (18.6%)
   Procurement_Priority_Score: 6 outliers (5.1%)


In [21]:
def assess_data_quality(df, name):
    """Comprehensive data quality assessment"""
    print(f"\n{name} Data Quality Report:")
    
    # Completeness
    total_cells = df.shape[0] * df.shape[1]
    missing_cells = df.isnull().sum().sum()
    completeness = ((total_cells - missing_cells) / total_cells) * 100
    print(f"   📊 Completeness: {completeness:.2f}%")
    
    # Uniqueness (for product names)
    if 'Product Name' in df.columns:
        uniqueness = (df['Product Name'].nunique() / len(df)) * 100
        print(f"   🆔 Product Uniqueness: {uniqueness:.2f}%")
    
    # Consistency (no negative values where inappropriate)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    negative_issues = 0
    for col in numeric_cols:
        if 'Quantity' in col or 'Sales' in col or 'Inventory' in col:
            negative_count = (df[col] < 0).sum()
            if negative_count > 0:
                negative_issues += negative_count
    
    consistency = ((len(df) * len(numeric_cols) - negative_issues) / (len(df) * len(numeric_cols))) * 100 if len(numeric_cols) > 0 else 100
    print(f"   ✅ Consistency: {consistency:.2f}%")
    
    # Overall quality score
    overall_quality = (completeness + uniqueness + consistency) / 3 if 'Product Name' in df.columns else (completeness + consistency) / 2
    print(f"   🏆 Overall Quality Score: {overall_quality:.2f}%")
    return overall_quality

assess_data_quality(procurement_df, "Procurement Features")
overall_quality = assess_data_quality(ml_features_df, "ML Features")


Procurement Features Data Quality Report:
   📊 Completeness: 100.00%
   🆔 Product Uniqueness: 100.00%
   ✅ Consistency: 100.00%
   🏆 Overall Quality Score: 100.00%

ML Features Data Quality Report:
   📊 Completeness: 100.00%
   🆔 Product Uniqueness: 100.00%
   ✅ Consistency: 87.56%
   🏆 Overall Quality Score: 95.85%


In [18]:
reasonable_reorder = procurement_df[
    (procurement_df['Reorder_Point'] > 0) & 
    (procurement_df['Reorder_Point'] <= procurement_df['Order Quantity_sum'])
]
print(f"   ✅ Reasonable Reorder Points: {len(reasonable_reorder)}/{len(procurement_df)} ({len(reasonable_reorder)/len(procurement_df)*100:.1f}%)")

# Check if EOQ values are reasonable
reasonable_eoq = procurement_df[
    (procurement_df['EOQ'] > 0) & 
    (procurement_df['EOQ'] <= procurement_df['Order Quantity_sum'] * 2)
]
print(f"   ✅ Reasonable EOQ Values: {len(reasonable_eoq)}/{len(procurement_df)} ({len(reasonable_eoq)/len(procurement_df)*100:.1f}%)")

# Check if safety stock is reasonable
reasonable_safety = procurement_df[
    (procurement_df['Safety_Stock'] >= 0) & 
    (procurement_df['Safety_Stock'] <= procurement_df['Order Quantity_mean'] * 2)
]
print(f"   ✅ Reasonable Safety Stock: {len(reasonable_safety)}/{len(procurement_df)} ({len(reasonable_safety)/len(procurement_df)*100:.1f}%)")

# Feature distribution analysis
print(f"\n📊 Feature Distribution Summary:")
ml_numeric_features = ml_features_df.select_dtypes(include=[np.number]).columns
skewed_features = []
for feature in ml_numeric_features:
    skewness = stats.skew(ml_features_df[feature].dropna())
    if abs(skewness) > 1:  # Highly skewed
        skewed_features.append((feature, skewness))

print(f"   📈 Highly Skewed Features (|skew| > 1): {len(skewed_features)}")
for feature, skew_val in sorted(skewed_features, key=lambda x: abs(x[1]), reverse=True)[:5]:
    print(f"      {feature}: {skew_val:.3f}")

   ✅ Reasonable Reorder Points: 113/118 (95.8%)
   ✅ Reasonable EOQ Values: 37/118 (31.4%)
   ✅ Reasonable Safety Stock: 118/118 (100.0%)

📊 Feature Distribution Summary:
   📈 Highly Skewed Features (|skew| > 1): 11
      Inventory_Days_Supply: 7.558
      Current_Inventory: 6.827
      EOQ: 3.749
      Order_Frequency: 3.482
      Demand_Growth_Mean: 3.456


In [22]:
reorder_required = len(procurement_df[procurement_df['Stock_Status'] == 'Reorder_Required'])
stockout_products = len(procurement_df[procurement_df['Stock_Status'] == 'Stockout'])

urgent_products = len(procurement_df[procurement_df['Stock_Status'] == 'Stockout'])
if urgent_products > 0:
    print(f"   🚨 URGENT: {urgent_products} products are in stockout - immediate reordering required")

# High opportunity products
high_opportunity = len(procurement_df[
    (procurement_df['Demand_Growth_Mean'] > 0.1) & 
    (procurement_df['ABC_Category'] == 'A')
])
if high_opportunity > 0:
    print(f"   📈 OPPORTUNITY: {high_opportunity} high-value products showing strong growth")

# Efficiency improvements
overstock_value = procurement_df[procurement_df['Stock_Status'] == 'Overstock']['Gross Sales_sum'].sum()
if overstock_value > 0:
    print(f"   💰 EFFICIENCY: ${overstock_value:,.2f} tied up in overstocked products")

# Supplier issues
poor_suppliers = len(procurement_df[procurement_df['Delivery_Reliability'] < 0.8])
if poor_suppliers > 0:
    print(f"   🚚 SUPPLIER: {poor_suppliers} products have poor delivery reliability (<80%)")

print(f"\n📊 RECOMMENDATIONS:")
print(f"   1. Implement automated reordering for {reorder_required + stockout_products} products")
print(f"   2. Review supplier contracts for {poor_suppliers} underperforming products")
print(f"   3. Optimize inventory levels to free up ${overstock_value:,.2f} in working capital")
print(f"   4. Focus growth strategies on {high_opportunity} high-potential products")

print(f"\n✅ DATA READINESS FOR ML:")
print(f"   • {len(ml_features_df)} products with {len(ml_features_df.columns)-1} features")
print(f"   • Data quality score: {overall_quality:.1f}%")
print(f"   • Feature engineering validation: {len(reasonable_reorder)/len(procurement_df)*100:.1f}% metrics validated")
print(f"   • Ready for demand forecasting, inventory optimization, and supplier selection models")


   🚨 URGENT: 96 products are in stockout - immediate reordering required
   📈 OPPORTUNITY: 14 high-value products showing strong growth
   💰 EFFICIENCY: $111,062.00 tied up in overstocked products
   🚚 SUPPLIER: 93 products have poor delivery reliability (<80%)

📊 RECOMMENDATIONS:
   1. Implement automated reordering for 99 products
   2. Review supplier contracts for 93 underperforming products
   3. Optimize inventory levels to free up $111,062.00 in working capital
   4. Focus growth strategies on 14 high-potential products

✅ DATA READINESS FOR ML:
   • 118 products with 22 features
   • Data quality score: 95.9%
   • Feature engineering validation: 95.8% metrics validated
   • Ready for demand forecasting, inventory optimization, and supplier selection models


In [23]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('ABC Category Distribution', 'Stock Status Overview', 
                   'Revenue by Product Category', 'Procurement Priority Distribution'),
    specs=[[{"type": "pie"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "histogram"}]]
)

# ABC Category Distribution (Pie Chart)
abc_counts = procurement_df['ABC_Category'].value_counts()
fig.add_trace(
    go.Pie(labels=abc_counts.index, values=abc_counts.values, name="ABC Categories"),
    row=1, col=1
)

# Stock Status Overview (Bar Chart)
stock_counts = procurement_df['Stock_Status'].value_counts()
colors = ['red' if status == 'Stockout' else 'orange' if status == 'Reorder_Required' 
          else 'blue' if status == 'Overstock' else 'green' for status in stock_counts.index]

fig.add_trace(
    go.Bar(x=stock_counts.index, y=stock_counts.values, name="Stock Status",
           marker_color=colors),
    row=1, col=2
)

# Revenue by Department (Bar Chart)
dept_revenue = orders_df.groupby('Product Department')['Gross Sales'].sum().sort_values(ascending=False).head(10)
fig.add_trace(
    go.Bar(x=dept_revenue.index, y=dept_revenue.values, name="Revenue by Department",
           marker_color='lightblue'),
    row=2, col=1
)

# Procurement Priority Distribution (Histogram)
fig.add_trace(
    go.Histogram(x=procurement_df['Procurement_Priority_Score'], name="Priority Score",
                nbinsx=30, marker_color='purple'),
    row=2, col=2
)

fig.update_layout(height=800, showlegend=False, title_text="Procurement Business Overview Dashboard")
fig.show()

In [24]:
monthly_data = orders_df.groupby(orders_df['Order_Date'].dt.to_period('M')).agg({
    'Order Quantity': 'sum',
    'Gross Sales': 'sum',
    'Order ID': 'nunique'
}).reset_index()
monthly_data['Order_Date'] = monthly_data['Order_Date'].dt.to_timestamp()

# Create temporal analysis dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Monthly Order Volume Trend', 'Monthly Revenue Trend',
                   'Seasonal Patterns (by Month)', 'Weekly Patterns'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"type": "bar"}, {"type": "bar"}]]
)

# Monthly Order Volume
fig.add_trace(
    go.Scatter(x=monthly_data['Order_Date'], y=monthly_data['Order Quantity'],
              mode='lines+markers', name='Order Volume', line=dict(color='blue', width=3)),
    row=1, col=1
)

# Monthly Revenue
fig.add_trace(
    go.Scatter(x=monthly_data['Order_Date'], y=monthly_data['Gross Sales'],
              mode='lines+markers', name='Revenue', line=dict(color='green', width=3)),
    row=1, col=2
)

# Seasonal patterns by month
orders_df['Month'] = orders_df['Order_Date'].dt.month
monthly_seasonal = orders_df.groupby('Month')['Order Quantity'].sum()
fig.add_trace(
    go.Bar(x=monthly_seasonal.index, y=monthly_seasonal.values, name='Monthly Volume',
           marker_color='orange'),
    row=2, col=1
)

# Weekly patterns
orders_df['Day_of_Week'] = orders_df['Order_Date'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekly_patterns = orders_df.groupby('Day_of_Week')['Order Quantity'].sum().reindex(day_order)
fig.add_trace(
    go.Bar(x=weekly_patterns.index, y=weekly_patterns.values, name='Weekly Volume',
           marker_color='purple'),
    row=2, col=2
)

fig.update_layout(height=800, showlegend=False, title_text="Temporal Analysis Dashboard")
fig.show()

In [25]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Current Inventory vs Reorder Point', 'EOQ Distribution',
                   'Safety Stock Analysis', 'Inventory Days Supply'),
    specs=[[{"type": "scatter"}, {"type": "histogram"}],
           [{"type": "box"}, {"type": "scatter"}]]
)

# Current Inventory vs Reorder Point Scatter Plot
colors = ['red' if status == 'Stockout' else 'orange' if status == 'Reorder_Required' 
          else 'blue' if status == 'Overstock' else 'green' 
          for status in procurement_df['Stock_Status']]

fig.add_trace(
    go.Scatter(x=procurement_df['Reorder_Point'], y=procurement_df['Current_Inventory'],
              mode='markers', name='Inventory vs Reorder Point',
              marker=dict(color=colors, size=8, opacity=0.6),
              text=procurement_df['Product Name'].str[:30]),
    row=1, col=1
)

# Add diagonal line for reference (where current inventory = reorder point)
max_val = max(procurement_df['Reorder_Point'].max(), procurement_df['Current_Inventory'].max())
fig.add_trace(
    go.Scatter(x=[0, max_val], y=[0, max_val], mode='lines', 
              line=dict(dash='dash', color='red'), name='Reorder Line'),
    row=1, col=1
)

# EOQ Distribution
fig.add_trace(
    go.Histogram(x=procurement_df['EOQ'], name='EOQ Distribution',
                nbinsx=30, marker_color='lightgreen'),
    row=1, col=2
)

# Safety Stock by ABC Category
abc_categories = procurement_df['ABC_Category'].unique()
for category in abc_categories:
    category_data = procurement_df[procurement_df['ABC_Category'] == category]
    fig.add_trace(
        go.Box(y=category_data['Safety_Stock'], name=f'Category {category}'),
        row=2, col=1
    )

# Inventory Days Supply vs Demand
fig.add_trace(
    go.Scatter(x=procurement_df['Order Quantity_mean'], y=procurement_df['Inventory_Days_Supply'],
              mode='markers', name='Days Supply vs Demand',
              marker=dict(size=8, color=procurement_df['ABC_Category'].map({'A': 'red', 'B': 'blue', 'C': 'green'}),
                         opacity=0.6)),
    row=2, col=2
)

fig.update_layout(height=800, showlegend=True, title_text="Inventory Optimization Dashboard")
fig.show()

In [26]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Delivery Reliability Distribution', 'Lead Time vs Reliability',
                   'Fulfillment Time by ABC Category', 'On-Time Delivery Performance'),
    specs=[[{"type": "histogram"}, {"type": "scatter"}],
           [{"type": "box"}, {"type": "bar"}]]
)

# Delivery Reliability Distribution
fig.add_trace(
    go.Histogram(x=procurement_df['Delivery_Reliability'], name='Delivery Reliability',
                nbinsx=25, marker_color='skyblue'),
    row=1, col=1
)

# Lead Time vs Reliability Scatter
fig.add_trace(
    go.Scatter(x=procurement_df['Warehouse_Fulfillment_Days'], 
              y=procurement_df['Delivery_Reliability'],
              mode='markers', name='Lead Time vs Reliability',
              marker=dict(size=8, color=procurement_df['Procurement_Priority_Score'],
                         colorscale='Viridis', opacity=0.6, showscale=True)),
    row=1, col=2
)

# Fulfillment Time by ABC Category
for category in abc_categories:
    category_data = procurement_df[procurement_df['ABC_Category'] == category]
    fig.add_trace(
        go.Box(y=category_data['Warehouse_Fulfillment_Days'], name=f'Category {category}'),
        row=2, col=1
    )

# On-Time Delivery Performance (Top and Bottom performers)
delivery_sorted = procurement_df.sort_values('On_Time_Delivery_mean')
top_performers = delivery_sorted.tail(10)
bottom_performers = delivery_sorted.head(10)

fig.add_trace(
    go.Bar(x=top_performers['Product Name'].str[:20], y=top_performers['On_Time_Delivery_mean'],
           name='Top Performers', marker_color='green'),
    row=2, col=2
)

fig.add_trace(
    go.Bar(x=bottom_performers['Product Name'].str[:20], y=bottom_performers['On_Time_Delivery_mean'],
           name='Bottom Performers', marker_color='red'),
    row=2, col=2
)

fig.update_layout(height=800, showlegend=True, title_text="Supplier Performance Dashboard")
fig.show()


In [27]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Demand Growth Distribution', 'Demand Variability vs Growth',
                   'High Growth Products', 'Demand Patterns by Department'),
    specs=[[{"type": "histogram"}, {"type": "scatter"}],
           [{"type": "bar"}, {"type": "box"}]]
)

# Demand Growth Distribution
fig.add_trace(
    go.Histogram(x=procurement_df['Demand_Growth_Mean'], name='Demand Growth',
                nbinsx=30, marker_color='lightcoral'),
    row=1, col=1
)

# Demand Variability vs Growth
fig.add_trace(
    go.Scatter(x=procurement_df['Demand_Variability'], y=procurement_df['Demand_Growth_Mean'],
              mode='markers', name='Variability vs Growth',
              marker=dict(size=procurement_df['Order Quantity_sum']/1000, 
                         color=procurement_df['ABC_Category'].map({'A': 'red', 'B': 'blue', 'C': 'green'}),
                         opacity=0.6),
              text=procurement_df['Product Name'].str[:30]),
    row=1, col=2
)

# High Growth Products (Top 15)
high_growth = procurement_df.nlargest(15, 'Demand_Growth_Mean')
fig.add_trace(
    go.Bar(x=high_growth['Product Name'].str[:20], y=high_growth['Demand_Growth_Mean'],
           name='High Growth Products', marker_color='gold'),
    row=2, col=1
)

# Demand Patterns by Department
departments = orders_df['Product Department'].value_counts().head(8).index
for dept in departments:
    dept_data = orders_df[orders_df['Product Department'] == dept]
    monthly_dept = dept_data.groupby(dept_data['Order_Date'].dt.month)['Order Quantity'].sum()
    fig.add_trace(
        go.Scatter(x=monthly_dept.index, y=monthly_dept.values, 
                  mode='lines+markers', name=dept),
        row=2, col=2
    )

fig.update_layout(height=800, showlegend=True, title_text="Demand Forecasting Dashboard")
fig.show()

In [28]:
key_features = [
    'Order Quantity_mean', 'Gross Sales_sum', 'Current_Inventory',
    'Stockout_Frequency', 'Delivery_Reliability', 'Demand_Growth_Mean',
    'Reorder_Point', 'Safety_Stock', 'EOQ', 'Procurement_Priority_Score'
]

correlation_matrix = procurement_df[key_features].corr()

# Create correlation heatmap
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu',
    zmid=0,
    text=correlation_matrix.round(2).values,
    texttemplate="%{text}",
    textfont={"size": 10},
    hoverongaps=False
))

fig.update_layout(
    title='Correlation Matrix of Key Procurement Features',
    width=800,
    height=600
)
fig.show()


In [29]:
outlier_features = ['Order Quantity_sum', 'Gross Sales_sum', 'Current_Inventory', 'Procurement_Priority_Score']

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=tuple([f'{feature} Outliers' for feature in outlier_features])
)

positions = [(1,1), (1,2), (2,1), (2,2)]

for i, feature in enumerate(outlier_features):
    row, col = positions[i]
    fig.add_trace(
        go.Box(y=procurement_df[feature], name=feature, boxpoints='outliers'),
        row=row, col=col
    )

fig.update_layout(height=600, showlegend=False, title_text="Outlier Detection Dashboard")
fig.show()

In [30]:
action_counts = procurement_df['Recommended_Action'].value_counts()

# Create pie chart for recommended actions
fig = go.Figure(data=[go.Pie(
    labels=action_counts.index,
    values=action_counts.values,
    hole=.3,
    textinfo='label+percent',
    textposition='inside'
)])

fig.update_layout(title_text="Recommended Procurement Actions Distribution")
fig.show()

# Priority matrix: Stockout Risk vs Revenue Impact
fig = go.Figure()

# Create scatter plot with size based on priority score
fig.add_trace(go.Scatter(
    x=procurement_df['Stockout_Frequency'],
    y=procurement_df['Gross Sales_sum'],
    mode='markers',
    marker=dict(
        size=procurement_df['Procurement_Priority_Score'],
        color=procurement_df['ABC_Category'].map({'A': 'red', 'B': 'blue', 'C': 'green'}),
        opacity=0.6,
        sizemode='diameter',
        sizeref=2,
        line=dict(width=1, color='DarkSlateGrey')
    ),
    text=procurement_df['Product Name'].str[:30],
    hovertemplate='<b>%{text}</b><br>' +
                  'Stockout Risk: %{x:.2f}<br>' +
                  'Revenue: $%{y:,.0f}<br>' +
                  'Priority: %{marker.size:.1f}<br>' +
                  '<extra></extra>'
))

fig.update_layout(
    title='Procurement Priority Matrix: Stockout Risk vs Revenue Impact',
    xaxis_title='Stockout Frequency',
    yaxis_title='Total Revenue ($)',
    width=800,
    height=600
)
fig.show()

In [31]:
dept_analysis = orders_df.groupby('Product Department').agg({
    'Order Quantity': 'sum',
    'Gross Sales': 'sum',
    'Profit': 'sum',
    'Product Name': 'nunique'
}).round(2)

dept_analysis['Profit_Margin'] = (dept_analysis['Profit'] / dept_analysis['Gross Sales']) * 100
dept_analysis = dept_analysis.sort_values('Gross Sales', ascending=False).head(10)

# Create department analysis dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Revenue by Department', 'Profit Margin by Department',
                   'Product Count by Department', 'Volume by Department')
)

# Revenue by Department
fig.add_trace(
    go.Bar(x=dept_analysis.index, y=dept_analysis['Gross Sales'], 
           name='Revenue', marker_color='lightblue'),
    row=1, col=1
)

# Profit Margin by Department
fig.add_trace(
    go.Bar(x=dept_analysis.index, y=dept_analysis['Profit_Margin'], 
           name='Profit Margin %', marker_color='lightgreen'),
    row=1, col=2
)

# Product Count by Department
fig.add_trace(
    go.Bar(x=dept_analysis.index, y=dept_analysis['Product Name'], 
           name='Product Count', marker_color='orange'),
    row=2, col=1
)

# Volume by Department
fig.add_trace(
    go.Bar(x=dept_analysis.index, y=dept_analysis['Order Quantity'], 
           name='Order Volume', marker_color='purple'),
    row=2, col=2
)

fig.update_layout(height=800, showlegend=False, title_text="Department Performance Dashboard")
fig.update_xaxes(tickangle=45)
fig.show()

In [34]:
target_correlations = {}
targets = ['Stockout_Frequency', 'Demand_Growth_Mean', 'Procurement_Priority_Score']

for target in targets:
    correlations = procurement_df[key_features].corrwith(procurement_df[target]).abs().sort_values(ascending=False)
    target_correlations[target] = correlations

# Create feature importance heatmap
fig = go.Figure()

for i, target in enumerate(targets):
    fig.add_trace(go.Bar(
        x=target_correlations[target].index,
        y=target_correlations[target].values,
        name=target,
        opacity=0.7
    ))

fig.update_layout(
    title='Feature Importance for Key Prediction Targets',
    xaxis_title='Features',
    yaxis_title='Absolute Correlation',
    barmode='group',
    height=500
)
fig.update_xaxes(tickangle=45)
fig.show()

# Data completeness visualization
completeness_data = []
for col in procurement_df.columns:
    if procurement_df[col].dtype in ['float64', 'int64']:
        completeness = (1 - procurement_df[col].isnull().sum() / len(procurement_df)) * 100
        completeness_data.append({'Feature': col, 'Completeness': completeness})

completeness_df = pd.DataFrame(completeness_data).sort_values('Completeness')

fig = go.Figure(go.Bar(
    x=completeness_df['Completeness'],
    y=completeness_df['Feature'],
    orientation='h',
    marker_color='lightcoral'
))

fig.update_layout(
    title='Data Completeness by Feature',
    xaxis_title='Completeness (%)',
    height=max(400, len(completeness_df) * 20)
)
fig.show()