In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [5]:
try:
    # Load preprocessed data
    procurement_df = pd.read_csv('../data/processed/procurement_features.csv')
    orders_df = pd.read_csv('../data/processed/orders_ingested.csv')
    ml_features_df = pd.read_csv('../data/processed/ml_features.csv')
    
    # Convert dates
    orders_df['Order_Date'] = pd.to_datetime(orders_df['Order_Date'])
    orders_df['Shipment_Date'] = pd.to_datetime(orders_df['Shipment_Date'])
    
    print(f"✅ Procurement Features: {procurement_df.shape}")
    print(f"✅ Orders Data: {orders_df.shape}")
    print(f"✅ ML Features: {ml_features_df.shape}")
    
except Exception as e:
    print(f"❌ Error loading data: {e}")
    exit(1)

# Prepare modeling datasets
print("\n📋 Preparing modeling datasets...")

# Create time-based features for temporal models
orders_df['Year'] = orders_df['Order_Date'].dt.year
orders_df['Month'] = orders_df['Order_Date'].dt.month
orders_df['Quarter'] = orders_df['Order_Date'].dt.quarter
orders_df['Day_of_Year'] = orders_df['Order_Date'].dt.dayofyear
orders_df['Week_of_Year'] = orders_df['Order_Date'].dt.isocalendar().week



✅ Procurement Features: (118, 59)
✅ Orders Data: (30871, 26)
✅ ML Features: (118, 23)

📋 Preparing modeling datasets...


In [9]:
# Prepare supplier performance dataset
supplier_features = [
    'Order Quantity_mean', 'Order Quantity_std', 'Gross Sales_sum',
    'Current_Inventory', 'Stockout_Frequency', 'Demand_Variability',
    'Warehouse_Fulfillment_Days', 'Procurement_Priority_Score'
]

# Target: Delivery Reliability Score
supplier_data = procurement_df[supplier_features + ['Delivery_Reliability']].copy()
supplier_data = supplier_data.dropna()

print(f"📊 Supplier dataset shape: {supplier_data.shape}")

# Split features and target
X_supplier = supplier_data[supplier_features]
y_supplier_reliability = supplier_data['Delivery_Reliability']

# Create supplier classification target (Good/Average/Poor)
y_supplier_class = pd.cut(y_supplier_reliability, 
                         bins=[0, 0.7, 0.9, 1.0], 
                         labels=['Poor', 'Average', 'Good'])

# Train-test split
X_sup_train, X_sup_test, y_sup_rel_train, y_sup_rel_test = train_test_split(
    X_supplier, y_supplier_reliability, test_size=0.2, random_state=42
)

X_sup_cls_train, X_sup_cls_test, y_sup_cls_train, y_sup_cls_test = train_test_split(
    X_supplier, y_supplier_class, test_size=0.2, random_state=42
)

# Scale features
scaler_supplier = StandardScaler()
X_sup_train_scaled = scaler_supplier.fit_transform(X_sup_train)
X_sup_test_scaled = scaler_supplier.transform(X_sup_test)

print("🔧 Model Selection for Supplier Performance Prediction...")

# Test multiple regression models for reliability prediction
supplier_models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'SVR': SVR(kernel='rbf')
}

supplier_results = {}

for name, model in supplier_models.items():
    if name in ['Linear Regression', 'Ridge Regression', 'SVR']:
        model.fit(X_sup_train_scaled, y_sup_rel_train)
        y_pred = model.predict(X_sup_test_scaled)
    else:
        model.fit(X_sup_train, y_sup_rel_train)
        y_pred = model.predict(X_sup_test)
    
    mse = mean_squared_error(y_sup_rel_test, y_pred)
    mae = mean_absolute_error(y_sup_rel_test, y_pred)
    r2 = r2_score(y_sup_rel_test, y_pred)
    
    supplier_results[name] = {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Model': model
    }
    
    print(f"   {name}: R² = {r2:.4f}, MAE = {mae:.4f}")

# Select best supplier model
best_supplier_model_name = max(supplier_results.keys(), key=lambda x: supplier_results[x]['R2'])
best_supplier_model = supplier_results[best_supplier_model_name]['Model']

print(f"\n🏆 Best Supplier Performance Model: {best_supplier_model_name}")
print(f"   R² Score: {supplier_results[best_supplier_model_name]['R2']:.4f}")

# Train supplier classification model
print("\n🔍 Training Supplier Classification Model...")

supplier_classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42)
}

best_supplier_classifier = None
best_supplier_score = 0

for name, model in supplier_classifiers.items():
    if name in ['Logistic Regression', 'SVM']:
        model.fit(X_sup_train_scaled, y_sup_cls_train)
        score = model.score(X_sup_test_scaled, y_sup_cls_test)
    else:
        model.fit(X_sup_train, y_sup_cls_train)
        score = model.score(X_sup_test, y_sup_cls_test)
    
    print(f"   {name}: Accuracy = {score:.4f}")
    
    if score > best_supplier_score:
        best_supplier_score = score
        best_supplier_classifier = model

print(f"\n🏆 Best Supplier Classification Accuracy: {best_supplier_score:.4f}")


📊 Supplier dataset shape: (118, 9)
🔧 Model Selection for Supplier Performance Prediction...
   Random Forest: R² = 0.4775, MAE = 0.0771
   Gradient Boosting: R² = 0.2881, MAE = 0.0981
   XGBoost: R² = 0.3856, MAE = 0.0875
   Linear Regression: R² = 0.2096, MAE = 0.1178
   Ridge Regression: R² = 0.3251, MAE = 0.1063
   SVR: R² = 0.3116, MAE = 0.0929

🏆 Best Supplier Performance Model: Random Forest
   R² Score: 0.4775

🔍 Training Supplier Classification Model...


ValueError: Input contains NaN

In [8]:
print(f"📊 Supplier dataset shape: {supplier_data.shape}")

📊 Supplier dataset shape: (118, 9)
