In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [4]:
df = pd.read_csv("/Users/deepthiramesh/Documents/Extra Projects/Supply chain delay analysis/Dataset/dynamic_supply_chain_logistics_dataset_with_country.csv")

In [5]:
# Step 3: Data Cleaning & Parsing with synthetic delivery dates
num_rows = len(df)
base_dates = pd.date_range(start='2011-01-01', periods=1825, freq='D')
repeated_dates = np.tile(base_dates, int(np.ceil(num_rows / len(base_dates))))[:num_rows]

df['Scheduled Delivery Date'] = repeated_dates
df['Delivered to Client Date'] = df['Scheduled Delivery Date'] + pd.to_timedelta(df['delivery_time_deviation'].fillna(0), unit='D')
df['delay_days'] = (df['Delivered to Client Date'] - df['Scheduled Delivery Date']).dt.days
df = df[df['delay_days'].notnull()]

In [6]:
# Step 4: Delay Analysis
df['Route'] = df['supplier_country'] + " → Destination"
df['Delivery Hour'] = df['Delivered to Client Date'].dt.hour
df['Delivery Day'] = df['Delivered to Client Date'].dt.day_name()

In [7]:
# Step 5: Delay Summary
delay_summary = df.groupby('Route')['delay_days'].agg(['mean', 'std', 'count']).reset_index()
carrier_delay = df.groupby('supplier_id')['delay_days'].mean().sort_values(ascending=False)

In [9]:
# Step 6: Modeling (Random Forest)
model_df = df[['delay_days', 'risk_classification', 'supplier_id', 'product_id', 'historical_demand']].dropna()
le = LabelEncoder()
for col in ['risk_classification', 'supplier_id', 'product_id']:
    model_df[col] = le.fit_transform(model_df[col])

X = model_df.drop('delay_days', axis=1)
y = model_df['delay_days']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestRegressor(n_estimators=30, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R² Score: 0.8662386852346627
RMSE: 1.4885299086441581
