In [24]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [25]:
# 1-load_inspect.py
import pandas as pd
import numpy as np

# path you provided
path = r"D:\Amazon_Delivery_Time_Prediction\amazon_delivery.csv"

df = pd.read_csv(path)
print("shape:", df.shape)
print("columns:", df.columns.tolist())
display(df.head(8))   # in a notebook; in a script use print(df.head())
print("\nMissing values per column:\n", df.isnull().sum().sort_values(ascending=False).head(20))


shape: (43739, 16)
columns: ['Order_ID', 'Agent_Age', 'Agent_Rating', 'Store_Latitude', 'Store_Longitude', 'Drop_Latitude', 'Drop_Longitude', 'Order_Date', 'Order_Time', 'Pickup_Time', 'Weather', 'Traffic', 'Vehicle', 'Area', 'Delivery_Time', 'Category']


Unnamed: 0,Order_ID,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Date,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,ialx566343618,37,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,akqg208421122,34,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,njpu434582536,23,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,rjto796129700,38,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,zguw716275638,32,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys
5,fxuu788413734,22,4.8,17.431668,78.408321,17.461668,78.438321,2022-03-11,21:20:00,21:30:00,Cloudy,Jam,motorcycle,Urban,130,Toys
6,njmo150975311,33,4.7,23.369746,85.33982,23.479746,85.44982,2022-03-04,19:15:00,19:30:00,Fog,Jam,scooter,Metropolitian,200,Toys
7,jvjc772545076,35,4.6,12.352058,76.60665,12.482058,76.73665,2022-03-14,17:25:00,17:30:00,Cloudy,Medium,motorcycle,Metropolitian,160,Snacks



Missing values per column:
 Weather            91
Agent_Rating       54
Agent_Age           0
Order_ID            0
Store_Longitude     0
Drop_Latitude       0
Drop_Longitude      0
Store_Latitude      0
Order_Date          0
Order_Time          0
Pickup_Time         0
Traffic             0
Vehicle             0
Area                0
Delivery_Time       0
Category            0
dtype: int64


In [26]:
# 2-clean_feature_engineer.py
import pandas as pd
import numpy as np

# load
df = pd.read_csv(r"D:\Amazon_Delivery_Time_Prediction\amazon_delivery.csv")

In [27]:
# 1) strip whitespace for string columns to remove trailing spaces like 'High '
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype(str).str.strip()


In [28]:
# Convert Delivery_Time to numeric (looks like minutes)
df['Delivery_Time'] = pd.to_numeric(df['Delivery_Time'], errors='coerce')

In [29]:
# Parse datetimes
df['order_datetime'] = pd.to_datetime(df['Order_Date'].astype(str) + ' ' + df['Order_Time'].astype(str), errors='coerce')
df['pickup_datetime'] = pd.to_datetime(df['Order_Date'].astype(str) + ' ' + df['Pickup_Time'].astype(str), errors='coerce')

In [30]:
# If pickup looks earlier than order (possible next-day pickup), add 1 day
mask = (df['pickup_datetime'] < df['order_datetime']) & df['pickup_datetime'].notnull() & df['order_datetime'].notnull()
df.loc[mask, 'pickup_datetime'] += pd.Timedelta(days=1)




In [31]:
# Convert to datetime if not already
df['order_datetime'] = pd.to_datetime(df['order_datetime'])
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Apply the day adjustment for rows where pickup < order
mask = df['pickup_datetime'] < df['order_datetime']
df.loc[mask, 'pickup_datetime'] += pd.Timedelta(days=1)

# Now calculate pickup delay in minutes
df['pickup_delay_mins'] = (df['pickup_datetime'] - df['order_datetime']).dt.total_seconds() / 60.0

In [32]:
# Haversine distance (km) between store and drop coordinates
def haversine_km(lat1, lon1, lat2, lon2):
    # vectorized implementation using numpy
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    R = 6371  # Earth radius in km
    return R * c

df['distance_km'] = haversine_km(df['Store_Latitude'], df['Store_Longitude'],
                                 df['Drop_Latitude'], df['Drop_Longitude'])

In [33]:
# time features
df['order_hour'] = df['order_datetime'].dt.hour
df['order_dayofweek'] = df['order_datetime'].dt.dayofweek

# impute Agent_Rating with median (minority missing)
df['Agent_Rating'] = pd.to_numeric(df['Agent_Rating'], errors='coerce')
df['Agent_Rating'].fillna(df['Agent_Rating'].median(), inplace=True)

In [34]:
# Drop small number of rows missing key time info (you can impute instead if needed)
df = df.dropna(subset=['order_datetime', 'Order_Time', 'Traffic', 'Weather'])


In [35]:
# final quick check
print("After cleaning, shape:", df.shape)
print(df[['Delivery_Time', 'distance_km', 'pickup_delay_mins', 'order_hour']].describe().T)

After cleaning, shape: (43648, 22)
                     count        mean         std        min        25%  \
Delivery_Time      43648.0  124.914475   51.933163  10.000000  90.000000   
distance_km        43648.0   27.255432  303.815765   1.465067   4.663432   
pickup_delay_mins  43648.0    9.991294    4.086680   5.000000   5.000000   
order_hour         43648.0   17.425976    4.818494   0.000000  15.000000   

                          50%         75%          max  
Delivery_Time      125.000000  160.000000   270.000000  
distance_km          9.220419   13.682165  6884.726399  
pickup_delay_mins   10.000000   15.000000    15.000000  
order_hour          19.000000   21.000000    23.000000  


In [36]:
# 3-prepare_train.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pickle

# Define features
num_feats = ['distance_km', 'Agent_Age', 'Agent_Rating', 'pickup_delay_mins', 'order_hour', 'order_dayofweek']
cat_feats = ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']

# Drop any leftover NaNs in selected features
print("Original shape:", df.shape)
df_model = df.dropna(subset=num_feats + cat_feats + ['Delivery_Time']).copy()
print(f"After dropping NaNs: {df_model.shape}")
print(f"Dropped {df.shape[0] - df_model.shape[0]} rows")

# Prepare X and y
X = df_model[num_feats + cat_feats]
y = df_model['Delivery_Time']  # minutes

print(f"\nTarget variable (Delivery_Time) stats:")
print(y.describe())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.20, 
    random_state=42,
    shuffle=True
)

print(f"\nTrain set size: {X_train.shape[0]} ({(1-0.20)*100:.0f}%)")
print(f"Test set size: {X_test.shape[0]} ({0.20*100:.0f}%)")

# Preprocessor: scale numeric, OHE categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feats),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_feats),
    ], 
    remainder='drop'
)

# Fit the preprocessor on training data
print("\nFitting preprocessor on training data...")
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print(f"Transformed training shape: {X_train_transformed.shape}")
print(f"Transformed test shape: {X_test_transformed.shape}")

# Get feature names after transformation
try:
    feature_names = (
        num_feats + 
        preprocessor.named_transformers_['cat'].get_feature_names_out(cat_feats).tolist()
    )
    print(f"\nTotal features after encoding: {len(feature_names)}")
    print(f"- Numeric features: {len(num_feats)}")
    print(f"- Categorical features (after OHE): {len(feature_names) - len(num_feats)}")
except Exception as e:
    print(f"Could not get feature names: {e}")

# Optional: Save preprocessor for later use
try:
    with open('preprocessor.pkl', 'wb') as f:
        pickle.dump(preprocessor, f)
    print("\nPreprocessor saved to 'preprocessor.pkl'")
except Exception as e:
    print(f"Could not save preprocessor: {e}")

# Optional: Save train/test splits
try:
    np.save('X_train.npy', X_train_transformed)
    np.save('X_test.npy', X_test_transformed)
    np.save('y_train.npy', y_train.values)
    np.save('y_test.npy', y_test.values)
    print("Train/test data saved to .npy files")
except Exception as e:
    print(f"Could not save data: {e}")

print("\n✅ Data preparation complete!")

Original shape: (43648, 22)
After dropping NaNs: (43648, 22)
Dropped 0 rows

Target variable (Delivery_Time) stats:
count    43648.000000
mean       124.914475
std         51.933163
min         10.000000
25%         90.000000
50%        125.000000
75%        160.000000
max        270.000000
Name: Delivery_Time, dtype: float64

Train set size: 34918 (80%)
Test set size: 8730 (20%)

Fitting preprocessor on training data...
Transformed training shape: (34918, 39)
Transformed test shape: (8730, 39)

Total features after encoding: 39
- Numeric features: 6
- Categorical features (after OHE): 33

Preprocessor saved to 'preprocessor.pkl'
Train/test data saved to .npy files

✅ Data preparation complete!


In [39]:
# 4-train_and_eval.py
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

# Load preprocessed data
print("Loading preprocessed data...")
X_train_transformed = np.load('X_train.npy')
X_test_transformed = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

print(f"Train shape: {X_train_transformed.shape}")
print(f"Test shape: {X_test_transformed.shape}")

def evaluate(model, X_test, y_test, model_name="Model"):
    """Evaluate model and print metrics"""
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)  # Calculate RMSE manually
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name}:")
    print(f"  MAE:  {mae:.2f} mins")
    print(f"  RMSE: {rmse:.2f} mins")
    print(f"  R²:   {r2:.4f}")
    
    return mae, rmse, r2, y_pred

# Store results
results = {}



Loading preprocessed data...
Train shape: (34918, 39)
Test shape: (8730, 39)


In [40]:

# 1. Linear Regression
print("\n[1/3] Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train_transformed, y_train)
lr_mae, lr_rmse, lr_r2, lr_pred = evaluate(lr_model, X_test_transformed, y_test, "LinearRegression")
results['Linear Regression'] = {'MAE': lr_mae, 'RMSE': lr_rmse, 'R²': lr_r2, 'predictions': lr_pred}


[1/3] Training Linear Regression...

LinearRegression:
  MAE:  26.40 mins
  RMSE: 33.48 mins
  R²:   0.5873


In [41]:
# 2. Random Forest
print("\n[2/3] Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=100, 
    max_depth=15, 
    random_state=42, 
    n_jobs=-1,
    verbose=0
)
rf_model.fit(X_train_transformed, y_train)
rf_mae, rf_rmse, rf_r2, rf_pred = evaluate(rf_model, X_test_transformed, y_test, "RandomForest")
results['Random Forest'] = {'MAE': rf_mae, 'RMSE': rf_rmse, 'R²': rf_r2, 'predictions': rf_pred}


[2/3] Training Random Forest...

RandomForest:
  MAE:  17.04 mins
  RMSE: 22.14 mins
  R²:   0.8194


In [42]:

# 3. Gradient Boosting
print("\n[3/3] Training Gradient Boosting...")
gb_model = GradientBoostingRegressor(
    n_estimators=200, 
    learning_rate=0.1, 
    max_depth=6, 
    random_state=42,
    verbose=0
)
gb_model.fit(X_train_transformed, y_train)
gb_mae, gb_rmse, gb_r2, gb_pred = evaluate(gb_model, X_test_transformed, y_test, "GradientBoosting")
results['Gradient Boosting'] = {'MAE': gb_mae, 'RMSE': gb_rmse, 'R²': gb_r2, 'predictions': gb_pred}


[3/3] Training Gradient Boosting...

GradientBoosting:
  MAE:  17.38 mins
  RMSE: 22.35 mins
  R²:   0.8160


In [44]:

# Model Comparison Summary
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)

comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'MAE_mins': [results[m]['MAE'] for m in results.keys()],
    'RMSE_mins': [results[m]['RMSE'] for m in results.keys()],
    'R2_Score': [results[m]['R²'] for m in results.keys()]
})
comparison_df = comparison_df.sort_values('MAE_mins')

# Rename for display
comparison_df.columns = ['Model', 'MAE (mins)', 'RMSE (mins)', 'R² Score']
print("\n", comparison_df.to_string(index=False))



MODEL COMPARISON SUMMARY

             Model  MAE (mins)  RMSE (mins)  R² Score
    Random Forest   17.040492    22.144296  0.819417
Gradient Boosting   17.381916    22.353542  0.815988
Linear Regression   26.396671    33.476125  0.587311


In [46]:
# Best model
best_model_name = comparison_df.iloc[0]['Model']
print(f"\n🏆 Best Model: {best_model_name} (Lowest MAE)")

# Sample predictions comparison
print("\n" + "="*60)
print("SAMPLE PREDICTIONS (First 10 Test Cases)")
print("="*60)

sample_df = pd.DataFrame({
    'Actual (mins)': y_test[:10],
    'LR Pred': lr_pred[:10],
    'RF Pred': rf_pred[:10],
    'GB Pred': gb_pred[:10]
})

# Add error columns
sample_df['RF Error'] = np.abs(sample_df['Actual (mins)'] - sample_df['RF Pred'])
sample_df['GB Error'] = np.abs(sample_df['Actual (mins)'] - sample_df['GB Pred'])

print("\n", sample_df.round(2).to_string(index=True))


🏆 Best Model: Random Forest (Lowest MAE)

SAMPLE PREDICTIONS (First 10 Test Cases)

    Actual (mins)  LR Pred  RF Pred  GB Pred  RF Error  GB Error
0            170   174.99   166.51   173.03      3.49      3.03
1            180   148.95   187.53   188.46      7.53      8.46
2            100   132.64   129.76   128.88     29.76     28.88
3            170   157.05   172.58   173.34      2.58      3.34
4             75   112.91    76.61    76.54      1.61      1.54
5            115   110.24   108.31   101.31      6.69     13.69
6             80   123.77   103.36   102.18     23.36     22.18
7             50    53.27    65.75    68.16     15.75     18.16
8             80    94.66    98.75    93.78     18.75     13.78
9            100    97.06   121.82   112.03     21.82     12.03


In [47]:

# Save best model
print("\n" + "="*60)
print("SAVING MODELS")
print("="*60)

try:
    # Save all models
    with open('lr_model.pkl', 'wb') as f:
        pickle.dump(lr_model, f)
    with open('rf_model.pkl', 'wb') as f:
        pickle.dump(rf_model, f)
    with open('gb_model.pkl', 'wb') as f:
        pickle.dump(gb_model, f)
    
    print("✅ Models saved successfully:")
    print("   - lr_model.pkl")
    print("   - rf_model.pkl")
    print("   - gb_model.pkl")
    
    # Save best model separately
    if best_model_name == 'Random Forest':
        best_model = rf_model
    elif best_model_name == 'Gradient Boosting':
        best_model = gb_model
    else:
        best_model = lr_model
    
    with open('best_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
    print(f"   - best_model.pkl ({best_model_name})")
    
except Exception as e:
    print(f"❌ Error saving models: {e}")


SAVING MODELS
✅ Models saved successfully:
   - lr_model.pkl
   - rf_model.pkl
   - gb_model.pkl
   - best_model.pkl (Random Forest)


In [48]:
# Feature importance (for tree-based models)
if hasattr(rf_model, 'feature_importances_'):
    print("\n" + "="*60)
    print("TOP 10 FEATURE IMPORTANCES (Random Forest)")
    print("="*60)
    
    try:
        # Load preprocessor to get feature names
        with open('preprocessor.pkl', 'rb') as f:
            preprocessor = pickle.load(f)
        
        num_feats = ['distance_km', 'Agent_Age', 'Agent_Rating', 'pickup_delay_mins', 'order_hour', 'order_dayofweek']
        cat_feats = ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']
        
        feature_names = (
            num_feats + 
            preprocessor.named_transformers_['cat'].get_feature_names_out(cat_feats).tolist()
        )
        
        importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': rf_model.feature_importances_
        }).sort_values('Importance', ascending=False).head(10)
        
        print("\n", importance_df.to_string(index=False))
        
    except Exception as e:
        print(f"Could not display feature importances: {e}")

print("\n" + "="*60)
print("✅ TRAINING COMPLETE!")
print("="*60)


TOP 10 FEATURE IMPORTANCES (Random Forest)

            Feature  Importance
  Category_Grocery    0.263181
      Agent_Rating    0.178205
       Traffic_Low    0.108092
       distance_km    0.106506
         Agent_Age    0.088069
     Weather_Sunny    0.061605
    Weather_Cloudy    0.045101
       Weather_Fog    0.044034
Vehicle_motorcycle    0.025630
    Traffic_Medium    0.011865

✅ TRAINING COMPLETE!


In [49]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import time

In [50]:

# Load preprocessed data
print("Loading preprocessed data...")
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance"""
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

Loading preprocessed data...
Train shape: (34918, 39)
Test shape: (8730, 39)


In [51]:
# RANDOM FOREST HYPERPARAMETER TUNING
# ============================================================
print("\n" + "="*60)
print("RANDOM FOREST HYPERPARAMETER TUNING")
print("="*60)

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

print("\nParameter grid:")
for key, value in rf_param_grid.items():
    print(f"  {key}: {value}")

rf_base = RandomForestRegressor(random_state=42, n_jobs=-1)

print("\nRunning RandomizedSearchCV (20 iterations)...")
start_time = time.time()

rf_random = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=rf_param_grid,
    n_iter=20,
    cv=3,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_random.fit(X_train, y_train)
rf_time = time.time() - start_time

print(f"\nTuning completed in {rf_time:.2f} seconds")
print(f"Best parameters: {rf_random.best_params_}")
print(f"Best CV MAE: {-rf_random.best_score_:.2f} mins")

# Evaluate on test set
rf_metrics = evaluate_model(rf_random.best_estimator_, X_test, y_test)
print(f"\nTest Set Performance:")
print(f"  MAE:  {rf_metrics['MAE']:.2f} mins")
print(f"  RMSE: {rf_metrics['RMSE']:.2f} mins")
print(f"  R²:   {rf_metrics['R2']:.4f}")



RANDOM FOREST HYPERPARAMETER TUNING

Parameter grid:
  n_estimators: [100, 200, 300]
  max_depth: [10, 15, 20, None]
  min_samples_split: [2, 5, 10]
  min_samples_leaf: [1, 2, 4]
  max_features: ['sqrt', 'log2']

Running RandomizedSearchCV (20 iterations)...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

Tuning completed in 28.51 seconds
Best parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}
Best CV MAE: 17.86 mins

Test Set Performance:
  MAE:  17.71 mins
  RMSE: 22.72 mins
  R²:   0.8098


In [52]:
# GRADIENT BOOSTING HYPERPARAMETER TUNING
# ============================================================
print("\n" + "="*60)
print("GRADIENT BOOSTING HYPERPARAMETER TUNING")
print("="*60)

gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

print("\nParameter grid:")
for key, value in gb_param_grid.items():
    print(f"  {key}: {value}")

gb_base = GradientBoostingRegressor(random_state=42)

print("\nRunning RandomizedSearchCV (20 iterations)...")
start_time = time.time()

gb_random = RandomizedSearchCV(
    estimator=gb_base,
    param_distributions=gb_param_grid,
    n_iter=20,
    cv=3,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

gb_random.fit(X_train, y_train)
gb_time = time.time() - start_time

print(f"\nTuning completed in {gb_time:.2f} seconds")
print(f"Best parameters: {gb_random.best_params_}")
print(f"Best CV MAE: {-gb_random.best_score_:.2f} mins")

# Evaluate on test set
gb_metrics = evaluate_model(gb_random.best_estimator_, X_test, y_test)
print(f"\nTest Set Performance:")
print(f"  MAE:  {gb_metrics['MAE']:.2f} mins")
print(f"  RMSE: {gb_metrics['RMSE']:.2f} mins")
print(f"  R²:   {gb_metrics['R2']:.4f}")


GRADIENT BOOSTING HYPERPARAMETER TUNING

Parameter grid:
  n_estimators: [100, 200, 300]
  learning_rate: [0.01, 0.05, 0.1, 0.2]
  max_depth: [3, 5, 7]
  min_samples_split: [2, 5, 10]
  min_samples_leaf: [1, 2, 4]
  subsample: [0.8, 0.9, 1.0]

Running RandomizedSearchCV (20 iterations)...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

Tuning completed in 173.44 seconds
Best parameters: {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 7, 'learning_rate': 0.1}
Best CV MAE: 17.30 mins

Test Set Performance:
  MAE:  17.19 mins
  RMSE: 22.16 mins
  R²:   0.8191


In [53]:
# MODEL COMPARISON
# ============================================================
print("\n" + "="*60)
print("TUNED MODELS COMPARISON")
print("="*60)

comparison = pd.DataFrame({
    'Model': ['Random Forest (Tuned)', 'Gradient Boosting (Tuned)'],
    'MAE (mins)': [rf_metrics['MAE'], gb_metrics['MAE']],
    'RMSE (mins)': [rf_metrics['RMSE'], gb_metrics['RMSE']],
    'R² Score': [rf_metrics['R2'], gb_metrics['R2']],
    'Training Time (s)': [rf_time, gb_time]
})

print("\n", comparison.to_string(index=False))

# Select best model
best_idx = comparison['MAE (mins)'].idxmin()
best_model_name = comparison.loc[best_idx, 'Model']
best_model = rf_random.best_estimator_ if best_idx == 0 else gb_random.best_estimator_

print(f"\n🏆 Best Model: {best_model_name}")



TUNED MODELS COMPARISON

                     Model  MAE (mins)  RMSE (mins)  R² Score  Training Time (s)
    Random Forest (Tuned)   17.712655    22.724018  0.809838          28.510487
Gradient Boosting (Tuned)   17.191519    22.161749  0.819132         173.444541

🏆 Best Model: Gradient Boosting (Tuned)


In [54]:
# SAVE TUNED MODELS
# ============================================================
print("\n" + "="*60)
print("SAVING TUNED MODELS")
print("="*60)

try:
    # Save both tuned models
    with open('rf_tuned_model.pkl', 'wb') as f:
        pickle.dump(rf_random.best_estimator_, f)
    
    with open('gb_tuned_model.pkl', 'wb') as f:
        pickle.dump(gb_random.best_estimator_, f)
    
    # Save best model
    with open('final_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
    
    # Save model metadata
    metadata = {
        'best_model': best_model_name,
        'rf_params': rf_random.best_params_,
        'gb_params': gb_random.best_params_,
        'rf_metrics': rf_metrics,
        'gb_metrics': gb_metrics,
        'rf_training_time': rf_time,
        'gb_training_time': gb_time
    }
    
    with open('model_metadata.pkl', 'wb') as f:
        pickle.dump(metadata, f)
    
    print("✅ Models saved successfully:")
    print("   - rf_tuned_model.pkl")
    print("   - gb_tuned_model.pkl")
    print("   - final_model.pkl")
    print("   - model_metadata.pkl")
    
except Exception as e:
    print(f"❌ Error saving models: {e}")


SAVING TUNED MODELS
✅ Models saved successfully:
   - rf_tuned_model.pkl
   - gb_tuned_model.pkl
   - final_model.pkl
   - model_metadata.pkl


In [55]:
# FEATURE IMPORTANCE
# ============================================================
print("\n" + "="*60)
print("TOP 15 FEATURE IMPORTANCES (Best Model)")
print("="*60)

try:
    with open('preprocessor.pkl', 'rb') as f:
        preprocessor = pickle.load(f)
    
    num_feats = ['distance_km', 'Agent_Age', 'Agent_Rating', 'pickup_delay_mins', 'order_hour', 'order_dayofweek']
    cat_feats = ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']
    
    feature_names = (
        num_feats + 
        preprocessor.named_transformers_['cat'].get_feature_names_out(cat_feats).tolist()
    )
    
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False).head(15)
    
    print("\n", importance_df.to_string(index=False))
    
    # Save feature importance
    importance_df.to_csv('feature_importance.csv', index=False)
    print("\n✅ Feature importance saved to 'feature_importance.csv'")
    
except Exception as e:
    print(f"Could not display feature importances: {e}")

print("\n" + "="*60)
print("✅ HYPERPARAMETER TUNING COMPLETE!")
print("="*60)


TOP 15 FEATURE IMPORTANCES (Best Model)

            Feature  Importance
  Category_Grocery    0.278530
      Agent_Rating    0.180999
       distance_km    0.108847
       Traffic_Low    0.105791
         Agent_Age    0.084338
     Weather_Sunny    0.061871
    Weather_Cloudy    0.046590
       Weather_Fog    0.038858
Vehicle_motorcycle    0.025819
       Traffic_Jam    0.024220
    Traffic_Medium    0.013744
Area_Metropolitian    0.006433
   Area_Semi-Urban    0.005179
        order_hour    0.002508
Weather_Sandstorms    0.002289

✅ Feature importance saved to 'feature_importance.csv'

✅ HYPERPARAMETER TUNING COMPLETE!
