In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -----------------------------
# 1️⃣ Generate Synthetic Dataset
# -----------------------------
np.random.seed(42)

num_samples = 1000
df_ml = pd.DataFrame({
    "Distance_km": np.random.uniform(1, 15, num_samples),
    "Time_of_Day": np.random.choice(["Morning", "Afternoon", "Evening", "Late Night"], num_samples),
    "Order_Demand": np.random.choice(["Low", "Medium", "High"], num_samples),
    "Weather_Condition": np.random.choice(["Clear", "Rainy", "Snowy"], num_samples),
    "Traffic_Congestion": np.random.randint(1, 4, num_samples)  # 1: Low, 2: Medium, 3: High
})











In [None]:
# -----------------------------
# 2️⃣ Base Delivery Fee Calculation
# -----------------------------
def calculate_base_price(distance_km):
    base_price = 3.0
    if distance_km <= 2:
        return base_price
    elif 2 < distance_km <= 5:
        return base_price + 0.75 * (distance_km - 2)
    elif 5 < distance_km <= 10:
        return base_price + (0.75 * 3) + 1.25 * (distance_km - 5)
    else:
        return base_price + (0.75 * 3) + (1.25 * 5) + 1.75 * (distance_km - 10)

df_ml["Base_Delivery_Fee"] = df_ml["Distance_km"].apply(calculate_base_price)

# Adding price variations
df_ml["Final_Price"] = df_ml["Base_Delivery_Fee"] + np.random.uniform(0.5, 4.0, num_samples)

In [None]:
# -----------------------------
# 3️⃣ Data Preprocessing
# -----------------------------
X = df_ml[["Distance_km", "Time_of_Day", "Order_Demand", "Weather_Condition", "Traffic_Congestion"]]
y = df_ml["Final_Price"]

# One-hot encoding
encoder = OneHotEncoder(drop="first", sparse_output=False)
encoded_features = encoder.fit_transform(X[["Time_of_Day", "Order_Demand", "Weather_Condition"]])
encoded_feature_names = encoder.get_feature_names_out(["Time_of_Day", "Order_Demand", "Weather_Condition"])
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Combine numeric and categorical features
X_numeric = X.drop(columns=["Time_of_Day", "Order_Demand", "Weather_Condition"]).reset_index(drop=True)
X_final = pd.concat([X_numeric, encoded_df], axis=1)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# -----------------------------
# 4️⃣ Hyperparameter Tuning with GridSearchCV
# -----------------------------
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train the best model
best_model = RandomForestRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
# -----------------------------
# 5️⃣ Model Evaluation
# -----------------------------
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Optimized Model Performance:\nMAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}")


Optimized Model Performance:
MAE: 0.89, RMSE: 1.05, R²: 0.96


In [None]:
# -----------------------------
# 6️⃣ User Input Prediction Function
# -----------------------------
def predict_delivery_fee():
    distance_km = float(input("Enter Distance (km): "))
    time_of_day = input("Enter Time of Day (Morning/Afternoon/Evening/Late Night): ")
    order_demand = input("Enter Order Demand (Low/Medium/High): ")
    weather_condition = input("Enter Weather Condition (Clear/Rainy/Snowy): ")
    traffic_congestion = int(input("Enter Traffic Congestion Level (1: Low, 2: Medium, 3: High): "))

    base_delivery_fee = calculate_base_price(distance_km)

    user_data = pd.DataFrame({
        "Distance_km": [distance_km],
        "Time_of_Day": [time_of_day],
        "Order_Demand": [order_demand],
        "Weather_Condition": [weather_condition],
        "Traffic_Congestion": [traffic_congestion]
    })

    # One-hot encode input
    user_encoded = encoder.transform(user_data[["Time_of_Day", "Order_Demand", "Weather_Condition"]])
    user_encoded_df = pd.DataFrame(user_encoded, columns=encoded_feature_names)

    user_numeric = user_data.drop(columns=["Time_of_Day", "Order_Demand", "Weather_Condition"]).reset_index(drop=True)
    user_final = pd.concat([user_numeric, user_encoded_df], axis=1)

    user_final_scaled = scaler.transform(user_final)

    predicted_price = best_model.predict(user_final_scaled)[0]

    print(f"\nPredicted Final Delivery Price: ${predicted_price:.2f}")

predict_delivery_fee()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -----------------------------
# 🎯 1. Feature Importance Visualization
# -----------------------------
feature_importance = best_model.feature_importances_
feature_names = X_final.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance, y=feature_names, palette="viridis")
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance in Delivery Fee Prediction")
plt.show()






In [None]:
# -----------------------------
# 📊 2. Predicted vs. Actual Plot
# -----------------------------
y_pred = best_model.predict(X_test)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="--")  # Ideal line
plt.xlabel("Actual Delivery Fee")
plt.ylabel("Predicted Delivery Fee")
plt.title("Predicted vs. Actual Delivery Fee")
plt.show()

In [None]:
# -----------------------------
# 🔍 3. Residuals Plot
# -----------------------------
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
sns.histplot(residuals, bins=30, kde=True, color="blue")
plt.axvline(0, color="red", linestyle="--")
plt.xlabel("Prediction Error (Residuals)")
plt.ylabel("Frequency")
plt.title("Residuals Distribution")
plt.show()

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    best_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error') # Changed rf_model to best_model

train_mean = -train_scores.mean(axis=1)
test_mean = -test_scores.mean(axis=1)

plt.figure(figsize=(8,6))
plt.plot(train_sizes, train_mean, label="Training Error", marker='o')
plt.plot(train_sizes, test_mean, label="Validation Error", marker='o')
plt.xlabel("Training Set Size")
plt.ylabel("MSE")
plt.title("Learning Curve")
plt.legend()
plt.show()