<a href="https://colab.research.google.com/github/Aarupadaiyar/Surge-Price-Prediction--Research-Paper/blob/main/Surge_price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightgbm xgboost






In [None]:

import requests, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, classification_report,
    r2_score, mean_absolute_error, mean_squared_error
)
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor


def get_weather(lat, lon, city):
    url = "https://api.open-meteo.com/v1/forecast"
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly": "temperature_2m,relativehumidity_2m,rain",
        "timezone": "Asia/Kolkata"
    }
    data = requests.get(url, params=params).json()
    df = pd.DataFrame(data["hourly"])
    df["city"] = city
    return df


cities = {
    "Bangalore": (12.9716, 77.5946),
    "Chennai": (13.0827, 80.2707),
    "Mumbai": (19.0760, 72.8777),
    "Delhi": (28.7041, 77.1025),
    "Hyderabad": (17.3850, 78.4867),
    "Pune": (18.5204, 73.8567),
    "Kolkata": (22.5726, 88.3639)
}

weather_frames = [get_weather(lat, lon, city) for city, (lat, lon) in cities.items()]
weather_df = pd.concat(weather_frames, ignore_index=True)

weather_df["time"] = pd.to_datetime(weather_df["time"])
weather_df["hour"] = weather_df["time"].dt.hour
weather_df["month"] = weather_df["time"].dt.month

def to_season(m):
    if m in [12,1,2]: return "Winter"
    if m in [3,4,5]: return "Summer"
    if m in [6,7,8,9]: return "Monsoon"
    return "Post-Monsoon"

weather_df["season"] = weather_df["month"].apply(to_season)


traffic_df = pd.DataFrame({
    "city": ["Bangalore","Delhi","Mumbai","Chennai","Hyderabad","Pune","Kolkata"],
    "congestion_level": [51,43,47,32,38,33,28]
})

merged_df = weather_df.merge(traffic_df, on="city", how="left")


fare_df = pd.DataFrame({
    "city": (["Bangalore"]*5 + ["Delhi"]*5 + ["Mumbai"]*5 +
             ["Chennai"]*5 + ["Hyderabad"]*5 + ["Pune"]*5 + ["Kolkata"]*5),

    "vehicle_type": ["Bike","Auto","Mini","Sedan","SUV"] * 7,

    "base_fare_per_km": [
        7,11,15,18,25,
        6,10,14,17,23,
        7,12,17,20,28,
        6,10,14,17,24,
        6,10,14,17,23,
        7,11,16,19,26,
        6,10,14,17,23
    ],

    "minimum_fare": [
        25,35,50,65,90,
        20,30,45,60,85,
        25,35,55,70,95,
        20,30,45,60,85,
        20,30,45,60,85,
        25,35,50,65,90,
        20,30,45,60,85
    ],

    "booking_fee": [10,20,30,35,40] * 7,

    "night_multiplier": [1.5,1.5,1.5,1.5,1.5] * 6 + [1.25,1.25,1.25,1.25,1.25]
})

# Random vehicle assignment
merged_df["vehicle_type"] = np.random.choice(["Bike","Auto","Mini","Sedan","SUV"], len(merged_df))

# Merge fare
merged_df = merged_df.merge(fare_df, on=["city","vehicle_type"], how="left")

def generate_distance(n):
    d=[]
    for _ in range(n):
        p=np.random.rand()
        if p<0.7: d.append(np.random.uniform(2,8))
        elif p<0.9: d.append(np.random.uniform(8,15))
        else: d.append(np.random.uniform(15,35))
    return d

merged_df["distance_km"] = np.array(generate_distance(len(merged_df))).round(2)

# Expand dataset 10x
expanded_df = merged_df.loc[merged_df.index.repeat(10)].reset_index(drop=True)


def generate_demand(row):
    hour=row["hour"]
    rain=row["rain"]
    demand = 4 if (7<=hour<=10 or 17<=hour<=21) else 2
    if rain>0: demand+=1
    return min(demand,5)

expanded_df["demand_level"] = expanded_df.apply(generate_demand, axis=1)


def generate_surge(row):
    d=row["demand_level"]
    rain=row["rain"]
    traffic=row["congestion_level"]

    if d<=3: s=1.0
    elif d==4: s=1.15
    else: s=1.3

    if rain>1: s+=0.05
    if traffic>40: s+=0.05

    return round(min(max(s,1.0),2.0),2)

expanded_df["surge_multiplier"] = expanded_df.apply(generate_surge, axis=1)


expanded_df["surge_flag"] = 1
no_surge_idx = expanded_df.sample(frac=0.30, random_state=42).index
expanded_df.loc[no_surge_idx, "surge_flag"] = 0


def compute_total_fare(r):
    fare = r["base_fare_per_km"] * r["distance_km"]
    fare = max(fare, r["minimum_fare"])
    fare += r["booking_fee"]
    fare *= r["night_multiplier"]
    fare *= r["surge_multiplier"]
    return round(fare,2)

expanded_df["total_fare"] = expanded_df.apply(compute_total_fare, axis=1)


cols = [
    "city","month","season","hour",
    "temperature_2m","relativehumidity_2m","rain",
    "congestion_level","vehicle_type","distance_km",
    "demand_level","surge_multiplier","surge_flag",
    "base_fare_per_km","minimum_fare","booking_fee","night_multiplier",
    "total_fare"
]

expanded_df = expanded_df[cols]


le_city=LabelEncoder()
le_vehicle=LabelEncoder()
le_season=LabelEncoder()

expanded_df["city"] = le_city.fit_transform(expanded_df["city"])
expanded_df["vehicle_type"] = le_vehicle.fit_transform(expanded_df["vehicle_type"])
expanded_df["season"] = le_season.fit_transform(expanded_df["season"])


Xc = expanded_df[[
    "distance_km","hour","month",
    "temperature_2m","relativehumidity_2m","rain",
    "congestion_level","vehicle_type","season","demand_level"
]]

yc = expanded_df["surge_flag"]

Xr = expanded_df[[
    "distance_km","base_fare_per_km","minimum_fare",
    "booking_fee","temperature_2m","relativehumidity_2m",
    "rain","demand_level","congestion_level",
    "vehicle_type","night_multiplier"
]]

yr = expanded_df["surge_multiplier"]


sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in sss.split(Xc, yc):
    Xc_train, Xc_test = Xc.iloc[train_idx], Xc.iloc[test_idx]
    yc_train, yc_test = yc.iloc[train_idx], yc.iloc[test_idx]


Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)


models_cls = {
    "LightGBM": LGBMClassifier(),
    "XGBoost": XGBClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15)
}

print("\n======== CLASSIFICATION RESULTS ========\n")
for name, model in models_cls.items():
    model.fit(Xc_train, yc_train)
    preds = model.predict(Xc_test)
    print(f"\n{name}")
    print("Accuracy:", accuracy_score(yc_test, preds))
    print(classification_report(yc_test, preds))
    print("-"*60)


models_reg = {
    "Extra Trees": ExtraTreesRegressor(n_estimators=200),
    "XGBoost": XGBRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=200)
}

print("\n======== REGRESSION RESULTS ========\n")
for name, model in models_reg.items():
    model.fit(Xr_train, yr_train)
    preds = model.predict(Xr_test)
    print(f"\n{name}")
    print("R²:", r2_score(yr_test, preds))
    print("MAE:", mean_absolute_error(yr_test, preds))
    print("RMSE:", np.sqrt(mean_squared_error(yr_test, preds)))
    print("-"*60)

fi = pd.DataFrame({
    "feature": Xr.columns,
    "importance": models_reg["Extra Trees"].feature_importances_
}).sort_values(by="importance", ascending=False)

fi




[LightGBM] [Info] Number of positive: 6586, number of negative: 2822
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 589
[LightGBM] [Info] Number of data points in the train set: 9408, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.700043 -> initscore=0.847500
[LightGBM] [Info] Start training from score 0.847500

LightGBM
Accuracy: 0.6875
              precision    recall  f1-score   support

           0       0.36      0.05      0.09       706
           1       0.70      0.96      0.81      1646

    accuracy                           0.69      2352
   macro avg       0.53      0.51      0.45      2352
weighted avg       0.60      0.69      0.60      2352

------------------------------------------------------------

XGBoost
Accuracy: 0.65433673469

Unnamed: 0,feature,importance
7,demand_level,0.8959319
8,congestion_level,0.1016827
10,night_multiplier,0.001287114
4,temperature_2m,0.0007474708
5,relativehumidity_2m,0.000350102
0,distance_km,5.060276e-07
1,base_fare_per_km,2.47589e-07
2,minimum_fare,1.928919e-12
9,vehicle_type,9.714245e-13
3,booking_fee,8.937141e-13
