In [39]:
import pandas as pd
import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [40]:

df = pd.read_parquet("yellow_tripdata_2025-01.parquet", engine="pyarrow")  # For Parquet files
# df = pd.read_csv("yellow_tripdata_2025-01.csv")  # For CSV files  

print(df.head())  # View first few rows


   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2025-01-01 00:18:38   2025-01-01 00:26:59              1.0   
1         1  2025-01-01 00:32:40   2025-01-01 00:35:13              1.0   
2         1  2025-01-01 00:44:04   2025-01-01 00:46:01              1.0   
3         2  2025-01-01 00:14:27   2025-01-01 00:20:01              3.0   
4         2  2025-01-01 00:21:34   2025-01-01 00:25:06              3.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           1.60         1.0                  N           229           237   
1           0.50         1.0                  N           236           237   
2           0.60         1.0                  N           141           141   
3           0.52         1.0                  N           244           244   
4           0.66         1.0                  N           244           116   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [41]:
df["pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df["dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])


In [42]:
#df = df[df["trip_distance"] > 0]



In [43]:
# Feature Engineering
df['hour'] = df['tpep_pickup_datetime'].dt.hour
df['day'] = df['tpep_pickup_datetime'].dt.day
df['month'] = df['tpep_pickup_datetime'].dt.month
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60


In [44]:
# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)

In [45]:
# Features & Target for Surge Prediction
X = df[['fare_amount', 'trip_distance', 'hour', 'day', 'month']]
fare_median = df['fare_amount'].median()
y = (df['fare_amount'] > fare_median).astype(int)



In [46]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [47]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [48]:
# Standardize features
scaler = StandardScaler()
X_train_sm = scaler.fit_transform(X_train_sm)
X_test = scaler.transform(X_test)

In [49]:
# Train Surge Prediction Models
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf_model.fit(X_train_sm, y_train_sm)

xgb_model = XGBClassifier(n_estimators=200, max_depth=8, learning_rate=0.1, random_state=42, tree_method='hist', device = "cuda")
xgb_model.fit(X_train_sm, y_train_sm)

In [50]:
# Logistic Regression for Surge Multiplier Prediction
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
df["predicted_surge"] = logistic_model.predict(X)

In [51]:
# Price Prediction Model
X_price = df[['trip_distance', 'hour', 'month']]
y_price = df['fare_amount']
X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X_price, y_price, test_size=0.2, random_state=42)

price_model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42, n_jobs=-1)
price_model.fit(X_train_price, y_train_price)

In [52]:
# Demand Prediction Model
df['demand'] = df.groupby('tpep_pickup_datetime')['fare_amount'].transform('count')

X_demand = df[['hour', 'day', 'month', 'trip_distance']]
y_demand = df['demand']
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_demand, y_demand, test_size=0.2, random_state=42)

demand_model = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42, tree_method="hist", device="cuda")
demand_model.fit(X_train_d, y_train_d)


In [53]:
# Streamlit UI
st.set_page_config(page_title="Ride Prediction App", layout="wide")
st.sidebar.title("Navigation")
page = st.sidebar.radio("Select Page", ["Surge & Price Prediction", "Data Visualization"])

if page == "Surge & Price Prediction":
    st.title("\U0001F697 Surge Prediction & Ride Price Estimator")
    
    fare_amount = st.number_input("Fare Amount", min_value=0.0)
    trip_distance = st.number_input("Trip Distance (miles)", min_value=0.0)
    hour = st.slider("Hour of the day", 0, 23, 12)
    day = st.slider("Day of the month", 1, 31, 15)
    month = st.slider("Month", 1, 12, 6)
    
    if st.button("Predict Surge"):
        input_data = np.array([[fare_amount, trip_distance, hour, day, month]])
        input_data = scaler.transform(input_data)
        y_probs = rf_model.predict_proba(input_data)[:, 1]
        prediction = (y_probs >= 0.5).astype(int)
        surge = "Yes" if prediction[0] == 1 else "No"
        st.write(f"Surge Pricing? {surge}")
    
    if st.button("Predict Price"):
        user_input_price = np.array([[trip_distance, hour, month]])
        predicted_price = price_model.predict(user_input_price)
        st.success(f"Estimated Ride Price: ${predicted_price[0]:.2f} \U0001F4B2")




In [54]:
import joblib

joblib.dump(scaler, "scaler.pkl")
joblib.dump(rf_model, "surge_rf.pkl")
joblib.dump(price_model, "price_rf.pkl")
joblib.dump(demand_model, "demand_xgb.pkl")


['demand_xgb.pkl']

In [55]:
import joblib

model_files = ["scaler.pkl", "surge_rf.pkl", "price_rf.pkl", "demand_xgb.pkl"]

for file in model_files:
    try:
        model = joblib.load(file)
        print(f"✅ {file} loaded successfully.")
    except Exception as e:
        print(f"❌ Error loading {file}: {e}")


✅ scaler.pkl loaded successfully.
✅ surge_rf.pkl loaded successfully.
✅ price_rf.pkl loaded successfully.
✅ demand_xgb.pkl loaded successfully.
