<h1 align="center" style="color:blue">NYC Trips Volume Prediction </h1>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [217]:
df = pd.read_csv("trip_summary_from_df.csv")
df.columns

Index(['pickup_day', 'pickup_hour', 'PU_Borough', 'PU_Zone', 'DO_Borough',
       'DO_Zone', 'avg_trip_miles', 'total_trips', 'avg_trip_time',
       'total_driver_pay', 'total_tips', 'total_airport_fee',
       'sum_total_congestion', 'avg_base_passenger_fare'],
      dtype='object')

In [219]:
df.shape

(3049276, 14)

In [221]:
df.head()

Unnamed: 0,pickup_day,pickup_hour,PU_Borough,PU_Zone,DO_Borough,DO_Zone,avg_trip_miles,total_trips,avg_trip_time,total_driver_pay,total_tips,total_airport_fee,sum_total_congestion,avg_base_passenger_fare
0,Saturday,0,Brooklyn,Williamsburg (North Side),Brooklyn,Williamsburg (South Side),1.011874,246,6.595461,1825.72,98.73,6.0,0.0,12.287765
1,Saturday,0,Brooklyn,Williamsburg (South Side),Manhattan,Central Park,8.18375,4,27.470833,139.49,8.63,0.0,12.75,53.27
2,Saturday,0,Brooklyn,Manhattan Beach,Brooklyn,Bensonhurst East,3.290933,15,10.745556,181.34,5.0,0.0,0.0,16.145334
3,Saturday,0,Brooklyn,Bath Beach,Queens,Elmhurst/Maspeth,14.78,1,25.933332,36.58,0.0,0.0,0.0,42.01
4,Saturday,0,Queens,Rosedale,Queens,Rosedale,1.106354,48,5.619097,281.33,7.0,0.0,0.0,9.020625


<h2 align="center" style="color:blue">Data Cleaning</h2>

### Remove correlated columns

In [223]:
df = df.drop(columns=['total_driver_pay', 'total_tips', 'total_airport_fee', 'sum_total_congestion'])

### Handling Nulls

In [226]:
# Remove rows where zone names are missing
location_cols = ['PU_Borough', 'PU_Zone', 'DO_Borough', 'DO_Zone']

df = df.dropna(subset=location_cols)

df.shape

(3009756, 10)

In [227]:
df.isnull().sum()

pickup_day                 0
pickup_hour                0
PU_Borough                 0
PU_Zone                    0
DO_Borough                 0
DO_Zone                    0
avg_trip_miles             0
total_trips                0
avg_trip_time              0
avg_base_passenger_fare    0
dtype: int64

In [26]:
df.duplicated().sum()

0

<h2 align="center" style="color:blue"> Pickup_day (cycle encoding) </h2>

In [230]:
df['pickup_day'].unique()

array(['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday'], dtype=object)

In [232]:
day_map = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
    'Friday': 4, 'Saturday': 5, 'Sunday': 6
}
df['pickup_day'] = df['pickup_day'].map(day_map)

In [234]:
df['pickup_day'] = df['pickup_day'].astype(int)

In [236]:
df['pickup_day_sin'] = np.sin(2 * np.pi * df['pickup_day'] / 7)
df['pickup_day_cos'] = np.cos(2 * np.pi * df['pickup_day'] / 7)

<h2 align="center" style="color:blue"> Pickup_hour (cycle encoding) </h2>

In [238]:
df['pickup_hour'] = pd.to_numeric(df['pickup_hour'], errors='coerce')

In [240]:
df['pickup_hour_sin'] = np.sin(2 * np.pi * df['pickup_hour'] / 24)
df['pickup_hour_cos'] = np.cos(2 * np.pi * df['pickup_hour'] / 24)

<h2 align="center" style="color:blue"> is_weekend, avg_speed, fare_per_mile </h2>

In [243]:
df['is_weekend'] = df['pickup_day'].apply(lambda x: 1 if x >= 5 else 0)

In [244]:
df['avg_speed'] = df['avg_trip_miles'] / df['avg_trip_time'].replace(0, np.nan)
df['fare_per_mile'] = df['avg_base_passenger_fare'] / df['avg_trip_miles'].replace(0, np.nan)

<h2 align="center" style="color:blue"> trip_length_category (ordinal encoding) </h2>

In [246]:
bins = [0, 2, 5, np.inf]  # short <2, medium 2-5, long >5 miles
labels = ['short', 'medium', 'long']
df['trip_length_cat'] = pd.cut(df['avg_trip_miles'], bins=bins, labels=labels)

In [247]:
df['trip_length_cat'].unique()

['short', 'long', 'medium']
Categories (3, object): ['short' < 'medium' < 'long']

In [252]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['short', 'medium', 'long']])
df['trip_length_cat_encoded'] = encoder.fit_transform(df[['trip_length_cat']])

In [253]:
df['trip_length_cat_encoded'].unique()

array([0., 2., 1.])

<h2 align="center" style="color:blue"> PU_Borough, DO_Borough (one-hot encoding) </h2>

In [257]:
borough_trips = df.groupby('PU_Borough')['total_trips'].sum().sort_values(ascending=False)

# Convert to DataFrame for tabular display
borough_trips_table = borough_trips.reset_index()
borough_trips_table.columns = ['PU_Borough', 'Total_Trips']

print(borough_trips_table)

      PU_Borough  Total_Trips
0      Manhattan      7051812
1       Brooklyn      5353011
2         Queens      3984179
3          Bronx      2525637
4  Staten Island       303269


In [259]:
borough_trips = df.groupby('DO_Borough')['total_trips'].sum().sort_values(ascending=False)

# Convert to DataFrame for tabular display
borough_trips_table = borough_trips.reset_index()
borough_trips_table.columns = ['DO_Borough', 'Total_Trips']

print(borough_trips_table)

      DO_Borough  Total_Trips
0      Manhattan      6739795
1       Brooklyn      5384748
2         Queens      4124466
3          Bronx      2543123
4  Staten Island       299494
5            EWR       126282


In [260]:
df = pd.get_dummies(df, columns=['PU_Borough', 'DO_Borough'], prefix=['PU', 'DO'], drop_first=False)

<h2 align="center" style="color:blue"> PU_Zone, DO_Zone (frequency encoding) </h2>

In [263]:
# replacing high-cardinality zone names with its frequency (how many times it appears).
# Captures importance, doesn’t leak target.

# Pickup zone frequency encoding
freq_map_pu = df['PU_Zone'].value_counts().to_dict()
df['PU_Zone_freq'] = df['PU_Zone'].map(freq_map_pu)

# Dropoff zone frequency encoding
freq_map_do = df['DO_Zone'].value_counts().to_dict()
df['DO_Zone_freq'] = df['DO_Zone'].map(freq_map_do)

<h2 align="center" style="color:blue"> Feature Selection </h2>

In [266]:
df.columns

Index(['pickup_day', 'pickup_hour', 'PU_Zone', 'DO_Zone', 'avg_trip_miles',
       'total_trips', 'avg_trip_time', 'avg_base_passenger_fare',
       'pickup_day_sin', 'pickup_day_cos', 'pickup_hour_sin',
       'pickup_hour_cos', 'is_weekend', 'avg_speed', 'fare_per_mile',
       'trip_length_cat', 'trip_length_cat_encoded', 'PU_Bronx', 'PU_Brooklyn',
       'PU_Manhattan', 'PU_Queens', 'PU_Staten Island', 'DO_Bronx',
       'DO_Brooklyn', 'DO_EWR', 'DO_Manhattan', 'DO_Queens',
       'DO_Staten Island', 'PU_Zone_freq', 'DO_Zone_freq'],
      dtype='object')

In [268]:
df = df.drop(columns=['pickup_day','pickup_hour','PU_Zone', 'DO_Zone','avg_trip_miles','avg_trip_time', 'trip_length_cat'])

In [270]:
df.shape

(3009756, 23)

<h2 align="center" style="color:blue"> Optimize Datatypes of features </h2>

In [272]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3009756 entries, 0 to 3049275
Data columns (total 23 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   total_trips              int64  
 1   avg_base_passenger_fare  float64
 2   pickup_day_sin           float64
 3   pickup_day_cos           float64
 4   pickup_hour_sin          float64
 5   pickup_hour_cos          float64
 6   is_weekend               int64  
 7   avg_speed                float64
 8   fare_per_mile            float64
 9   trip_length_cat_encoded  float64
 10  PU_Bronx                 bool   
 11  PU_Brooklyn              bool   
 12  PU_Manhattan             bool   
 13  PU_Queens                bool   
 14  PU_Staten Island         bool   
 15  DO_Bronx                 bool   
 16  DO_Brooklyn              bool   
 17  DO_EWR                   bool   
 18  DO_Manhattan             bool   
 19  DO_Queens                bool   
 20  DO_Staten Island         bool   
 21  PU_Zone_freq 

In [274]:
# Optimize numeric types
int_cols = df.select_dtypes(include=['int64']).columns
float_cols = df.select_dtypes(include=['float64']).columns
obj_cols = df.select_dtypes(include=['object']).columns

# Downcast integers and floats
df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')
df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')

# Convert object columns to category if they are categorical
for col in obj_cols:
    if df[col].nunique() / len(df) < 0.5:  # heuristic: if less than 50% unique
        df[col] = df[col].astype('category')

# Check memory usage after optimization
print(df.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
Index: 3009756 entries, 0 to 3049275
Data columns (total 23 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   total_trips              int16  
 1   avg_base_passenger_fare  float32
 2   pickup_day_sin           float32
 3   pickup_day_cos           float32
 4   pickup_hour_sin          float32
 5   pickup_hour_cos          float32
 6   is_weekend               int8   
 7   avg_speed                float32
 8   fare_per_mile            float32
 9   trip_length_cat_encoded  float32
 10  PU_Bronx                 bool   
 11  PU_Brooklyn              bool   
 12  PU_Manhattan             bool   
 13  PU_Queens                bool   
 14  PU_Staten Island         bool   
 15  DO_Bronx                 bool   
 16  DO_Brooklyn              bool   
 17  DO_EWR                   bool   
 18  DO_Manhattan             bool   
 19  DO_Queens                bool   
 20  DO_Staten Island         bool   
 21  PU_Zone_freq 

<h2 align="center" style="color:blue"> Train Test split </h2>

In [276]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['total_trips'])
y = df['total_trips']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h2 align="center" style="color:blue"> Scaling </h2>

In [278]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h2 align="center" style="color:blue"> Train with Linear Models (Linear Regression, Ridge, and Lasso) </h2>

In [281]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score


# Initialize models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=10.0, random_state=42),
    'Lasso': Lasso(alpha=1, random_state=42, max_iter=5000)
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    print(f'{name} -> RMSE: {rmse:.2f}, R2: {r2:.2f}')

LinearRegression -> RMSE: 12.08, R2: 0.24
Ridge -> RMSE: 12.08, R2: 0.24
Lasso -> RMSE: 12.45, R2: 0.19


<h2 align="center" style="color:blue"> HistGradientBoostingRegressor </h2>

In [284]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize
hgb = HistGradientBoostingRegressor(
    max_iter=500,
    learning_rate=0.1,
    max_depth=10,
    min_samples_leaf=20,
    random_state=42
)

# Fit
hgb.fit(X_train, y_train)

# Predict
y_pred = hgb.predict(X_test)

# Evaluate
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'HGB -> RMSE: {rmse:.2f}, R2: {r2:.2f}')

HGB -> RMSE: 5.79, R2: 0.83


<h2 align="center" style="color:blue"> Check Overfitting </h2>

In [296]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predictions
y_train_pred = hgb.predict(X_train)
y_test_pred = hgb.predict(X_test)

# Train metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Test metrics
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print(f"Train RMSE: {train_rmse:.2f}, R²: {train_r2:.2f}")
print(f"Test RMSE:  {test_rmse:.2f}, R²: {test_r2:.2f}")

Train RMSE: 5.58, R²: 0.83
Test RMSE:  5.79, R²: 0.83


<h2 align="center" style="color:blue"> Conclusion </h2>

Baseline linear models (Linear Regression, Ridge, Lasso) performed poorly, with RMSE ≈ 12.1 and R² ≈ 0.24, 
indicating they could not capture the complex, non-linear relationships in the data.

After testing advanced models, the Histogram Gradient Boosting Regressor (HGB) achieved RMSE ≈ 5.79 and R² ≈ 0.83, 
a major improvement in accuracy and explanatory power. The model was checked for overfitting and found to generalize well.