This notebook explores the performance of linear regression, random forest regression, and gradient boosting on the weather-augmented bus dataset.

### Section 1 - Data transformation & splitting

In [1]:
import pandas as pd

data = pd.read_csv('processed_data.csv', low_memory=False)

# Display the first few rows of the DataFrame
data

Unnamed: 0,RecordedAtTime,PublishedLineName,DirectionRef,VehicleRef,DestinationLat,DestinationLong,NextStopPointName,OriginLat,OriginLong,VehicleLat,...,late,adherence,day_of_week,Temp,Pressure,Humidity,WindSpeed,Clouds,Visibility,WeatherConditions
0,00:03:24,B31,0,NYCT_4611,40.608433,-73.957100,GERRITSEN AV/GERRITSEN BEACH,40.587101,-73.918503,40.587024,...,False,4,Thursday,20.4,1014.2,76.80,20.8,5.1,15.6,Clear
1,00:03:23,Bx1,1,NYCT_5685,40.809654,-73.928360,RIVERDALE AV/W 231 ST,40.881187,-73.909340,40.881224,...,False,15,Thursday,20.4,1014.2,76.80,20.8,5.1,15.6,Clear
2,00:03:30,Bx39,0,NYCT_4718,40.903309,-73.849922,WHITE PLAINS RD/LAFAYETTE AV,40.807869,-73.852715,40.822127,...,True,-3,Thursday,20.4,1014.2,76.80,20.8,5.1,15.6,Clear
3,00:03:49,Q44-SBS,1,NYCT_5999,40.704933,-73.793320,MAIN ST/UNION TP,40.842560,-73.878334,40.717817,...,True,-4,Wednesday,20.4,1014.2,76.80,20.8,5.1,15.6,Clear
4,00:03:24,X10,1,NYCT_2660,40.633698,-74.129776,E 57 ST/LEXINGTON AV,40.760429,-73.967674,40.761108,...,True,-12,Wednesday,20.4,1014.2,76.80,20.8,5.1,15.6,Clear
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330398,23:53:32,Bx19,0,NYCT_5943,40.861202,-73.880676,E 149 ST/COURTLANDT AV,40.826702,-73.955475,40.816609,...,True,-2,Friday,26.8,1015.1,68.92,16.1,62.7,16.0,Partially cloudy
330399,23:53:19,B46,1,NYCT_4573,40.609142,-73.921440,MALCOLM X BL/MADISON ST,40.709393,-73.959518,40.686797,...,True,-7,Friday,26.8,1015.1,68.92,16.1,62.7,16.0,Partially cloudy
330400,23:53:31,B15,0,NYCT_6826,40.646908,-73.779564,DUMONT AV/FOUNTAIN AV,40.699722,-73.941513,40.669310,...,True,-41,Friday,26.8,1015.1,68.92,16.1,62.7,16.0,Partially cloudy
330401,23:53:31,M100,1,NYCT_4401,40.802273,-73.931145,DYCKMAN ST/NAGLE AV,40.871967,-73.913040,40.861808,...,True,-5,Friday,26.8,1015.1,68.92,16.1,62.7,16.0,Partially cloudy


In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Convert 'RecordedAtTime' to datetime
data['RecordedAtTime'] = pd.to_datetime(data['RecordedAtTime'])
# Extracting time features
data['hour'] = data['RecordedAtTime'].dt.hour

# Label encoding for categorical variables
label_cols = ['PublishedLineName', 'VehicleRef', 'NextStopPointName', 'day_of_week', 'WeatherConditions']
label_encoder = {col: LabelEncoder() for col in label_cols}

for col in label_cols:
    data[col] = label_encoder[col].fit_transform(data[col])

# Normalizing numerical features
numerical_cols = ['DestinationLat', 'DestinationLong', 'OriginLat', 'OriginLong', 'VehicleLat', 'VehicleLong', 
                  'Temp', 'Pressure', 'Humidity', 'WindSpeed', 'Clouds', 'Visibility', 'hour']
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Target Variable
target = data['adherence']

# Selecting features for the model
features = data[numerical_cols + label_cols]

# Splitting the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train.shape, X_val.shape, X_test.shape

((231282, 18), (49560, 18), (49561, 18))

In [3]:
X_train

Unnamed: 0,DestinationLat,DestinationLong,OriginLat,OriginLong,VehicleLat,VehicleLong,Temp,Pressure,Humidity,WindSpeed,Clouds,Visibility,hour,PublishedLineName,VehicleRef,NextStopPointName,day_of_week,WeatherConditions
202931,-1.156305,-0.940820,-1.549742,-2.298154,-1.592448,-2.446114,1.216957,-1.657820,0.178320,3.219172,0.282368,0.345944,0.541542,193,1171,7140,1,2
110515,-0.893441,0.303058,-1.026363,0.427229,-1.063791,0.454211,0.058409,1.058696,0.116639,-0.712646,-0.912615,0.345944,-1.949696,23,1680,7237,3,0
219572,-0.918860,-1.377476,-1.869201,-1.854824,-1.946894,-1.932637,-0.065721,0.009744,0.251800,-1.019255,-0.670276,0.345944,-1.237914,191,3251,6151,6,0
150097,-1.358875,-2.421245,-0.894411,-1.423960,-1.054641,-2.323992,0.803190,0.520772,-0.676627,-0.099426,-0.146600,0.345944,0.541542,177,3269,8611,6,2
235871,-0.389875,-0.540139,-0.739743,-0.525291,-0.579061,-0.840081,0.989385,0.413187,-0.898141,-0.135498,-0.670276,0.345944,-0.170240,38,651,3184,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,-2.234858,-2.550695,-1.308543,-1.467999,-2.256352,-2.564374,0.099786,1.246970,0.290954,-1.109435,-0.934899,0.345944,-1.059968,192,4233,5092,1,3
259178,-0.206355,1.271197,-0.254035,1.975145,-0.202595,1.308239,1.134203,-1.711613,-1.121263,0.802367,-0.734342,0.345944,0.185651,170,4006,269,2,0
131932,-0.098346,-0.709402,-1.513236,-2.668891,-1.572767,-2.783796,0.203227,0.251810,0.374088,-0.838897,-0.102032,0.345944,-1.237914,208,423,8268,5,2
146867,-1.307102,0.113711,-0.176429,-0.293918,-1.077197,0.026291,0.492864,0.709046,0.053886,0.549864,1.079023,0.345944,-0.526131,28,1128,8088,6,2


### Section 2 - Linear Regression

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

1. Simple linear regression was attempted to establish a baseline

In [18]:
from sklearn.linear_model import LinearRegression

# Initialize and fit the Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Predict on the validation set
y_pred_val = linear_reg.predict(X_val)

# Calculate MSE
mse_linear_reg = mean_squared_error(y_val, y_pred_val)
print(f'Linear Regression MSE: {mse_linear_reg}')

Linear Regression MSE: 112.60482316677242


2. Attempted ridge regression with different hyperparameters to see if that would improve performance

In [22]:
from sklearn.linear_model import Ridge

# Define the parameter grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# Initialize the grid search model
grid_search = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best parameters: ", grid_search.best_params_)

Best parameters:  {'alpha': 100}


In [23]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=100)
ridge_reg.fit(X_train, y_train)

# Predict on the validation set
y_pred_val = ridge_reg.predict(X_val)

# Calculate MSE
mse_ridge_reg = mean_squared_error(y_val, y_pred_val)
print(f'Ridge Regression MSE: {mse_ridge_reg}')

Ridge Regression MSE: 112.60420312716633


3. Final result on test set

In [24]:
# Predict on the test set
y_pred = ridge_reg.predict(X_test)

# Calculate MSE
mse_ridge_reg = mean_squared_error(y_test, y_pred)
print(f'Ridge Regression MSE: {mse_ridge_reg}')

Ridge Regression MSE: 103.45408738055589


### Section 3 - Random Forests

1. Attempted a simple random forest to establish a baseline

In [16]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the Random Forest model
random_forest = RandomForestRegressor(random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the validation set
y_pred_val = random_forest.predict(X_val)

# Calculate MSE
mse_random_forest = mean_squared_error(y_val, y_pred_val)
print(f'Random Forest MSE: {mse_random_forest}')


Random Forest MSE: 72.06656453793137


2. Fine-tuned hyperparameters with cross-validation grid search

In [14]:
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'min_samples_split': [2, 5, 10]
}

# Initialize the grid search model
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring="neg_mean_squared_error")

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best parameters: ", grid_search.best_params_,"\nBest score: ", grid_search.best_score_)

Best parameters:  {'min_samples_split': 5, 'n_estimators': 150} 
Best score:  -73.86739303129852


In [15]:
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [150, 200, 250],
    'min_samples_split': [3, 5, 8]
}

# Initialize the grid search model
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring="neg_mean_squared_error")

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best parameters: ", grid_search.best_params_,"\nBest score: ", grid_search.best_score_)

Best parameters:  {'min_samples_split': 3, 'n_estimators': 250} 
Best score:  -73.60519359367296


3. Final result on test set

In [13]:
# Initialize and fit the Random Forest model
random_forest = RandomForestRegressor(n_estimators=250, min_samples_split=3, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate MSE
mse_random_forest = mean_squared_error(y_test, y_pred)
print(f'Random Forest MSE: {mse_random_forest}')

Random Forest MSE: 67.71941617115336


### Section 4 - Gradient Boosting

The model attempted here showed the best performance on an analagous task with trains (https://github.com/amalrkrishna/subway_time_prediction/tree/master). However, since its performance on this task was significantly worse than that of random forests, further fine-tuning attempts were abandoned.

In [28]:
import xgboost

# Initialize and fit the GBM model
xgb_model = xgboost.XGBRegressor(n_estimators=50, learning_rate=0.05, gamma=0, subsample=1,colsample_bytree=1, max_depth=6)  
xgb_results = xgb_model.fit(X_train,y_train, verbose=True)
 
# Predict on the validation set
y_pred_val = xgb_results.predict(X_val)

# Calculate MSE
mse_xgb = mean_squared_error(y_val, y_pred_val)
print(f'GBM MSE: {mse_xgb}')

GBM MSE: 97.45109471773452


In [32]:
# Predict on the test set
y_pred = xgb_results.predict(X_test)

# Calculate MSE
mse_xgb = mean_squared_error(y_test, y_pred)
print(f'XGB MSE: {mse_xgb}')

XGB MSE: 89.73788002990814
