### W23P1 STAT 857 - Modeling

In [None]:
pip install xgboost lightgbm catboost

In [2]:
## Importing libraries
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor 
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 100)

In [3]:
## Reading the data
train = pd.read_csv('Data/W23P1_train_final.csv')
test = pd.read_csv('Data/W23P1_test_final.csv')
sub = pd.read_csv('Data/W23P1_sample_submission.csv')

In [5]:
## Defining input and target variables
X_train = train[['passenger_count', 'distance', 'duration', 'pickup_day', 'pickup_hour', 'Friday', 'Monday', 'Saturday', 
                 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'weekend', 'rush_hour', 'overnight', 'pickup_LGA', 'dropoff_LGA', 
                 'pickup_JFK', 'dropoff_JFK', 'pickup_EWR', 'dropoff_EWR', 'airport', 'change_borough', 'haversine']]
Y_train = train['fare_amount']

X_test = test[['passenger_count', 'distance', 'duration', 'pickup_day', 'pickup_hour', 'Friday', 'Monday', 'Saturday', 
                 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'weekend', 'rush_hour', 'overnight', 'pickup_LGA', 'dropoff_LGA', 
                 'pickup_JFK', 'dropoff_JFK', 'pickup_EWR', 'dropoff_EWR', 'airport', 'change_borough', 'haversine']]

### Baseline Models:

In [8]:
## Random Forest Model:
rf_md = RandomForestRegressor(max_depth = 3, n_estimators = 500).fit(X_train, Y_train)

sub['fare_amount'] = rf_md.predict(X_test)

sub.to_csv('Submissions/rf_baseline.csv', index = False)

In [9]:
## XGBoost Model:
XGB_md = XGBRegressor(tree_method = 'hist', colsample_bytree = 0.7, gamma = 0.8, learning_rate = 0.01, max_depth = 7, 
                      min_child_weight = 10, n_estimators = 1000, subsample = 0.7).fit(X_train, Y_train)

sub['fare_amount'] = XGB_md.predict(X_test)

sub.to_csv('Submissions/xgb_baseline.csv', index = False)

In [10]:
## LightGBM Model:
lgb_md = LGBMRegressor(n_estimators = 1000, max_depth = 7, learning_rate = 0.01, num_leaves = 20, lambda_l1 = 3, lambda_l2 = 3, 
                       bagging_fraction = 0.7, feature_fraction = 0.7).fit(X_train, Y_train)

sub['fare_amount'] = lgb_md.predict(X_test)

sub.to_csv('Submissions/lgbm_baseline.csv', index = False)



In [12]:
## CatBoost Model:
cat_md = CatBoostRegressor(loss_function = 'RMSE', iterations = 1000, learning_rate = 0.01, depth = 7, random_strength = 0.5, 
                           bagging_temperature = 0.7, border_count = 30, l2_leaf_reg = 5, verbose = False).fit(X_train, Y_train)

sub['fare_amount'] = cat_md.predict(X_test)

sub.to_csv('Submissions/cat_baseline.csv', index = False)

### Second Round of Models: with optimized parameters

In [None]:
## Random Forest Model:
rf_md = RandomForestRegressor(max_depth = 300, n_estimators = 12, min_samples_split = 5, 
                              min_samples_leaf = 6).fit(X_train, Y_train)

rf_preds = rf_md.predict(X_test)

sub['fare_amount'] = rf_preds

sub.to_csv('Submissions/rf_rd2.csv', index = False)

In [None]:
## XGBoost Model:
XGB_md = XGBRegressor(tree_method = 'hist', n_estimators = 500, learning_rate = 0.02, max_depth = 5, gamma = 0.2, 
                      min_child_weight = 10, subsample = 0.94, colsample_bytree = 0.92, verbosity = 0).fit(X_train, Y_train)

XGB_preds = XGB_md.predict(X_test)

sub['fare_amount'] = XGB_preds

sub.to_csv('Submissions/xgb_rd2.csv', index = False)

In [None]:
## LightGBM Model:
lgbm_md = LGBMRegressor(boosting_type = 'dart', n_estimators = 600, learning_rate = 0.18, num_leaves = 8, max_depth = 8,
                      subsample = 0.73, colsample_bytree = 0.86, random_state = 543, reg_alpha = 0.021, reg_lambda = 0.027, 
                        objective = 'rmse', verbosity = -1).fit(X_train, Y_train)

lgbm_preds = lgbm_md.predict(X_test)

sub['fare_amount'] = lgbm_preds

sub.to_csv('Submissions/lgbm_rd2.csv', index = False)

In [None]:
## Ensemble Model:

## Constructing the training data
rf_preds_train = rf_md.predict(X_train)
XGB_preds_train = XGB_md.predict(X_train)
lgbm_preds_train = lgbm_md.predict(X_train)

X_train_ensemble = pd.DataFrame({'rf': rf_preds_train, 'xgb': XGB_preds_train, 'lgbm': lgbm_preds_train})
X_test_ensemble = pd.DataFrame({'rf': rf_preds, 'xgb': XGB_preds, 'lgbm': lgbm_preds})

## Building the model
ensemble_md = RandomForestRegressor(max_depth = 3, n_estimators = 500).fit(X_train_ensemble, Y_train)

sub['fare_amount'] = ensemble_md.predict(X_test)

sub.to_csv('Submissions/ensemble_rd2.csv', index = False)