### W23P1 STAT 857 - Modeling

In [None]:
pip install xgboost lightgbm catboost

In [1]:
## Importing libraries
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor 
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 100)

In [None]:
## Reading the data
train = pd.read_csv('Data/W23P1_train_final.csv')
test = pd.read_csv('Data/W23P1_test_final.csv')
sub = pd.read_csv('Data/W23P1_sample_submission.csv')

### Baseline Models:

In [None]:
## Defining input and target variables
X_train = train.drop(columns = ['fare_amount'])
Y_train = train['fare_amount']

X_test = test

In [None]:
## Random Forest Model:
rf_md = RandomForestRegressor(max_depth = 3, n_estimators = 500).fit(X_train, Y_train)

sub['fare_amount'] = rf_md.predict(X_test)

sub.to_csv('Submissions/rf_baseline.csv', index = False)

In [None]:
## XGBoost Model:
XGB_md = XGBRegressor(tree_method = 'hist', colsample_bytree = 0.7, gamma = 0.8, learning_rate = 0.01, max_depth = 7, 
                      min_child_weight = 10, n_estimators = 1000, subsample = 0.7).fit(X_train, Y_train)

sub['fare_amount'] = XGB_md.predict(X_test)

sub.to_csv('Submissions/xgb_baseline.csv', index = False)

In [None]:
## LightGBM Model:
lgb_md = LGBMRegressor(n_estimators = 1000, max_depth = 7, learning_rate = 0.01, num_leaves = 20, lambda_l1 = 3, lambda_l2 = 3, 
                       bagging_fraction = 0.7, feature_fraction = 0.7).fit(X_train, Y_train)

sub['fare_amount'] = lgb_md.predict(X_test)

sub.to_csv('Submissions/lgbm_baseline.csv', index = False)

In [None]:
## CatBoost Model:
cat_md = CatBoostRegressor(loss_function = 'RMSE', iterations = 1000, learning_rate = 0.01, depth = 7, random_strength = 0.5, 
                           bagging_temperature = 0.7, border_count = 30, l2_leaf_reg = 5, verbose = False).fit(X_train, Y_train)

sub['fare_amount'] = cat_md.predict(X_test)

sub.to_csv('Submissions/cat_baseline.csv', index = False)

### Second Round of Models: with optimized parameters

In [None]:
## Random Forest Model:
rf_md = RandomForestRegressor(max_depth = 300, n_estimators = 12, min_samples_split = 5, 
                              min_samples_leaf = 6).fit(X_train, Y_train)

rf_preds = rf_md.predict(X_test)

sub['fare_amount'] = rf_preds

sub.to_csv('Submissions/rf_rd2.csv', index = False)

In [None]:
## XGBoost Model:
XGB_md = XGBRegressor(tree_method = 'hist', n_estimators = 500, learning_rate = 0.02, max_depth = 5, gamma = 0.2, 
                      min_child_weight = 10, subsample = 0.94, colsample_bytree = 0.92, verbosity = 0).fit(X_train, Y_train)

XGB_preds = XGB_md.predict(X_test)

sub['fare_amount'] = XGB_preds

sub.to_csv('Submissions/xgb_rd2.csv', index = False)

In [None]:
## LightGBM Model:
lgbm_md = LGBMRegressor(boosting_type = 'dart', n_estimators = 600, learning_rate = 0.18, num_leaves = 8, max_depth = 8,
                      subsample = 0.73, colsample_bytree = 0.86, random_state = 543, reg_alpha = 0.021, reg_lambda = 0.027, 
                        objective = 'rmse', verbosity = -1).fit(X_train, Y_train)

lgbm_preds = lgbm_md.predict(X_test)

sub['fare_amount'] = lgbm_preds

sub.to_csv('Submissions/lgbm_rd2.csv', index = False)

In [None]:
## Ensemble Model:

## Constructing the training data
rf_preds_train = rf_md.predict(X_train)
XGB_preds_train = XGB_md.predict(X_train)
lgbm_preds_train = lgbm_md.predict(X_train)

X_train_ensemble = pd.DataFrame({'rf': rf_preds_train, 'xgb': XGB_preds_train, 'lgbm': lgbm_preds_train})
X_test_ensemble = pd.DataFrame({'rf': rf_preds, 'xgb': XGB_preds, 'lgbm': lgbm_preds})

## Building the model
ensemble_md = RandomForestRegressor(max_depth = 3, n_estimators = 500).fit(X_train_ensemble, Y_train)

sub['fare_amount'] = ensemble_md.predict(X_test_ensemble)

sub.to_csv('Submissions/ensemble_rd2.csv', index = False)

### Third Round of Models:

In [None]:
## Defining input and target variables

variables = ['distance', 'haversine', 'duration', 'passenger_count', 'pickup_day','Monday', 'Tuesday', 'Wednesday', 'Thursday',  
             'Friday', 'Saturday','weekend', 'pickup_hour', 'rush_hour', 'overnight', 'pickup_LGA', 'dropoff_LGA', 'pickup_JFK', 
             'dropoff_JFK', 'pickup_EWR', 'dropoff_EWR', 'airport', 'change_borough', 'pickup_longitude', 'pickup_latitude', 
             'dropoff_longitude', 'dropoff_latitude']

X_train = train[variables]
Y_train = train['fare_amount']

X_test = test[variables]

In [None]:
## Random Forest Model:
rf_md = RandomForestRegressor(max_depth = 300, n_estimators = 12, min_samples_split = 5, 
                              min_samples_leaf = 6).fit(X_train, Y_train)

rf_preds = rf_md.predict(X_test)

sub['fare_amount'] = rf_preds

#sub.to_csv('Submissions/rf_rd3.csv', index = False)

In [None]:
## XGBoost Model:
XGB_md = XGBRegressor(tree_method = 'hist', n_estimators = 500, learning_rate = 0.02, max_depth = 5, gamma = 0.2, 
                      min_child_weight = 10, subsample = 0.94, colsample_bytree = 0.92, verbosity = 0).fit(X_train, Y_train)

XGB_preds = XGB_md.predict(X_test)

sub['fare_amount'] = XGB_preds

#sub.to_csv('Submissions/xgb_rd3.csv', index = False)

In [None]:
## LightGBM Model:
lgbm_md = LGBMRegressor(boosting_type = 'dart', n_estimators = 600, learning_rate = 0.18, num_leaves = 8, max_depth = 8,
                      subsample = 0.73, colsample_bytree = 0.86, random_state = 543, reg_alpha = 0.021, reg_lambda = 0.027, 
                        objective = 'rmse', verbosity = -1).fit(X_train, Y_train)

lgbm_preds = lgbm_md.predict(X_test)

sub['fare_amount'] = lgbm_preds

#sub.to_csv('Submissions/lgbm_rd3.csv', index = False)

In [None]:
## Ensemble Model:

## Constructing the training data
rf_preds_train = rf_md.predict(X_train)
XGB_preds_train = XGB_md.predict(X_train)
lgbm_preds_train = lgbm_md.predict(X_train)

X_train_ensemble = pd.DataFrame({'rf': rf_preds_train, 'xgb': XGB_preds_train, 'lgbm': lgbm_preds_train})
X_test_ensemble = pd.DataFrame({'rf': rf_preds, 'xgb': XGB_preds, 'lgbm': lgbm_preds})

## Building the model
ensemble_md = RandomForestRegressor(max_depth = 3, n_estimators = 500).fit(X_train_ensemble, Y_train)

sub['fare_amount'] = ensemble_md.predict(X_test_ensemble)

sub.to_csv('Submissions/ensemble_rd3.csv', index = False)

In [None]:
ins = test.merge(train, how = 'inner', on = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'])
ins.head()

In [None]:
sub['fare_amount'] = (rf_preds + XGB_preds + lgbm_preds) / 3
print(np.mean(sub['fare_amount']))

ind = [631,  1044,  8718,  9614,  9784, 11654, 12032, 12810, 13254, 13374, 16042, 17660, 18328, 18930, 20616, 22499, 22829, 23702, \
       25597, 25958, 26925, 27347, 27552, 31726, 33099, 33668]
fare = [6.5,  6.5,  6.66666667, 6.5, 6.5, 11.4, 14.25, 14.25, 6.5, 5.5, 51.625, 51.625, 9.5, 5.5, 11.4, 18.75, 13.5, 6.5, 18.75, 14.25, \
        6.66666667, 7.75, 6.5, 14.25, 6.5, 14.25]    

sub['fare_amount'].iloc[ind] = fare

print(np.mean(sub['fare_amount']))
sub.to_csv('Submissions/ensemble_average4.csv', index = False)

# sub['fare_amount'] = (rf_preds + 3*XGB_preds + 2*lgbm_preds) / 6

# sub.to_csv('Submissions/ensemble_average3.csv', index = False)

### Fourth Round: Optimized HPs and top-10 variables

In [None]:
## Defining input and target variables

variables = ['distance', 'haversine', 'dropoff_longitude', 'duration', 'pickup_longitude', 'dropoff_EWR', 'EWR', 
             'dropoff_airport', 'pickup_airport', 'dropoff_JFK']

X_train = train[variables]
Y_train = train['fare_amount']

X_test = test[variables]

In [None]:
## Random Forest Model:
rf_md = RandomForestRegressor(max_depth = 100, n_estimators = 10, min_samples_split = 8, 
                              min_samples_leaf = 8).fit(X_train, Y_train)

rf_preds = rf_md.predict(X_test)

sub['fare_amount'] = rf_preds

sub.to_csv('Submissions/rf_rd4.csv', index = False)

In [None]:
## XGBoost Model:
XGB_md = XGBRegressor(tree_method = 'hist', n_estimators = 500, learning_rate = 0.01, max_depth = 10, gamma = 0.13, 
                      min_child_weight = 15, subsample = 0.71, colsample_bytree = 0.7, verbosity = 0).fit(X_train, Y_train)

XGB_preds = XGB_md.predict(X_test)

sub['fare_amount'] = XGB_preds

sub.to_csv('Submissions/xgb_rd4.csv', index = False)

In [None]:
## LightGBM Model:
lgbm_md = LGBMRegressor(boosting_type = 'dart', n_estimators = 500, learning_rate = 0.1, num_leaves = 29, max_depth = 3,
                      subsample = 0.93, colsample_bytree = 0.73, random_state = 433, reg_alpha = 0.065, reg_lambda = 0.047, 
                        objective = 'rmse', verbosity = -1).fit(X_train, Y_train)

lgbm_preds = lgbm_md.predict(X_test)

sub['fare_amount'] = lgbm_preds

sub.to_csv('Submissions/lgbm_rd4.csv', index = False)

In [None]:
## Ensemble Model:

## Constructing the training data
rf_preds_train = rf_md.predict(X_train)
XGB_preds_train = XGB_md.predict(X_train)
lgbm_preds_train = lgbm_md.predict(X_train)

X_train_ensemble = pd.DataFrame({'rf': rf_preds_train, 'xgb': XGB_preds_train, 'lgbm': lgbm_preds_train})
X_test_ensemble = pd.DataFrame({'rf': rf_preds, 'xgb': XGB_preds, 'lgbm': lgbm_preds})

## Building the model
ensemble_md = RandomForestRegressor(max_depth = 5, n_estimators = 1000).fit(X_train_ensemble, Y_train)

sub['fare_amount'] = ensemble_md.predict(X_test_ensemble)

sub.to_csv('Submissions/ensemble_rd4.csv', index = False)

### Fifth Round of Models: Optimized hps, new features, with top __ features 

In [None]:
## Defining input and target variables
X_train_XGB = train.drop(columns = ['fare_amount'])
Y_train_XGB = train['fare_amount']

X_test = test

## XGBoost Model:
XGB_md = XGBRegressor(tree_method = 'hist', n_estimators = 300, learning_rate = 0.02, max_depth = 5, gamma = 0.2, 
                      min_child_weight = 6, subsample = 0.99, colsample_bytree = 0.74, verbosity = 0).fit(X_train_XGB, Y_train_XGB)

XGB_preds = XGB_md.predict(X_test)

sub['fare_amount'] = XGB_preds

sub.to_csv('Submissions/xgb_rd3a.csv', index = False)

In [None]:
## Defining input and target variables
X_train_XGB = train.drop(columns = ['fare_amount', 'Tuesday', 'Wednesday', 'dropoff_LGA', 'pickup_JFK', 'pickup_EWR', 'dropoff_EWR', 
                                   'pickup_bronx', 'pickup_brooklyn', 'pickup_staten_island', 'dropoff_bronx', 'dropoff_brooklyn', 
                                   'dropoff_queens', 'dropoff_staten_island'])
Y_train_XGB = train['fare_amount']

X_test = test.drop(columns = ['Tuesday', 'Wednesday', 'dropoff_LGA', 'pickup_JFK', 'pickup_EWR', 'dropoff_EWR', 'pickup_bronx', 
                              'pickup_brooklyn', 'pickup_staten_island', 'dropoff_bronx', 'dropoff_brooklyn', 'dropoff_queens', 
                              'dropoff_staten_island'])

## XGBoost Model:
XGB_md = XGBRegressor(tree_method = 'hist', n_estimators = 300, learning_rate = 0.02, max_depth = 5, gamma = 0.2, 
                      min_child_weight = 6, subsample = 0.99, colsample_bytree = 0.74, verbosity = 0).fit(X_train_XGB, Y_train_XGB)

XGB_preds = XGB_md.predict(X_test)

sub['fare_amount'] = XGB_preds

sub.to_csv('Submissions/xgb_rd3b.csv', index = False)

In [None]:
## Defining input and target variables
X_train_lgbm = train.drop(columns = ['fare_amount'])
Y_train_lgbm = train['fare_amount']

X_test = test

## LightGBM Model:
lgbm_md = LGBMRegressor(boosting_type = 'dart', n_estimators = 1200, learning_rate = 0.06, num_leaves = 9, max_depth = 3,
                      subsample = 0.82, colsample_bytree = 0.9, random_state = 660, reg_alpha = 0.042, reg_lambda = 0.066, 
                        objective = 'rmse', verbosity = -1).fit(X_train_lgbm, Y_train_lgbm)

lgbm_preds = lgbm_md.predict(X_test)

sub['fare_amount'] = lgbm_preds

sub.to_csv('Submissions/lgbm_rd3a.csv', index = False)

In [None]:
## Defining input and target variables
X_train_lgbm = train.drop(columns = ['fare_amount', 'Tuesday', 'Wednesday', 'dropoff_LGA', 'pickup_JFK', 'pickup_EWR', 'dropoff_EWR', 
                                    'pickup_bronx', 'pickup_brooklyn', 'pickup_staten_island', 'dropoff_bronx', 'dropoff_brooklyn', 
                                    'dropoff_queens', 'dropoff_staten_island'])
Y_train_lgbm = train['fare_amount']

X_test = test.drop(columns = ['Tuesday', 'Wednesday', 'dropoff_LGA', 'pickup_JFK', 'pickup_EWR', 'dropoff_EWR', 'pickup_bronx', 
                              'pickup_brooklyn', 'pickup_staten_island', 'dropoff_bronx', 'dropoff_brooklyn', 'dropoff_queens', 
                              'dropoff_staten_island'])

## LightGBM Model:
lgbm_md = LGBMRegressor(boosting_type = 'dart', n_estimators = 1200, learning_rate = 0.06, num_leaves = 9, max_depth = 3,
                      subsample = 0.82, colsample_bytree = 0.9, random_state = 660, reg_alpha = 0.042, reg_lambda = 0.066, 
                        objective = 'rmse', verbosity = -1).fit(X_train_lgbm, Y_train_lgbm)

lgbm_preds = lgbm_md.predict(X_test)

sub['fare_amount'] = lgbm_preds

sub.to_csv('Submissions/lgbm_rd3b.csv', index = False)

## Using K-Fold Cross-Validation

In [None]:
## Reading the data
train = pd.read_csv('W23P1_training.csv')
test = pd.read_csv('W23P1_testing.csv')
sub = pd.read_csv('Data/W23P1_sample_submission.csv')

## Defining the input and target variables
X_train = train.drop(columns = ['uid', 'fare_amount'])
Y_train = train['fare_amount']

X_test = test.drop(columns = ['uid', 'fare_amount'])

In [None]:
## 10-Fold CV with RandomForest
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

## Defining lists to store results 
train_error = list(); validation_error = list()

for train_index, test_index in kf.split(X_train):
    
    ## Defining the training and validation data
    X_training = X_train.iloc[train_index]; Y_training = Y_train.iloc[train_index]
    X_validation = X_train.iloc[test_index]; Y_validation = Y_train.iloc[test_index]
    
    rf_md = RandomForestRegressor(max_depth = 3, n_estimators = 100).fit(X_training, Y_training)
    
    training_preds = rf_md.predict(X_training)
    validation_preds = rf_md.predict(X_validation)
    
    training_mse = mean_squared_error(Y_training, training_preds, squared = False)
    validation_mse = mean_squared_error(Y_validation, validation_preds, squared = False)
    
    train_error.append(training_mse)
    validation_error.append(validation_mse)

In [None]:
print(np.mean(train_error))
print(np.mean(validation_error))

In [None]:
## 10-Fold CV with XGBoost
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)

## Defining lists to store results 
train_error = list(); validation_error = list(); preds = list()

for train_index, test_index in kf.split(X_train):
    
    ## Defining the training and validation data
    X_training = X_train.iloc[train_index]; Y_training = Y_train.iloc[train_index]
    X_validation = X_train.iloc[test_index]; Y_validation = Y_train.iloc[test_index]
    
    xgb_md = XGBRegressor(tree_method = 'hist', learning_rate = 0.1, subsample = 0.9).fit(X_training, Y_training)
    
    training_preds = xgb_md.predict(X_training)
    validation_preds = xgb_md.predict(X_validation)
    testing_preds = xgb_md.predict(X_test)
    
    training_mse = mean_squared_error(Y_training, training_preds, squared = False)
    validation_mse = mean_squared_error(Y_validation, validation_preds, squared = False)
    
    train_error.append(training_mse)
    validation_error.append(validation_mse)
    preds.append(testing_preds)

print(np.mean(train_error))
print(np.mean(validation_error))

In [None]:
sub['fare_amount'] = np.mean(pd.DataFrame(np.array(preds).reshape((35000, 10))), axis = 1)
sub.to_csv('Submissions/xgb_kfold.csv', index = False)

## Final Shot:

In [16]:
## Reading the data
train = pd.read_csv('Data/final_shot_train.csv')
test = pd.read_csv('Data/final_shot_test.csv')
sub = pd.read_csv('Data/W23P1_sample_submission.csv')

## Defining inputs and target
X = train[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine', 'same_lat', 
           'same_coord_rounded', 'pickup_LGA', 'dropoff_LGA', 'LGA', 'pickup_JFK', 'dropoff_JFK', 'JFK', 'pickup_EWR', 
           'dropoff_EWR', 'EWR', 'pickup_airport', 'dropoff_airport', 'airport', 'change_borough', 'pickup_bronx', 
           'pickup_brooklyn', 'pickup_manhattan', 'pickup_other', 'pickup_queens', 'pickup_staten_island', 'dropoff_bronx', 
           'dropoff_brooklyn', 'dropoff_manhattan', 'dropoff_other', 'dropoff_queens', 'dropoff_staten_island', 'time_estimate', 
           'distance', 'duration']]
Y = train['fare_amount']

## Splitting into training and validation sets
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.3, random_state = 365)

In [25]:
## Random Forest
rf_md = RandomForestRegressor(max_depth = 7, n_estimators = 300, min_samples_leaf = 100, random_state = 365).fit(X_train, Y_train)
rf_train_preds = rf_md.predict(X_train); print(mean_squared_error(Y_train, rf_train_preds, squared = False))
rf_val_preds = rf_md.predict(X_validation); print(mean_squared_error(Y_validation, rf_val_preds, squared = False))

2.984813327895632
2.5179201755757727


max_depth = 7, n_estimators = 300, min_samples_leaf = 250, random_state = 365 --> 3.12 2.65

max_depth = 7, n_estimators = 300, min_samples_leaf = 200, random_state = 365 --> 3.11 2.63

max_depth = 7, n_estimators = 300, min_samples_leaf = 100, random_state = 365 --> 2.98 2.51

In [41]:
## XGBoost
xgb_md = XGBRegressor(tree_method = 'hist', n_estimators = 700, learning_rate = 0.05, max_depth = 8, gamma = 50, 
                      min_child_weight = 100, subsample = 1, colsample_bytree = 1, seed = 365).fit(X_train, Y_train)
xgb_train_preds = xgb_md.predict(X_train); print(mean_squared_error(Y_train, xgb_train_preds, squared = False))
xgb_val_preds = xgb_md.predict(X_validation); print(mean_squared_error(Y_validation, xgb_val_preds, squared = False))

2.700086224274026
2.3675831409458543


n_estimators = 800, learning_rate = 0.09, max_depth = 8, gamma = 300, min_child_weight = 400 --> 3.01 2.53

In [45]:
## LGBM:
lgbm_md = LGBMRegressor(n_estimators = 1000, learning_rate = 0.05, min_data_in_leaf = 100, max_depth = 6, 
                        subsample = 1, colsample_bytree = 1, random_state = 365, reg_alpha = 1, reg_lambda = 1, 
                        objective = 'rmse', verbosity = -1).fit(X_train, Y_train)
lgbm_train_preds = lgbm_md.predict(X_train); print(mean_squared_error(Y_train, lgbm_train_preds, squared = False))
lgbm_val_preds = lgbm_md.predict(X_validation); print(mean_squared_error(Y_validation, lgbm_val_preds, squared = False))

2.4234469282543722
2.352782055386873


n_estimators = 1000, learning_rate = 0.01, min_data_in_leaf = 400, max_depth = 5, 
                        subsample = 1, colsample_bytree = 1, random_state = 365, reg_alpha = 20, reg_lambda = 20, 
                        objective = 'rmse', verbosity = -1 --> 2.85 2.88

In [49]:
## Defining inputs and target
X = train[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine', 'same_lat', 
           'same_coord_rounded', 'pickup_LGA', 'dropoff_LGA', 'LGA', 'pickup_JFK', 'dropoff_JFK', 'JFK', 'pickup_EWR', 
           'dropoff_EWR', 'EWR', 'pickup_airport', 'dropoff_airport', 'airport', 'change_borough', 'pickup_bronx', 
           'pickup_brooklyn', 'pickup_manhattan', 'pickup_other', 'pickup_queens', 'pickup_staten_island', 'dropoff_bronx', 
           'dropoff_brooklyn', 'dropoff_manhattan', 'dropoff_other', 'dropoff_queens', 'dropoff_staten_island', 'time_estimate', 
           'distance', 'duration']]
Y = train['fare_amount']

X_test = test[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'haversine', 'same_lat', 
               'same_coord_rounded', 'pickup_LGA', 'dropoff_LGA', 'LGA', 'pickup_JFK', 'dropoff_JFK', 'JFK', 'pickup_EWR', 
               'dropoff_EWR', 'EWR', 'pickup_airport', 'dropoff_airport', 'airport', 'change_borough', 'pickup_bronx', 
               'pickup_brooklyn', 'pickup_manhattan', 'pickup_other', 'pickup_queens', 'pickup_staten_island', 'dropoff_bronx', 
               'dropoff_brooklyn', 'dropoff_manhattan', 'dropoff_other', 'dropoff_queens', 'dropoff_staten_island', 'time_estimate', 
               'distance', 'duration']]

## Building last chance models
rf_md = RandomForestRegressor(max_depth = 7, n_estimators = 300, min_samples_leaf = 100, random_state = 365).fit(X, Y)

xgb_md = XGBRegressor(tree_method = 'hist', n_estimators = 700, learning_rate = 0.05, max_depth = 8, gamma = 50, 
                      min_child_weight = 100, subsample = 1, colsample_bytree = 1, seed = 365).fit(X, Y)

lgbm_md = LGBMRegressor(n_estimators = 1000, learning_rate = 0.05, min_data_in_leaf = 100, max_depth = 6, subsample = 1, 
                        colsample_bytree = 1, random_state = 365, reg_alpha = 1, reg_lambda = 1, objective = 'rmse').fit(X, Y)

## Predicting on the testing set
rf_md_preds = rf_md.predict(X_test); rf_md_preds = np.where(rf_md_preds < 2.5, 2.5, rf_md_preds)
xgb_md_preds = xgb_md.predict(X_test); xgb_md_preds = np.where(xgb_md_preds < 2.5, 2.5, xgb_md_preds)
lgbm_md_preds = lgbm_md.predict(X_test); lgbm_md_preds = np.where(lgbm_md_preds < 2.5, 2.5, lgbm_md_preds)



In [7]:
## Constructing ensemble predictions
ensemble_final1 = (rf_md_preds + xgb_md_preds + lgbm_md_preds) / 3

## Getting predictions in a csv file 
sub['fare_amount'] = ensemble_final1
sub.to_csv('Submissions/ensemble_final1.csv', index = False)

In [50]:
best = pd.read_csv('Submissions/ensemble_average_adjusted.csv')
best2 = pd.read_csv('Submissions/ensemble_average2.csv')

In [51]:
## Constructing ensemble predictions
ensemble_final2 = (best['fare_amount'] + best2['fare_amount'] + rf_md_preds + xgb_md_preds + lgbm_md_preds) / 5

## Getting predictions in a csv file 
sub['fare_amount'] = ensemble_final2
sub.to_csv('Submissions/ensemble_final2.csv', index = False)

In [53]:
best_preds = np.where(best['fare_amount'] < 2.5, 2.5, best['fare_amount'])
best_preds2 = np.where(best2['fare_amount'] < 2.5, 2.5, best2['fare_amount'])

ensemble_final3 = (2*best_preds + best_preds2) / 3

## Getting predictions in a csv file 
sub['fare_amount'] = ensemble_final3
sub.to_csv('Submissions/ensemble_final3.csv', index = False)

In [60]:
best_preds[best_preds > 80]

array([ 85.90644838, 104.38365368, 105.98494003,  81.79215973,
        85.20422355,  85.40628672,  83.26933383,  87.57049199,
        85.96743854,  86.53929781,  97.64057864,  88.07223788,
        93.90235397])

In [62]:
ensemble_final3[ensemble_final3 > 80]

array([ 85.73042133, 105.79975235, 107.36563559,  82.5407642 ,
        84.82433852,  85.20584115,  83.24782316,  87.68651055,
        85.64669273,  86.62645379,  98.42243436,  89.07372489,
        93.77715118])

In [63]:
ensemble_final4 = (2*best_preds + best_preds2) / 3

ensemble_final4 = np.where(ensemble_final4 > 100, 120, ensemble_final4)

## Getting predictions in a csv file 
sub['fare_amount'] = ensemble_final4
sub.to_csv('Submissions/ensemble_final4.csv', index = False)