### Station based Train/Test/Split

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# machine learning
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import plot_importance, plot_tree
from math import sqrt
from sklearn.model_selection import train_test_split, KFold, cross_val_score ,RandomizedSearchCV, GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import RFE, RFECV
from catboost import CatBoostRegressor
import shap

import warnings
warnings.filterwarnings('ignore')


### Load and view data

In [2]:
OD = pd.read_excel("../Updated Variable list github/gabe/Modeling/Outputs/all_riders_vars_dataframe.xlsx").drop('Unnamed: 0', axis=1)

* Split the dataset into train, test, and val set

In [3]:
pd.reset_option('all')
# pd.set_option('display.max_rows', None)

In [4]:
OD = OD.groupby('pairs').first().reset_index()

In [5]:
OD.rename({'proportionhouses_O': 'proportion_of_households_O', 
           'proportionhouses_D': 'proportion_of_households_D'}, axis=1, inplace=True)

In [6]:
OD.head()

Unnamed: 0,pairs,passengers,ID_D,PRIMARY_NAME_FY23_D,walkshed_filename_D,WMATA_filename_D,track_miles_names_D,auto_filename_D,ID_O,PRIMARY_NAME_FY23_O,...,prox3norm_O,conn6norm_O,parks2norm_O,peds4norm_O,safenorm_O,traffic5norm_O,aesttot3norm_O,paf2norm_O,sop7norm_O,urban_dummy_O
0,MSTN_0010MSTN_002,1418,MSTN_002,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,Archives-Navy Memorial,Archives,ARCHIVES-NAVY MEMORIAL-PENN QUARTER,MSTN_001,Anacostia,...,10.285137,75.577863,3.443532,31.693741,77.080763,54.616166,50.629299,0.34611,47.471791,0
1,MSTN_0010MSTN_003,156,MSTN_003,Benning Road,BENNING ROAD,Benning Road,Benning Road,BENNING ROAD,MSTN_001,Anacostia,...,10.285137,75.577863,3.443532,31.693741,77.080763,54.616166,50.629299,0.34611,47.471791,0
2,MSTN_0010MSTN_004,425,MSTN_004,Brookland-CUA,BROOKLAND-CUA,Brookland,Brookland-CUA,BROOKLAND-CUA,MSTN_001,Anacostia,...,10.285137,75.577863,3.443532,31.693741,77.080763,54.616166,50.629299,0.34611,47.471791,0
3,MSTN_0010MSTN_005,146,MSTN_005,Capitol South,CAPITOL SOUTH,Capitol South,Capitol South,CAPITOL SOUTH,MSTN_001,Anacostia,...,10.285137,75.577863,3.443532,31.693741,77.080763,54.616166,50.629299,0.34611,47.471791,0
4,MSTN_0010MSTN_006,124,MSTN_006,Cleveland Park,CLEVELAND PARK,Cleveland Park,Cleveland Park,CLEVELAND PARK,MSTN_001,Anacostia,...,10.285137,75.577863,3.443532,31.693741,77.080763,54.616166,50.629299,0.34611,47.471791,0


In [7]:
OD.shape

(8187, 148)

In [8]:
OD.drop(['pairs', 'ID_D', 'PRIMARY_NAME_FY23_D', 'walkshed_filename_D', 'WMATA_filename_D', 
           'track_miles_names_D', 'auto_filename_D', 'ID_O', 'PRIMARY_NAME_FY23_O', 'walkshed_filename_O', 
             'WMATA_filename_O', 'track_miles_names_O', 'auto_filename_O', 'O', 'D', 'Name_1_O_D', 'Name_1_D_D', 'Name_1_O_O',
             'MSTN_ID_D', 'STATION_ID_D', 'MSTN_D', 'MSTN_ID_O', 'STATION_ID_O', 'MSTN_O', 
              'Total Households_O', 'Total Households_D', 'COMP_MILE', 'track_miles', 'SD_FARE', 'TRAVEL_TIME', 'pub_admin_jobs_O', 
            'pub_admin_jobs_D', 'bus_tt_per_mile', 'am_parking_user', 'pm_parking_user', 'off_parking_user', 'peak_fare_per_mile2',
            'off_peak_fare_per_mile', 'google_driving_miles', 'am_new_auto_tt_per_mile2', 'pm_new_auto_tt_per_mile2', 'off_new_auto_tt_per_mile2',
             'OFF_PEAK_FARE', 'PEAK_FARE', 'HH_1_car_O', 'HH_1_car_D', 'HH_more1_car_O', 'HH_more1_car_D', 'Name_1_D_O'
             
          ], axis=1, inplace=True)  # include back '', '', and 'O_MSTN_ID', 'D_MSTN_ID'

OD.shape

(8187, 99)

### Preprocessing

In [9]:
# X = OD.drop('passengers', axis=1)
# y = OD['passengers']

In [10]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.reset_option('all')

In [11]:
# X.isna().any().sum()

In [12]:
# y.isna().any().sum()

In [13]:
X = OD.fillna(0.000001)

### Remove highly collinear variables

* xx of these variables were dropped and due to very high multicollinearity and risk of endogeneity problem

In [14]:
# Xcorr = X.corr()
# plt.figure(figsize=(35, 30))
# sns.heatmap(Xcorr, cmap='coolwarm')

In [15]:
X1 = X.copy()

X1 = X1.drop(['All_Jobs_O', 'All_Jobs_D', 'ctpp_jobs_O', 'ctpp_jobs_D', 'urban_dummy_O', 'urban_dummy_D', 'terminal_dummy_2023_O', 
              'terminal_dummy_2023_D', 'AM_AVG_TRAINS_O',  'AM_AVG_TRAINS_D', 'PM_AVG_TRAINS_O', 'PM_AVG_TRAINS_D', 'am_new_auto_tt2', 
              'pm_new_auto_tt2', 'off_new_auto_tt2','bus_transit_minutes', 'aesttot3norm_D', 'aesttot3norm_O', 'conn6norm_D', 'conn6norm_O',
             'dens2norm_D', 'dens2norm_O', 'form3norm_D', 'form3norm_O', 'paf2norm_D', 'paf2norm_O', 'parks2norm_D', 'parks2norm_O', 
              'peds4norm_D', 'peds4norm_O', 'prox3norm_D', 'prox3norm_O', 'safenorm_D', 'safenorm_O', 
             'traffic5norm_D', 'traffic5norm_O', 'trip_time_mins','bikelane_D', 'bikelane_O'], axis=1)  # I took out  'sop7norm_D', 'sop7norm_O', 


In [16]:
# sorted(X1.columns.to_list())
X1.shape

(8187, 60)

### Modelling

In [17]:
pd.reset_option('all')

### Splitting train / val / test split

#### train, val, test

In [18]:
rfecv_28 = ['bus_tt', 'sop7norm_D', 'sop7norm_O',  'bike_traveltime', 'google_driving_minutes',
           'metro_transit_minutes', 'bus_competativeness_index',
           'ALLPERIODS_AVG_TRAINS_D', 'proportion_of_households_D',
           'PARKING_CAPACITY_D', 'distance_to_core_D',
           'Total_Nine_to_Five_workers_D', 'Proportion_education_jobs_D',
           'bike_cap_D', 'hotelcount_D', 'restaurantcount_D', 'intercityhub_D',
           'HH_0_car_D', 'ALLPERIODS_AVG_TRAINS_O', 'proportion_of_households_O',
           'PARKING_CAPACITY_O', 'distance_to_core_O',
           'Proportion_night_weekend_jobs_O', 'Total_Nine_to_Five_workers_O',
           'Median_household_income_O', 'bike_cap_O', 'restaurantcount_O',
           'intercityhub_O', 'prop_str_dens_O', 'HH_0_car_O', 'O_MSTN_ID', 
            'passengers',] # add O_MSTN_ID for the splitting sake. take out </--Name_1_O_O and D_D done--/> 

In [19]:
X1 = X1[rfecv_28]

75%, 15%, 10% <br />
65, 14, 12 stations selected at random and shuffled. try this and try selected at random

##### Station specific train_test split

In [20]:
X1['O_ID'] = X1['O_MSTN_ID'].str.split('_').str[1].astype('int')

In [21]:
X1.tail()

Unnamed: 0,bus_tt,sop7norm_D,sop7norm_O,bike_traveltime,google_driving_minutes,metro_transit_minutes,bus_competativeness_index,ALLPERIODS_AVG_TRAINS_D,proportion_of_households_D,PARKING_CAPACITY_D,...,Total_Nine_to_Five_workers_O,Median_household_income_O,bike_cap_O,restaurantcount_O,intercityhub_O,prop_str_dens_O,HH_0_car_O,O_MSTN_ID,passengers,O_ID
8182,1e-06,45.440273,42.531423,1e-06,45.48,68,0.925548,5.386364,3.467023,633.0,...,300.750951,145980.0,53.0,9.0,1e-06,233.715791,0.0,MSTN_091,67,91
8183,122.0,45.514203,42.531423,1e-06,8.47,13,0.646123,2.886364,1.34139,1e-06,...,300.750951,145980.0,53.0,9.0,1e-06,233.715791,0.0,MSTN_091,841,91
8184,85.0,40.516541,42.531423,1e-06,8.57,11,0.590763,2.8,0.645197,1e-06,...,300.750951,145980.0,53.0,9.0,1e-06,233.715791,0.0,MSTN_091,2721,91
8185,108.0,41.561406,42.531423,1e-06,11.05,9,0.404494,2.755556,1.111354,1e-06,...,300.750951,145980.0,53.0,9.0,1e-06,233.715791,0.0,MSTN_091,843,91
8186,61.0,44.169351,42.531423,1e-06,7.17,7,0.465735,2.755556,1.415941,1e-06,...,300.750951,145980.0,53.0,9.0,1e-06,233.715791,0.0,MSTN_091,895,91


In [22]:
train = X1[X1['O_ID']<66].reset_index(drop=True)

In [23]:
# train['passengers'].std()
# train['passengers'].max() - train['passengers'].min()
# train.passengers.describe()

In [24]:
X_train = train.drop(['passengers', 'O_MSTN_ID', 'O_ID'], axis=1)
y_train = train['passengers']

In [25]:
val = X1[(X1['O_ID']>65) & (X1['O_ID']<80)].reset_index(drop=True)
X_val = val.drop(['passengers', 'O_MSTN_ID', 'O_ID'], axis=1)
y_val = val['passengers']

In [26]:
# X_val

In [27]:
test = X1[(X1['O_ID']>79)].reset_index(drop=True)
X_test = test.drop(['passengers', 'O_MSTN_ID', 'O_ID'], axis=1)
y_test = test['passengers']

In [28]:
X_train.shape, y_train.shape,X_val.shape, y_val.shape,X_test.shape, y_test.shape,

((5847, 30), (5847,), (1260, 30), (1260,), (1080, 30), (1080,))

In [29]:
# X_train.columns

##### modeling

In [30]:
regr = RandomForestRegressor(random_state=10)

In [31]:
regr.fit(X_train, y_train)

##### Predict on Train

In [32]:
train_pred = regr.predict(X_train)

In [33]:
train_rmse = sqrt(mean_squared_error(y_train, train_pred))
train_rmse

278.99208782448954

In [34]:
np.round(r2_score(y_train, train_pred), 3)

0.96

##### Predict on Validation set

In [35]:
y_pred = regr.predict(X_val)

In [36]:
rmse = sqrt(mean_squared_error(y_val, y_pred))
rmse

702.7483569623862

In [37]:
np.round(r2_score(y_val, y_pred), 3)

0.605

##### Predict on hold out Test set

In [38]:
test_pred = regr.predict(X_test)

In [39]:
rmse = sqrt(mean_squared_error(y_test, test_pred))

In [40]:
rmse

498.7349272554881

In [41]:
np.round(r2_score(y_test, test_pred), 3)

0.307

##### Splitting train, val, test stations with Random selection

In [42]:
df = X1.copy()
seed = 26
unique_stations = df['O_ID'].unique()

np.random.seed(seed)
shuffled_stations = np.random.permutation(unique_stations)

In [43]:
train_stations = shuffled_stations[:65]
val_stations = shuffled_stations[65:79]
test_stations = shuffled_stations[79:]

In [44]:
print('train_stations_shuffled: \n', train_stations)
print('  ')
print('val_stations_shuffled: \n', val_stations)
print('  ')
print('test_stations_shuffled: \n', test_stations)

train_stations_shuffled: 
 [19 79 44  6 50 82 34  5 68 21 37 32 60  9 77 58 74  3 47  8 55 35 27 65
 67 73 76 12 59 26 38  4  2 10 42 14 25 64 11 23 90 40 71 75 39 28 86 15
  1 41 89 88 85 80 45 81 29 36 52 30 62 20 87 51 61]
  
val_stations_shuffled: 
 [83 17 70 13 72 57 48 24 43 16 31 22 53 91]
  
test_stations_shuffled: 
 [56 18 46 69 33 78 84 66 49  7 63 54]


In [45]:
rtrain = df[df['O_ID'].isin(train_stations)].sample(frac=1)
rX_train = rtrain.drop(['passengers', 'O_MSTN_ID', 'O_ID'], axis=1)
ry_train = rtrain['passengers']

In [46]:
rVal = df[df['O_ID'].isin(val_stations)].sample(frac=1)
rX_val = rVal.drop(['passengers', 'O_MSTN_ID', 'O_ID'], axis=1)
ry_val = rVal['passengers']

In [47]:
rTest = df[df['O_ID'].isin(test_stations)].sample(frac=1)
rX_test = rTest.drop(['passengers', 'O_MSTN_ID', 'O_ID'], axis=1)
ry_test = rTest['passengers']

In [48]:
# rtrain
# train_station_names = list(set(rtrain['Name_1_O_O'])) # uncomment after you've included the 'Name_1_O_O column in the df
# val_station_names = list(set(rVal['Name_1_O_O']))   # uncomment after you've included the 'Name_1_O_O column in the df
# test_station_names = list(set(rTest['Name_1_O_O'])) # uncomment after you've included the 'Name_1_O_O column in the df
# test_station_names

In [49]:
rX_train.shape, ry_train.shape, rX_val.shape, ry_val.shape, rX_test.shape, ry_test.shape,

((5847, 30), (5847,), (1260, 30), (1260,), (1080, 30), (1080,))

In [50]:
# tlist = ['STADIUM-ARMORY', "PRINCE GEORGE'S PLAZA", 'VIRGINIA SQUARE-GMU', 'GREENBELT', 'COLUMBIA HEIGHTS', 'CAPITOL HEIGHTS',
#          'BRADDOCK ROAD', 'FRIENDSHIP HEIGHTS', 'CLARENDON', 'WHEATON', 'VIENNA/FAIRFAX-GMU', 'NEW CARROLLTON']


# for i in tlist:
#     print(i)

#### modeling

In [232]:
rregr = RandomForestRegressor(random_state=10, max_depth=10, oob_score=True)

In [233]:
rregr.fit(rX_train, ry_train)

In [234]:
rregr.oob_score_

0.6639362320863204

##### Predict on Train

In [235]:
rtrain_pred = rregr.predict(rX_train)

In [236]:
rtrain_rmse = sqrt(mean_squared_error(ry_train, rtrain_pred))
rtrain_rmse

365.88509442915597

In [237]:
np.round(r2_score(ry_train, rtrain_pred), 3)

0.916

##### Predict on Validation set

In [238]:
ry_pred = rregr.predict(rX_val)

In [239]:
rrmse = sqrt(mean_squared_error(ry_val, ry_pred))
rrmse

966.5442197659164

In [240]:
np.round(r2_score(ry_val, ry_pred), 3)

0.605

##### Predict on hold out Test set

In [241]:
rtest_pred = rregr.predict(rX_test)

In [242]:
rrmse = sqrt(mean_squared_error(ry_test, rtest_pred))

In [243]:
rrmse

616.3938517807253

In [244]:
np.round(r2_score(ry_test, rtest_pred), 3)

0.655

##### kfold

In [163]:
kfold = KFold(n_splits=5, shuffle=True, random_state=5)
krreg = RandomForestRegressor(random_state=10)

scores = cross_val_score(krreg, rX_train, ry_train, cv=kfold, scoring='r2')

print(f"R-squared: {scores.mean():.2f}")
print('scores :', scores )

R-squared: 0.66
scores : [0.60129491 0.69638463 0.68217143 0.66107318 0.65342264]


In [164]:
krreg.fit(rX_train, ry_train)

In [166]:
kfold_val = krreg.predict(rX_val)

In [167]:
np.round(r2_score(ry_val, kfold_val), 3)

0.62

#### GridSearch

In [70]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],           # Number of trees in the forest
    'max_depth': [10, 20, 30, None],          # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],          # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],            # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'], # Number of features to consider at every split
    'bootstrap': [True, False]                # Method of selecting samples for training each tree
}

In [71]:
regr = RandomForestRegressor(random_state=10)
rgrid_search = GridSearchCV(estimator=regr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='r2')

In [72]:
rgrid_search.fit(rX_train, ry_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; tota

[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   5.0s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   4.9s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   4.8s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   4.9s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   4.8s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   6.1s
[CV] END bootstrap=False, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   6.3s
[CV] END bootstrap=False, m

In [60]:
print("Best parameters:", rgrid_search.best_params_)
print("Best R2 score:", rgrid_search.best_score_)

Best parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best R2 score: 0.507592555201988
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   1.5s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total tim

##### Predict on train

In [73]:
rtrain_pred = rgrid_search.predict(rX_train)

In [74]:
grrmse = sqrt(mean_squared_error(ry_train, rtrain_pred))
grrmse

0.00383217872127351

In [75]:
np.round(r2_score(ry_train, rtrain_pred), 3)

1.0

##### Predict on validation

In [79]:
gry_pred = rgrid_search.predict(rX_val)

In [80]:
grrmse = sqrt(mean_squared_error(ry_val, gry_pred))
grrmse

981.3441627465327

In [81]:
np.round(r2_score(ry_val, gry_pred), 3)

0.593

##### Predict on hold-out test

In [82]:
rtest_pred = rgrid_search.predict(rX_test)

In [83]:
grrmse = sqrt(mean_squared_error(ry_test, rtest_pred))
grrmse

615.3630688857495

In [84]:
np.round(r2_score(ry_test, rtest_pred), 3)

0.657

#### sklearn train_test_split

In [70]:
y1t =  X1['passengers']
X1t = X1.drop(['passengers', 'O_MSTN_ID', 'O_ID'], axis=1)

In [71]:
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X1t, y1t, shuffle=True, test_size=0.10, random_state = 10)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, shuffle=True, test_size=0.15, random_state=10)

In [72]:
X_train.shape, X_val.shape, X_test.shape

((6262, 30), (1106, 30), (819, 30))

In [73]:
regr = RandomForestRegressor(random_state=10)
regr.fit(X_train, y_train)

In [74]:
# X_train_temp.shape, X_test.shape, X_val.shape
# X_val

###### Predict on Train

In [75]:
train_pred = regr.predict(X_train)
rmse = sqrt(mean_squared_error(y_train, train_pred))
rmse

260.4131639352177

In [76]:
np.round(r2_score(y_train, train_pred), 3)

0.96

###### Predict on Validation

In [77]:
y_pred = regr.predict(X_val)

In [78]:
rmse = sqrt(mean_squared_error(y_val, y_pred))
rmse

677.0307116352163

In [79]:
np.round(r2_score(y_val, y_pred), 3)

0.718

###### Predict on Test

In [80]:
test_pred = regr.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, test_pred))
rmse

620.4506387185065

In [81]:
np.round(r2_score(y_test, test_pred), 3)

0.733

###### Predict or random val and test

In [53]:
sklearn_y_pred = regr.predict(rX_val)
sklearn_rmse = sqrt(mean_squared_error(ry_val, sklearn_y_pred))
sklearn_rmse

505.2497293141912

In [54]:
np.round(r2_score(ry_val, sklearn_y_pred), 3)

0.892

#### XGBoost

In [138]:
xgbparams_grid = {
    'colsample_bytree': [0.3, 0.7],
    'learning_rate': [0.01, 0.1],
    'max_depth': [5, 10],
    'alpha': [1, 10],
    'n_estimators': [100, 200]
}

xg_reg = xgb.XGBRegressor(objective='reg:squarederror')

xg_grid_search = GridSearchCV(estimator=xg_reg, param_grid=xgbparams_grid, scoring='r2', cv=5, verbose=1, n_jobs=-1)

In [139]:
xg_grid_search.fit(rX_train, ry_train)  # Fit GridSearchCV

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [140]:
print("Best parameters:", xg_grid_search.best_params_)
print("Best R2 score:", xg_grid_search.best_score_)

Best parameters: {'alpha': 10, 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best R2 score: 0.48257393564077306


In [141]:
# Train the model with the best parameters
xg_reg_optimized = xgb.XGBRegressor(**xg_grid_search.best_params_, objective='reg:squarederror')
xg_reg_optimized.fit(rX_train, ry_train)

In [142]:
predictions = xg_reg_optimized.predict(rX_val)

In [143]:
# Evaluate the predictions using R² score
r2 = r2_score(ry_val, predictions)
print("Test R2 score:", r2)

Test R2 score: 0.5975953923490563


In [144]:
kfold = KFold(n_splits=5, shuffle=False, random_state=10)

In [145]:
scores = cross_val_score(xg_grid_search, rX_train, ry_train, cv=kfold, scoring='r2')

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [146]:
print(f"R-squared: {scores.mean():.2f}")

R-squared: 0.75


In [150]:
rmse = cross_val_score(xg_grid_search, rX_train, ry_train, cv=kfold, scoring='neg_mean_squared_error')

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [151]:
np.mean(np.sqrt(-rmse))

623.4791594608245

In [156]:
xtestpred = xg_reg_optimized.predict(rX_test)

In [158]:
r2_score(ry_test, xtestpred)

0.7011219034497408

In [None]:
# rmse = sqrt(mean_squared_error(ry_test, xtestpred))
# rmse