In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns

import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
import joblib

# check xgboost version
from xgboost import XGBRegressor

import holidays
us_holidays = holidays.country_holidays('US')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

In [4]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [5]:
df = pd.read_csv(f'{data_dir}/final_returns_model_input.csv')

df['ended_at_hourly'] = pd.to_datetime(df['ended_at_hourly'])

df_test = df[pd.to_datetime(
    df['ended_at_hourly']) >= pd.to_datetime("2023-01-01 00:00:00")]
df_train = df[pd.to_datetime(
    df['ended_at_hourly']) < pd.to_datetime("2023-01-01 00:00:00")]

del df

In [6]:
target_scaler = joblib.load(f'{models_dir}/target_returns_scaler.sav')

In [7]:
def train_model(df_train, df_test, FEATURES, model='xgb'):
  TARGET = 'bike_return'
  TARGET_TRAIN = 'bike_return_target'
  X_train = df_train[FEATURES]
  y_train = df_train[TARGET_TRAIN]

  X_test = df_test[FEATURES]
  y_test_training = target_scaler.transform(df_test[[TARGET]])
  y_test = df_test[TARGET]

  if model=='rf':
    reg = RandomForestRegressor(random_state=42)
    reg.fit(X_train, y_train)

  if model=='xgb':
    reg = XGBRegressor()
    reg.fit(X_train, y_train)

  y_pred = target_scaler.inverse_transform([reg.predict(X_test)])
  preds_out = X_test.copy()
  preds_out['actual_bike_return'] = y_test
  preds_out['pred'] = y_pred[0]

  non_zero = preds_out.query('actual_bike_return != 0')
  zeros = preds_out.query('actual_bike_return == 0')

  mse_score = mean_squared_error(y_test, y_pred[0])
  rmse_score = np.sqrt(mse_score)
  mae_score = mean_absolute_error(y_test, y_pred[0])
  mape_score = mean_absolute_percentage_error(y_test+1, y_pred[0]+1)
  print("overall")
  print("MSE:", mse_score)
  print("RMSE:", rmse_score)
  print("MAE:", mae_score)
  print("MAPE:", mape_score)


  mse_score = mean_squared_error(non_zero['actual_bike_return'], non_zero['pred'])
  rmse_score = np.sqrt(mse_score)
  mae_score = mean_absolute_error(non_zero['actual_bike_return'], non_zero['pred'])
  mape_score = mean_absolute_percentage_error(non_zero['actual_bike_return']+1, non_zero['pred']+1)
  print()
  print("Non-zero")
  print("MSE:", mse_score)
  print("RMSE:", rmse_score)
  print("MAE:", mae_score)
  print("MAPE:", mape_score)

  mse_score = mean_squared_error(zeros['actual_bike_return'], zeros['pred'])
  rmse_score = np.sqrt(mse_score)
  mae_score = mean_absolute_error(zeros['actual_bike_return'], zeros['pred'])
  mape_score = mean_absolute_percentage_error(zeros['actual_bike_return']+1, zeros['pred']+1)
  print()
  print("Zeros")
  print("MSE:", mse_score)
  print("RMSE:", rmse_score)
  print("MAE:", mae_score)
  print("MAPE:", mape_score)

  return reg, preds_out

# Normal Model

## XBGoost

In [11]:
FEATURES = ['end_station_cluster', 'ended_at_year',
            'ended_at_month', 'ended_at_day',
            'ended_at_hour',
            'ended_at_week',
            'ended_at_quarter',
            'ended_at_dayofweek',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            #'coco',
            "bike_return_lag_1_h",
            "bike_return_lag_2_h",
            "bike_return_lag_24_h",
            #'flag_added',
            'is_holiday',
            #'total_demand',
            #'demand_degrees'
          ]
hours = df_test['ended_at_hourly']
model, model_preds = train_model(df_train, df_test, FEATURES, model='xgb')
model_preds['started_at_hourly'] = hours

overall
MSE: 3.0219137544549715
RMSE: 1.7383652534651546
MAE: 0.8864674708243583
MAPE: 0.3648560955433944

Non-zero
MSE: 7.071469792240275
RMSE: 2.659223531830349
MAE: 1.699138978746337
MAPE: 0.36235538743406287

Zeros
MSE: 0.4306889155733284
RMSE: 0.6562689354017364
MAE: 0.3664562455122379
MAPE: 0.3664562455122379


In [10]:
#11-01
3.0241088825032505
1.7389965159548912
0.8838541036353352
0.36134692270631164

0.36134692270631164

In [12]:
suf = "_normal_xgb_01_11"

model_preds.to_csv(f"{models_dir}/test_returns_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/returns_model{suf}.sav'
joblib.dump(model, filename)
del model

#filename = f'label_encoder{suf}.sav'
#joblib.dump(le, filename)
#del reg

## Random Forest

In [11]:
FEATURES = ['end_station_cluster', 'ended_at_year',
            'ended_at_month', 'ended_at_day',
            'ended_at_hour',
            'ended_at_week',
            'ended_at_quarter',
            'ended_at_dayofweek',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            "bike_return_lag_1_h",
            "bike_return_lag_2_h",
            "bike_return_lag_24_h",
            #'flag_added',
            'is_holiday',
            #'total_demand',
            #'demand_degrees'
          ]

model, model_preds = train_model(df_train, df_test, FEATURES, model='rf')

overall
MSE: 3.276007555102104
RMSE: 1.809974462555233
MAE: 0.9394940804666416
MAPE: 0.4085958870166032

Non-zero
MSE: 7.4624820501658835
RMSE: 2.7317543905274286
MAE: 1.7374170720009694
MAPE: 0.376833259427738

Zeros
MSE: 0.5971715054875008
RMSE: 0.7727687270377217
MAE: 0.428920117324434
MAPE: 0.428920117324434


In [12]:
5.636537048223255
2.374139222586421
0.33886059404561014

#11-01
3.566440816395711
1.8885022680409231
0.3805276715588524

0.3805276715588524

In [13]:
suf = "_normal_rf_01_11"

model_preds.to_csv(f"{models_dir}/test_returns_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/returns_model{suf}.sav'
joblib.dump(model, filename)
del model

#filename = f'label_encoder{suf}.sav'
#joblib.dump(le, filename)
#del reg

In [14]:
#e
3.189635202926419
1.7859549834546276
0.37270374298234266

#f
3.1785156823909997
1.7828392194449278
0.374173850198251

#g
3.1979816352695654
1.788290142921323
0.3706682703327132

#h
3.201650782753264
1.789315730315157
0.37396526641032307

#i
3.581524793531
1.8924916891577093
0.3959399998524082

#k
3.063089630068344
1.7501684576258205
0.35523741143766596

0.35523741143766596