In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_percentage_error
import joblib

import holidays
us_holidays = holidays.country_holidays('US')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

In [3]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [4]:
demand = pd.read_csv(f"{data_dir}/pre_processed_data.csv")

#tss = TimeSeriesSplit(n_splits=5, test_size=24*60, gap=24)
df = demand.sort_values(by=['started_at_year', 'started_at_month',
                            'started_at_day', 'started_at_hour'])
df = df.drop(columns=['Unnamed: 0', 'duration_sec']).reset_index(drop=True)
df['started_at_daily'] = pd.to_datetime(df['started_at_daily'])
df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])

df = df[df['clusters']==1]
df = df[df['started_at_hourly'] >= pd.to_datetime("2023-01-01 00:00:00")].drop_duplicates(
    ['started_at_hourly', 'start_station_cluster', 'demand']).reset_index(drop=True)

In [5]:
earliest_trips = df.groupby('start_station_cluster').nth(0)
stations_operational_since_early = earliest_trips[(earliest_trips['started_at_month'] < 4)]['start_station_cluster']
stations_early_enough = df['start_station_cluster'].isin(stations_operational_since_early)
print((df[~stations_early_enough].shape[0]/df.shape[0])*100)
df = df[stations_early_enough].reset_index(drop=True)

1.9881767405363227


In [6]:
CLUSTER_COLUMN = 'start_station_cluster'
DATETIME = "started_at_hourly"
DATE = "started_at_daily"

In [7]:
def fill_three_day_window(day, df, cluster):
    # Define the three-day window (previous day, current day, and next day)
    start_date = day - pd.Timedelta(days=1)
    end_date = day + pd.Timedelta(days=1)

    days_to_fill = []
    for i in range(3):
        candidate_day = start_date + pd.Timedelta(days=i)
        if candidate_day not in processed_dates:
            days_to_fill.append(candidate_day)
    #print(days_to_fill)
    # If all days overlap, skip filling this window
    if not days_to_fill:
        return None
    days_to_fill[-1] = days_to_fill[-1] - pd.Timedelta(hours=1)
    # Create a full hourly range only for the non-overlapping days
    full_range = pd.date_range(start=min(days_to_fill), end=max(days_to_fill), freq='h')


    # Reindex the DataFrame to this range and interpolate/forward-fill
    df_window = df.reindex(full_range)
    df_window['demand'] = df_window['demand'].fillna(0)
    #df_window['flag_added'] = df_window['flag_added'].fillna(True)
    df_window['start_station_cluster'] = cluster  # or use fillna()
    df_window['clusters'] = 1  # or use fillna()

    # Add these dates to the processed set
    new_days = [start_date + pd.Timedelta(days=i) for i in range(3)]
    new_days[-1] = new_days[-1] - pd.Timedelta(hours=1)

    processed_dates.update()

    return df_window

In [8]:
def add_lags(df, target, identifier):

  df_res = pd.DataFrame()
  print(target)
  for ii in df[identifier].unique():
      df_current = df[df[identifier]==ii].copy()
      df_current.index = df_current['started_at_hourly']
      df_current.index = pd.to_datetime(df_current.index)
      target_map = df_current[target].to_dict()
      df_current[f"{target}_lag_1_h"] = (df_current.index - pd.Timedelta('1 hours')).map(target_map)
      df_current[f"{target}_lag_2_h"] = (df_current.index - pd.Timedelta('2 hours')).map(target_map)
      df_current[f"{target}_lag_24_h"] = (df_current.index - pd.Timedelta('24 hours')).map(target_map)
      if target == "demand":
          df_current[f"{target}_lag_1_h"] = df_current[f"{target}_lag_1_h"].fillna(0)
          df_current[f"{target}_lag_2_h"] = df_current[f"{target}_lag_2_h"].fillna(0)
          df_current[f"{target}_lag_24_h"] = df_current[f"{target}_lag_24_h"].fillna(0)
      else:
          df_current[f"{target}_lag_1_h"] = df_current[f"{target}_lag_1_h"].interpolate().fillna(0)
          df_current[f"{target}_lag_2_h"] = df_current[f"{target}_lag_2_h"].interpolate().fillna(0)
          df_current[f"{target}_lag_24_h"] = df_current[f"{target}_lag_24_h"].interpolate().fillna(0)
      df_res = pd.concat([df_res, df_current])
  return df_res

In [9]:
df_concatenated = pd.DataFrame()
for i in df[CLUSTER_COLUMN].unique():

  df_current = df[df[CLUSTER_COLUMN]==i].copy()
  #df_current['flag_added'] = False
  df_current[DATETIME] = pd.to_datetime(df_current[DATETIME])
  df_current.set_index(DATETIME, inplace=True)
  unique_dates = df_current.index.normalize().unique()

  # Create an empty DataFrame to hold the result
  df_filled = pd.DataFrame()

  # Track days that have already been processed to avoid overlap
  processed_dates = set()
  #print(i)
  # Function to generate a three-day range and fill missing hours
  # Process each unique date
  for date in unique_dates:
      df_current_date = df_current[df_current[DATE] == date].copy()
      filled_window = fill_three_day_window(date, df_current_date, i)
      #print(filled_window.iloc[:, 0:7])
      #print()
      if filled_window is not None:
          df_filled = pd.concat([df_filled, filled_window])

  # Drop duplicates (if any) and sort the DataFrame
  df_filled = df_filled[~df_filled.index.duplicated(keep='first')].sort_index()
  df_concatenated = pd.concat([df_concatenated, df_filled])

del df_filled


In [10]:
def create_datetime_features(input_df, column_name):

    input_df[column_name+'_hour'] = input_df[column_name].dt.hour
    input_df[column_name+'_quarter'] = input_df[column_name].dt.quarter
    input_df[column_name+'_month'] = input_df[column_name].dt.month
    input_df[column_name+'_year'] = input_df[column_name].dt.year
    input_df[column_name+'_week'] = input_df[column_name].dt.isocalendar().week
    input_df[column_name+'_day'] = input_df[column_name].dt.day
    input_df[column_name+'_dayofweek'] = input_df[column_name].dt.dayofweek

    return input_df

In [11]:
def add_datetime(df, column_name="started_at"):
    conversion_dict_hourly = dict(year= df[f'{column_name}_year'],
                                  month=df[f'{column_name}_month'],
                                  day=  df[f'{column_name}_day'],
                                  hour= df[f'{column_name}_hour']
                                 )
    conversion_dict_daily = dict(year= df[f'{column_name}_year'],
                           month=df[f'{column_name}_month'],
                           day=  df[f'{column_name}_day']
                                 )
    df[f'{column_name}_hourly'] = pd.to_datetime(conversion_dict_hourly)

    df[f'{column_name}_daily'] = pd.to_datetime(conversion_dict_daily)

    return df

In [12]:
df_concatenated = df_concatenated.reset_index().rename(columns={
    "index": "started_at_hourly"
})

In [13]:
df_concatenated = add_datetime(create_datetime_features(
    df_concatenated.rename(columns={"started_at_hourly": "started_at"}).copy(), "started_at"
))
df_concatenated = df_concatenated.drop(columns=['started_at'])

In [14]:
df_concatenated['is_holiday'] =  df_concatenated['started_at_hourly'].apply(lambda x: x in us_holidays)

In [15]:
weather_data = pd.read_csv(f"{data_dir}/weather_data.csv")
weather_data['started_at_hourly'] = pd.to_datetime(weather_data['started_at_hourly'])

number_of_clusters = df['start_station_cluster'].nunique()

weather_data_time = weather_data.reset_index().rename(columns={
    "time": "started_at_hourly"
})
df_concatenated = df_concatenated.merge(weather_data_time[['temp', 'dwpt', "rhum", "prcp", "wdir", "wspd", "pres", "coco", "centroid", 'started_at_hourly']].rename(columns={
    "centroid":"clusters"
}), on=['clusters', "started_at_hourly"], how='left')

In [16]:
df_concatenated = df_concatenated.sort_values("started_at_hourly")
df_concatenated = add_lags(df_concatenated, "demand", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "temp", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "prcp", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "rhum", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "wspd", identifier=CLUSTER_COLUMN)

demand
temp
prcp
rhum
wspd


In [17]:
fold = 0
preds = pd.DataFrame()
rmse = []
mape = []
mse = []
le = preprocessing.LabelEncoder()
scaler = MinMaxScaler(feature_range=(1, 2))
SCALER_FEATURES = ["started_at_year", "started_at_month",
                   "started_at_day", "started_at_hour",
                   "started_at_week", "started_at_quarter",
                   "started_at_dayofweek",
                   "temp",
                   "dwpt",
                   "rhum",
                   "prcp",
                   "wdir",
                   "wspd",
                   "pres",
                   "coco",
                   "demand_lag_1_h",
                   "demand_lag_2_h",
                   "demand_lag_24_h",
                   "temp_lag_1_h",
                   "temp_lag_2_h",
                   "temp_lag_24_h",
                   "prcp_lag_1_h",
                   "prcp_lag_2_h",
                   "prcp_lag_24_h",
                   "rhum_lag_1_h",
                   "rhum_lag_2_h",
                   "rhum_lag_24_h",
                   "wspd_lag_1_h",
                   "wspd_lag_2_h",
                   "wspd_lag_24_h"
                  ]
target_scaler = MinMaxScaler(feature_range=(1, 2))
df_concatenated[SCALER_FEATURES] = scaler.fit_transform(df_concatenated[SCALER_FEATURES])
df_concatenated[['demand_target']] = target_scaler.fit_transform(df_concatenated[['demand']])
df_concatenated['start_station_cluster'] = le.fit_transform(df_concatenated['start_station_cluster'])
df_concatenated['is_holiday'] = df_concatenated['is_holiday'].astype(int)
#df_concatenated['flag_added'] = df_concatenated['flag_added'].astype(int)

filename = f'{models_dir}/target_scaler.sav'
joblib.dump(target_scaler, filename)

filename = f'{models_dir}/scaler.sav'
joblib.dump(scaler, filename)

['/content/drive/MyDrive/Thesis/models/scaler.sav']

In [18]:
df_concatenated = df_concatenated.query('~((started_at_hour>=2) & (started_at_hour<=5))').copy().reset_index(drop=True)

In [19]:
timestamps = np.load(f'{data_dir}/demand_graph_timestamps.pkl.npz')
stamps = [timestamps[f'arr_{i}'] for i in range(8758, len(timestamps))]
gnn_embedding = torch.load(f'{data_dir}/gnn_cnn_2024-11-02 14:35:41.074099_output_embedding.pt')

  gnn_embedding = torch.load(f'{data_dir}/gnn_cnn_2024-11-02 14:35:41.074099_output_embedding.pt')


In [20]:
datetimes = [datetime(int(arr[0][0]), int(arr[0][1]), int(arr[0][2]), int(arr[0][3])) for arr in stamps]
total_demand = [int(sum(arr[:,4])) for arr in stamps]
demand_degrees = [int(sum(arr[:,4]>0)) for arr in stamps]
mean_embeddings = [emb.mean(dim=0) for emb in gnn_embedding]
variance_embeddings = [emb.var(dim=0) for emb in gnn_embedding]

In [21]:
df_embedding = pd.DataFrame({
    'datetime': datetimes,
    'total_demand': total_demand,
    'demand_degrees': demand_degrees,
    **{f"dim_mean_{i}": [emb[i].item() for emb in mean_embeddings] for i in range(mean_embeddings[0].size(0))},
    **{f"dim_var_{i}": [emb[i].item() for emb in variance_embeddings] for i in range(variance_embeddings[0].size(0))}
})

In [22]:
df_embedding['datetime'] = df_embedding['datetime'] + timedelta(hours=1)


In [23]:
df_concatenated = df_concatenated.merge(df_embedding.rename(
    columns={"datetime": "started_at_hourly"}), on=['started_at_hourly'], how="left")
df_concatenated = df_concatenated.fillna(0)

In [24]:
del demand
del datetimes
del total_demand
del gnn_embedding
del demand_degrees
del mean_embeddings
del variance_embeddings

In [25]:
df_test = df_concatenated[pd.to_datetime(
    df_concatenated['started_at_hourly']) >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df_concatenated[pd.to_datetime(
    df_concatenated['started_at_hourly']) < pd.to_datetime("2024-01-01 00:00:00")]

In [26]:
del df
del df_concatenated

In [27]:
#del df
#del demand

In [28]:
tss = TimeSeriesSplit(n_splits=5, test_size=24*10*number_of_clusters, gap=24)
#df_train = df_train.sort_index()
#
#fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)
#
#fold = 0
#for train_idx, val_idx in tss.split(df_train):
#    train = df_train.iloc[train_idx]
#    test = df_train.iloc[val_idx]
#    train['demand'].plot(ax=axs[fold],
#                          label='Training Set',
#                          title=f'Data Train/Test Split Fold {fold}')
#    test['demand'].plot(ax=axs[fold],
#                         label='Test Set')
#    axs[fold].axvline(test.index.min(), color='black', ls='--')
#    fold += 1
#plt.show()

In [29]:
def train_model(df_train, df_test, FEATURES):
  TARGET = 'demand'
  TARGET_TRAIN = 'demand_target'
  X_train = df_train[FEATURES]
  y_train = df_train[TARGET_TRAIN]

  X_test = df_test[FEATURES]
  y_test_training = target_scaler.transform(df_test[[TARGET]])
  y_test = df_test[TARGET]

  reg = RandomForestRegressor(random_state=42)
  reg.fit(X_train, y_train)

  y_pred = target_scaler.inverse_transform([reg.predict(X_test)])
  preds_out = X_test.copy()
  preds_out['actual_demand'] = y_test
  preds_out['pred'] = y_pred[0]

  mse_score = mean_squared_error(y_test, y_pred[0])
  rmse_score = np.sqrt(mean_squared_error(y_test, y_pred[0]))


  mape_score = mean_absolute_percentage_error(y_test+1, y_pred[0]+1)
  print(mse_score)
  print(rmse_score)
  print(mape_score)

  return reg, preds_out

# Normal Model

In [None]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            #'flag_added',
            'is_holiday',
            'total_demand',
            'demand_degrees'
            ]

model, model_preds = train_model(df_train, df_test, FEATURES)

3.5628780991430298
1.887558767070056
0.38054293838781955


In [None]:
5.636537048223255
2.374139222586421
0.33886059404561014

#11-01
3.5628780991430298
1.887558767070056
0.38054293838781955

0.33886059404561014

In [None]:
suf = "_normal_rf_11_01"

model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

#filename = f'label_encoder{suf}.sav'
#joblib.dump(le, filename)
#del reg

# Weather Model

In [None]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            'total_demand',
            'demand_degrees'
]

model, model_preds = train_model(df_train, df_test, FEATURES)

3.334053373770126
1.8259390388975547
0.37095322916411994


In [None]:
4.830006668125328
2.1977276146341085
0.34375553207388226

#11-01
3.334053373770126
1.8259390388975547
0.37095322916411994

0.34375553207388226

In [None]:
suf = "_weather_rf_11_01"

model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

# Weather and Lag


In [None]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            'total_demand',
            'demand_degrees'
]

model, model_preds = train_model(df_train, df_test, FEATURES)

3.6181847108329603
1.902152651821867
0.41428162614828307


In [None]:

#11-01
3.6181847108329603
1.902152651821867
0.41428162614828307

In [None]:
suf = "_weather_lag_rf_11_01"

model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

# Weather Lag and Lag

In [None]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            'total_demand',
            'demand_degrees'
]

model, model_preds = train_model(df_train, df_test, FEATURES)

3.606011516779565
1.8989501090812166
0.41077526832563427


In [None]:
3.6314578551418113
1.905638437674317
0.3965058032591828

#11-01
3.606011516779565
1.8989501090812166
0.41077526832563427

0.3965058032591828

In [None]:
suf = "_weather_lag_and_lag_rf_11_01"

model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

# GNN Mean Embedding

In [None]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            'total_demand',
            'demand_degrees'
]
FEATURES.extend([f"dim_mean_{i}" for i in range(50)])

model, model_preds = train_model(df_train, df_test, FEATURES)

In [None]:
3.582259568051494
1.8926858080652198
0.40263418701670023

11_01
3.5981417775135864
1.896876848272862
0.408581436366315

In [None]:
suf = "_weather_lag_mean_embedding_rf_11_01"

model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

# GNN Variance Embedding

In [None]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            'total_demand',
            'demand_degrees'
]
FEATURES.extend([f"dim_var_{i}" for i in range(50)])

model, model_preds = train_model(df_train, df_test, FEATURES)

In [None]:
11_01
3.5986369799336346
1.8970073747704923
0.40994330252827255

In [None]:
suf = "_weather_lag_var_embedding_rf_11_01"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

# GNN Mean-Variance Embedding

In [30]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            #"temp_lag_1_h",
            #"temp_lag_2_h",
            #"temp_lag_24_h",
            #"prcp_lag_1_h",
            #"prcp_lag_2_h",
            #"prcp_lag_24_h",
            #"rhum_lag_1_h",
            #"rhum_lag_2_h",
            #"rhum_lag_24_h",
            #"wspd_lag_1_h",
            #"wspd_lag_2_h",
            #"wspd_lag_24_h",
            'total_demand',
            'demand_degrees'
]
FEATURES.extend([f"dim_var_{i}" for i in range(50)])
FEATURES.extend([f"dim_mean_{i}" for i in range(50)])

model, model_preds  = train_model(df_train, df_test, FEATURES)

3.601167851840035
1.897674327127823
0.40947944791286134


In [None]:
11_01
3.601167851840035
1.897674327127823
0.40947944791286134

In [31]:
suf = "_weather_lag_mean_var_embedding_rf_11_01"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

In [None]:
#
#for train_idx, val_idx in tss.split(df):
#
#    train = df.iloc[train_idx]
#    test = pd.concat([df.iloc[val_idx], df_test])
#
#
#    FEATURES = ['start_station_cluster', 'started_at_year',
#                'started_at_month', 'started_at_day',
#                'started_at_hour',
#                'started_at_week',
#                'started_at_quarter',
#                'started_at_dayofweek',
#                'is_holiday']
#    TARGET = 'demand'
#    TARGET_TRAIN = 'demand_target'
#
#    X_train = train[FEATURES]
#    y_train = train[TARGET_TRAIN]
#
#    X_test = test[FEATURES]
#    y_test = test[TARGET]
#
#    reg = RandomForestRegressor(random_state=42)
#    reg.fit(X_train, y_train)
#    y_pred = reg.predict(X_test)
#    preds_out = X_test.copy()
#    preds_out['actual_demand'] = y_test
#    preds_out['pred'] = y_pred
#    preds = pd.concat([preds, preds_out])
#    mse_score = mean_squared_error(y_test, y_pred)
#    rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
#    mape_score = mean_absolute_percentage_error(y_test, y_pred)
#    mse.append(mse_score)
#    rmse.append(rmse_score)
#    mape.append(mape_score)
#    print(mse)
#    print(rmse_score)
#    print(mape_score)
#    print(preds.head())
#    print(preds.tail())
#
#print(f"Mean MSE: {np.mean(mse)}")
#print(f"Mean RMSE: {np.mean(rmse)}")
#print(f"Mean MAPE: {np.mean(mape)}")
#del df
#del X_train
#del y_train
#del X_test

[0.00011016991702653535]
0.010496185832317154
0.004621687000503637
         start_station_cluster  started_at_year  started_at_month  started_at_day  started_at_hour  started_at_week  started_at_quarter  started_at_dayofweek  is_holiday  actual_demand      pred
1880375                    106              2.0          1.090909        1.566667         1.391304         1.117647                 1.0                   2.0           0       1.014124  1.000537
1880376                    107              2.0          1.090909        1.566667         1.391304         1.117647                 1.0                   2.0           0       1.000000  1.002373
1880377                    107              2.0          1.090909        1.566667         1.391304         1.117647                 1.0                   2.0           0       1.000000  1.002373
1880378                    107              2.0          1.090909        1.566667         1.391304         1.117647                 1.0                  

In [None]:
preds_out['pred']

1.2287570621468924

In [None]:
target_scaler.inverse_transform([preds_out['pred']])



array([[40.87, 40.87, 40.87, ...,  3.34,  3.34,  1.62]])

In [None]:
target_scaler.inverse_transform([preds_out['actual_demand']])


array([[17., 17., 17., ...,  2.,  2.,  1.]])

In [None]:
preds_out

Unnamed: 0,start_station_cluster,started_at_year,started_at_month,started_at_day,started_at_hour,started_at_week,started_at_quarter,started_at_dayofweek,is_holiday,actual_demand,pred
2041655,68,2.0,1.181818,1.733333,1.521739,1.215686,1.0,1.833333,0,1.045198,1.112627
2041656,68,2.0,1.181818,1.733333,1.521739,1.215686,1.0,1.833333,0,1.045198,1.112627
2041657,68,2.0,1.181818,1.733333,1.521739,1.215686,1.0,1.833333,0,1.045198,1.112627
2041658,68,2.0,1.181818,1.733333,1.521739,1.215686,1.0,1.833333,0,1.045198,1.112627
2041659,68,2.0,1.181818,1.733333,1.521739,1.215686,1.0,1.833333,0,1.045198,1.112627
...,...,...,...,...,...,...,...,...,...,...,...
2081970,152,2.0,1.181818,2.000000,2.000000,1.235294,1.0,2.000000,0,1.000000,1.002232
2081971,152,2.0,1.181818,2.000000,2.000000,1.235294,1.0,2.000000,0,1.000000,1.002232
2081972,156,2.0,1.181818,2.000000,2.000000,1.235294,1.0,2.000000,0,1.002825,1.006610
2081973,156,2.0,1.181818,2.000000,2.000000,1.235294,1.0,2.000000,0,1.002825,1.006610


In [None]:
@scope.define
def to_int(x):
    """
    Custom Hyperopt scope function to cast a value to an integer.

    :param x: Value to be cast to an integer.
    :type x: Any
    :return: Integer representation of the input value.
    :rtype: int
    """
    return int(x)

def tuning(X_train, y_train, X_test, y_test):
  def objective(params):
      print(params)
      reg = RandomForestRegressor(**params)
      reg.fit(X_train, y_train)
      y_pred = reg.predict(X_test)
      mse = mean_squared_error(y_test, y_pred)
      RMSE = math.sqrt(mse)
      return {"loss": RMSE, "status": STATUS_OK}

  space = {
      'n_estimators': scope.to_int(hp.quniform("n_estimators", 50, 500, 10)),
      'max_depth': scope.to_int(hp.quniform("max_depth", 3, 200, 3)),
      'min_samples_split': scope.to_int(hp.quniform("min_samples_split", 2, 40, 3)),
      'min_samples_leaf': scope.to_int(hp.quniform("min_samples_leaf", 2, 20, 3)),
      'max_features': hp.choice('max_features', ['sqrt', 'log2']),
      'bootstrap': hp.choice('bootstrap', [True, False])
  }

  trials = Trials()
  best = fmin(
      fn=objective,
      space=space,
      algo=tpe.suggest,
      max_evals=100,
      trials=trials,
  )

  best_params = {
      "n_estimators": int(best["n_estimators"]),
      "max_depth": int(best["max_depth"]),
      "min_samples_split": best["min_samples_split"],
      "min_samples_leaf": int(best["min_samples_leaf"]),
      "max_features": best["max_features"],
      "bootstrap": best["bootstrap"]
  }
  model = RandomForestRegressor(**best_params)
  model.fit(X_train, y_train)

  return model

In [None]:
#FEATURES = ['start_station_cluster', 'started_at_year',
#            'started_at_month', 'started_at_day',
#            'started_at_hour',
#            'started_at_week',
#            'started_at_quarter',
#            'started_at_dayofweek',
#            'is_holiday']
#TARGET = 'demand'
#TARGET_TRAIN = 'demand_target'
#X_train = df_train[FEATURES]
#y_train = df_train[TARGET_TRAIN]
#
#X_test = df_test[FEATURES]
#y_test_training = target_scaler.transform(df_test[[TARGET]])
#y_test = df_test[TARGET]
#
#reg = tuning(X_train, y_train, X_test, y_test_training)
#
#y_pred = target_scaler.inverse_transform([reg.predict(X_test)])
#preds_out = X_test.copy()
#preds_out['actual_demand'] = y_test
#preds_out['pred'] = y_pred[0]
#preds = pd.concat([preds, preds_out])
#
#mse_score = mean_squared_error(y_test, y_pred[0])
#rmse_score = np.sqrt(mean_squared_error(y_test, y_pred[0]))
#mape_score = mean_absolute_percentage_error(y_test, y_pred[0])

#mse_score = mean_squared_error(y_test, y_pred[0])
#rmse_score = np.sqrt(mean_squared_error(y_test, y_pred[0]))
#mape_score = mean_absolute_percentage_error(y_test, y_pred[0])
#print(mse_score)
#print(rmse_score)
#print(mape_score)


#suf = "_rf_10_13"
#
#preds.to_csv(f"test_predictions{suf}.csv")
#del preds
#
#filename = f'demand_model{suf}.sav'
#joblib.dump(reg, filename)
#del reg
#
#filename = f'target_scaler{suf}.sav'
#joblib.dump(target_scaler, filename)
#del target_scaler
#
#filename = f'scaler{suf}.sav'
#joblib.dump(scaler, filename)
#del scaler
#
#filename = f'label_encoder{suf}.sav'
#joblib.dump(le, filename)
#del reg

array([      0,       1,       2, ..., 2041628, 2041629, 2041630])