In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns

import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.cluster import KMeans
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import joblib

import holidays
us_holidays = holidays.country_holidays('US')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

In [None]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
!unzip /content/drive/MyDrive/Thesis/Data/gnn_cnn_2024-11-02_14_35_41.074099_output_embedding.zip

Archive:  /content/drive/MyDrive/Thesis/Data/gnn_cnn_2024-11-02_14_35_41.074099_output_embedding.zip
  inflating: gnn_cnn_2024-11-02 14_35_41.074099_output_embedding.pt  


In [None]:
demand = pd.read_csv(f"{data_dir}/pre_processed_data.csv")

df = demand.sort_values(by=['started_at_year', 'started_at_month',
                            'started_at_day', 'started_at_hour'])
df = df.drop(columns=['Unnamed: 0', 'duration_sec']).reset_index(drop=True)
df['started_at_daily'] = pd.to_datetime(df['started_at_daily'])
df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])

df = df[df['clusters']==1]
df = df[df['started_at_hourly'] >= pd.to_datetime("2023-01-01 00:00:00")].drop_duplicates(
    ['started_at_hourly', 'start_station_cluster', 'demand']).reset_index(drop=True)

In [None]:
df['start_station_cluster'].max()

278

In [None]:
earliest_trips = df.groupby('start_station_cluster').nth(0)
stations_operational_since_early = earliest_trips[(earliest_trips['started_at_month'] < 4)]['start_station_cluster']
stations_early_enough = df['start_station_cluster'].isin(stations_operational_since_early)
print((df[~stations_early_enough].shape[0]/df.shape[0])*100)
df = df[stations_early_enough].reset_index(drop=True)

1.9881767405363227


In [None]:
CLUSTER_COLUMN = 'start_station_cluster'
DATETIME = "started_at_hourly"
DATE = "started_at_daily"

# Filling Gaps

In [None]:
def fill_three_day_window(day, df, cluster):
    # Define the three-day window (previous day, current day, and next day)
    start_date = day - pd.Timedelta(days=1)
    end_date = day + pd.Timedelta(days=1)

    days_to_fill = []
    for i in range(3):
        candidate_day = start_date + pd.Timedelta(days=i)
        if candidate_day not in processed_dates:
            days_to_fill.append(candidate_day)
    #print(days_to_fill)
    # If all days overlap, skip filling this window
    if not days_to_fill:
        return None
    days_to_fill[-1] = days_to_fill[-1] - pd.Timedelta(hours=1)
    # Create a full hourly range only for the non-overlapping days
    full_range = pd.date_range(start=min(days_to_fill), end=max(days_to_fill), freq='h')


    # Reindex the DataFrame to this range and interpolate/forward-fill
    df_window = df.reindex(full_range)
    df_window['demand'] = df_window['demand'].fillna(0)
    #df_window['flag_added'] = df_window['flag_added'].fillna(True)
    df_window['start_station_cluster'] = cluster  # or use fillna()
    df_window['clusters'] = 1  # or use fillna()

    # Add these dates to the processed set
    new_days = [start_date + pd.Timedelta(days=i) for i in range(3)]
    new_days[-1] = new_days[-1] - pd.Timedelta(hours=1)

    processed_dates.update()

    return df_window

In [None]:
def add_lags(df, target, identifier):

  df_res = pd.DataFrame()
  print(target)
  for ii in df[identifier].unique():
      df_current = df[df[identifier]==ii].copy()
      df_current.index = df_current['started_at_hourly']
      df_current.index = pd.to_datetime(df_current.index)
      target_map = df_current[target].to_dict()
      df_current[f"{target}_lag_1_h"] = (df_current.index - pd.Timedelta('1 hours')).map(target_map)
      df_current[f"{target}_lag_2_h"] = (df_current.index - pd.Timedelta('2 hours')).map(target_map)
      df_current[f"{target}_lag_24_h"] = (df_current.index - pd.Timedelta('24 hours')).map(target_map)
      if target == "demand":
          df_current[f"{target}_lag_1_h"] = df_current[f"{target}_lag_1_h"].fillna(0)
          df_current[f"{target}_lag_2_h"] = df_current[f"{target}_lag_2_h"].fillna(0)
          df_current[f"{target}_lag_24_h"] = df_current[f"{target}_lag_24_h"].fillna(0)
      else:
          df_current[f"{target}_lag_1_h"] = df_current[f"{target}_lag_1_h"].interpolate().fillna(0)
          df_current[f"{target}_lag_2_h"] = df_current[f"{target}_lag_2_h"].interpolate().fillna(0)
          df_current[f"{target}_lag_24_h"] = df_current[f"{target}_lag_24_h"].interpolate().fillna(0)
      df_res = pd.concat([df_res, df_current])
  return df_res

In [None]:
df_concatenated = pd.DataFrame()
for i in df[CLUSTER_COLUMN].unique():

  df_current = df[df[CLUSTER_COLUMN]==i].copy()
  #df_current['flag_added'] = False
  df_current[DATETIME] = pd.to_datetime(df_current[DATETIME])
  df_current.set_index(DATETIME, inplace=True)
  unique_dates = df_current.index.normalize().unique()

  # Create an empty DataFrame to hold the result
  df_filled = pd.DataFrame()

  # Track days that have already been processed to avoid overlap
  processed_dates = set()
  #print(i)
  # Function to generate a three-day range and fill missing hours
  # Process each unique date
  for date in unique_dates:
      df_current_date = df_current[df_current[DATE] == date].copy()
      filled_window = fill_three_day_window(date, df_current_date, i)
      #print(filled_window.iloc[:, 0:7])
      #print()
      if filled_window is not None:
          df_filled = pd.concat([df_filled, filled_window])

  # Drop duplicates (if any) and sort the DataFrame
  df_filled = df_filled[~df_filled.index.duplicated(keep='first')].sort_index()
  df_concatenated = pd.concat([df_concatenated, df_filled])

del df_filled


# Datetime

In [None]:
def create_datetime_features(input_df, column_name):

    input_df[column_name+'_hour'] = input_df[column_name].dt.hour
    input_df[column_name+'_quarter'] = input_df[column_name].dt.quarter
    input_df[column_name+'_month'] = input_df[column_name].dt.month
    input_df[column_name+'_year'] = input_df[column_name].dt.year
    input_df[column_name+'_week'] = input_df[column_name].dt.isocalendar().week
    input_df[column_name+'_day'] = input_df[column_name].dt.day
    input_df[column_name+'_dayofweek'] = input_df[column_name].dt.dayofweek

    return input_df

In [None]:
def add_datetime(df, column_name="started_at"):
    conversion_dict_hourly = dict(year= df[f'{column_name}_year'],
                                  month=df[f'{column_name}_month'],
                                  day=  df[f'{column_name}_day'],
                                  hour= df[f'{column_name}_hour']
                                 )
    conversion_dict_daily = dict(year= df[f'{column_name}_year'],
                           month=df[f'{column_name}_month'],
                           day=  df[f'{column_name}_day']
                                 )
    df[f'{column_name}_hourly'] = pd.to_datetime(conversion_dict_hourly)

    df[f'{column_name}_daily'] = pd.to_datetime(conversion_dict_daily)

    return df

In [None]:
df_concatenated = df_concatenated.reset_index().rename(columns={
    "index": "started_at_hourly"
})

In [None]:
df_concatenated = add_datetime(create_datetime_features(
    df_concatenated.rename(columns={"started_at_hourly": "started_at"}).copy(), "started_at"
))
df_concatenated = df_concatenated.drop(columns=['started_at'])

In [None]:
df_concatenated['is_holiday'] =  df_concatenated['started_at_hourly'].apply(lambda x: x in us_holidays)

# Weather Data

In [None]:
weather_data = pd.read_csv(f"{data_dir}/weather_data.csv")

In [None]:
weather_data = pd.read_csv(f"{data_dir}/weather_data.csv")
weather_data['started_at_hourly'] = pd.to_datetime(weather_data['started_at_hourly'])

median_code = weather_data["coco"].median()

weather_data["coco"] = weather_data["coco"].fillna(median_code).astype(int)



number_of_clusters = df['start_station_cluster'].nunique()

weather_data_time = weather_data.reset_index().rename(columns={
    "time": "started_at_hourly"
})
df_concatenated = df_concatenated.merge(weather_data_time[['temp', 'dwpt', "rhum", "prcp", "wdir", "wspd", "pres", "coco", "centroid", 'started_at_hourly']].rename(columns={
    "centroid":"clusters"
}), on=['clusters', "started_at_hourly"], how='left')

# Lags

In [None]:
df_concatenated = df_concatenated.sort_values("started_at_hourly")
df_concatenated = add_lags(df_concatenated, "demand", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "temp", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "prcp", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "rhum", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "wspd", identifier=CLUSTER_COLUMN)
df_concatenated = add_lags(df_concatenated, "coco", identifier=CLUSTER_COLUMN)

demand
temp
prcp
rhum
wspd
coco


In [None]:
df_concatenated = df_concatenated.query('~((started_at_hour>=2) & (started_at_hour<=5))').copy().reset_index(drop=True)

# GNN Analysis

In [None]:
timestamps = np.load(f'{data_dir}/demand_graph_timestamps.pkl.npz')
stamps = [timestamps[f'arr_{i}'] for i in range(8757, len(timestamps)-1)]
gnn_embedding = torch.load(f'gnn_cnn_2024-11-02 14_35_41.074099_output_embedding.pt')

In [None]:
station_clusters = [
    74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,
    87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
    100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
    113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
    139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
    152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
    165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177,
    178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
    205, 206, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219,
    220, 222, 223, 224, 225, 226, 227, 228, 230, 231, 232, 233, 234,
    235, 237, 238, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
    250, 251, 252, 253, 255, 257, 258, 261, 264, 265, 266, 274, 275,
    278
]





In [None]:
len(gnn_embedding)

10942

In [None]:
datetimes = [datetime(int(arr[0][0]), int(arr[0][1]), int(arr[0][2]), int(arr[0][3])) for arr in stamps]
total_demand = [int(sum(arr[:,4])) for arr in stamps]
demand_degrees = [int(sum(arr[:,4]>0)) for arr in stamps]
mean_embeddings = [emb.mean(dim=0) for emb in gnn_embedding]
variance_embeddings = [emb.var(dim=0) for emb in gnn_embedding]

## Mean and Variance

In [None]:
df_embedding = pd.DataFrame({
    'datetime': datetimes,
    'total_demand_1h': total_demand,
    'demand_degrees_1h': demand_degrees,
    **{f"dim_mean_{i}": [emb[i].item() for emb in mean_embeddings] for i in range(mean_embeddings[0].size(0))},
    **{f"dim_var_{i}": [emb[i].item() for emb in variance_embeddings] for i in range(variance_embeddings[0].size(0))}
})

In [None]:
df_embedding['datetime'] = df_embedding['datetime'] + timedelta(hours=1)

## Clustering

In [None]:
df_gnn_clusters = pd.DataFrame()
for ii in range(len(gnn_embedding)):
  kmeans = KMeans(n_clusters=4, random_state=42)
  labels = kmeans.fit_predict(gnn_embedding[ii].detach().numpy())
  df_current = pd.DataFrame({
      "datetime": datetimes[ii],
      "gnn_label_cluster": labels,
      "start_station_cluster": station_clusters

  })
  df_gnn_clusters = pd.concat([df_gnn_clusters, df_current])

  return fit_method(estimator, *args, **kwargs)


In [None]:
import plotly.express as px
def get_cluster_screenshots(time='2023-01-2 9:00:00'):
  standard_stations = pd.read_csv(f"{data_dir}/standard_stations.csv")
  cluster_example = df_gnn_clusters[df_gnn_clusters['datetime'] == time]
  standard_stations = standard_stations.drop_duplicates(subset=['cluster_stations'])
  cluster_example = cluster_example.merge(standard_stations[[
    'cluster_stations',
    'station_latitude_centroid_stations',
    'station_longitude_centroid_stations'
  ]], left_on=['start_station_cluster'], right_on=['cluster_stations'])

  cluster_example['gnn_label_cluster'] = cluster_example['gnn_label_cluster'].astype(str)
  fig = px.scatter_mapbox(cluster_example, lat='station_latitude_centroid_stations', lon='station_longitude_centroid_stations',
                          color = "gnn_label_cluster",
                          mapbox_style="open-street-map",
                          zoom=10.8
  ).update_traces(marker={"size": 10})

  return fig

In [None]:
get_cluster_screenshots(time='2023-01-2 9:00:00')

In [None]:
get_cluster_screenshots(time='2023-01-2 10:00:00')

In [None]:
get_cluster_screenshots(time='2023-01-2 11:00:00')

In [None]:

get_cluster_screenshots(time='2023-01-2 12:00:00')

In [None]:
df_gnn_clusters = df_gnn_clusters.rename(
    columns={"datetime": "started_at_hourly"}
).merge(df_concatenated[['demand', 'started_at_hourly', 'start_station_cluster']], on=['started_at_hourly', 'start_station_cluster'],
how="left").dropna()

df_concatenated = df_concatenated.merge(df_gnn_clusters.rename(
    columns={"datetime": "started_at_hourly"})[[
        'started_at_hourly',
        'start_station_cluster',
        'gnn_label_cluster']],
    on=['started_at_hourly', 'start_station_cluster'],
how="left")

cluster_aggregate = df_gnn_clusters.groupby(["gnn_label_cluster", "started_at_hourly"]).agg({
    "demand": ["mean", "sum"]
})
cluster_aggregate.columns = cluster_aggregate.columns.droplevel()

cluster_aggregate = cluster_aggregate.reset_index().rename(columns={
    "mean": "mean_gnn_cluster_demand_1h",
    "sum": "total_gnn_cluster_demand_1h"
})

cluster_aggregate['started_at_hourly'] = cluster_aggregate['started_at_hourly'] + timedelta(hours=1)

In [None]:
df_concatenated = df_concatenated.merge(cluster_aggregate,
    on=['gnn_label_cluster', 'started_at_hourly'],
how="left"
)

df_concatenated['mean_gnn_cluster_demand_1h'] = df_concatenated['mean_gnn_cluster_demand_1h'].fillna(1)
df_concatenated['total_gnn_cluster_demand_1h'] = df_concatenated['total_gnn_cluster_demand_1h'].fillna(1)

## Cosine Similarity

In [None]:
similarity_cols = [f"cosine_sim_{col}" for col in range(gnn_embedding[0].detach().numpy().shape[0])]
df_cosine = pd.DataFrame()
for ii in range(len(gnn_embedding)):

  similarity = cosine_similarity(gnn_embedding[ii].detach().numpy())
  df_current = pd.DataFrame({
      "datetime": datetimes[ii],
      "start_station_cluster": station_clusters
  })

  similarity_df = pd.DataFrame(similarity, columns=similarity_cols)
  df_current = pd.concat([df_current, similarity_df], axis=1)
  df_cosine = pd.concat([df_cosine, df_current])

In [None]:
del demand
del datetimes
del total_demand
del gnn_embedding
del demand_degrees
del mean_embeddings
del variance_embeddings
del df_gnn_clusters
del cluster_aggregate

In [None]:
df_cosine['datetime'] = df_cosine['datetime'] + timedelta(hours=1)

df_cosine = df_cosine.rename(
    columns={"datetime": "started_at_hourly"}
)

df_cosine.to_csv(f'{data_dir}/cosine_similarity.csv')

In [None]:
del df_cosine

In [None]:
df_concatenated = df_concatenated.merge(df_embedding.rename(
    columns={"datetime": "started_at_hourly"}), on=['started_at_hourly'], how="left")
df_concatenated = df_concatenated.fillna(0)

In [None]:
df_concatenated#.drop_duplicates(subset=['start_station_cluster', 'started_at_hourly'])

Unnamed: 0,start_station_cluster,started_at_year,started_at_month,started_at_day,started_at_hour,demand,started_at_week,started_at_quarter,started_at_dayofweek,started_at_daily,clusters,started_at_hourly,is_holiday,temp,dwpt,rhum,prcp,wdir,wspd,pres,coco,demand_lag_1_h,demand_lag_2_h,demand_lag_24_h,temp_lag_1_h,temp_lag_2_h,temp_lag_24_h,prcp_lag_1_h,prcp_lag_2_h,prcp_lag_24_h,rhum_lag_1_h,rhum_lag_2_h,rhum_lag_24_h,wspd_lag_1_h,wspd_lag_2_h,wspd_lag_24_h,coco_lag_1_h,coco_lag_2_h,coco_lag_24_h,gnn_label_cluster,mean_gnn_cluster_demand_1h,total_gnn_cluster_demand_1h,total_demand_1h,demand_degrees_1h,dim_mean_0,dim_mean_1,dim_mean_2,dim_mean_3,dim_mean_4,dim_mean_5,dim_mean_6,dim_mean_7,dim_mean_8,dim_mean_9,dim_mean_10,dim_mean_11,dim_mean_12,dim_mean_13,dim_mean_14,dim_mean_15,dim_mean_16,dim_mean_17,dim_mean_18,dim_mean_19,dim_mean_20,dim_mean_21,dim_mean_22,dim_mean_23,dim_mean_24,dim_mean_25,dim_mean_26,dim_mean_27,dim_mean_28,dim_mean_29,dim_mean_30,dim_mean_31,dim_mean_32,dim_mean_33,dim_mean_34,dim_mean_35,dim_mean_36,dim_mean_37,dim_mean_38,dim_mean_39,dim_mean_40,dim_mean_41,dim_mean_42,dim_mean_43,dim_mean_44,dim_mean_45,dim_mean_46,dim_mean_47,dim_mean_48,dim_mean_49,dim_var_0,dim_var_1,dim_var_2,dim_var_3,dim_var_4,dim_var_5,dim_var_6,dim_var_7,dim_var_8,dim_var_9,dim_var_10,dim_var_11,dim_var_12,dim_var_13,dim_var_14,dim_var_15,dim_var_16,dim_var_17,dim_var_18,dim_var_19,dim_var_20,dim_var_21,dim_var_22,dim_var_23,dim_var_24,dim_var_25,dim_var_26,dim_var_27,dim_var_28,dim_var_29,dim_var_30,dim_var_31,dim_var_32,dim_var_33,dim_var_34,dim_var_35,dim_var_36,dim_var_37,dim_var_38,dim_var_39,dim_var_40,dim_var_41,dim_var_42,dim_var_43,dim_var_44,dim_var_45,dim_var_46,dim_var_47,dim_var_48,dim_var_49
0,74,2022,12,31,0,0.0,52,4,5,2022-12-31,1,2022-12-31 00:00:00,False,16.1,14.5,90.0,0.0,220.0,31.7,1012.7,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000e+00
1,74,2022,12,31,1,0.0,52,4,5,2022-12-31,1,2022-12-31 01:00:00,False,15.6,14.5,93.0,0.3,240.0,40.7,1012.6,7,0.0,0.0,0.0,16.1,0.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,31.7,0.0,0.0,7.0,0.0,0.0,0.0,1.000000,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000e+00
2,74,2022,12,31,6,0.0,52,4,5,2022-12-31,1,2022-12-31 06:00:00,False,15.6,14.5,93.0,1.5,8.0,29.5,1012.9,9,0.0,0.0,0.0,15.6,15.6,0.0,0.8,0.5,0.0,93.0,90.0,0.0,33.5,33.5,0.0,8.0,8.0,0.0,0.0,1.000000,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000e+00
3,74,2022,12,31,7,0.0,52,4,5,2022-12-31,1,2022-12-31 07:00:00,False,15.0,14.4,96.0,2.5,230.0,18.4,1012.6,9,0.0,0.0,0.0,15.6,15.6,0.0,1.5,0.8,0.0,93.0,93.0,0.0,29.5,33.5,0.0,9.0,8.0,0.0,0.0,1.000000,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000e+00
4,74,2022,12,31,8,0.0,52,4,5,2022-12-31,1,2022-12-31 08:00:00,False,15.0,14.4,96.0,1.3,230.0,11.2,1012.2,9,0.0,0.0,0.0,15.0,15.6,0.0,2.5,1.5,0.0,96.0,93.0,0.0,18.4,29.5,0.0,9.0,9.0,0.0,0.0,1.000000,1.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422575,210,2023,6,16,19,0.0,24,2,4,2023-06-16,1,2023-06-16 19:00:00,False,21.1,14.0,64.0,0.0,40.0,13.0,1015.1,2,0.0,0.0,3.0,19.4,19.4,19.4,0.0,0.0,0.0,66.0,66.0,66.0,13.0,13.0,16.6,2.0,2.0,2.0,1.0,0.723684,55.0,544.0,104.0,-0.008475,0.061464,0.000079,0.000046,0.030358,-0.027691,-0.000828,0.112863,-0.089404,0.027710,-0.097900,-0.076161,0.024385,0.000283,-0.034884,0.000543,0.000022,0.024765,0.000900,-0.023377,-0.085218,-0.000607,0.009694,0.032163,0.000235,0.000447,0.000040,-0.000947,0.077103,0.031426,0.095915,-0.092456,0.000503,-0.000780,-0.001152,0.031540,0.000113,0.004603,-0.013007,0.001485,0.000390,0.087084,-0.000708,-0.027225,0.085981,-0.000301,0.000168,0.000809,0.001305,0.000048,0.000310,0.010909,2.029046e-07,3.574158e-07,0.003640,0.002950,0.000008,0.029104,0.019929,0.003070,0.023473,0.015390,0.002352,9.064717e-07,0.004420,2.288713e-06,5.561700e-08,0.002144,0.000005,0.002578,0.018730,3.455794e-06,0.000506,0.003809,3.928731e-07,3.491909e-07,1.151035e-08,1.462791e-06,0.016054,0.003719,0.021029,0.020729,3.757491e-06,0.000007,0.000008,0.003129,4.411302e-07,0.000144,0.000829,0.000017,1.593487e-06,0.020061,0.000006,0.002576,0.019587,7.937777e-07,3.848000e-07,0.000009,0.000013,9.104175e-07
1422576,210,2023,6,16,20,0.0,24,2,4,2023-06-16,1,2023-06-16 20:00:00,False,22.2,13.8,59.0,0.0,40.0,13.0,1014.9,2,0.0,0.0,6.0,21.1,19.4,22.2,0.0,0.0,0.0,64.0,66.0,55.0,13.0,13.0,20.5,2.0,2.0,2.0,1.0,0.365854,30.0,386.0,81.0,-0.006896,0.051452,0.000067,0.000186,0.024508,-0.022419,-0.000311,0.096665,-0.075832,0.022358,-0.083219,-0.064297,0.019693,0.000260,-0.028413,0.000304,0.000034,0.020319,0.000691,-0.017119,-0.072043,-0.000304,0.007550,0.026176,0.000294,0.000342,0.000026,-0.000724,0.064859,0.025473,0.082256,-0.078736,0.000128,-0.000309,-0.000963,0.026270,0.000117,0.003564,-0.010195,0.001143,0.000379,0.073384,-0.000570,-0.022364,0.072437,-0.000238,0.000278,0.000248,0.000641,-0.000156,0.000315,0.010826,1.977856e-07,3.100157e-07,0.003601,0.002920,0.000007,0.028942,0.019765,0.003041,0.023292,0.015282,0.002330,1.106908e-06,0.004371,2.069962e-06,8.501071e-09,0.002133,0.000006,0.002408,0.018566,3.270000e-06,0.000503,0.003770,5.272299e-07,2.676686e-07,4.752780e-09,1.097399e-06,0.015900,0.003676,0.020954,0.020598,3.368261e-06,0.000006,0.000009,0.003128,3.878822e-07,0.000147,0.000820,0.000018,2.014768e-06,0.019862,0.000006,0.002564,0.019388,8.128901e-07,5.884589e-07,0.000008,0.000012,8.642760e-07
1422577,210,2023,6,16,21,0.0,24,2,4,2023-06-16,1,2023-06-16 21:00:00,False,22.2,13.8,59.0,0.0,310.0,25.9,1014.9,2,0.0,0.0,2.0,22.2,21.1,22.8,0.0,0.0,0.0,59.0,64.0,57.0,13.0,13.0,20.5,2.0,2.0,2.0,1.0,0.392157,40.0,233.0,77.0,0.000638,0.007958,0.000254,0.000312,-0.000539,0.000153,0.000641,0.025494,-0.017070,-0.000668,-0.019429,-0.012599,-0.000481,-0.000233,-0.000816,-0.000266,0.000049,0.000970,-0.000358,0.002773,-0.015120,0.000382,-0.001853,0.000533,-0.000052,0.000154,0.000005,-0.000349,0.012198,0.000173,0.021616,-0.018704,-0.000519,0.000565,0.000396,0.002777,-0.000153,-0.001582,0.001762,-0.000697,-0.000292,0.014548,0.000545,-0.001144,0.014314,0.000182,-0.000098,-0.000755,-0.000600,-0.000436,0.000067,0.002426,3.772223e-08,1.027883e-07,0.000812,0.000656,0.000002,0.006463,0.004430,0.000684,0.005220,0.003419,0.000522,1.564356e-07,0.000985,5.017705e-07,6.735480e-09,0.000475,0.000001,0.000599,0.004169,8.300125e-07,0.000112,0.000848,6.114564e-08,8.556490e-08,2.625442e-09,3.617496e-07,0.003575,0.000829,0.004655,0.004604,9.771402e-07,0.000002,0.000002,0.000689,7.191660e-08,0.000031,0.000184,0.000004,2.742030e-07,0.004473,0.000001,0.000570,0.004367,1.220427e-07,5.908818e-08,0.000002,0.000004,2.636452e-07
1422578,210,2023,6,16,22,0.0,24,2,4,2023-06-16,1,2023-06-16 22:00:00,False,21.1,13.3,61.0,0.0,310.0,29.5,1014.7,2,0.0,0.0,2.0,22.2,22.2,22.2,0.0,0.0,0.0,59.0,59.0,53.0,25.9,13.0,18.4,2.0,2.0,2.0,2.0,7.555556,68.0,179.0,58.0,0.001050,0.005116,0.000278,0.000290,-0.002182,0.001606,0.000657,0.020790,-0.013217,-0.002174,-0.015217,-0.009224,-0.001776,-0.000235,0.000982,-0.000265,0.000059,-0.000244,-0.000373,0.004553,-0.011362,0.000376,-0.002434,-0.001127,-0.000059,0.000127,0.000005,-0.000293,0.008726,-0.001474,0.017662,-0.014761,-0.000520,0.000584,0.000405,0.001335,-0.000163,-0.001861,0.002529,-0.000785,-0.000292,0.010648,0.000579,0.000184,0.010450,0.000176,-0.000100,-0.000762,-0.000598,-0.000415,0.000067,0.002375,5.733103e-08,1.424185e-07,0.000795,0.000644,0.000002,0.006306,0.004331,0.000671,0.005097,0.003346,0.000513,1.852373e-07,0.000966,5.864285e-07,6.043458e-09,0.000468,0.000001,0.000548,0.004072,9.946856e-07,0.000110,0.000832,8.316022e-08,6.624020e-08,4.200183e-09,2.783110e-07,0.003494,0.000813,0.004551,0.004499,1.157589e-06,0.000002,0.000002,0.000681,7.690019e-08,0.000031,0.000181,0.000004,3.317247e-07,0.004368,0.000001,0.000562,0.004262,1.497390e-07,9.103316e-08,0.000003,0.000004,3.490897e-07


# Scaling

In [None]:
#df_concatenated.to_csv(f'{data_dir}/final_model_input_unscaled.csv')

In [None]:
le = preprocessing.LabelEncoder()

scalers = {}

SCALER_DATETIME_FEATURES = [
    'started_at_year',
    'started_at_month',
    'started_at_day',
    'started_at_hour',
    'started_at_week',
    'started_at_quarter',
    'started_at_dayofweek',
]

SCALER_DEMAND_FEATURES = [
    "demand_lag_1_h",
    "demand_lag_2_h",
    "demand_lag_24_h",
    "mean_gnn_cluster_demand_1h",
    "total_gnn_cluster_demand_1h",
    "total_demand_1h",
    "demand_degrees_1h"
]

SCALER_WEATHER_FEATURES = [
    "temp",
    "dwpt",
    "rhum",
    "prcp",
    "wdir",
    "wspd",
    "pres",
    "coco",
    "temp_lag_1_h",
    "temp_lag_2_h",
    "temp_lag_24_h",
    "prcp_lag_1_h",
    "prcp_lag_2_h",
    "prcp_lag_24_h",
    "rhum_lag_1_h",
    "rhum_lag_2_h",
    "rhum_lag_24_h",
    "wspd_lag_1_h",
    "wspd_lag_2_h",
    "wspd_lag_24_h",
    "coco_lag_1_h",
    "coco_lag_2_h",
    "coco_lag_24_h"
]


SCALER_FEATURES = [
    "temp",
    "dwpt",
    "rhum",
    "prcp",
    "wdir",
    "wspd",
    "pres",
    "coco",
    "temp_lag_1_h",
    "temp_lag_2_h",
    "temp_lag_24_h",
    "prcp_lag_1_h",
    "prcp_lag_2_h",
    "prcp_lag_24_h",
    "rhum_lag_1_h",
    "rhum_lag_2_h",
    "rhum_lag_24_h",
    "wspd_lag_1_h",
    "wspd_lag_2_h",
    "wspd_lag_24_h",
    "demand_lag_1_h",
    "demand_lag_2_h",
    "demand_lag_24_h",
    "mean_gnn_cluster_demand_1h",
    "total_gnn_cluster_demand_1h",
    "total_demand_1h",
    "demand_degrees_1h",
    'started_at_year',
    'started_at_month', 'started_at_day',
    'started_at_hour',
    'started_at_week',
    'started_at_quarter',
    'started_at_dayofweek',
]

target_scaler = MinMaxScaler(feature_range=(1, 2))

demand_features_scaler = MinMaxScaler(feature_range=(1, 2))
weather_scaler = MinMaxScaler(feature_range=(1, 2))
datetime_features_scaler = MinMaxScaler(feature_range=(1, 2))
#start_station_cluster_scaler = MinMaxScaler(feature_range=(1, 2))


#scaler = MinMaxScaler(feature_range=(1, 2))
#df_concatenated["start_station_cluster"] = start_station_cluster_scaler.fit_transform(df_concatenated[['start_station_cluster']])
df_concatenated[SCALER_DEMAND_FEATURES] = demand_features_scaler.fit_transform(df_concatenated[SCALER_DEMAND_FEATURES])
df_concatenated[SCALER_WEATHER_FEATURES] = weather_scaler.fit_transform(df_concatenated[SCALER_WEATHER_FEATURES])
df_concatenated[SCALER_DATETIME_FEATURES] = datetime_features_scaler.fit_transform(df_concatenated[SCALER_DATETIME_FEATURES])

#scalers[feature] = scaler

df_concatenated[['demand_target']] = target_scaler.fit_transform(df_concatenated[['demand']])
df_concatenated['start_station_cluster'] = le.fit_transform(df_concatenated['start_station_cluster'])
df_concatenated['is_holiday'] = df_concatenated['is_holiday'].astype(int)

filename = f'{models_dir}/target_scaler.sav'
joblib.dump(target_scaler, filename)

filename = f'{models_dir}/datetime_features_scaler.sav'
joblib.dump(datetime_features_scaler, filename)

filename = f'{models_dir}/demand_features_scaler.sav'
joblib.dump(demand_features_scaler, filename)

filename = f'{models_dir}/weather_scaler.sav'
joblib.dump(weather_scaler, filename)

#filename = f'{models_dir}/start_station_cluster_scaler.sav'
#joblib.dump(start_station_cluster_scaler, filename)

['/content/drive/MyDrive/Thesis/models/weather_scaler.sav']

In [None]:

filename = f'{models_dir}/start_station_encoder.sav'
joblib.dump(le, filename)

['/content/drive/MyDrive/Thesis/models/start_station_encoder.sav']

In [None]:
#df_concatenated.to_csv(f'{data_dir}/final_model_input_partial_scale_4.csv')

In [None]:
df_concatenated['demand'].value_counts()

Unnamed: 0_level_0,count
demand,Unnamed: 1_level_1
0.0,782837
1.0,229889
2.0,120433
3.0,73065
4.0,48252
5.0,34032
6.0,24821
7.0,18673
8.0,14570
9.0,11262
