In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import pickle
import holidays

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"

with open(f'{data_dir}/station_encoder.pkl', 'rb') as handle:
    le = pickle.load(handle)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [11]:
df = pd.read_csv(f"{data_dir}/df_final_dataframe.csv").drop(columns=['Unnamed: 0'])

In [7]:
standard_stations = pd.read_csv(f"{data_dir}/standard_stations.csv")
standard_stations['station_id'] = le.transform(standard_stations['station_id'])

In [None]:
distance = pd.read_csv(f'{data_dir}/distance_matrix_centroids.csv')

In [None]:
#df = df[(df['start_station_cluster'] == 1) & (df['end_station_cluster'] == 1)].reset_index(drop=True)

In [None]:
df['start_station_id'] = le.transform(df['start_station_id'])
df['end_station_id'] = le.transform(df['end_station_id'])

wanted_clusters_stations = standard_stations[standard_stations['clusters'] == 1]['cluster_stations']

df = df[
    df['start_station_cluster'].isin(wanted_clusters_stations) &\
    df['end_station_cluster'].isin(wanted_clusters_stations)
].reset_index(drop=True)

#distance['start_station_id'] = le.transform(distance['start_station_id'])

#distance['end_station_id'] = le.transform(distance['end_station_id'])


In [None]:
#distance

In [None]:
column_name = 'started_at'
conversion_dict_hourly = dict(year= df[f'{column_name}_year'],
                              month=df[f'{column_name}_month'],
                              day=  df[f'{column_name}_day'],
                              hour= df[f'{column_name}_hour']
                              )
df[f'{column_name}_hourly'] = pd.to_datetime(conversion_dict_hourly)

In [None]:
df['end_station_cluster'].max()#.info()

In [None]:
df

In [None]:
df = df.merge(
    standard_stations[[
        'station_latitude_centroid_stations',
        'station_longitude_centroid_stations',
        'cluster_stations',
        'station_id'
        ]].rename(
            columns={
                'station_id': "start_station_id",
                'cluster_stations': 'start_station_cluster_centroid',
                'station_latitude_centroid_stations': "start_station_centroid_lng",
                'station_longitude_centroid_stations': "start_station_centroid_lat",
                }),
    on = ['start_station_id'],
    how='left'
)

df = df.merge(
    standard_stations[[
        'station_latitude_centroid_stations',
        'station_longitude_centroid_stations',
        'cluster_stations',
        'station_id'
        ]].rename(
            columns={
                'station_id': "end_station_id",
                'cluster_stations': 'end_station_cluster_centroid',
                'station_latitude_centroid_stations': "end_station_centroid_lng",
                'station_longitude_centroid_stations': "end_station_centroid_lat",
                }),
    on = ['end_station_id'],
    how='left'
)

In [None]:
df.info()

In [None]:
df_grouped = df.groupby([
    'started_at_hour',
    'started_at_quarter',
    'started_at_month',
    'started_at_year',
    'started_at_day',
    'start_station_cluster',
    'end_station_cluster',
    'started_at_dayofweek',
    'start_station_cluster_centroid',
    'end_station_cluster_centroid',
    'start_station_centroid_lng',
    'start_station_centroid_lat',
    'end_station_centroid_lng',
    'end_station_centroid_lat',
]).agg(
    {
        'ride_id': 'count',
        'duration_sec': 'mean',
    }
)

df_grouped = df_grouped.reset_index().rename(columns={
    "ride_id": "demand"
})

In [None]:
#df_grouped.to_csv(f"{data_dir}/df_final_dataframe_grouped.csv")

In [None]:
all_clusters = np.union1d(
    df_grouped['start_station_cluster'].unique(),
    df_grouped['end_station_cluster'].unique()
  )

In [None]:
df_grouped['start_station_cluster']

In [None]:
all_clusters

In [None]:
#df_grouped[['start_station_cluster_centroid', 'end_station_cluster_centroid', 'demand']].value_counts()

In [None]:
del df

In [None]:
distance_matrix = np.zeros((len(all_clusters), len(all_clusters)))

for _, row in distance.iterrows():
    start = np.where(all_clusters == row['start_station_cluster'])
    end = np.where(all_clusters == row['end_station_cluster'])
    distance_matrix[start, end] = row['distance']

In [None]:
np.zeros((len(all_clusters), len(all_clusters))).shape

In [None]:
np.savez(f'{data_dir}/distance_matrix', *distance_matrix)

In [None]:
#import pdb


demand_graphs = []
features = []

for (year, month, day, hour, quarter), group in df_grouped.groupby([
      'started_at_year',
      'started_at_month',
      'started_at_day',
      'started_at_hour',
      'started_at_quarter',
    ]):

    graph_matrix = np.zeros((len(all_clusters), len(all_clusters)))

    for _, row in group.iterrows():
        pickup_idx = np.where(all_clusters == row['start_station_cluster_centroid'])[0][0]
        dropoff_idx = np.where(all_clusters == row['end_station_cluster_centroid'])[0][0]
        graph_matrix[pickup_idx, dropoff_idx] = row['demand']


    features_list = []
    for i in range(len(graph_matrix)):
        temp_features = []
        temp_features.append(year)
        temp_features.append(month)
        temp_features.append(day)
        temp_features.append(hour)
        temp_features.append(graph_matrix[i].sum())
        features_list.append(temp_features)
    features.append(features_list)
    demand_graphs.append(graph_matrix)
    #pdb.set_trace()

In [None]:
np.savez(f'{data_dir}/demand_graphs.pkl', *demand_graphs)
np.savez(f'{data_dir}/demand_graph_timestamps.pkl', *features)

In [None]:
del demand_graphs

In [None]:

duration_graphs = []
features = []

for (year, month, day, hour, quarter), group in df_grouped.groupby([
      'started_at_year',
      'started_at_month',
      'started_at_day',
      'started_at_hour',
      'started_at_quarter',
    ]):

    duration_matrix = np.zeros((len(all_clusters), len(all_clusters)))

    for _, row in group.iterrows():
        pickup_idx = np.where(all_clusters == row['start_station_cluster_centroid'])[0][0]
        dropoff_idx = np.where(all_clusters == row['end_station_cluster_centroid'])[0][0]

        duration_matrix[pickup_idx, dropoff_idx] = row['duration_sec']

    features_list = []
    for i in range(len(graph_matrix)):
        temp_features = []
        temp_features.append(year)
        temp_features.append(month)
        temp_features.append(day)
        temp_features.append(hour)
        temp_features.append(graph_matrix[i].sum())
        temp_features.append(duration_matrix[i].mean())
        features_list.append(temp_features)
    features.append(features_list)
    duration_graphs.append(duration_matrix)
    #pdb.set_trace()

In [None]:
np.savez(f'{data_dir}/duration_graphs.pkl', *duration_graphs)
np.savez(f'{data_dir}/duration_graph_timestamps.pkl', *features)

In [None]:
row['duration_sec']

In [None]:
demand_graphs[15500].sum()

In [None]:
for i in duration_graphs[15500]:
  print(i)

In [None]:
duration_graphs

In [None]:
features[15500]#.sum()