In this file, I am going to create the dataset for the task.<br>
Before that, first, I need to calculate all the time-based features for the graphs.

In [2]:
import pandas as pd
import os
import numpy as np
import networkx as nx
import geopandas as gpd
import shapely
from datetime import datetime
import re
from tqdm import tqdm
import pickle

# Loading the data

In [46]:
ridership_data = pd.read_csv('data/l_ridership.csv')
print(ridership_data.shape)
ridership_data.head()

(42358, 7)


Unnamed: 0,station_id,stationame,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,monthtotal
0,40900,Howard,01/01/2001,6233.9,3814.5,2408.6,164447
1,41190,Jarvis,01/01/2001,1489.1,1054.0,718.0,40567
2,40100,Morse,01/01/2001,4412.5,3064.5,2087.8,119772
3,41300,Loyola,01/01/2001,4664.5,3156.0,1952.8,125008
4,40760,Granville,01/01/2001,3109.8,2126.0,1453.8,84189


In [47]:
# taking data only from Jan 2020
targ_date = datetime.strptime('01/01/2020', r'%m/%d/%Y').date()

# converting all dates in dataframe from string to datetime object
ridership_data['month_beginning'] = ridership_data['month_beginning'].map(lambda d: datetime.strptime(d, r'%m/%d/%Y').date())
ridership_data.head()

Unnamed: 0,station_id,stationame,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,monthtotal
0,40900,Howard,2001-01-01,6233.9,3814.5,2408.6,164447
1,41190,Jarvis,2001-01-01,1489.1,1054.0,718.0,40567
2,40100,Morse,2001-01-01,4412.5,3064.5,2087.8,119772
3,41300,Loyola,2001-01-01,4664.5,3156.0,1952.8,125008
4,40760,Granville,2001-01-01,3109.8,2126.0,1453.8,84189


Converting the month total column into int

In [48]:
def convert_2_float(s):
    # convert the given string to integer
    new_val = re.sub(r"[^0-9\.]+", "", s)
    if new_val == '':
        return 0.0
    else:
        return float(new_val)

ridership_data['monthtotal'] = ridership_data['monthtotal'].apply(lambda s: convert_2_float(s))
ridership_data.head()

Unnamed: 0,station_id,stationame,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,monthtotal
0,40900,Howard,2001-01-01,6233.9,3814.5,2408.6,164447.0
1,41190,Jarvis,2001-01-01,1489.1,1054.0,718.0,40567.0
2,40100,Morse,2001-01-01,4412.5,3064.5,2087.8,119772.0
3,41300,Loyola,2001-01-01,4664.5,3156.0,1952.8,125008.0
4,40760,Granville,2001-01-01,3109.8,2126.0,1453.8,84189.0


In [49]:
ridership_filtered = ridership_data[ridership_data['month_beginning'] >= targ_date]
print(ridership_filtered.shape)
ridership_filtered.head()

(9885, 7)


Unnamed: 0,station_id,stationame,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,monthtotal
32473,40900,Howard,2020-01-01,4801.2,2852.0,2304.4,128557.0
32474,41190,Jarvis,2020-01-01,1359.3,1027.5,811.4,38071.0
32475,40100,Morse,2020-01-01,4119.4,2628.0,2091.8,111598.0
32476,41300,Loyola,2020-01-01,4869.3,3517.0,2590.0,134143.0
32477,40760,Granville,2020-01-01,3533.1,1619.8,1392.4,91169.0


Recording times when a station was first added.

In [50]:
first_date = ridership_filtered.groupby('station_id').first()[['stationame', 'month_beginning', 'monthtotal']]
first_date

Unnamed: 0_level_0,stationame,month_beginning,monthtotal
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
40010,Austin-Forest Park,2020-01-01,40945.0
40020,Harlem-Lake,2020-01-01,86701.0
40030,Pulaski-Lake,2020-01-01,32776.0
40040,Quincy/Wells,2020-01-01,188175.0
40050,Davis,2020-01-01,88595.0
...,...,...,...
41670,Conservatory,2020-01-01,23791.0
41680,Oakton-Skokie,2020-01-01,21068.0
41690,Cermak-McCormick Place,2020-01-01,38838.0
41700,Washington/Wabash,2020-01-01,238477.0


In [51]:
# getting stations that were started after jan 1, 2020
new_stations = first_date[first_date['month_beginning'] > targ_date]
print(new_stations.shape)
new_stations.head()

(1, 3)


Unnamed: 0_level_0,stationame,month_beginning,monthtotal
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
41710,Damen-Lake,2024-07-01,0.0


Only Damen-lake was newly added after 2020, on July 2024. But upon closer inspection, it had zero riders for this month. The riders started from the next month, august, 1, 2024. This will be it's correct start date.

In [53]:
# checking all the rows where he monthly total is 0
zero_rows = ridership_filtered[ridership_filtered['monthtotal'] == 0.0]
print(zero_rows.shape)
zero_rows.head()

(101, 7)


Unnamed: 0,station_id,stationame,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,monthtotal
34911,40340,Berwyn,2021-06-01,0,0,0,0.0
34913,40770,Lawrence,2021-06-01,0,0,0,0.0
35054,40340,Berwyn,2021-07-01,0,0,0,0.0
35056,40770,Lawrence,2021-07-01,0,0,0,0.0
35197,40340,Berwyn,2021-08-01,0,0,0,0.0


There are 101 such instances, checking how many vales for each stations do we have for these. If for a station there are more than one, this only means that the station was inoperable, and needs to be removed from the graph in between the respective dates.

In [56]:
zero_counts = zero_rows.groupby('stationame')['month_beginning'].nunique()
zero_counts

stationame
Berwyn        49
Damen-Lake     1
Lawrence      49
Racine         2
Name: month_beginning, dtype: int64

In [61]:
zero_rows.groupby('stationame')['month_beginning'].min()

stationame
Berwyn        2021-06-01
Damen-Lake    2024-07-01
Lawrence      2021-06-01
Racine        2023-08-01
Name: month_beginning, dtype: object

In [64]:
zero_rows.groupby('stationame')['month_beginning'].max()

stationame
Berwyn        2025-06-01
Damen-Lake    2024-07-01
Lawrence      2025-06-01
Racine        2023-09-01
Name: month_beginning, dtype: object

We can ignore Damen-Lake, but the other three stations need to be considered.
- Racine: this station was closed during August and September 2023.
- Berwyn: closed from June 1, 2021 to June 1, 2025.
- Lawrence: same as berwyn

Removing these rows from the counts

In [70]:
ridership_filtered = ridership_filtered[ridership_filtered['monthtotal'] != 0.0]
print(ridership_filtered.shape)
ridership_filtered.head()

(9784, 7)


Unnamed: 0,station_id,stationame,month_beginning,avg_weekday_rides,avg_saturday_rides,avg_sunday-holiday_rides,monthtotal
32473,40900,Howard,2020-01-01,4801.2,2852.0,2304.4,128557.0
32474,41190,Jarvis,2020-01-01,1359.3,1027.5,811.4,38071.0
32475,40100,Morse,2020-01-01,4119.4,2628.0,2091.8,111598.0
32476,41300,Loyola,2020-01-01,4869.3,3517.0,2590.0,134143.0
32477,40760,Granville,2020-01-01,3533.1,1619.8,1392.4,91169.0


In [73]:
# saving to disc
ridership_filtered.to_csv('data/l_ridership_filtered.csv', index=False)

# Creating adjacency matrices for each of these dates.

There will be the following adjacency matrices:
1. From June 2021: Berwyn and Lawrence removed
2. From August 2023: Racine removed
3. From September 2023:  Racine added back
4. From August 2024: Damen-Lake added
5. From June 2025: Berwyn and Lawrence added
6. Before June 2021: Everything present except Damen-Lake

In [3]:
# loading the latest adjacency matrix
adj_df = pd.read_csv('data/graph/cta_adj_mat_final.csv', index_col=0)
print(adj_df.shape)
adj_df.head()

(144, 144)


Unnamed: 0,Cumberland,Oak Park-Lake,Austin-Lake,Ashland-Lake,Clinton-Lake,Grand/Milwaukee,Chicago/Milwaukee,Damen/Milwaukee,California/Milwaukee,Belmont-O'Hare,...,Southport,Adams/Wabash,Jackson/State,Rosemont,Harlem-Lake,Division/Milwaukee,95th/Dan Ryan,63rd-Dan Ryan,Garfield-Dan Ryan,Damen-Lake
Cumberland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Oak Park-Lake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Austin-Lake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ashland-Lake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Clinton-Lake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The above matrix is the latest one.<br>
1. Creating adj_df for 'from June 2021'

In [147]:
# making a copy first
adj_df_from_june21 = adj_df.copy()
adj_df_from_june21.shape

(144, 144)

In [148]:
# removing Berwyn
# setting connection between argyle and bryn mawr
adj_df_from_june21.loc['Argyle', 'Bryn Mawr'] = 1.0
adj_df_from_june21.loc['Bryn Mawr', 'Argyle'] = 1.0

# removing berwyn from row and column
adj_df_from_june21.drop(columns='Berwyn', inplace=True)
adj_df_from_june21.drop(index='Berwyn', inplace=True)

In [149]:
# removing Lawrence
# setting connection between wilson and argyle and vice versa
adj_df_from_june21.loc['Argyle', 'Wilson'] = 1.0
adj_df_from_june21.loc['Wilson', 'Argyle'] = 1.0

# removing Lawrence from row and column
adj_df_from_june21.drop(columns='Lawrence', inplace=True)
adj_df_from_june21.drop(index='Lawrence', inplace=True)

In [150]:
# removing the Damen-Lake station
# setting connections between california-lake and ashland-lake and vice versa
adj_df_from_june21.loc['California-Lake', 'Ashland-Lake'] = 1.0
adj_df_from_june21.loc['Ashland-Lake', 'California-Lake'] = 1.0

# removing Damen-Lake from row and column
adj_df_from_june21.drop(columns='Damen-Lake', inplace=True)
adj_df_from_june21.drop(index='Damen-Lake', inplace=True)

In [151]:
# saving this to disc
adj_df_from_june21.to_csv('data/graph/adj_mat_june21.csv')

2. Creating adj_df for August 2023

In [152]:
# making a copy first
adj_df_from_aug23 = adj_df_from_june21.copy()
adj_df_from_aug23.shape

(141, 141)

In [153]:
# removing racine
# adding connection between illinois medical district - UIC-Halsted and vice versa
adj_df_from_aug23.loc['Illinois Medical District', 'UIC-Halsted'] = 1.0
adj_df_from_aug23.loc['UIC-Halsted', 'Illinois Medical District'] = 1.0

# removing Lawrence from row and column
adj_df_from_aug23.drop(columns='Racine', inplace=True)
adj_df_from_aug23.drop(index='Racine', inplace=True)

In [154]:
# saving
adj_df_from_aug23.to_csv('data/graph/adj_mat_aug23.csv')

3. Adj df after September 23 is going to be the same as June 21.
4. Creating adj_df for August 24.

In [155]:
# making a copy first
adj_df_from_aug24 = adj_df_from_june21.copy()
adj_df_from_aug24.shape

(141, 141)

In [156]:
# adding Damen-Lake
adj_df_from_aug24['Damen-Lake'] = np.zeros(adj_df_from_aug24.shape[0])
adj_df_from_aug24.loc['Damen-Lake'] = np.zeros(adj_df_from_aug24.shape[1])
# setting connection between ashland-lake and damen lake
adj_df_from_aug24.loc['Ashland-Lake', 'Damen-Lake'] = 1.0
adj_df_from_aug24.loc['Damen-Lake', 'Ashland-Lake'] = 1.0

# setting connection between california-lake and damen lake
adj_df_from_aug24.loc['California-Lake', 'Damen-Lake'] = 1.0
adj_df_from_aug24.loc['Damen-Lake', 'California-Lake'] = 1.0

# removing connetions between california lake and ashland lake
adj_df_from_aug24.loc['California-Lake', 'Ashland-Lake'] = 0.0
adj_df_from_aug24.loc['Ashland-Lake', 'California-Lake'] = 0.0

In [157]:
# saving
adj_df_from_aug24.to_csv('data/graph/adj_mat_aug24.csv')

5. We already have adj_df for after June 2025
6. For before june 2021

In [4]:
# making a copy first
adj_df_before_june21 = adj_df.copy()
adj_df_before_june21.shape

(144, 144)

In [5]:
# removing the Damen-Lake station
# setting connections between california-lake and ashland-lake and vice versa
adj_df_before_june21.loc['California-Lake', 'Ashland-Lake'] = 1.0
adj_df_before_june21.loc['Ashland-Lake', 'California-Lake'] = 1.0

# removing Damen-Lake from row and column
adj_df_before_june21.drop(columns='Damen-Lake', inplace=True)
adj_df_before_june21.drop(index='Damen-Lake', inplace=True)

In [19]:
adj_df_before_june21.shape

(143, 143)

In [8]:
# saving
adj_df_before_june21.to_csv('data/graph/adj_mat_before_june21.csv')

# Creating the Structural features for graph for each adjacency matrix again as it is time-based.

We have 5 different graphs. We need to create spatial features again for each of those. The reason why we have only 5 for all the months is that across many months, the graph stays the same.

In [9]:
# loading the adjacency matrices
adj_df = pd.read_csv('data/graph/cta_adj_mat_final.csv', index_col = 0)
adj_df_from_june21 = pd.read_csv('data/graph/adj_mat_june21.csv', index_col = 0)
adj_df_before_june21 = pd.read_csv('data/graph/adj_mat_before_june21.csv', index_col = 0)
adj_df_from_aug23 = pd.read_csv('data/graph/adj_mat_aug23.csv', index_col = 0)
adj_df_from_aug24 = pd.read_csv('data/graph/adj_mat_aug24.csv', index_col = 0)

In [20]:
adj_df_before_june21.shape

(143, 143)

In [10]:
# creating graphs for each period
G_before_june21 = nx.from_pandas_adjacency(adj_df_before_june21)
G_from_june21 = nx.from_pandas_adjacency(adj_df_from_june21)
G_from_aug23 = nx.from_pandas_adjacency(adj_df_from_aug23)
G_from_sep23 = nx.from_pandas_adjacency(adj_df_from_june21)
G_from_aug24 = nx.from_pandas_adjacency(adj_df_from_aug24)
G_from_june25 = nx.from_pandas_adjacency(adj_df)

len(G_before_june21.nodes()), len(G_from_june21.nodes()), len(G_from_aug23.nodes()), len(G_from_sep23.nodes()), len(G_from_aug24.nodes()), len(G_from_june25.nodes())

(143, 141, 140, 141, 142, 144)

### Calculating the structrual features, appending them and saving
#### 1. For June 21

In [160]:
# degree_centrality
degree_cent_measure_june21 = nx.degree_centrality(G_from_june21)
# closeness centrality
closeness_cent_measure_june21 = nx.closeness_centrality(G_from_june21)
# betweenness centrality
betweenness_cent_measure_june21 = nx.betweenness_centrality(G_from_june21)
# eigenvector centrality
eigenvector_cent_measure_june21 = nx.eigenvector_centrality(G_from_june21, max_iter=500)
# katz centrality
katz_cent_measure_june21 = nx.katz_centrality(G_from_june21, alpha=0.2)
# degree
degree_dict_june21 = dict(G_from_june21.degree)
# is node an interchange
is_interchange_june21 = {}
for n, d in degree_dict_june21.items():
    if d>2:
        is_interchange_june21[n] = 1
    else:
        is_interchange_june21[n] = 0
# is the node a terminal station
is_terminal_june21 = {}
for n, d in degree_dict_june21.items():
    if d==1:
        is_terminal_june21[n] = 1
    else:
        is_terminal_june21[n] = 0

# setting howard to 1
is_terminal_june21["Howard"] = 1

In [161]:
# number of lines passing
station_lines_june21 = {n:1 for n in G_from_june21.nodes()}

In [162]:
multiple = {
    'Ashland-Lake':2, 'Morgan':2, 'Clinton-Lake':2,
    'Clark/Lake':6, 'State/Lake':5, 'Washington/Wabash':5, 'Adams/Wabash':5, 'Library':4, 'LaSalle/Van Buren':4,
    'Quincy':4, 'Washington/Wells':4,
    'Roosevelt/Wabash':2,
    'Merchandise Mart':2, 'Sedgwick':2, 'Chicago/Franklin':2, 'Armitage':2, 'Diversey':2, 'Wellington':2,
    'Fullerton':3, 'Belmont-North Main':3, 'Wilson':2, 'Howard':3
}
for n in station_lines_june21.keys():
    if n in list(multiple):
        station_lines_june21[n] = multiple[n]

Concatenating all of these to create a matrix

In [163]:
spatial_features_jun21 = {}
for n in tqdm(G_from_june21.nodes()):
    deg_cent = degree_cent_measure_june21[n]
    close_sent = closeness_cent_measure_june21[n]
    eig_cent = eigenvector_cent_measure_june21[n]
    katz_cent = katz_cent_measure_june21[n]
    deg = degree_dict_june21[n]
    intercange_flag = is_interchange_june21[n]
    terminal_flag = is_terminal_june21[n]
    stn_line_count = station_lines_june21[n]
    node_list = [deg_cent, close_sent, eig_cent, katz_cent, deg, intercange_flag, terminal_flag, stn_line_count]
    spatial_features_jun21[n] = node_list

100%|██████████| 141/141 [00:00<00:00, 309794.06it/s]


In [164]:
# saving
with open('data/graph/spatial/spatial_jun21.pkl', 'wb') as f:
    pickle.dump(spatial_features_jun21, f)

#### 2. For Aug 23

In [167]:
multiple = {
    'Ashland-Lake':2, 'Morgan':2, 'Clinton-Lake':2,
    'Clark/Lake':6, 'State/Lake':5, 'Washington/Wabash':5, 'Adams/Wabash':5, 'Library':4, 'LaSalle/Van Buren':4,
    'Quincy':4, 'Washington/Wells':4,
    'Roosevelt/Wabash':2,
    'Merchandise Mart':2, 'Sedgwick':2, 'Chicago/Franklin':2, 'Armitage':2, 'Diversey':2, 'Wellington':2,
    'Fullerton':3, 'Belmont-North Main':3, 'Wilson':2, 'Howard':3
}
for n in station_lines_aug23.keys():
    if n in list(multiple):
        station_lines_aug23[n] = multiple[n]

100%|██████████| 140/140 [00:00<00:00, 239479.02it/s]


In [169]:
# saving
with open('data/graph/spatial/spatial_aug23.pkl', 'wb') as f:
    pickle.dump(spatial_features_aug23, f)

#### 3. For Sep 23

In [170]:
# degree_centrality
degree_cent_measure_sep23 = nx.degree_centrality(G_from_sep23)
# closeness centrality
closeness_cent_measure_sep23 = nx.closeness_centrality(G_from_sep23)
# betweenness centrality
betweenness_cent_measure_sep23 = nx.betweenness_centrality(G_from_sep23)
# eigenvector centrality
eigenvector_cent_measure_sep23 = nx.eigenvector_centrality(G_from_sep23, max_iter=500)
# katz centrality
katz_cent_measure_sep23 = nx.katz_centrality(G_from_sep23, alpha=0.2)
# degree
degree_dict_sep23 = dict(G_from_sep23.degree)
# is node an interchange
is_interchange_sep23 = {}
for n, d in degree_dict_sep23.items():
    if d>2:
        is_interchange_sep23[n] = 1
    else:
        is_interchange_sep23[n] = 0
# is the node a terminal station
is_terminal_sep23 = {}
for n, d in degree_dict_sep23.items():
    if d==1:
        is_terminal_sep23[n] = 1
    else:
        is_terminal_sep23[n] = 0

# setting howard to 1
is_terminal_sep23["Howard"] = 1

In [171]:
# number of lines passing
station_lines_sep23 = {n:1 for n in G_from_sep23.nodes()}

In [172]:
multiple = {
    'Ashland-Lake':2, 'Morgan':2, 'Clinton-Lake':2,
    'Clark/Lake':6, 'State/Lake':5, 'Washington/Wabash':5, 'Adams/Wabash':5, 'Library':4, 'LaSalle/Van Buren':4,
    'Quincy':4, 'Washington/Wells':4,
    'Roosevelt/Wabash':2,
    'Merchandise Mart':2, 'Sedgwick':2, 'Chicago/Franklin':2, 'Armitage':2, 'Diversey':2, 'Wellington':2,
    'Fullerton':3, 'Belmont-North Main':3, 'Wilson':2, 'Howard':3
}
for n in station_lines_sep23.keys():
    if n in list(multiple):
        station_lines_sep23[n] = multiple[n]

In [173]:
spatial_features_sep23 = {}
for n in tqdm(G_from_sep23.nodes()):
    deg_cent = degree_cent_measure_sep23[n]
    close_sent = closeness_cent_measure_sep23[n]
    eig_cent = eigenvector_cent_measure_sep23[n]
    katz_cent = katz_cent_measure_sep23[n]
    deg = degree_dict_sep23[n]
    intercange_flag = is_interchange_sep23[n]
    terminal_flag = is_terminal_sep23[n]
    stn_line_count = station_lines_sep23[n]
    node_list = [deg_cent, close_sent, eig_cent, katz_cent, deg, intercange_flag, terminal_flag, stn_line_count]
    spatial_features_sep23[n] = node_list

100%|██████████| 141/141 [00:00<00:00, 304060.08it/s]


In [174]:
# saving
with open('data/graph/spatial/spatial_sep23.pkl', 'wb') as f:
    pickle.dump(spatial_features_sep23, f)

#### 4. For Aug 24

In [175]:
# degree_centrality
degree_cent_measure_aug24 = nx.degree_centrality(G_from_aug24)
# closeness centrality
closeness_cent_measure_aug24 = nx.closeness_centrality(G_from_aug24)
# betweenness centrality
betweenness_cent_measure_aug24 = nx.betweenness_centrality(G_from_aug24)
# eigenvector centrality
eigenvector_cent_measure_aug24 = nx.eigenvector_centrality(G_from_aug24, max_iter=500)
# katz centrality
katz_cent_measure_aug24 = nx.katz_centrality(G_from_aug24, alpha=0.1, max_iter=1000)
# degree
degree_dict_aug24 = dict(G_from_aug24.degree)
# is node an interchange
is_interchange_aug24 = {}
for n, d in degree_dict_aug24.items():
    if d>2:
        is_interchange_aug24[n] = 1
    else:
        is_interchange_aug24[n] = 0
# is the node a terminal station
is_terminal_aug24 = {}
for n, d in degree_dict_aug24.items():
    if d==1:
        is_terminal_aug24[n] = 1
    else:
        is_terminal_aug24[n] = 0

# setting howard to 1
is_terminal_aug24["Howard"] = 1

In [176]:
# number of lines passing
station_lines_aug24 = {n:1 for n in G_from_aug24.nodes()}

In [177]:
multiple = {
    'Ashland-Lake':2, 'Morgan':2, 'Clinton-Lake':2,
    'Clark/Lake':6, 'State/Lake':5, 'Washington/Wabash':5, 'Adams/Wabash':5, 'Library':4, 'LaSalle/Van Buren':4,
    'Quincy':4, 'Washington/Wells':4,
    'Roosevelt/Wabash':2,
    'Merchandise Mart':2, 'Sedgwick':2, 'Chicago/Franklin':2, 'Armitage':2, 'Diversey':2, 'Wellington':2,
    'Fullerton':3, 'Belmont-North Main':3, 'Wilson':2, 'Howard':3
}
for n in station_lines_aug24.keys():
    if n in list(multiple):
        station_lines_aug24[n] = multiple[n]

In [178]:
spatial_features_aug24 = {}
for n in tqdm(G_from_aug24.nodes()):
    deg_cent = degree_cent_measure_aug24[n]
    close_sent = closeness_cent_measure_aug24[n]
    eig_cent = eigenvector_cent_measure_aug24[n]
    katz_cent = katz_cent_measure_aug24[n]
    deg = degree_dict_aug24[n]
    intercange_flag = is_interchange_aug24[n]
    terminal_flag = is_terminal_aug24[n]
    stn_line_count = station_lines_aug24[n]
    node_list = [deg_cent, close_sent, eig_cent, katz_cent, deg, intercange_flag, terminal_flag, stn_line_count]
    spatial_features_aug24[n] = node_list

100%|██████████| 142/142 [00:00<00:00, 322115.29it/s]


In [179]:
# saving
with open('data/graph/spatial/spatial_aug24.pkl', 'wb') as f:
    pickle.dump(spatial_features_aug24, f)

#### 5. For June 25

In [180]:
# degree_centrality
degree_cent_measure_june25 = nx.degree_centrality(G_from_june25)
# closeness centrality
closeness_cent_measure_june25 = nx.closeness_centrality(G_from_june25)
# betweenness centrality
betweenness_cent_measure_june25 = nx.betweenness_centrality(G_from_june25)
# eigenvector centrality
eigenvector_cent_measure_june25 = nx.eigenvector_centrality(G_from_june25, max_iter=500)
# katz centrality
katz_cent_measure_june25 = nx.katz_centrality(G_from_june25, alpha=0.1, max_iter=1000)
# degree
degree_dict_june25 = dict(G_from_june25.degree)
# is node an interchange
is_interchange_june25 = {}
for n, d in degree_dict_june25.items():
    if d>2:
        is_interchange_june25[n] = 1
    else:
        is_interchange_june25[n] = 0
# is the node a terminal station
is_terminal_june25 = {}
for n, d in degree_dict_june25.items():
    if d==1:
        is_terminal_june25[n] = 1
    else:
        is_terminal_june25[n] = 0

# setting howard to 1
is_terminal_june25["Howard"] = 1

In [181]:
# number of lines passing
station_lines_june25 = {n:1 for n in G_from_june25.nodes()}

In [182]:
multiple = {
    'Ashland-Lake':2, 'Morgan':2, 'Clinton-Lake':2,
    'Clark/Lake':6, 'State/Lake':5, 'Washington/Wabash':5, 'Adams/Wabash':5, 'Library':4, 'LaSalle/Van Buren':4,
    'Quincy':4, 'Washington/Wells':4,
    'Roosevelt/Wabash':2,
    'Merchandise Mart':2, 'Sedgwick':2, 'Chicago/Franklin':2, 'Armitage':2, 'Diversey':2, 'Wellington':2,
    'Fullerton':3, 'Belmont-North Main':3, 'Wilson':2, 'Howard':3
}
for n in station_lines_june25.keys():
    if n in list(multiple):
        station_lines_june25[n] = multiple[n]

In [183]:
spatial_features_june25 = {}
for n in tqdm(G_from_june25.nodes()):
    deg_cent = degree_cent_measure_june25[n]
    close_sent = closeness_cent_measure_june25[n]
    eig_cent = eigenvector_cent_measure_june25[n]
    katz_cent = katz_cent_measure_june25[n]
    deg = degree_dict_june25[n]
    intercange_flag = is_interchange_june25[n]
    terminal_flag = is_terminal_june25[n]
    stn_line_count = station_lines_june25[n]
    node_list = [deg_cent, close_sent, eig_cent, katz_cent, deg, intercange_flag, terminal_flag, stn_line_count]
    spatial_features_june25[n] = node_list

100%|██████████| 144/144 [00:00<00:00, 432340.57it/s]


In [184]:
# saving
with open('data/graph/spatial/spatial_june25.pkl', 'wb') as f:
    pickle.dump(spatial_features_june25, f)

#### 6. Before June 21

In [11]:
# degree_centrality
degree_cent_measure_before_june21 = nx.degree_centrality(G_before_june21)
# closeness centrality
closeness_cent_measure_before_june21 = nx.closeness_centrality(G_before_june21)
# betweenness centrality
betweenness_cent_measure_before_june21 = nx.betweenness_centrality(G_before_june21)
# eigenvector centrality
eigenvector_cent_measure_before_june21 = nx.eigenvector_centrality(G_before_june21, max_iter=500)
# katz centrality
katz_cent_measure_before_june21 = nx.katz_centrality(G_before_june21, alpha=0.1, max_iter=1000)
# degree
degree_dict_before_june21 = dict(G_before_june21.degree)
# is node an interchange
is_interchange_before_june21 = {}
for n, d in degree_dict_before_june21.items():
    if d>2:
        is_interchange_before_june21[n] = 1
    else:
        is_interchange_before_june21[n] = 0
# is the node a terminal station
is_terminal_before_june21 = {}
for n, d in degree_dict_before_june21.items():
    if d==1:
        is_terminal_before_june21[n] = 1
    else:
        is_terminal_before_june21[n] = 0

# setting howard to 1
is_terminal_before_june21["Howard"] = 1

In [14]:
# number of lines passing
station_lines_before_june21 = {n:1 for n in G_before_june21.nodes()}

In [15]:
multiple = {
    'Ashland-Lake':2, 'Morgan':2, 'Clinton-Lake':2,
    'Clark/Lake':6, 'State/Lake':5, 'Washington/Wabash':5, 'Adams/Wabash':5, 'Library':4, 'LaSalle/Van Buren':4,
    'Quincy':4, 'Washington/Wells':4,
    'Roosevelt/Wabash':2,
    'Merchandise Mart':2, 'Sedgwick':2, 'Chicago/Franklin':2, 'Armitage':2, 'Diversey':2, 'Wellington':2,
    'Fullerton':3, 'Belmont-North Main':3, 'Wilson':2, 'Howard':3
}
for n in station_lines_before_june21.keys():
    if n in list(multiple):
        station_lines_before_june21[n] = multiple[n]

In [17]:
spatial_features_before_june21 = {}
for n in tqdm(G_before_june21.nodes()):
    deg_cent = degree_cent_measure_before_june21[n]
    close_sent = closeness_cent_measure_before_june21[n]
    eig_cent = eigenvector_cent_measure_before_june21[n]
    katz_cent = katz_cent_measure_before_june21[n]
    deg = degree_dict_before_june21[n]
    intercange_flag = is_interchange_before_june21[n]
    terminal_flag = is_terminal_before_june21[n]
    stn_line_count = station_lines_before_june21[n]
    node_list = [deg_cent, close_sent, eig_cent, katz_cent, deg, intercange_flag, terminal_flag, stn_line_count]
    spatial_features_before_june21[n] = node_list

100%|██████████| 143/143 [00:00<00:00, 315013.38it/s]


In [18]:
# saving
with open('data/graph/spatial/spatial_before_june21.pkl', 'wb') as f:
    pickle.dump(spatial_features_before_june21, f)