In [1]:
import pandas as pd
import networkx as nx
import os

In [2]:
data = pd.read_csv('../data/Trips_2018_clean.csv')

RUN THIS IF YOU WANT TO CREATE A GRAPH FOR EVERY MONTH OF 2018

The following code generates an undirected graoh for every month of 2018. Stations are the nodes. We assume that a route from station A to station B matters the same as a trip from station B to A, therefore if there is at least on route between them, then there is an edge in the graph. The edge's weight is the sum of all routes A-B.

In [3]:
# Part 1: PREP - Ensure datetime and filter for 2018
data['start_time'] = pd.to_datetime(data['start_time'])
data['stop_time'] = pd.to_datetime(data['stop_time'])
df_2018 = data[data['start_time'].dt.year == 2018].copy()

# Part 2: Directory for graphs
save_dir = 'monthly_graphs'
os.makedirs(save_dir, exist_ok=True)

# Part 3: Build and save graphs by month
graphs_by_month = {}
for month in range(1, 13):
    monthly_df = df_2018[df_2018['start_time'].dt.month == month]
    if monthly_df.empty:
        print(f"No data for month {month}, skipping.")
        continue

    # Canonicalize edges (A,B) where A < B
    edges_df = monthly_df[['start_station_id', 'end_station_id']].copy()
    edges_df['edge'] = edges_df.apply(
        lambda row: tuple(sorted((row['start_station_id'], row['end_station_id']))), axis=1)

    # Edge weights: count all A<->B trips
    edge_weights = edges_df['edge'].value_counts().reset_index()
    edge_weights.columns = ['edge', 'weight']

    # Build graph
    G = nx.Graph()
    for _, row in edge_weights.iterrows():
        a, b = row['edge']
        weight = row['weight']
        G.add_edge(a, b, weight=weight)

    # Save Graph as GEXF
    gexf_path = os.path.join(save_dir, f'graph_2018_month{month:02d}.gexf')
    nx.write_gexf(G, gexf_path)
    print(f"Saved: {gexf_path}")
    graphs_by_month[month] = G


Saved: monthly_graphs\graph_2018_month01.gexf
Saved: monthly_graphs\graph_2018_month02.gexf
Saved: monthly_graphs\graph_2018_month03.gexf
Saved: monthly_graphs\graph_2018_month04.gexf
Saved: monthly_graphs\graph_2018_month05.gexf
Saved: monthly_graphs\graph_2018_month06.gexf
Saved: monthly_graphs\graph_2018_month07.gexf
Saved: monthly_graphs\graph_2018_month08.gexf
Saved: monthly_graphs\graph_2018_month09.gexf
Saved: monthly_graphs\graph_2018_month10.gexf
Saved: monthly_graphs\graph_2018_month11.gexf
Saved: monthly_graphs\graph_2018_month12.gexf


RUN THE FOLLOWING CODE IF YOU WANT TO CREATE A GRAPH FOR THE WHOLE 2018

Same methodology to create the graph, but this time for the whole year. 

In [4]:
import pandas as pd
import networkx as nx
import os

data = pd.read_csv('dataV/Trips_2018_clean.csv')

# Part 1: PREP - Ensure datetime and filter for 2018
data['start_time'] = pd.to_datetime(data['start_time'])
data['stop_time'] = pd.to_datetime(data['stop_time'])
df_2018 = data[data['start_time'].dt.year == 2018].copy()

# Part 2: Directory for graph
save_dir = '2018_graph'
os.makedirs(save_dir, exist_ok=True)

years = [2018]
# Part 3: Build and save graphs by year
graphs_by_year = {}
for year in years:
    if df_2018.empty:
        print(f"No data for year {year}, skipping.")
        continue

    # Canonicalize edges (A,B) where A < B
    edges_df = df_2018[['start_station_id', 'end_station_id']].copy()
    edges_df['edge'] = edges_df.apply(
        lambda row: tuple(sorted((row['start_station_id'], row['end_station_id']))), axis=1)

    # Edge weights: count all A<->B trips
    edge_weights = edges_df['edge'].value_counts().reset_index()
    edge_weights.columns = ['edge', 'weight']

    # Build graph
    G = nx.Graph()
    for _, row in edge_weights.iterrows():
        a, b = row['edge']
        weight = row['weight']
        G.add_edge(a, b, weight=weight)

    # Save Graph as GEXF
    gexf_path = os.path.join(save_dir, f'graph_{year}.gexf')
    nx.write_gexf(G, gexf_path)
    print(f"Saved: {gexf_path}")
    graphs_by_year[year] = G

Saved: 2018_graph\graph_2018.gexf
