In [1]:
import pandas as pd 
import networkx as nx
import numpy as np
from functools import reduce
import datetime
import tqdm
import ast
import json
import sys
import time
import pickle
import re

In [2]:
timetable_intercity = pd.read_csv("./data/export-intercites-gtfs-last/timetable_intercity.csv")
timetable_ter = pd.read_csv("./data/ter/timetable_ter.csv")

In [3]:
timetable = pd.concat([timetable_intercity, timetable_ter], sort=True)

In [4]:
timetable["after"] = timetable["arrival_time"].apply(
    lambda x: datetime.datetime.strptime(ast.literal_eval(x)[0], "%H:%M:%S"))
timetable["before"] = timetable["arrival_time"].apply(
    lambda x: datetime.datetime.strptime(ast.literal_eval(x)[-1], "%H:%M:%S"))

In [5]:
def build_graph_within_timeframe(start_time, timeframe, timetable) : 
    stop_2_id = {}
    start_time = datetime.datetime.strptime(start_time, "%H:%M:%S")

    timetable = timetable[timetable["before"] >= start_time]
    timetable = timetable[timetable["after" ] <= start_time + datetime.timedelta(0,timeframe*60)]
    G = nx.Graph()

    frame_indices = []
    start = time.time()
    for i in timetable.index : 
        try :           
            arrival_times = ast.literal_eval(timetable["arrival_time"][i])
            times = [datetime.datetime.strptime(arrival_times[j], "%H:%M:%S") for j in range(len(arrival_times))]
            filtered_times = [time_ for time_ in times if (time_ - start_time).seconds <= (timeframe * 60)]

            indices = np.array([times.index(time) for time in filtered_times])

            if len(indices) == 0: continue

            frame_indices.append(i)

            departure_times = list(np.array(ast.literal_eval(timetable["departure_time"][i]))[indices])
            arrival_times = list(np.array(ast.literal_eval(timetable["arrival_time"][i]))[indices])

            stop_names = list(np.array(ast.literal_eval(timetable["stop_name"][i]))[indices])
        except ValueError : 
            #print(timetable["stop_name"][i])
            continue

        longitudes = list(np.array(ast.literal_eval(timetable["stop_lon"][i]))[indices])
        latitudes = list(np.array(ast.literal_eval(timetable["stop_lat"][i]))[indices])
        prices = [0.] + ast.literal_eval(timetable["prices"][i])

        prices = list(np.array(prices)[indices])
        for k in range(len(stop_names)-1) :  
            curr_stop = stop_names[k]
            next_stop = stop_names[k+1]

            if curr_stop not in stop_2_id.keys() : 
                stop_2_id[curr_stop] = len(stop_2_id.keys())
            if next_stop not in stop_2_id.keys(): 
                stop_2_id[next_stop] = len(stop_2_id.keys())

            G.add_edge(stop_2_id[curr_stop], stop_2_id[next_stop], weight=prices[k+1],
                       departure_time=departure_times[k], arrival_time=arrival_times[k+1], 
                       lon_lat=(longitudes[k+1], latitudes[k+1]))
    paths = nx.shortest_path(G)
    print("Execution time {:.2f} seconds".format(time.time() - start))
    return G, paths, stop_2_id

In [6]:
graph, paths, stop_2_id = build_graph_within_timeframe("07:30:00", 8 * 60, timetable)

Execution time 28.06 seconds


In [7]:
graph.edges[(0, 1)]

{'weight': 49.0,
 'departure_time': '08:21:00',
 'arrival_time': '11:06:00',
 'lon_lat': (6.17427169, 48.68978225)}

In [8]:
paths[1][882]

[1, 447, 902, 877, 882]

In [9]:
graph.edges[(1, 0)]

{'weight': 49.0,
 'departure_time': '08:21:00',
 'arrival_time': '11:06:00',
 'lon_lat': (6.17427169, 48.68978225)}

In [10]:
# graph.edges[(0, 882)]

In [11]:
stops_name_by_id = {v: k for k, v in stop_2_id.items()}

In [12]:
print(stops_name_by_id[1])
print(stops_name_by_id[0])
print(stops_name_by_id[882])

Nancy-Ville
Paris-Est
Les Laumes-Alésia


In [13]:
stops_name_by_id_df = pd.DataFrame.from_dict(stops_name_by_id, orient='index', columns=['name'])

In [14]:
with open('./data/_routing_stops_name_by_id.json', 'w') as outfile:
    json.dump(stops_name_by_id, outfile, sort_keys=True)

with open('./data/_routing_stops_id_by_name.json', 'w') as outfile:
    json.dump(stop_2_id, outfile, sort_keys=True)

In [15]:
with open('./data/_historic_cities.json') as infile:
    HISTORIC_CITIES = json.load(infile)
with open('./data/_art_history_cities.json') as infile:
    ART_HISTORY_CITIES = json.load(infile)

l = list(HISTORIC_CITIES.keys())
l.extend([item for item in l for l in ART_HISTORY_CITIES.values()])
l = set(l)
# l - set(stops_name_by_id_df[stops_name_by_id_df.name.isin(l)].name.values)
len(set(stops_name_by_id_df[stops_name_by_id_df.name.isin(l)].name.values)), len(l)

(140, 174)

In [16]:
stops_name_by_id_df[stops_name_by_id_df.name.str.contains('Villeneuve')]

Unnamed: 0,name
469,Villeneuve-d'Aveyron
1252,Villeneuve-Loubet-Plage
1339,Villeneuve-sur-Yonne
1343,Villeneuve-la-Guyard
2068,Villeneuve-la-Comtesse
2231,Villeneuve-sur-Allier


In [17]:
DATA_DIR = './data/'
STATIONS = pd.read_csv(DATA_DIR + '_stations.csv').dropna()
with open(DATA_DIR + '_historic_cities.json') as infile:
    HISTORIC_CITIES = json.load(infile)
with open(DATA_DIR + '_art_history_cities.json') as infile:
    ART_HISTORY_CITIES = json.load(infile)
SPECIAL_CITIES = list(HISTORIC_CITIES.keys())
SPECIAL_CITIES.extend([item for item in SPECIAL_CITIES for SPECIAL_CITIES in ART_HISTORY_CITIES.values()])
with open(DATA_DIR + '_routing_stops_name_by_id.json') as infile:
    STOPS_NAME_BY_ID = json.load(infile)
SPECIAL_STOPS_DF = pd.DataFrame.from_dict(STOPS_NAME_BY_ID, orient='index', columns=['name'])
SPECIAL_STOPS_DF = SPECIAL_STOPS_DF[SPECIAL_STOPS_DF.name.isin(SPECIAL_CITIES)]

special_ids = [int(x) for x in SPECIAL_STOPS_DF.index.values]

# Clean the paths by only keeping paths to important destinations.
# This brings down the size of the file from ~300MB to ~15MB.
for source_id in range(len(paths)):
    to_remove = []
    for dest_id in paths[source_id]:
        if dest_id == source_id:
            continue
        if dest_id in special_ids:
            continue
        to_remove.append(dest_id)
    for id_ in to_remove:
        del paths[source_id][id_]

In [18]:
with open('./data/_routing_paths.pkl', 'wb') as f:
    pickle.dump(paths, f, pickle.HIGHEST_PROTOCOL)

In [19]:
with open('./data/_routing_graph.pkl', 'wb') as f:
    pickle.dump(graph, f, pickle.HIGHEST_PROTOCOL)