In [131]:
import pandas as pd
import numpy as np
import math
from IPython.display import display
from datetime import datetime
import re
import swifter

In [132]:
DATA_DIR = "data"

In [133]:
#Extracting intericite data
times = pd.read_csv(DATA_DIR + "/export-intercites-gtfs-last/stop_times.txt")
stops = pd.read_csv(DATA_DIR + "/export-intercites-gtfs-last/stops.txt")
stops = stops[pd.notnull(stops["stop_name"])]
#Extracting ter data
stops_ter = pd.read_csv(DATA_DIR + "/ter/stops.txt")
stops_ter = stops_ter[pd.notnull(stops_ter["stop_name"])]
stop_times_ter = pd.read_csv(DATA_DIR + "/ter/stop_times.txt")
# Prices
prices_intercity = pd.read_csv(DATA_DIR + '/_intercity_prices.csv')
prices_ter = pd.read_csv(DATA_DIR + '/bareme-de-prix-ter.csv', sep=';')
prices_ter = prices_ter[prices_ter.Classe == '2 nde'][['Km', 'Plein tarif']]
prices_ter = prices_ter.sort_values(by='Km')
# Stations
stations = pd.read_csv('./data/_stations.csv')

In [162]:
#fix times which hours exceeds 24
def fix_times(time):
    if int(time.split(":")[0]) >= 24:
        return "{}:".format(int(time.split(":")[0]) - 24) + ":".join(time.split(":")[1:])
    return time

def compute_duration(arrival_list, departure_list):
    durations = []
    for i in range(len(arrival_list) -1):
        durations.append((datetime.strptime(arrival_list[i+1], "%H:%M:%S") - datetime.strptime(departure_list[i], "%H:%M:%S")).seconds / 60)
    return durations

def get_price(start, stop):
    prices = []
    tmp = prices_intercity[(prices_intercity.Origin == start) & (prices_intercity.Destination == stop)]
    if not tmp.empty:
        prices.append(tmp.iloc[0].Price)
    tmp = prices_intercity[(prices_intercity.Destination == start) & (prices_intercity.Origin == stop)]
    if not tmp.empty:
        prices.append(tmp.iloc[0].Price)
    if not prices:
        return 144
    return min(prices)

def compute_price(dests_list):
    prices = []
    for i in range(len(dests_list) - 1):
        start = dests_list[i]
        stop = dests_list[i + 1]
        price = get_price(start, stop)
        prices.append(price)
    return prices

In [135]:
# Fix time for intercite.
times["arrival_time"] = times["arrival_time"].apply(fix_times)
times["departure_time"] = times["departure_time"].apply(fix_times)
# Fix time for TER.
stop_times_ter["arrival_time"] = stop_times_ter["arrival_time"].apply(fix_times)
stop_times_ter["departure_time"] = stop_times_ter["departure_time"].apply(fix_times)

In [136]:
cols_1 = times.columns
cols_2 = stops.columns 

In [154]:
# Transformations to apply for station names.
NORMALIZATIONS = {'ç': 'c',
                  'é': 'e',
                  'è': 'e',
                  'ë': 'e',
                  'ô': 'o',
                  'â': 'a',
                  'î': 'i',
                  'ê': 'e',
                  '\\(': '',
                  '\\)': '',
                  'û': 'u',
                  '-': ' '}
NORMALIZATIONS_REGEX = {re.compile(k): v for k, v in NORMALIZATIONS.items()}
NORMALIZATIONS['('] = NORMALIZATIONS['\\(']
NORMALIZATIONS[')'] = NORMALIZATIONS['\\)']
del NORMALIZATIONS['\\(']
del NORMALIZATIONS['\\)']
WHITESPACE_REGEX = re.compile(' +')
PARAN_REGEX = re.compile(' *\([^)]*\)$')
# Some stations are spelled quite differently, or are ambiguous (had to resort to googling to figure out which is
# which exactly). We have to define these manually.
REPLACEMENTS = {'AEROPORT CDG 2 TGV ROISSY': 'Roissy-Aéroport-Charles-de-Gaulle 2-TGV (TGV)',
                'AEROPORT CDG2 TGV ROISSY': 'Roissy-Aéroport-Charles-de-Gaulle 2-TGV (TGV)',
                'Bâle': 'Bâle-St-Jean',
                'MASSY PALAISEAU': 'Massy-Palaiseau-Grande-Ceinture',
                'LILLE EUROPE-147322': 'Lille-Europe',
                'RUFFEC CHARENTE': 'Ruffec',
                'MOUTIERS SALINS BRIDES': 'Moûtiers-Salins-Brides-les-Bains',
                'MANTES LA J. TGV': 'Mantes-la-Jolie',
                'CLUSES  74': 'Cluses',
                'STRASBOURG': 'Strasbourg-Ville',
                'SAUMUR': 'Saumur-Rive-Droite',
                'PARIS MONTPARNAS VAUGIRARD BLS': 'Paris-Montparnasse',
                'PARIS MONTPARNASSE 3 VAUGIRARD': 'Paris-Montparnasse',
                'PLOUARET TREGOR': 'Plouaret',
                'plouaret tregor': 'Plouaret',
                'Saint Malo': 'St-Malo',
                'CONFLANS FDO TGV': 'Conflans-Fin-d\'Oise',
                'MULHOUSE VILLE': 'Mulhouse-Ville',
                'MULHOUSE': 'Mulhouse-Ville',
                'SAINT MICHEL VALLOIRE': 'St-Michel-Valloire',
                'JUVISY TGV': 'Juvisy',
                'Facture': 'Facture-Biganos',
                'Saint Maixent': 'St-Maixent (Deux-Sèvres)',
                'St maixent': 'St-Maixent (Deux-Sèvres)',
                'Vendôme Gare TGV': 'Vendôme',
                'saint nazaire': 'St-Nazaire',
                'Saint Brieuc': 'St-Brieuc',
                'PARIS MONTPARNASSE 1 ET 2': 'Paris-Montparnasse',
                'Dol de Bretagne': 'Dol',
                'DOL DE BRETAGNE': 'Dol',
                'LYON SAINT EXUPERY TGV': 'Lyon-St-Exupéry-TGV',
                'MONTPELLIER': 'Montpellier (CNM)',
                'AVIGNON SUD': 'Avignon-Centre',
                'Angers': 'Angers-St-Laud',
                'VILLENEUVE LES AVIGNONS': 'Villeneuve-lès-Avignon'}
# Some common suffixes which are missing in many cases, so we try using them to get a match when all else fails.
SUFFIXES = ['-tgv', '-voyageurs']
MEM = {}
UNKNOWNS = set()

def _from_name_get_matches(search, pattern):
    pattern = pattern.strip()
    if pattern in REPLACEMENTS:
        pattern = REPLACEMENTS[pattern]
    pattern = pattern.lower()
    for pat, repl in NORMALIZATIONS.items():
        pattern = pattern.replace(pat, repl)
    pattern = re.sub(WHITESPACE_REGEX, ' ', pattern)
    pattern = pattern.strip()
    return stations[search == pattern]

def _from_name_complex(name, try_suffixes=False):
    search = stations.Name.str.lower()
    for pat, repl in NORMALIZATIONS_REGEX.items():
        search = search.replace(pat, repl)
    pattern = name
    if try_suffixes:
        orig_pattern = pattern
        for suffix in SUFFIXES:
            pattern = orig_pattern + suffix
            match = _from_name_get_matches(search, pattern)
            if not match.empty:
                return match
    else:
        return _from_name_get_matches(search, pattern)

TRIMMINGS = ['-Centre', '-Carrefour', '-Mairie', 'Aéroport', '-Ctre', '-Gare-?.*', r' *\([^)]*\)']
TRIMMINGS = [re.compile(trimming + '$') for trimming in TRIMMINGS]
def _from_name(name, rec=False):
    if isinstance(name, list):
        return [_from_name(n) for n in name]
    elif isinstance(name, str):
        name = name.replace('Gare de ', '')
    else:
        return None
    match = _from_name_complex(name)
    if not match.empty:
        return match.Name.iloc[0]
    else:
        match = _from_name_complex(name, try_suffixes=True)
        if match is not None and not match.empty:
            return match.Name.iloc[0]
        elif name.endswith('-') or name.endswith('.'):
            return _from_name(name[:-1])
        elif rec:
            return None
        else:
            for trimming in TRIMMINGS:
                name = re.sub(trimming, '', name)
            return _from_name(name, rec=True)

def compute_distance(lat1, lon1, lat2, lon2):
    R = 6373.0
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

def get_nearest_name(lat, lon):
    min_dist = 999999999
    name = None
    for _, station in stations.iterrows():
        dist = compute_distance(lat, lon, station.Latitude, station.Longitude)
        if dist < min_dist:
            min_dist = dist
            name = station.Name
    return name

def fix_names(names, stop_lats, stop_lons):
    # print('.', end='')
    result = []
    for i, name in enumerate(names):
        if name in MEM:
            fixed_name = MEM[name]
        else:
            fixed_name = _from_name(name)
            MEM[name] = fixed_name
        if not fixed_name:
            stop_lat = stop_lats[i]
            stop_lon = stop_lons[i]
            # fixed_name = get_nearest_name(stop_lat, stop_lon)
            UNKNOWNS.add(name)
        result.append(fixed_name)
    return result

In [166]:
table = pd.merge(times, stops, how="left", left_on=times.stop_id, right_on=stops.stop_id).rename(columns={"key_0": "stop_id"})[[cols_1[0], cols_1[1], cols_1[2], cols_2[0], cols_2[1], cols_2[3], cols_2[4]]]
timetable = table.groupby("trip_id").agg(list).reset_index()

timetable["durations"] = timetable[["arrival_time", "departure_time"]].apply(
    lambda x: compute_duration(x["arrival_time"], x["departure_time"]), axis=1)

timetable["stop_name"] = timetable[["stop_name", "stop_lat", "stop_lon"]].swifter.progress_bar(True).apply(
    lambda x: fix_names(x["stop_name"], x["stop_lat"], x["stop_lon"]))
len(UNKNOWNS)

Pandas Apply: 100%|██████████| 8108/8108 [00:00<00:00, 15292.92it/s]


1201

In [167]:
timetable["prices"] = timetable[["stop_name"]].swifter.progress_bar(True).apply(lambda x: compute_price(x["stop_name"]), axis=1)

Pandas Apply: 100%|██████████| 8108/8108 [02:03<00:00, 65.42it/s] 


In [168]:
def remove_unknown_stops(row):
    to_remove = []
    for i, stop in enumerate(row.stop_name):
        if not stop:
            to_remove.append(i)
    if len(row.arrival_time) != len(row.stop_name):
        print('This should not happen')
        print(row)
        return row
    for i in sorted(to_remove, reverse=True):
        del row.arrival_time[i]
        del row.departure_time[i]
        del row.stop_name[i]
        del row.stop_lat[i]
        del row.stop_lon[i]
        if i > 0:
            tmp_dur = row.durations[i - 1]
            tmp_price = row.durations[i - 1]
            del row.durations[i - 1]
            del row.prices[i - 1]
            if i < len(row.durations):
                row.durations[i - 1] += tmp_dur
                row.prices[i - 1] += tmp_price
    return row

timetable.apply(remove_unknown_stops, axis=1)
timetable.drop(timetable[timetable.stop_name.str.len() <= 1].index, inplace=True)

In [169]:
timetable.head(3)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_name,stop_lat,stop_lon,durations,prices
0,OCESN001001F0100459990,"[08:21:00, 11:06:00, 11:26:00, 11:52:00, 12:17...","[08:21:00, 11:09:00, 11:29:00, 12:00:00, 12:20...","[StopPoint:OCECorail Intercité-87113001, StopP...","[Paris-Est, Nancy-Ville, Lunéville, Sarrebourg...","[48.87656977, 48.68978225, 48.58799369, 48.737...","[2.35915061, 6.17427169, 6.49703457, 7.0527805...","[165.0, 17.0, 23.0, 17.0]","[49.0, 7.4, 9.6, 5.9]"
1,OCESN001001F0100659999,"[07:06:00, 10:26:00, 10:50:00, 11:14:00, 11:31...","[07:06:00, 10:32:00, 10:52:00, 11:16:00, 11:33...","[StopPoint:OCECorail Intercité-87113001, StopP...","[Paris-Est, Nancy-Ville, Lunéville, Sarrebourg...","[48.87656977, 48.68978225, 48.58799369, 48.737...","[2.35915061, 6.17427169, 6.49703457, 7.0527805...","[200.0, 18.0, 22.0, 15.0]","[49.0, 7.4, 9.6, 5.9]"
2,OCESN001001F0100759984,"[08:21:00, 11:06:00, 11:26:00, 11:52:00, 12:17...","[08:21:00, 11:09:00, 11:29:00, 12:00:00, 12:20...","[StopPoint:OCECorail Intercité-87113001, StopP...","[Paris-Est, Nancy-Ville, Lunéville, Sarrebourg...","[48.87656977, 48.68978225, 48.58799369, 48.737...","[2.35915061, 6.17427169, 6.49703457, 7.0527805...","[165.0, 17.0, 23.0, 17.0]","[49.0, 7.4, 9.6, 5.9]"


In [170]:
timetable.to_csv("./data/export-intercites-gtfs-last/timetable_intercity.csv")

In [155]:
merged_ter = pd.merge(stop_times_ter,stops_ter,how='left', left_on=stop_times_ter.stop_id, right_on=stops_ter.stop_id) \
            [['trip_id', 'arrival_time', 'departure_time', 'stop_name', 'stop_lat', 'stop_lon']]

timetable_ter = merged_ter.groupby("trip_id").agg(list).reset_index()

timetable_ter["durations"] = timetable_ter[["arrival_time","departure_time"]].apply(
    lambda x: compute_duration(x["arrival_time"], x["departure_time"]), axis=1)

timetable_ter["stop_name"] = timetable_ter[["stop_name", "stop_lat", "stop_lon"]].swifter.progress_bar(True).apply(
    lambda x: fix_names(x["stop_name"], x["stop_lat"], x["stop_lon"]))

Pandas Apply: 100%|██████████| 31984/31984 [04:58<00:00, 107.17it/s]


In [160]:
def get_price_from_dist(dist):
    try:
        return prices_ter[prices_ter.Km > dist].iloc[0]['Plein tarif']
    except IndexError:
        return prices_ter[prices_ter.Km == 1300].iloc[0]['Plein tarif']

def compute_price_ter(lat_list, lon_list):
    prices = []
    for i in range(len(lat_list) - 1):
        start_lat = lat_list[i]
        start_lon = lon_list[i]
        stop_lat = lat_list[i + 1]
        stop_lon = lon_list[i + 1]
        dist = compute_distance(start_lat, start_lon, stop_lat, stop_lon)
        price = get_price_from_dist(dist)
        prices.append(price)
    return prices

timetable_ter["prices"] = timetable_ter[["stop_lat", "stop_lon"]].apply(
    lambda x: compute_price_ter(x["stop_lat"], x["stop_lon"]), axis=1)

In [163]:
timetable_ter.apply(remove_unknown_stops, axis=1)
timetable_ter.drop(timetable_ter[timetable_ter.stop_name.str.len() <= 1].index, inplace=True)

In [164]:
timetable_ter.head(3)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_name,stop_lat,stop_lon,durations,prices
223,OCESN002100R0100260086,"[06:25:00, 06:33:00, 06:40:00, 06:48:00]","[06:25:00, 06:34:00, 06:41:00, 06:48:00]","[Provins, Champbenoist-Poigny, Ste-Colombe-Sep...","[48.55569426, 48.54535739999999, 48.53017483, ...","[3.30284529, 3.28705609, 3.25721747, 3.24968453]","[8.0, 6.0, 7.0]","[1.2, 1.4, 1.2]"
224,OCESN002100R0100360088,"[06:30:00, 06:33:00, 06:40:00, 06:48:00]","[06:30:00, 06:34:00, 06:41:00, 06:48:00]","[Provins, Champbenoist-Poigny, Ste-Colombe-Sep...","[48.55569426, 48.54535739999999, 48.53017483, ...","[3.30284529, 3.28705609, 3.25721747, 3.24968453]","[3.0, 6.0, 7.0]","[1.2, 1.4, 1.2]"
225,OCESN002101R0100160094,"[06:44:00, 06:51:00, 06:57:00, 07:02:00]","[06:44:00, 06:52:00, 06:58:00, 07:02:00]","[Longueville, Ste-Colombe-Septveilles, Champbe...","[48.51351115, 48.53017483, 48.54535739999999, ...","[3.24968453, 3.25721747, 3.28705609, 3.30284529]","[7.0, 5.0, 4.0]","[1.2, 1.4, 1.2]"


In [165]:
timetable_ter.to_csv("./data/ter/timetable_ter.csv")

In [152]:
timetable_ter_copy = timetable_ter.copy()