In [1]:
import re

import numpy as np
import pandas as pd

In [5]:
from IPython.display import display

In [2]:
DATA_DIR = 'data/'

In [6]:
# Read stations data and TGV prices data.
stations = pd.read_csv(DATA_DIR + '_stations.csv')[['Name']]
display(stations.head(3))
tgv_prices = pd.read_csv(DATA_DIR + 'tarifs-tgv-par-od.csv', sep=';')
display(tgv_prices.head(3))

Unnamed: 0,Name
0,La Villeneuve-au-Chène
1,St-Léger-Moussey
2,Romilly-sur-Seine


Unnamed: 0,OD,Prix d'appel 2nde,Plein Tarif Loisir 2nde,1ère classe,Commentaires
0,DAX-DOUAI,30.0,144.0,174.0,
1,IRUN-LILLE FLANDRES,30.0,144.0,174.0,
2,BAYONNE-DOUAI,30.0,144.0,174.0,


In [7]:
# Transformations to apply for station names.
NORMALIZATIONS = {'ç': 'c',
                  'é': 'e',
                  'è': 'e',
                  'ë': 'e',
                  'ô': 'o',
                  'â': 'a',
                  'î': 'i',
                  'ê': 'e',
                  '\\(': '',
                  '\\)': '',
                  'û': 'u',
                  '-': ' '}
# Some stations are spelled quite differently, or are ambiguous (had to resort to googling to figure out which is
# which exactly). We have to define these manually.
REPLACEMENTS = {'AEROPORT CDG 2 TGV ROISSY': 'Roissy-Aéroport-Charles-de-Gaulle 2-TGV (TGV)',
                'AEROPORT CDG2 TGV ROISSY': 'Roissy-Aéroport-Charles-de-Gaulle 2-TGV (TGV)',
                'Bâle': 'Bâle-St-Jean',
                'MASSY PALAISEAU': 'Massy-Palaiseau-Grande-Ceinture',
                'LILLE EUROPE-147322': 'Lille-Europe',
                'RUFFEC CHARENTE': 'Ruffec',
                'MOUTIERS SALINS BRIDES': 'Moûtiers-Salins-Brides-les-Bains',
                'MANTES LA J. TGV': 'Mantes-la-Jolie',
                'CLUSES  74': 'Cluses',
                'STRASBOURG': 'Strasbourg-Ville',
                'SAUMUR': 'Saumur-Rive-Droite',
                'PARIS MONTPARNAS VAUGIRARD BLS': 'Paris-Montparnasse',
                'PARIS MONTPARNASSE 3 VAUGIRARD': 'Paris-Montparnasse',
                'PLOUARET TREGOR': 'Plouaret',
                'plouaret tregor': 'Plouaret',
                'Saint Malo': 'St-Malo',
                'CONFLANS FDO TGV': 'Conflans-Fin-d\'Oise',
                'MULHOUSE VILLE': 'Mulhouse-Ville',
                'MULHOUSE': 'Mulhouse-Ville',
                'SAINT MICHEL VALLOIRE': 'St-Michel-Valloire',
                'JUVISY TGV': 'Juvisy',
                'Facture': 'Facture-Biganos',
                'Saint Maixent': 'St-Maixent (Deux-Sèvres)',
                'St maixent': 'St-Maixent (Deux-Sèvres)',
                'Vendôme Gare TGV': 'Vendôme',
                'saint nazaire': 'St-Nazaire',
                'Saint Brieuc': 'St-Brieuc',
                'PARIS MONTPARNASSE 1 ET 2': 'Paris-Montparnasse',
                'Dol de Bretagne': 'Dol',
                'DOL DE BRETAGNE': 'Dol',
                'LYON SAINT EXUPERY TGV': 'Lyon-St-Exupéry-TGV',
                'MONTPELLIER': 'Montpellier (CNM)',
                'AVIGNON SUD': 'Avignon-Centre',
                'Angers': 'Angers-St-Laud',
                'VILLENEUVE LES AVIGNONS': 'Villeneuve-lès-Avignon'}
# Some common suffixes which are missing in many cases, so we try using them to get a match when all else fails.
SUFFIXES = ['-tgv', '-voyageurs']

def _from_od_get_matches(search, pattern):
    pattern = pattern.strip()
    if pattern in REPLACEMENTS:
        pattern = REPLACEMENTS[pattern]
    pattern = pattern.lower()
    for pat, repl in NORMALIZATIONS.items():
        if pat.startswith('\\'):
            pat = pat[1:]
        pattern = pattern.replace(pat, repl)
    pattern = re.sub(' +', ' ', pattern)
    pattern = pattern.strip()
    return stations[search == pattern]

def _from_od_complex(od, lim, index, try_suffixes=False):
    splits = od.split('-', lim)
    splits = ['-'.join(splits[:-1]), splits[-1]]
    search = stations.Name.str.lower()
    for pat, repl in NORMALIZATIONS.items():
        search = search.replace(pat, repl, regex=True)
    pattern = splits[index]
    if try_suffixes:
        orig_pattern = pattern
        for suffix in SUFFIXES:
            pattern = orig_pattern + suffix
            match = _from_od_get_matches(search, pattern)
            if not match.empty:
                return match
    else:
        return _from_od_get_matches(search, pattern)

def _from_od(od, dest):
    splits = od.split('-')
    index = 1 if dest else 0
    for lim in reversed(range(1, len(splits))):
        match = _from_od_complex(od, lim, index)
        if not match.empty:
            return match.Name.iloc[0]
        else:
            match = _from_od_complex(od, lim, index, try_suffixes=True)
            if match is not None and not match.empty:
                return match.Name.iloc[0]
    return None

def origin_from_od(od):
    return _from_od(od, dest=False)

def dest_from_od(od):
    return _from_od(od, dest=True)

In [None]:
tgv_prices = tgv_prices.assign(Origin=tgv_prices.OD.apply(lambda x: origin_from_od(x)))
tgv_prices = tgv_prices.assign(Destination=tgv_prices.OD.apply(lambda x: dest_from_od(x)))
print(len(tgv_prices))

In [None]:
tgv_prices_final = tgv_prices[['Origin', 'Destination', 'Plein Tarif Loisir 2nde']].dropna()
tgv_prices_final.columns = ['Origin', 'Destination', 'Price']
print(len(tgv_prices_final))

In [None]:
tgv_prices_final.head()

In [None]:
tgv_prices_final.to_csv(DATA_DIR + '_tgv_prices.csv', index=False)