In [2]:
import re

import pandas as pd

In [3]:
DATA_DIR = 'data/'

In [12]:
# Load intercity prices data.
intercity_prices = pd.read_csv(DATA_DIR + 'tarifs-intercites-100-eco.csv', delimiter=';')
intercity_prices = intercity_prices[['Origine', 'Destination', 'Plein Tarif 2nde']]
intercity_prices.columns = ['Origin', 'Destination', 'Price']

# We're reading and merging 3 different intercity prices databases.
tmp = pd.read_csv(DATA_DIR + 'tarifs-intercites-de-jour.csv', delimiter=';')
tmp = tmp[['Origine', 'Destination', 'Plein Tarif 2nde']]
tmp.columns = ['Origin', 'Destination', 'Price']
intercity_prices = pd.concat([intercity_prices, tmp])

tmp = pd.read_csv(DATA_DIR + 'tarifs-intercites-sans-reservation-obligatoire.csv', delimiter=';')
tmp = tmp[['Origine', 'Destination', 'Tarif normal 2nde']]
tmp.columns = ['Origin', 'Destination', 'Price']
intercity_prices = pd.concat([intercity_prices, tmp])

# Keep the cheapest plein-tarif options (the rest are too unreliable/not flexible).
intercity_prices = intercity_prices.sort_values(by='Price')
intercity_prices = intercity_prices.drop_duplicates(subset=['Origin', 'Destination'], keep='first')
intercity_prices.head(10)

Unnamed: 0,Origin,Destination,Price
200,LYON PART DIEU,LYON PERRACHE,1.0
0,LES AUBRAIS,ORLÉANS,1.2
1,ST PIERRE DES CORPS,TOURS,1.4
100,CLERMONT FERRAND,LA PARDIEU,1.4
462,COLLIOURE,PORT VENDRES VILLE,1.5
201,LONGUEAU,AMIENS,1.8
101,BEAUMONT LE ROGER,SERQUIGNY,2.0
2,BRASSAC LES MINES,ARVANT,2.0
102,CONDAT LE LARDIN,TERRASSON,2.0
3,FOURCHAMBAULT,NEVERS,2.0


The different databases use different naming conventions. Additionally, there are a lot of inconsistencies in the naming between them. We fix these issues by performing the following steps:

1. String normalization (i.e. to lowercase / remove accents).
2. Perform manual replacements where necessary, by checking the most famous corresponding station by googling (not necessarily the closest/with an unambiguous name).

In [13]:
# Load stations data.
stations = pd.read_csv(DATA_DIR + '_stations.csv')
stations = stations[['Name']]
stations.head(3)

Unnamed: 0,Name
0,Brest
1,Le Rody
2,Kerhuon


In [16]:
# Transformations to apply for station names.
NORMALIZATIONS = {'ç': 'c',
                  'é': 'e',
                  'è': 'e',
                  'ë': 'e',
                  'ô': 'o',
                  'â': 'a',
                  'î': 'i',
                  'ê': 'e',
                  '\\(': '',
                  '\\)': '',
                  'û': 'u',
                  '-': ' '}
REPLACEMENTS = {
    'LES AUBRAIS': 'Les Aubrais-Orléans',
    'SAUMUR': 'Saumur-Rive-Droite',
    'STRASBOURG': 'Strasbourg-Ville',
    'AVIGNON': 'Avignon-Terminal TER',
    'CAUSSADE TARN ET GARONNE': 'Caussade',
    'MONTPELLIER': 'Montpellier-St-Roch',
    'BRASSAC LES MINES': 'Brassac-les-Mines-Ste-Florine',
    'RANG-DU FLIERS': 'Rang-du-Fliers-Verton',
    'AULNOYE': 'Aulnoye-Aymeries',
    'LONGUEVILLE (IDF)': 'Longueville',
    'VIERZON': 'Vierzon-Ville',
    'ROMILLY': 'Romilly-sur-Seine',
    'NEMOURS': 'Nemours-St-Pierre',
    'TULLE (+ REBROUSSEMENT)': 'Tulle',
    'LA PARDIEU': 'Clermont-La Pardieu',
    'VERNON': 'Vernon-Giverny',
    'LA BASTIDE ST LAURENT': 'La Bastide-St-Laurent-les-Bains',
    'PARIS MONTPARNASSE VAUGIRARD': 'Paris-Montparnasse',
    'ROUEN': 'Rouen-Rive-Droite',
    'ST MARIENS': 'St-Mariens-St-Yzan',
    'SAINT AMAND MONTROND': 'St-Amand-Montrond-Orval',
    'PARIS SAINT LAZARE': 'Paris-St-Lazare',
    'TERRASSON': 'Terrasson-Lavilledieu',
    'BEUIL': 'Bueil',
    'BORDEAUX': 'Bordeaux-St-Jean',
    'BOULOGNE': 'Boulogne-Ville',
    'BOULOGNE SUR MER': 'Boulogne-Tintelleries',
    'EVREUX': 'Évreux-Embranchement',
    'GRAND COMBES-LA PISE': 'Grand\'Combe-La Pise',
    'LA ROCHELLE': 'La Rochelle-Ville',
    'LIMOGES': 'Limoges-Bénédictins',
    'LYON PERRACHE': 'Lyon-Perrache-Voyageurs',
    'MARSEILLE': 'Marseille-St-Charles',
    'MONTLUÇON': 'Montluçon-Ville',
    'MONTRÉJEAU': 'Montréjeau-Gourdan-Polignan',
    'PARIS GARE DE BERCY': 'Paris-Bercy',
    'PARIS MONTPARNASSE 3 VAUGIRARD': 'Paris-Montparnasse',
    'RIOM': 'Riom-Châtel-Guyon',
    'SAINT FLORENT SUR CHER': 'St-Florent-sur-Cher',
    'SAINT PIERRE DES CORPS': 'St-Pierre-des-Corps',
    'ST CHÉLY': 'St-Chély-d\'Apcher',
    'ST FLOUR': 'St-Flour-Chaudes-Aigues',
    'ST JEAN DE LUZ': 'St-Jean-de-Luz-Ciboure',
    'TOULOUSE': 'Toulouse-Matabiau',
    'VALLON EN SULLY': 'Vallon',
    'VENDEUVRE': 'Vendeuvre-Jort',
    'NOGENT': 'Nogent-sur-Marne',
    'ANGERS': 'Angers-St-Laud',
    'LYON PERRACHE  VILLE': 'Lyon-Perrache-Voyageurs',
    'PARIS AUSTERLITZ  VILLE': 'Paris-Austerlitz',
    'PARIS EST  SUR SEINE': 'Paris-Est',
    'PARIS NORD  AYMERIES': 'Paris-Nord',
    'VENDEUVRE SUR BARSE': 'Vendeuvre (Aube)',
    'ARGELES': 'Argelès-sur-Mer',
}

def _from_od_get_matches(search, pattern):
    pattern = pattern.strip()
    if pattern in REPLACEMENTS:
        pattern = REPLACEMENTS[pattern]
    pattern = pattern.lower()
    for pat, repl in NORMALIZATIONS.items():
        if pat.startswith('\\'):
            pat = pat[1:]
        pattern = pattern.replace(pat, repl)
    pattern = re.sub(' +', ' ', pattern)
    pattern = pattern.strip()
    return stations[search == pattern].Name.iloc[0]

def get_fixed_name(name):
    search = stations.Name.str.lower()
    for pat, repl in NORMALIZATIONS.items():
        search = search.replace(pat, repl, regex=True)
    return _from_od_get_matches(search, name)

intercity_prices = intercity_prices.assign(Origin=intercity_prices.Origin.apply(
    lambda x: get_fixed_name(x)))
intercity_prices = intercity_prices.assign(Destination=intercity_prices.Destination.apply(
    lambda x: get_fixed_name(x)))
intercity_prices.head(3)

Unnamed: 0,Origin,Destination,Price
200,Lyon-Part-Dieu,Lyon-Perrache-Voyageurs,1.0
0,Les Aubrais-Orléans,Orléans,1.2
1,St-Pierre-des-Corps,Tours,1.4


In [17]:
# Save results.
intercity_prices.to_csv(DATA_DIR + '_intercity_prices.csv', index=False)