In [1]:
import re

import pandas as pd
import networkx as nx

In [2]:
DATA_DIR = 'data/'

In [3]:
intercity_prices = pd.read_csv(DATA_DIR + 'tarifs-intercites-100-eco.csv', delimiter=';')
intercity_prices = intercity_prices[['Origine', 'Destination', 'Plein Tarif 2nde']]
intercity_prices.columns = ['Origin', 'Destination', 'Price']
intercity_prices.head(3)

Unnamed: 0,Origin,Destination,Price
0,ANGERS ST LAUD,AMBOISE,34.0
1,LA BAULE ESCOUBLAC,AMBOISE,68.0
2,ST NAZAIRE,AMBOISE,65.0


In [5]:
stations = pd.read_csv(DATA_DIR + '_stations.csv')
stations = stations[['Name']]
stations.head(3)

Unnamed: 0,Name
0,La Villeneuve-au-Chène
1,St-Léger-Moussey
2,Romilly-sur-Seine


In [8]:
# Transformations to apply for station names.
NORMALIZATIONS = {'ç': 'c',
                  'é': 'e',
                  'è': 'e',
                  'ë': 'e',
                  'ô': 'o',
                  'â': 'a',
                  'î': 'i',
                  'ê': 'e',
                  '\\(': '',
                  '\\)': '',
                  'û': 'u',
                  '-': ' '}
REPLACEMENTS = {'LES AUBRAIS': 'Les Aubrais-Orléans',
                'SAUMUR': 'Saumur-Rive-Droite',
                'STRASBOURG': 'Strasbourg-Ville',}

def _from_od_get_matches(search, pattern):
    pattern = pattern.strip()
    if pattern in REPLACEMENTS:
        pattern = REPLACEMENTS[pattern]
    pattern = pattern.lower()
    for pat, repl in NORMALIZATIONS.items():
        if pat.startswith('\\'):
            pat = pat[1:]
        pattern = pattern.replace(pat, repl)
    pattern = re.sub(' +', ' ', pattern)
    pattern = pattern.strip()
    return stations[search == pattern].Name.iloc[0]

def get_fixed_name(name):
    search = stations.Name.str.lower()
    for pat, repl in NORMALIZATIONS.items():
        search = search.replace(pat, repl, regex=True)
    return _from_od_get_matches(search, name)

intercity_prices = intercity_prices.assign(Origin=intercity_prices.Origin.apply(
    lambda x: get_fixed_name(x)))
intercity_prices = intercity_prices.assign(Destination=intercity_prices.Destination.apply(
    lambda x: get_fixed_name(x)))
intercity_prices.head(3)

Unnamed: 0,Origin,Destination,Price
0,Angers-St-Laud,Amboise,34.0
1,La Baule-Escoublac,Amboise,68.0
2,St-Nazaire,Amboise,65.0


In [9]:
intercity_prices.to_csv(DATA_DIR + '_intercity_prices.csv', index=False)