In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import re

## Cleaning the data for the lines and their stations

In [6]:
df = pd.read_csv('../data/NBT20FRI-new.csv')

In [18]:
df

Unnamed: 0,Line,Dir,From Station,To Station
0,Bakerloo,NB,Elephant & Castle LU,Lambeth North
1,Bakerloo,NB,Lambeth North,Waterloo LU
2,Bakerloo,NB,Waterloo LU,Embankment
3,Bakerloo,NB,Embankment,Charing Cross LU
4,Bakerloo,NB,Charing Cross LU,Piccadilly Circus
...,...,...,...,...
1154,Victoria,SB,Pimlico,Vauxhall LU
1155,Victoria,SB,Vauxhall LU,Stockwell
1156,Victoria,SB,Stockwell,Brixton LU
1157,Waterloo & City,EB,Waterloo LU,Bank


In [10]:
def clean_tube_data(df):
    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Choose one direction per line
    df = df.drop_duplicates(subset=['Line', 'From Station'])
    
    # Function to clean station names
    def clean_station_name(name):
        # Remove suffixes like LU, LO, NR, TfL, EL
        name = re.sub(r'\s+(LU|LO|NR|TfL|EL)$', '', name)
        # Remove suffixes in brackets
        name = re.sub(r'\s+\([^)]+\)$', '', name)
        return name.strip()
    
    # Clean station names
    df['From Station'] = df['From Station'].apply(clean_station_name)
    df['To Station'] = df['To Station'].apply(clean_station_name)
    
    # Create a new dataframe with Line and Station Name
    from_stations = df[['Line', 'From Station']].rename(columns={'From Station': 'Station Name'})
    to_stations = df[['Line', 'To Station']].rename(columns={'To Station': 'Station Name'})
    stations = pd.concat([from_stations, to_stations], ignore_index=True)
    
    # Remove duplicates and sort
    stations = stations.drop_duplicates().sort_values(['Line', 'Station Name'])
    
    return stations.reset_index(drop=True)

In [17]:
clean_stations = clean_tube_data(df)
clean_stations

Unnamed: 0,Line,Station Name
0,Bakerloo,Baker Street
1,Bakerloo,Charing Cross
2,Bakerloo,Edgware Road
3,Bakerloo,Elephant & Castle
4,Bakerloo,Embankment
...,...,...
590,Victoria,Victoria
591,Victoria,Walthamstow Central
592,Victoria,Warren Street
593,Waterloo & City,Bank


In [19]:
clean_stations.to_csv('../data/station-with-no-coordinates.csv', sep=',', encoding='utf-8', index=False, header=True)