# Predicting train delays and detecting unusual service patterns



In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, date, timedelta
import os

import matplotlib.pyplot as plt
import seaborn as sns
from modules.data_skew import numeric_col_distributions

## Reading in and cleaning service data

In [None]:
train_companies = {
    'Great Western Railway': [800, 802, 387, 175, 165, 166, 57, 150, 158],
    'Elizabeth Line': [345],
    'Cross Country': [220, 221],
    'South Western Railway': [455, 444, 450, 458, 701, 159]
}

# Convert `train_companies` dict to DataFrames
train_companies_df = pd.DataFrame(list(train_companies.items()), columns=['company', 'train_numbers'])

# Explode to have one train number per row
train_companies_df = train_companies_df.explode('train_numbers').rename(columns={'train_numbers': 'lead_class'}).reset_index(drop=True)
train_companies_df['lead_class'] = train_companies_df['lead_class'].astype(float)

In [None]:

service_data = pd.read_csv(r"C:\Users\fcpen\Documents\GitHub\Train_delays_and_services\data\RDG_2024-2025_ALL.csv")
service_data = service_data[(service_data['transport_type'] == 'train') & (service_data['lead_class'] != 66)] # only interested in passenger train services
service_data.drop(columns=['transport_type', 'this_tiploc', 'this_crs'], inplace=True)

non_passenger_pattern = r'Siding|Sdgs|Sidings|Loop|Yard|Depot|Quarry|Freight|Freightliners|Reception|Recep|Receptions|Railhead|Jn|Terminal|Terminl|Refinery|Staff|Tml|Recp|Yd|F.L.T|Fuelling|Gbrf|T.C|Works|Tarmac|Trsmd|Docks|Fh|Dock|Fhh|M.C.T|Sdg|Cargo|Waste'
service_data = service_data[~service_data['origin_description'].str.contains(non_passenger_pattern, case=False, na=False, regex=True)]
service_data = service_data[~service_data['destination_description'].str.contains(non_passenger_pattern, case=False, na=False, regex=True)]
service_data['was_cancelled'] = service_data['stp_indicator'] == 'CAN'

service_data = service_data.merge(train_companies_df, on='lead_class', how='left')


In [None]:
service_data.head()

In [None]:
service_data.info()

In [None]:
service_data.describe()

In [None]:
service_data[service_data['num_vehicles'] > 12].shape

In [None]:
service_data.isnull().sum().sort_values(ascending=False)

The missing values in the datetime columns are likely due to factors to do with the nature of the service itself, so I will ignore those missing values, or in the case of the delays in minutes, I'll replace any nulls with zeroes. As there are so few trains with a missing platform number, I will drop those rows. For train company and number of carriages and train class, I will use domain knowledge of which train company runs services between those two stations.

In [None]:
service_data.dropna(subset=['platform', 'platform_actual'], inplace=True)

In [None]:
selected_cols = ['actual_arr_delay_mins', 'actual_dep_delay_mins', 'actual_pass_delay_mins']

service_data.fillna({col: 0 for col in selected_cols},inplace=True)

In [None]:
def train_company(origin: str, destination: str):
    """
    Determine the train company from origin and destination strings.

    Returns the company name as a string, or `None` if it cannot be inferred.
    """
    # guard against non-string inputs
    if not isinstance(origin, str) or not isinstance(destination, str):
        return None

    origin = origin.strip()
    destination = destination.strip()

    el_stations = {'Abbey Wood', 'London Liverpool Street', 'Shenfield'}
    xc_stations = {'Birmingham New Street', 'Manchester Piccadilly', 'Bournemouth', 'York', 'Banbury'}

    # Elizabeth Line when one end is Reading and the other is one of the EL stations
    if destination in el_stations or origin in el_stations:
        return 'Elizabeth Line'

    # Great Western Railway when Paddington is involved
    if origin == 'London Paddington' and destination != 'Reading':
        return 'Great Western Railway'
    
    if destination == 'London Paddington':
        return 'Great Western Railway'
    
    if origin in xc_stations or destination in xc_stations:
        return 'Cross Country'
    
    if destination == 'London Victoria' or origin == 'London Victoria':
        return 'South Western Railway'
    # Unable to determine
    return None

In [None]:
# Apply `train_company` to rows with missing `company` and show results
before = service_data['company'].isnull().sum()
mask = service_data['company'].isnull()
service_data.loc[mask, 'company'] = (
    service_data.loc[mask].apply(
        lambda r: train_company(r['origin_description'], r['destination_description']), axis=1
    )
)
after = service_data['company'].isnull().sum()
print(f'Company missing before: {before}, after: {after}')

# Show remaining ambiguous rows for manual review
service_data[service_data['company'].isnull()].head(25)

In [None]:
service_data['company'].value_counts()

In [None]:
# Filling in null companies with the most common company - Great Western Railway
service_data.fillna({'company': 'Great Western Railway'}, inplace=True)

In [None]:
service_data.isnull().sum().sort_values(ascending=False)

In [None]:
service_data[service_data['actual_dep_delay_mins'] < 0].shape

In [None]:
numerical_cols = ['actual_arr_delay_mins', 'actual_dep_delay_mins', 'actual_pass_delay_mins', 'num_vehicles']

numeric_col_distributions(service_data, numerical_cols)

As expected, the various train delay columns are very skewed as the majority of trains aren't delayed from Reading. The number of carriages is not very skewed as most trains have standard configurations, and there's only so long a station platform can be; from the histogram one can see that there are very very few trains that are longer than 12 carriages, as very few platforms can accommodate trains longer than 12 carriages.

In [None]:
# services_per_day = service_data.groupby('run_date')['schedule_uid'].agg('count')

# plt.figure(figsize=(14, 10))
# services_per_day.plot(kind='line')
# plt.title('Services per day', fontsize=13, fontweight='bold')
# plt.ylabel('Number of services')
# plt.show()