# Predicting train delays and detecting unusual service patterns



In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, date, timedelta
import os

import matplotlib.pyplot as plt
import seaborn as sns
from modules.data_skew import numeric_col_distributions

In [None]:
pd.set_option('display.max_colwidth', None)

## Reading in and cleaning service data

In [None]:
train_companies = {
    'Great Western Railway': [800, 802, 387, 175, 165, 166, 57, 150, 158],
    'Elizabeth Line': [345],
    'Cross Country': [220, 221],
    'South Western Railway': [455, 444, 450, 458, 701, 159]
}

# Convert `train_companies` dict to DataFrames
train_companies_df = pd.DataFrame(list(train_companies.items()), columns=['company', 'train_numbers'])

# Explode to have one train number per row
train_companies_df = train_companies_df.explode('train_numbers').rename(columns={'train_numbers': 'lead_class'}).reset_index(drop=True)
train_companies_df['lead_class'] = train_companies_df['lead_class'].astype(float)

In [None]:

service_data = pd.read_csv(r"C:\Users\fcpen\Documents\GitHub\Train_delays_and_services\data\RDG_2024-2025_ALL.csv")
service_data = service_data[(service_data['transport_type'] == 'train') & (service_data['lead_class'] != 66)] # only interested in passenger train services

non_passenger_pattern = r'Siding|Sdgs|Sidings|Loop|Yard|Depot|Quarry|Freight|Freightliners|Reception|Recep|Receptions|Railhead|Jn|Terminal|Terminl|Refinery|Staff|Tml|Recp|Yd|F.L.T'
service_data = service_data[~service_data['origin_description'].str.contains(non_passenger_pattern, case=False, na=False, regex=True)]
service_data = service_data[~service_data['destination_description'].str.contains(non_passenger_pattern, case=False, na=False, regex=True)]
service_data['was_cancelled'] = service_data['stp_indicator'] == 'CAN'

service_data = service_data.merge(train_companies_df, on='lead_class', how='left')


In [None]:
service_data.head()

In [None]:
service_data.info()

In [None]:
service_data.describe()

In [None]:
service_data[service_data['num_vehicles'] > 12].shape

In [None]:
service_data.info()

In [None]:
numerical_cols = ['actual_arr_delay_mins', 'actual_dep_delay_mins', 'actual_pass_delay_mins', 'num_vehicles']

numeric_col_distributions(service_data, numerical_cols)

As expected, the various train delay columns are very skewed as the majority of trains aren't delayed from Reading. The number of carriages is not very skewed as most trains have standard configurations, and there's only so long a station platform can be; from the histogram one can see that there are very very few trains that are longer than 12 carriages, as very few platforms can accommodate trains longer than 12 carriages.

In [None]:
# services_per_day = service_data.groupby('run_date')['schedule_uid'].agg('count')

# plt.figure(figsize=(14, 10))
# services_per_day.plot(kind='line')
# plt.title('Services per day', fontsize=13, fontweight='bold')
# plt.ylabel('Number of services')
# plt.show()