# Train Delay Data v2

Use this file to explore and pre-process the data

### Import library's

In [64]:
import os
import pandas as pd, numpy as np, copy
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set the option to display all columns, without the "..." in the middle
pd.set_option('display.max_columns', None)


### Location of files

In [65]:
train_data_dir = 'data/delay data' # Directory where the csv data is stored

In [66]:
def find_csv_filenames(path_to_dir, suffix=".csv"):
    filenames = os.listdir(path_to_dir)
    return [filename for filename in filenames if filename.endswith(suffix)]

def concatenate_csv_files(directory):
    frames = []
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                df = pd.read_csv(os.path.join(subdir, file))
                frames.append(df)
    return pd.concat(frames)

# Replace 'your_directory' with the directory you want to search
train_dataset = concatenate_csv_files(train_data_dir)

  df = pd.read_csv(os.path.join(subdir, file))


# Encoding Values

- Below code will convert the string data data types with time stamp values into pandas timestamp value then convert to number of seconds from midnight
- The station unique values are encoded using label encoding
- the Bool values are encoded using binary encoding

- NaN values are transformed into `-1'

In [67]:
columns_for_binary_encoding = []
columns_for_one_hot_encoding = []
columns_for_label_encoding = []
columns_for_target_encoding = []

encoding_dict = {}

for column in train_dataset.columns:
    if train_dataset[column].dtype == 'object':
        if len(train_dataset[column].unique()) == 2:
            columns_for_binary_encoding.append(column)
            encoding_dict[column] = 'Binary Encoding'

        elif len(train_dataset[column].unique()) > 2 and len(train_dataset[column].unique()) < 10:
            columns_for_one_hot_encoding.append(column)
            encoding_dict[column] = 'One Hot Encoding'

        elif len(train_dataset[column].unique()) > 11 and len(train_dataset[column].unique()) < 50:
            columns_for_label_encoding.append(column)
            encoding_dict[column] = 'Label Encoding'

        elif len(train_dataset[column].unique()) > 50:
            columns_for_target_encoding.append(column)
            encoding_dict[column] = 'Target Encoding'

print('Columns for Binary Encoding:', columns_for_binary_encoding)
print('Columns for One Hot Encoding:', columns_for_one_hot_encoding)
print('Columns for Label Encoding:', columns_for_label_encoding)
print('Columns for Target Encoding:', columns_for_target_encoding)
# print('Columns for for y:', labels)
print('\n' + '_' * 20 + '\n')

unique_counts = pd.DataFrame.from_records(
    [(col, train_dataset[col].dtype, len(train_dataset[col].unique()), encoding_dict.get(col, 'No Encoding')) for col in train_dataset.columns],
    columns=['Column_Name', 'Data_Type', 'Num_Unique_Values', 'Encoding']
)

unique_counts

Columns for Binary Encoding: ['arr_atRemoved', 'pass_atRemoved', 'dep_atRemoved']
Columns for One Hot Encoding: ['dep_wet']
Columns for Label Encoding: ['tpl']
Columns for Target Encoding: ['pta', 'ptd', 'wta', 'wtp', 'wtd', 'arr_et', 'arr_wet', 'pass_et', 'dep_et', 'arr_at', 'pass_at', 'dep_at']

____________________



Unnamed: 0,Column_Name,Data_Type,Num_Unique_Values,Encoding
0,rid,int64,55552,No Encoding
1,tpl,object,47,Label Encoding
2,pta,object,1152,Target Encoding
3,ptd,object,1131,Target Encoding
4,wta,object,2156,Target Encoding
5,wtp,object,2297,Target Encoding
6,wtd,object,2110,Target Encoding
7,arr_et,object,1024,Target Encoding
8,arr_wet,object,786,Target Encoding
9,arr_atRemoved,object,2,Binary Encoding


## Column headers

| Code      | Description                   | Notes                         | Importance    |
| ----      | -----------                   | -----                         | ----------    |
| rid       | Train RTTI Train Identifier   | Unique code for train travel  |               |
| tpl       | TIPLOC (Timing point locations) | Unique station code         |               |
| pta       | Planned Time of Arrival       | 24hr Time value               |               |
| ptd       | Planned Time of Departure     | 24hr Time value               |               |
| wta       | Working (staff) Time of Arrival| 24hr Time value- with seconds|               |
| wtp       | Working Time of Passing       | 24hr Time value               |               |
| wtd       | Working Time of Departure     | 24hr Time value- with seconds |               |
| arr_et    | Estimated Arrival Time        | 24hr Time value               |               |
| arr_wet   | Working Estimated Time        | 24hr Time value               |               |
| arr_atRemoved | true if actual replaced by estimated | True / False       |               |
| pass_et   | Estimated Passing Time        | 24hr Time value               |               |    
| pass_wet  | Working Estimated Time        | ** 24hr Time value?           |               |             |
| pass_atRemoved | true if actual replaced by estimated | True / False      |               | 
| dep_et    | Estimated Departure           | 24hr Time value               |               |           |
| dep_wet    | Working Estimated Time       | ** 24hr Time value?           |               |
| dep_atRemoved | true if actual replaced by estimated | True / False       |               |
| arr_at    | Recorded Actual Time of Arrival | 24hr Time value             |               |
| pass_at   | Actual Passing Time           | 24hr Time value               |               |
| dep_at    | Actual Departure Time         | 24hr Time value               |               |
| cr_code   | Cancellation Reason Code      | Float value                   |               |
| lr_code   | Late Running Reason           | Float Value                   |               | 

In [75]:
train_dataset['elapsed_time_from_london'] = np.where(train_dataset['tpl'] == 'LIVST', 0, np.nan)

In [76]:
train_dataset.head(50)

Unnamed: 0,rid,tpl,pta,ptd,wta,wtp,wtd,arr_et,arr_wet,arr_atRemoved,pass_et,pass_wet,pass_atRemoved,dep_et,dep_wet,dep_atRemoved,arr_at,pass_at,dep_at,cr_code,lr_code,arr_at_Norwich,new_column,elapsed_time_from_london
0,202009016712165,LIVST,,07:00,,,07:00,,,,,,,,,False,,,06:59,,,,0.0,0.0
1,202009016712165,BTHNLGR,,,,07:03,,,,,,,False,,,,,07:03,,,,,,
2,202009016712165,BOWJ,,,,07:05,,,,,,,False,,,,,07:04,,,,,,
3,202009016712165,MRYLAND,,,,07:06:30,,,,,,,False,,,,,07:06,,,,,,
4,202009016712165,STFD,,,,07:06,,,,,07:07,,False,,,,,,,,,,,
5,202009016712165,FRSTGTJ,,,,07:07:30,,,,,,,False,,,,,07:07,,,,,,
6,202009016712165,ILFORD,,,,07:09,,,,,,,False,,,,,07:07,,,,,,
7,202009016712165,MANRPK,,,,07:08,,,,,07:07,,False,,,,,,,,,,,
8,202009016712165,SVNKNGS,,,,07:09:30,,,,,,,False,,,,,07:08,,,,,,
9,202009016712165,GODMAYS,,,,07:10,,,,,,,False,,,,,07:09,,,,,,


In [72]:
train_dataset[['tpl','arr_at','pass_at','dep_at']].head(40)

Unnamed: 0,tpl,arr_at,pass_at,dep_at
0,LIVST,,,06:59
1,BTHNLGR,,07:03,
2,BOWJ,,07:04,
3,MRYLAND,,07:06,
4,STFD,,,
5,FRSTGTJ,,07:07,
6,ILFORD,,07:07,
7,MANRPK,,,
8,SVNKNGS,,07:08,
9,GODMAYS,,07:09,


## Creating a new column / feature for the arrival time at Norwich

In [69]:
train_dataset['arr_at_Norwich'] = np.where(train_dataset['tpl'] == 'NRCH', train_dataset['arr_at'], np.nan)

train_dataset.loc[train_dataset['tpl'] == 'NRCH', 'arr_at'] = np.nan

train_dataset

Unnamed: 0,rid,tpl,pta,ptd,wta,wtp,wtd,arr_et,arr_wet,arr_atRemoved,pass_et,pass_wet,pass_atRemoved,dep_et,dep_wet,dep_atRemoved,arr_at,pass_at,dep_at,cr_code,lr_code,arr_at_Norwich
0,202009016712165,LIVST,,07:00,,,07:00,,,,,,,,,False,,,06:59,,,
1,202009016712165,BTHNLGR,,,,07:03,,,,,,,False,,,,,07:03,,,,
2,202009016712165,BOWJ,,,,07:05,,,,,,,False,,,,,07:04,,,,
3,202009016712165,MRYLAND,,,,07:06:30,,,,,,,False,,,,,07:06,,,,
4,202009016712165,STFD,,,,07:06,,,,,07:07,,False,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27013,202204308009724,DISS,01:03,01:05,01:03,,01:05,,,False,,,,,,False,01:02,,01:04,,,
27014,202204308009724,TROWSEJ,,,,01:19:30,,,,,,,False,,,,,01:18,,,,
27015,202204308009724,TRWSSBJ,,,,01:20,,,,,01:18,,False,,,,,,,,,
27016,202204308009724,NRCHTPJ,,,,01:20:30,,,,,,,False,,,,,01:19,,,,


## Dimension Reduction

## Converting string time vales to timestamp

In [54]:
def convert_string_to_seconds(str):
    date_time_value = pd.to_datetime(str, format='%H:%M')
    total_seconds = date_time_value.hour * 3600 + date_time_value.minute * 60 + date_time_value.second
    return total_seconds

def convert_seconds_to_string(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return "{:02d}:{:02d}:{:02d}".format(int(hours), int(minutes), int(seconds))

In [55]:
time= "12:34"

print(f'Orignal Value: {time}')
print(f'Converted Value: {convert_string_to_seconds(time)}')
print(f'Backwards Converted Value: {convert_seconds_to_string(convert_string_to_seconds(time))}')

Orignal Value: 12:34
Converted Value: 45240
Backwards Converted Value: 12:34:00


In [56]:
def convert_to_seconds(df, col, time_format):
    df[col] = pd.to_datetime(df[col], errors='coerce', format=time_format)
    seconds_since_midnight = df[col].dt.hour * 3600 + df[col].dt.minute * 60 + df[col].dt.second
    return seconds_since_midnight.fillna(-1)

# Define time columns
time_columns = train_dataset.columns.drop(['lr_code', 'cr_code', 'dep_atRemoved', 'pass_atRemoved', 'arr_atRemoved','tpl','rid','wta','wtd'])
time_columns_with_seconds = train_dataset[['wta','wtd']]

# Convert time strings to time objects for each column
for col in time_columns:
    train_dataset[col + '_seconds_since_midnight'] = convert_to_seconds(train_dataset, col, '%H:%M')
    train_dataset.drop(col, axis=1, inplace=True)

for col in time_columns_with_seconds:
    train_dataset[col + '_seconds_since_midnight'] = convert_to_seconds(train_dataset, col, '%H:%M:%S')
    train_dataset.drop(col, axis=1, inplace=True)

In [57]:
import datetime

# Function to convert seconds since midnight to time value
def seconds_to_time(seconds):
    # Calculate hours, minutes, and seconds
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    
    # Create a timedelta object representing the time duration
    time_delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
    
    # Use midnight as a reference point and add the time duration to it
    midnight = datetime.datetime.strptime('00:00:00', '%H:%M:%S').time()
    time_value = (datetime.datetime.combine(datetime.date.today(), midnight) + time_delta).time()
    
    return time_value

# Example usage
seconds_since_midnight = 23850.0  # Example value
time_value = seconds_to_time(seconds_since_midnight)
print("Time value:", time_value)


Time value: 06:37:30


## Encoding the tpl

In [58]:
from sklearn.preprocessing import LabelEncoder

# create the LabelEncoder object
le = LabelEncoder()

# fit the encoder
le.fit(train_dataset['tpl'])

# create a DataFrame with the original and encoded values
encoding_table = pd.DataFrame({
    'Original Value': le.classes_,
    'Encoded Value': range(len(le.classes_))
})

print(encoding_table)

train_dataset['tpl'] = le.fit_transform(train_dataset['tpl'])

list_encoded_stations = train_dataset['tpl']

train_dataset.head(33)

   Original Value  Encoded Value
0            BOWJ              0
1         BROXBRN              1
2         BRTWOOD              2
3         BTHNLGR              3
4         CHDWLHT              4
5         CHESHNT              5
6         CHLMSFD              6
7         CLCHSTR              7
8            DISS              8
9          FRSTGT              9
10        FRSTGTJ             10
11        GIDEAPK             11
12        GIDEPKJ             12
13        GODMAYS             13
14        HAGHLYJ             14
15        HAKNYNM             15
16        HFLPEVL             16
17        HRLDWOD             17
18        ILFELEJ             18
19         ILFORD             19
20        INGTSTL             20
21        INGTSTN             21
22        IPSWEPJ             22
23        IPSWESJ             23
24        IPSWHJN             24
25        IPSWICH             25
26        KELVEDN             26
27          LIVST             27
28        MANNGTR             28
29        

Unnamed: 0,rid,tpl,arr_atRemoved,pass_atRemoved,dep_atRemoved,cr_code,lr_code,pta_seconds_since_midnight,ptd_seconds_since_midnight,wtp_seconds_since_midnight,arr_et_seconds_since_midnight,arr_wet_seconds_since_midnight,pass_et_seconds_since_midnight,pass_wet_seconds_since_midnight,dep_et_seconds_since_midnight,dep_wet_seconds_since_midnight,arr_at_seconds_since_midnight,pass_at_seconds_since_midnight,dep_at_seconds_since_midnight,arr_at_Norwich_seconds_since_midnight,wta_seconds_since_midnight,wtd_seconds_since_midnight
0,202009016712165,27,,,False,,,-1.0,25200.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25140.0,-1.0,-1.0,-1.0
1,202009016712165,3,,False,,,,-1.0,-1.0,25380.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25380.0,-1.0,-1.0,-1.0,-1.0
2,202009016712165,0,,False,,,,-1.0,-1.0,25500.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25440.0,-1.0,-1.0,-1.0,-1.0
3,202009016712165,31,,False,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25560.0,-1.0,-1.0,-1.0,-1.0
4,202009016712165,38,,False,,,,-1.0,-1.0,25560.0,-1.0,-1.0,25620.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,202009016712165,10,,False,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25620.0,-1.0,-1.0,-1.0,-1.0
6,202009016712165,19,,False,,,,-1.0,-1.0,25740.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25620.0,-1.0,-1.0,-1.0,-1.0
7,202009016712165,29,,False,,,,-1.0,-1.0,25680.0,-1.0,-1.0,25620.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
8,202009016712165,41,,False,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25680.0,-1.0,-1.0,-1.0,-1.0
9,202009016712165,13,,False,,,,-1.0,-1.0,25800.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25740.0,-1.0,-1.0,-1.0,-1.0


#### Code Values

## Encoding the True / False values
True = 1

False = 0

NaN = -1

In [59]:
for col in columns_for_binary_encoding:
    # Map True to 1, False to 0, and NaN to a specific value (e.g., -1)
    train_dataset[col] = train_dataset[col].fillna(-1).astype(float)

train_dataset[['lr_code', 'cr_code']] = train_dataset[['lr_code', 'cr_code']].fillna(-1).astype(float)


train_dataset.sample(5)


Unnamed: 0,rid,tpl,arr_atRemoved,pass_atRemoved,dep_atRemoved,cr_code,lr_code,pta_seconds_since_midnight,ptd_seconds_since_midnight,wtp_seconds_since_midnight,arr_et_seconds_since_midnight,arr_wet_seconds_since_midnight,pass_et_seconds_since_midnight,pass_wet_seconds_since_midnight,dep_et_seconds_since_midnight,dep_wet_seconds_since_midnight,arr_at_seconds_since_midnight,pass_at_seconds_since_midnight,dep_at_seconds_since_midnight,arr_at_Norwich_seconds_since_midnight,wta_seconds_since_midnight,wtd_seconds_since_midnight
8700,202002107641923,22,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,42420.0,-1.0,-1.0,42300.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6762,202110117141587,31,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,34680.0,-1.0,-1.0,-1.0,-1.0
482,201801027621053,0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,36300.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,36300.0,-1.0,-1.0,-1.0,-1.0
18708,201903257629015,30,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,49380.0,-1.0,-1.0,-1.0,-1.0
17963,201707187101354,40,0.0,-1.0,0.0,-1.0,-1.0,62340.0,62340.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,62460.0,-1.0,62580.0,-1.0,62310.0,62370.0


In [60]:
unique_counts = pd.DataFrame.from_records(
    [(col, train_dataset[col].dtype, len(train_dataset[col].unique()), encoding_dict.get(col, 'No Encoding')) for col in train_dataset.columns],
    columns=['Column_Name', 'Data_Type', 'Num_Unique_Values', 'Encoding']
)

unique_counts

Unnamed: 0,Column_Name,Data_Type,Num_Unique_Values,Encoding
0,rid,int64,55552,No Encoding
1,tpl,int64,47,Label Encoding
2,arr_atRemoved,float64,2,Binary Encoding
3,pass_atRemoved,float64,2,Binary Encoding
4,dep_atRemoved,float64,2,Binary Encoding
5,cr_code,float64,112,No Encoding
6,lr_code,float64,181,No Encoding
7,pta_seconds_since_midnight,float64,1152,No Encoding
8,ptd_seconds_since_midnight,float64,1131,No Encoding
9,wtp_seconds_since_midnight,float64,1167,No Encoding


# Splitting the dataset

For the RNN to work it accepts data in steps. I am using the journey id 'rid' as the value of each journey $step$. Below shows there is an uneven number of step values,  the minority step values will be dropped.

In [14]:
# Group by 'rid', calculate the shape of each group, and count the occurrences of each shape
shape_counts = train_dataset.groupby('rid').apply(lambda x: x.shape).value_counts()

# Sort the Series by the first element of the shape tuple
sorted_shape_counts = shape_counts.sort_index(key=lambda x: x.map(lambda y: y[0]))

# Print the sorted shape counts
for shape, count in sorted_shape_counts.items():
    print(f"Shape: {shape}, Count: {count}")

Shape: (1, 22), Count: 1
Shape: (2, 22), Count: 20
Shape: (3, 22), Count: 61
Shape: (4, 22), Count: 36
Shape: (5, 22), Count: 10
Shape: (6, 22), Count: 504
Shape: (7, 22), Count: 357
Shape: (8, 22), Count: 121
Shape: (9, 22), Count: 428
Shape: (10, 22), Count: 40
Shape: (11, 22), Count: 17
Shape: (12, 22), Count: 34
Shape: (13, 22), Count: 39
Shape: (14, 22), Count: 179
Shape: (15, 22), Count: 93
Shape: (16, 22), Count: 137
Shape: (17, 22), Count: 8
Shape: (18, 22), Count: 15
Shape: (19, 22), Count: 9
Shape: (20, 22), Count: 7
Shape: (21, 22), Count: 11
Shape: (22, 22), Count: 22
Shape: (23, 22), Count: 41
Shape: (24, 22), Count: 49
Shape: (25, 22), Count: 289
Shape: (26, 22), Count: 123
Shape: (27, 22), Count: 12
Shape: (28, 22), Count: 131
Shape: (29, 22), Count: 199
Shape: (30, 22), Count: 373
Shape: (31, 22), Count: 9576
Shape: (32, 22), Count: 42459
Shape: (33, 22), Count: 59
Shape: (34, 22), Count: 4
Shape: (35, 22), Count: 88


  shape_counts = train_dataset.groupby('rid').apply(lambda x: x.shape).value_counts()


In [15]:
# Group by 'rid' and filter groups with shape greater than or equal to (32, 21)
filtered_test_data = train_dataset.groupby('rid').filter(lambda x: x.shape == (32, 22))


filtered_test_data

Unnamed: 0,rid,tpl,arr_atRemoved,pass_atRemoved,dep_atRemoved,cr_code,lr_code,pta_seconds_since_midnight,ptd_seconds_since_midnight,wtp_seconds_since_midnight,arr_et_seconds_since_midnight,arr_wet_seconds_since_midnight,pass_et_seconds_since_midnight,pass_wet_seconds_since_midnight,dep_et_seconds_since_midnight,dep_wet_seconds_since_midnight,arr_at_seconds_since_midnight,pass_at_seconds_since_midnight,dep_at_seconds_since_midnight,arr_at_Norwich_seconds_since_midnight,wta_seconds_since_midnight,wtd_seconds_since_midnight
0,202009016712165,27,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,25200.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25140.0,-1.0,-1.0,-1.0
1,202009016712165,3,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,25380.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25380.0,-1.0,-1.0,-1.0,-1.0
2,202009016712165,0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,25500.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25440.0,-1.0,-1.0,-1.0,-1.0
3,202009016712165,31,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25560.0,-1.0,-1.0,-1.0,-1.0
4,202009016712165,38,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,25560.0,-1.0,-1.0,25620.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27013,202204308009724,8,0.0,-1.0,0.0,-1.0,-1.0,3780.0,3900.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3720.0,-1.0,3840.0,-1.0,-1.0,-1.0
27014,202204308009724,43,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4680.0,-1.0,-1.0,-1.0,-1.0
27015,202204308009724,44,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,4800.0,-1.0,-1.0,4680.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
27016,202204308009724,34,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,4740.0,-1.0,-1.0,-1.0,-1.0


In [16]:
filtered_test_data.groupby('rid').apply(lambda x: x.shape).value_counts()

  filtered_test_data.groupby('rid').apply(lambda x: x.shape).value_counts()


(32, 22)    42459
Name: count, dtype: int64

In [17]:
from sklearn.preprocessing import MinMaxScaler

# Define the proportion of data to allocate to the validation set (e.g., 20%)
validation_proportion = 0.2

# Identify unique journeys based on the 'rid' column
unique_journeys = filtered_test_data['rid'].unique()

# Calculate the number of unique journeys to allocate to the validation set
num_validation_journeys = int(len(unique_journeys) * validation_proportion)

# Select a subset of unique journeys for validation
validation_journeys = unique_journeys[-num_validation_journeys:]

# Split the data into train and validation sets based on the selected unique journeys
train_df = filtered_test_data[~filtered_test_data['rid'].isin(validation_journeys)]
validation_df = filtered_test_data[filtered_test_data['rid'].isin(validation_journeys)]

# Drop the 'rid' column from both dataframes
train_df = train_df.drop(columns=           ['rid','arr_atRemoved', 'pass_atRemoved', 'dep_atRemoved', 'pass_wet_seconds_since_midnight','wta', 'wtp'])
validation_df = validation_df.drop(columns= ['rid','arr_atRemoved', 'pass_atRemoved', 'dep_atRemoved', 'pass_wet_seconds_since_midnight','wta', 'wtp'])

labels= ['arr_at_Norwich_seconds_since_midnight']

# Split the train data into X and y
X_train = train_df.drop(columns=labels)
y_train = train_df[labels]

# Split the validation data into X and y
X_val = validation_df.drop(columns=labels)
y_val = validation_df[labels]

_X_val = validation_df

# Developing The Model

## Basic Regressor

In [21]:
from sklearn.neural_network import MLPRegressor
from math import sqrt
from sklearn.metrics import mean_squared_error, r2_score

mlp = MLPRegressor(hidden_layer_sizes=150,solver='sgd', max_iter=50, activation='logistic',random_state=0, learning_rate_init=0.001,verbose = 1, momentum=0.9, tol=0.001, early_stopping=True)
mlp.fit(X_train, y_train)
# Make predictions
mlp_pred = mlp.predict(X_val)

y_val['MLP_Norwich_arr_seconds_since_midnight_prediction'] = mlp_pred.flatten()

y_val_filtered = y_val.loc[y_val['arr_at_Norwich_seconds_since_midnight'] != -1].copy()

# Calculate the score of the predictions
mlp_mse_score = mean_squared_error(y_val['arr_at_Norwich_seconds_since_midnight'], y_val['MLP_Norwich_arr_seconds_since_midnight_prediction'])

# Calculate RMSE
mlp_rmse_score = sqrt(mlp_mse_score)

print('Unfiltered Scores: ')
print(f"Prediction MSE score: {mlp_mse_score}")
print(f"Prediction RMSE score: {mlp_rmse_score}")
print(f"Prediction RMSE score in seconds: {convert_seconds_to_string(mlp_rmse_score)}")

print('-' * 30)

# Calculate the score of the predictions
mlp_mse_score_filtered = mean_squared_error(y_val_filtered['arr_at_Norwich_seconds_since_midnight'], y_val_filtered['MLP_Norwich_arr_seconds_since_midnight_prediction'])

rmse_score_filtered = sqrt(mlp_mse_score_filtered)

print('Unfiltered Scores: ')
print(f"Prediction MSE score: {mlp_mse_score_filtered}")
print(f"Prediction RMSE score: {rmse_score_filtered}")
print(f"Prediction RMSE score in seconds: {convert_seconds_to_string(rmse_score_filtered)}")


# y_val['MLP_Norwich_arr_prediction'] = y_val['MLP_Norwich_arr_seconds_since_midnight_prediction'].apply(convert_seconds_to_string)
# y_val_filtered['MLP_Norwich_arr_prediction'] = y_val_filtered['MLP_Norwich_arr_seconds_since_midnight_prediction'].apply(convert_seconds_to_string)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 41693509.09362442
Validation score: 0.864669
Iteration 2, loss = 45015184.10523089
Validation score: 0.870451
Iteration 3, loss = 44858573.10123549
Validation score: 0.872124
Iteration 4, loss = 44773258.35006578
Validation score: 0.873369
Iteration 5, loss = 44724342.72258563
Validation score: 0.874446
Iteration 6, loss = 44677747.45107383
Validation score: 0.875045
Iteration 7, loss = 44631383.21121157
Validation score: 0.876098
Iteration 8, loss = 44590649.29448048
Validation score: 0.877064
Iteration 9, loss = 44555305.65424439
Validation score: 0.877670
Iteration 10, loss = 44522708.35210019
Validation score: 0.878238
Iteration 11, loss = 44491885.52554170
Validation score: 0.878622
Iteration 12, loss = 44464926.73269501
Validation score: 0.878287
Iteration 13, loss = 44438257.13461301
Validation score: 0.879722
Iteration 14, loss = 44414701.32241297
Validation score: 0.879557
Iteration 15, loss = 44395372.92052523
Validation score: 0.880825
Iteration 16, loss 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_val['MLP_Norwich_arr_seconds_since_midnight_prediction'] = mlp_pred.flatten()


Unfiltered Scores: 


Prediction MSE score: 10114046.912236331
Prediction RMSE score: 3180.25893792256
Prediction RMSE score in seconds: 00:53:00
------------------------------
Unfiltered Scores: 


Prediction MSE score: 291973006.3288507
Prediction RMSE score: 17087.217629820563
Prediction RMSE score in seconds: 04:44:47


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_val['MLP_Norwich_arr_prediction'] = y_val['MLP_Norwich_arr_seconds_since_midnight_prediction'].apply(convert_seconds_to_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_val_filtered['MLP_Norwich_arr_prediction'] = y_val_filtered['MLP_Norwich_arr_seconds_since_midnight_prediction'].apply(convert_seconds_to_string)


##  Long Short-Term Memory (LSTM) network

Reset the validation and training data

In [28]:
# Re-split the train data into X and y
X_train = train_df.drop(columns=labels)
y_train = train_df[labels]

# Re-split the validation data into X and y
X_val = validation_df.drop(columns=labels)
y_val = validation_df[labels]

In [29]:
num_stations = len(train_df['lr_code'].unique())
num_features = train_df.drop(columns=['arr_at_seconds_since_midnight']).shape[1]
num_samples = train_df.shape[0]

print("Shape of array before reshaping:", train_df.drop(columns=['arr_et_seconds_since_midnight']).values.shape)
print("num_stations:", num_stations)
print("num_features:", num_features)
print("num_samples:", num_samples)


num_val_stations = len(validation_df['lr_code'].unique())
num_val_features = validation_df.drop(columns=['arr_at_seconds_since_midnight']).shape[1]
num_val_samples = validation_df.shape[0]

print("\n\nShape of array before reshaping:", validation_df.drop(columns=['arr_et_seconds_since_midnight']).values.shape)
print("num_stations:", num_val_stations)
print("num_features:", num_val_features)
print("num_samples:", num_val_samples)

Shape of array before reshaping: (1086976, 20)
num_stations: 159
num_features: 20
num_samples: 1086976


Shape of array before reshaping: (271712, 20)
num_stations: 108
num_features: 20
num_samples: 271712


In [30]:
# Reshape the data
X_train_3d = X_train.values.reshape((-1, 32, X_train.shape[1]))
y_train_3d = y_train.values.reshape((-1, 32, 1))

X_val_3d = X_val.values.reshape((-1, 32, X_val.shape[1]))
y_val_3d = y_val.values.reshape((-1, 32, 1))

In [31]:
print("Shape of X_val_3d:", X_val_3d.shape)
print("Shape of y_val_3d:", y_val_3d.shape)

print("Type of X_train_3d:", type(X_train_3d))
print("Type of y_train_3d:", type(y_train_3d))
print("Type of X_val_3d:", type(X_val_3d))
print("Type of y_val_3d:", type(y_val_3d))

Shape of X_val_3d: (8491, 32, 20)
Shape of y_val_3d: (8491, 32, 1)
Type of X_train_3d: <class 'numpy.ndarray'>
Type of y_train_3d: <class 'numpy.ndarray'>
Type of X_val_3d: <class 'numpy.ndarray'>
Type of y_val_3d: <class 'numpy.ndarray'>


In [32]:

from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense
from keras.callbacks import EarlyStopping, TensorBoard
from keras.metrics import MeanSquaredLogarithmicError, MeanAbsolutePercentageError
import tensorflow as tf
import datetime

# Define RMSE
def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(32, X_train.shape[1])))
model.add(TimeDistributed(Dense(1)))
model.compile(optimizer='adam', loss='mse', metrics=['mae', rmse, MeanSquaredLogarithmicError(), MeanAbsolutePercentageError()])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Define TensorBoard
time_stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/fit/" + time_stamp
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

model.summary()

# Train the model
model.fit(X_train_3d, y_train_3d, epochs=100, verbose=1, validation_data=(X_val_3d, y_val_3d), callbacks=[early_stopping, tensorboard_callback])

model.save(f'RNN Model_{time_stamp}.keras')

2024-04-30 18:40:24.106071: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(**kwargs)


Epoch 1/100
[1m1062/1062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 26ms/step - loss: 73334936.0000 - mae: 3939.7644 - mean_absolute_percentage_error: 32641422.0000 - mean_squared_logarithmic_error: 31.7891 - rmse: 7431.0239 - val_loss: 4466709.0000 - val_mae: 810.3860 - val_mean_absolute_percentage_error: 67584.1016 - val_mean_squared_logarithmic_error: 16.4414 - val_rmse: 1998.5557
Epoch 2/100
[1m1062/1062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 23ms/step - loss: 4820280.5000 - mae: 545.9996 - mean_absolute_percentage_error: 62564080.0000 - mean_squared_logarithmic_error: 13.9766 - rmse: 1773.2737 - val_loss: 850025.3750 - val_mae: 237.4935 - val_mean_absolute_percentage_error: 18671.6406 - val_mean_squared_logarithmic_error: 9.4256 - val_rmse: 714.4414
Epoch 3/100
[1m1062/1062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - loss: 700927.8125 - mae: 182.3744 - mean_absolute_percentage_error: 24745458.0000 - mean_squared_logarithmic_err

In [None]:
# Use the below to load the model instead of training again..

# from tensorflow.keras.models import load_model
# from tensorflow.keras import backend as K

# # Define the custom RMSE function
# def rmse(y_true, y_pred):
#     return K.sqrt(K.mean(K.square(y_pred - y_true)))

# # Load the model
# model = load_model('RNN Model.keras', custom_objects={'rmse': rmse})

In [36]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Make predictions
lstm_pred = model.predict(X_val_3d)

y_val['LSTM_Norwich_arr_seconds_since_midnight_prediction'] = lstm_pred.flatten()

y_val_filtered = y_val.loc[y_val['arr_at_Norwich_seconds_since_midnight'] != -1].copy()

# Calculate the score of the predictions
mse_score = mean_squared_error(y_val[labels], y_val['LSTM_Norwich_arr_seconds_since_midnight_prediction'])

# Calculate RMSE
rmse_score = sqrt(mse_score)

print(f"Prediction MSE score: {mse_score}")
print(f"Prediction RMSE score: {rmse_score}")
print(f"Prediction RMSE score in seconds: {convert_seconds_to_string(rmse_score)}")

print('-' * 30)

# Calculate the score of the predictions
LSTM_mse_score_filtered = mean_squared_error(y_val_filtered['arr_at_Norwich_seconds_since_midnight'], y_val_filtered['LSTM_Norwich_arr_seconds_since_midnight_prediction'])

rmse_score_filtered = sqrt(mlp_mse_score_filtered)

print('Filtered Scores: ')
print(f"prediction MSE score: {mlp_mse_score_filtered}")
print(f"Prediction RMSE score: {rmse_score_filtered}")
print(f"Prediction RMSE score in seconds: {convert_seconds_to_string(rmse_score_filtered)}")


y_val['LSTM_Norwich_arr_prediction'] = y_val['LSTM_Norwich_arr_seconds_since_midnight_prediction'].apply(convert_seconds_to_string)
y_val_filtered['LSTM_Norwich_arr_prediction'] = y_val_filtered['LSTM_Norwich_arr_seconds_since_midnight_prediction'].apply(convert_seconds_to_string)

[1m 31/266[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 5ms/step

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
Prediction MSE score: 645586.2465235378
Prediction RMSE score: 803.4838184578068
Prediction RMSE score in seconds: 00:13:23
------------------------------
Filtered Scores: 
prediction MSE score: 291973006.3288507
Prediction RMSE score: 17087.217629820563
Prediction RMSE score in seconds: 04:44:47


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_val['LSTM_Norwich_arr_seconds_since_midnight_prediction'] = lstm_pred.flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_val['LSTM_Norwich_arr_prediction'] = y_val['LSTM_Norwich_arr_seconds_since_midnight_prediction'].apply(convert_seconds_to_string)


In [40]:
y_val_filtered['arr_at_Norwich'] = y_val_filtered['arr_at_Norwich_seconds_since_midnight'].apply(convert_seconds_to_string)

y_val_filtered

Unnamed: 0,arr_at_Norwich_seconds_since_midnight,LSTM_Norwich_arr_seconds_since_midnight_prediction,LSTM_Norwich_arr_prediction,arr_at_Norwich
9141,71340.0,71811.460938,19:56:51,19:49:00
9173,73560.0,72627.929688,20:10:27,20:26:00
9205,74940.0,75613.695312,21:00:13,20:49:00
9237,77220.0,76200.929688,21:10:00,21:27:00
9269,78900.0,79166.492188,21:59:26,21:55:00
...,...,...,...,...
26858,76800.0,74834.914062,20:47:14,21:20:00
26921,80280.0,79130.648438,21:58:50,22:18:00
26953,84420.0,81905.757812,22:45:05,23:27:00
26985,1200.0,83.042725,00:01:23,00:20:00


### Tuning Parameters

In [None]:
import keras_tuner
from keras_tuner import HyperModel
from keras import layers
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense, Masking
from keras.metrics import MeanSquaredLogarithmicError, MeanAbsolutePercentageError
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
...

class RNNHyperModel(HyperModel):
    def __init__(self, num_units_min, num_units_max, num_units_step, 
                 num_layers_min, num_layers_max, 
                 activation, recurrent_activation, 
                 dropout_min, dropout_max, dropout_step, dropout_activate):
                #  optimizer, loss):
        
        self.num_units_min = num_units_min
        self.num_units_max = num_units_max
        self.num_units_step = num_units_step
        self.num_layers_min = num_layers_min
        self.num_layers_max = num_layers_max
        self.activation = activation
        self.recurrent_activation = recurrent_activation
        self.dropout_min = dropout_min
        self.dropout_max = dropout_max
        self.dropout_step = dropout_step
        self.dropout_activate = dropout_activate
        # self.optimizer = optimizer
        # self.loss = loss

    def build(self, hp):
        input_shape = (32, X_train.shape[1])
        model = Sequential()
        
        # model.add(Masking(mask_value=-1, input_shape=input_shape))
        
        _units = hp.Int('number_of_units', min_value=self.num_units_min, max_value=self.num_units_max,step=self.num_units_step, default = 30)
        _activation = hp.Choice('activation', values=self.activation)
        _recurrent_activation = hp.Choice('recurrent_activation', values=self.recurrent_activation)
        # _optimizer= hp.Choice('optimizer', values= self.optimizer)
        # _loss = hp.Choice('loss_function', values= self.loss)
        
        for i in range(1, hp.Int('number_of_layers', self.num_layers_min, self.num_layers_max, default= 1)):
            model.add(LSTM(units=_units, activation =_activation, recurrent_activation=_recurrent_activation, return_sequences=True, input_shape=input_shape))
            if hp.Choice('dropout', values=self.dropout_activate):
                model.add(layers.Dropout(rate= hp.Float('dropout_rate', self.dropout_min, self.dropout_max, step= self.dropout_step)))
                
        model.add(TimeDistributed(Dense(1)))
        # model.compile(optimizer= _optimizer, loss= _loss, metrics=['mae', 'mse', MeanSquaredLogarithmicError(), MeanAbsolutePercentageError()])
        model.compile(optimizer='adam', loss='mse', metrics=['mae', rmse, MeanSquaredLogarithmicError(), MeanAbsolutePercentageError()])
        return model    
    
HyperModel= RNNHyperModel(
    num_units_min= 10,
    num_units_max=500,
    num_units_step=10,
    num_layers_min=1,
    num_layers_max=10,
    # activation=['relu', 'tanh', 'sigmoid'],
    activation=['sigmoid'],
    # recurrent_activation=['hard_sigmoid', 'sigmoid', 'tanh', 'relu'],
    recurrent_activation=['sigmoid'],    
    dropout_min=0.01,
    dropout_max=0.3,
    dropout_step=0.01,
    dropout_activate=[True, False],
    # optimizer=['Adam', 'SGD', 'RMSprop'], 
    # loss=['mean_squared_error']  
)

tuner = keras_tuner.Hyperband(
    HyperModel,
    objective='mean_squared_error',
    max_epochs=5,
    factor=3,
    directory='tuning/log',
    project_name='RNN Tuning Model'   
)

# Create a TensorBoard callback
tensorboard_callback = TensorBoard(log_dir='tuning/log/RNN Tuning Model/Tensorboard logs')

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

In [None]:
tuner.search_space_summary()

# Run the hyperparameter search
tuner.search(X_train_3d, y_train_3d, epochs=5, validation_data=(X_val_3d, y_val_3d), callbacks=[tensorboard_callback, early_stopping_callback])
# tuner.search(X_train, y_train, epochs=2, validation_data=(X_val, y_val),callbacks=[keras.callbacks.TensorBoard("tuning/log")], verbose=1)

# Get the top 2 models.
models = tuner.get_best_models(num_models=2)
best_model = models[0]
best_model.summary()

## MLP

In [41]:
# Re-split the train data into X and y
X_train = train_df.drop(columns=labels)
y_train = train_df[labels]

# Re-split the validation data into X and y
X_val = validation_df.drop(columns=labels)
y_val = validation_df[labels]

In [42]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

# Define the model
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error', 'mean_squared_error'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [43]:
# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Fit the model (assuming you have training and validation data defined)
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, callbacks=[early_stopping])

model.save('MLP Model.keras')

# Evaluate the model
loss = model.evaluate(X_val, y_val)
print('Test loss:', loss)

Epoch 1/100
[1m33968/33968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 2ms/step - loss: 2071783.1250 - mean_absolute_error: 145.4897 - mean_squared_error: 2071783.1250 - val_loss: 364866.2188 - val_mean_absolute_error: 25.1426 - val_mean_squared_error: 364866.2188
Epoch 2/100
[1m33968/33968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 2ms/step - loss: 541727.9375 - mean_absolute_error: 34.5338 - mean_squared_error: 541727.9375 - val_loss: 721235.4375 - val_mean_absolute_error: 23.7053 - val_mean_squared_error: 721235.4375
Epoch 3/100
[1m33968/33968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2ms/step - loss: 482387.1250 - mean_absolute_error: 28.8076 - mean_squared_error: 482387.1250 - val_loss: 1462184.7500 - val_mean_absolute_error: 187.0311 - val_mean_squared_error: 1462184.7500
Epoch 4/100
[1m33968/33968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 2ms/step - loss: 493860.9062 - mean_absolute_error: 29.7494 - mean_squared_error: 493860.

In [45]:
# Assuming y_test is a pandas Series or DataFrame
df = pd.DataFrame(y_val)

# Add a new column with the model predictions
df['predictions'] = model.predict(X_val)


[1m  56/8491[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m15s[0m 2ms/step 

[1m8491/8491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step


In [47]:
# Calculate the score of the predictions
mlp_mse_score = mean_squared_error(df['arr_at_Norwich_seconds_since_midnight'], df['predictions'])

# Calculate RMSE
mlp_rmse_score = sqrt(mlp_mse_score)

print('Unfiltered Scores: ')
print(f"Prediction MSE score: {mlp_mse_score}")
print(f"Prediction RMSE score: {mlp_rmse_score}")
print(f"Prediction RMSE score in seconds: {convert_seconds_to_string(mlp_rmse_score)}")


Unfiltered Scores: 
Prediction MSE score: 384354.5168843572
Prediction RMSE score: 619.9633189829518
Prediction RMSE score in seconds: 00:10:19


## Creating dummy DF with a fake scenario

In [None]:
example__single_journey = filtered_test_data[filtered_test_data['rid'] == unique_rid[25]]
example__single_journey = example__single_journey.reset_index(drop=True)

example__single_journey

In [None]:
print(
    convert_seconds_to_string(
        (example__single_journey.loc[22, 'ptd_seconds_since_midnight'] 
        -
        example__single_journey.loc[0, 'ptd_seconds_since_midnight']) 
        
        - (convert_string_to_seconds('12:00'))
        
        )
)

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(example__single_journey):
    # Drop column: 'rid'
    example__single_journey = example__single_journey.drop(columns=['rid'])
    cols_to_blank = example__single_journey.columns
    cols_to_blank = cols_to_blank[1:]
    example__single_journey[cols_to_blank] = -1 
    return example__single_journey

example__single_journey_clean = clean_data(example__single_journey.copy())
example__single_journey_clean = example__single_journey_clean.reset_index(drop=True)
# example__single_journey_clean

In [None]:
example__single_journey_clean.at[22, 'arr_atRemoved'] = 0
example__single_journey_clean.at[22, 'pass_atRemoved'] = 0
example__single_journey_clean.at[22, 'dep_atRemoved'] = 0
example__single_journey_clean.at[22, 'pta_seconds_since_midnight'] = convert_string_to_seconds('12:00')
example__single_journey_clean.at[22, 'ptd_seconds_since_midnight'] = convert_string_to_seconds('12:09')
example__single_journey_clean.at[22, 'arr_et_seconds_since_midnight'] = convert_string_to_seconds('12:00')
example__single_journey_clean.at[22, 'dep_et_seconds_since_midnight'] = convert_string_to_seconds('12:06')
example__single_journey_clean.at[22, 'arr_at_seconds_since_midnight'] = convert_string_to_seconds('12:01')
example__single_journey_clean.at[22, 'dep_at_seconds_since_midnight'] = convert_string_to_seconds('12:15')

example__single_journey_clean.at[0, 'arr_atRemoved'] = 0
example__single_journey_clean.at[0, 'pass_atRemoved'] = 0
example__single_journey_clean.at[0, 'dep_atRemoved'] = 0
example__single_journey_clean.at[0, 'pta_seconds_since_midnight'] = convert_string_to_seconds('11:17')
example__single_journey_clean.at[0, 'ptd_seconds_since_midnight'] = convert_string_to_seconds('11:20')
example__single_journey_clean.at[0, 'arr_et_seconds_since_midnight'] = convert_string_to_seconds('11:17')
example__single_journey_clean.at[0, 'dep_et_seconds_since_midnight'] = convert_string_to_seconds('11:19')
example__single_journey_clean.at[0, 'arr_at_seconds_since_midnight'] = convert_string_to_seconds('11:18')
example__single_journey_clean.at[0, 'dep_at_seconds_since_midnight'] = convert_string_to_seconds('11:27')






example__single_journey_clean_1 = clean_data(example__single_journey_clean.copy())
example__single_journey_clean_1 = example__single_journey_clean_1.drop(columns=['arr_at_seconds_since_midnight'], axis=1)

example__single_journey_clean_1

In [None]:
def convert_string_to_seconds(str):
    date_time_value = pd.to_datetime(str, format='%H:%M')
    total_seconds = date_time_value.hour * 3600 + date_time_value.minute * 60 + date_time_value.second
    return total_seconds

def convert_seconds_to_string(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return "{:02d}:{:02d}:{:02d}".format(int(hours), int(minutes), int(seconds))

In [None]:
# Reshape the input data to have an extra third dimension
reshaped_input = np.expand_dims(example__single_journey_clean_1, axis=1)



predictions = model.predict(reshaped_input)
predictions = np.where((predictions > -100) & (predictions < 100), -1, predictions)


# Now the input data should have shape (32, 1, 19), which matches the model's expected input shape
example__single_journey_clean_1['predictions_seconds_since_midnight'] = predictions.flatten()

example__single_journey_clean_1['predictions'] = example__single_journey_clean_1['predictions_seconds_since_midnight'].apply(convert_seconds_to_string)


In [None]:
example__single_journey_clean_1