In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [2]:
data = pd.read_csv('/kaggle/input/processed-data/processed_data.csv')

pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,RecordedAtTime,PublishedLineName,DirectionRef,VehicleRef,DestinationLat,DestinationLong,NextStopPointName,OriginLat,OriginLong,VehicleLat,VehicleLong,scheduled,late,adherence,day_of_week,Temp,Pressure,Humidity,WindSpeed,Clouds,Visibility,WeatherConditions
0,2017-06-01 00:03:24,B31,0,NYCT_4611,40.608433,-73.9571,GERRITSEN AV/GERRITSEN BEACH,40.587101,-73.918503,40.587024,-73.918623,2017-06-01 00:08:00,False,4,Thursday,20.4,1014.2,76.8,20.8,5.1,15.6,Clear
1,2017-06-01 00:03:23,Bx1,1,NYCT_5685,40.809654,-73.92836,RIVERDALE AV/W 231 ST,40.881187,-73.90934,40.881224,-73.90939,2017-06-01 00:19:00,False,15,Thursday,20.4,1014.2,76.8,20.8,5.1,15.6,Clear
2,2017-06-01 00:03:30,Bx39,0,NYCT_4718,40.903309,-73.849922,WHITE PLAINS RD/LAFAYETTE AV,40.807869,-73.852715,40.822127,-73.858291,2017-06-01 00:00:10,True,-3,Thursday,20.4,1014.2,76.8,20.8,5.1,15.6,Clear
3,2017-06-01 00:03:49,Q44-SBS,1,NYCT_5999,40.704933,-73.79332,MAIN ST/UNION TP,40.84256,-73.878334,40.717817,-73.817285,2017-05-31 23:59:49,True,-4,Wednesday,20.4,1014.2,76.8,20.8,5.1,15.6,Clear
4,2017-06-01 00:03:24,X10,1,NYCT_2660,40.633698,-74.129776,E 57 ST/LEXINGTON AV,40.760429,-73.967674,40.761108,-73.969562,2017-05-31 23:51:05,True,-12,Wednesday,20.4,1014.2,76.8,20.8,5.1,15.6,Clear


In [3]:
# Convert 'RecordedAtTime' to datetime
data['RecordedAtTime'] = pd.to_datetime(data['RecordedAtTime'])
data['scheduled'] = pd.to_datetime(data['scheduled'])
# Extracting time features
data['hour'] = data['RecordedAtTime'].dt.hour
data['weekday'] = data['RecordedAtTime'].dt.dayofweek

# Label encoding for categorical variables
label_cols = ['PublishedLineName', 'VehicleRef', 'NextStopPointName', 'day_of_week', 'WeatherConditions']
label_encoder = {col: LabelEncoder() for col in label_cols}

for col in label_cols:
    data[col] = label_encoder[col].fit_transform(data[col])

# Normalizing numerical features
numerical_cols = ['DestinationLat', 'DestinationLong', 'OriginLat', 'OriginLong', 'VehicleLat', 'VehicleLong', 
                  'Temp', 'Pressure', 'Humidity', 'WindSpeed', 'Clouds', 'Visibility', 'hour', 'weekday']
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Target Variable
target = data['adherence']

# Selecting features for the model
features = data[numerical_cols + label_cols]

# Splitting the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train.shape, X_val.shape, X_test.shape

((231282, 19), (49560, 19), (49561, 19))

In [4]:
# Function to create sequences with the windowing approach
def create_hourly_sequences(grouped_data, sequence_length):
    X, y = [], []
    for name, group in grouped_data:
        # Ensure group has enough data to create a sequence
        if len(group) >= sequence_length:
            # Create sequences within this group (hour)
            input_data = group[numerical_cols + label_cols].values
            output_data = group['adherence'].values
            for i in range(len(input_data) - sequence_length):
                seq_X = input_data[i: i + sequence_length]
                seq_y = output_data[i + sequence_length]
                X.append(seq_X)
                y.append(seq_y)
    return np.array(X), np.array(y)

In [5]:
# To work with hourly data, we need to group the data by hour
# Let's first add an 'hour' column to the dataset, representing each record's hour of the day
data['hour_of_day'] = data['RecordedAtTime'].dt.hour

# Group the data by hour
grouped_data = data.groupby('hour_of_day')

# Calculate the average number of records per hour
avg_records_per_hour = grouped_data.size().mean()
sequence_length = int(round(avg_records_per_hour))

# Print the average number of records per hour and the chosen sequence length
avg_records_per_hour, sequence_length

(13766.791666666666, 13767)

In [6]:
sequence_length = 60

# Applying the function to create sequences
X, y = create_hourly_sequences(grouped_data, sequence_length)

# Splitting the data again into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train.shape, X_val.shape, X_test.shape

((230274, 60, 19), (49344, 60, 19), (49345, 60, 19))

In [7]:
try:
    # Automatically detect the TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("Device:", tpu.master())  # Display the TPU address
    # Initialize the TPU system
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
    strategy = tf.distribute.get_strategy()  # Default strategy that works on CPU and single GPU
    print("Running on CPU or single GPU")

print("Replicas:", strategy.num_replicas_in_sync)

Running on CPU or single GPU
Replicas: 1


In [8]:
n_features = 22

# Building the LSTM Model
with strategy.scope():  # Ensures model is built within the TPU context if available
    model = Sequential()
    model.add(LSTM(units=50, activation='relu', input_shape=(sequence_length, n_features)))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mean_squared_error')

# Model Summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 50)                14600     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 14651 (57.23 KB)
Trainable params: 14651 (57.23 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
def data_generator(data, batch_size, sequence_length):
    selected_cols = [col for col in data.columns if col not in ['adherence', 'RecordedAtTime', 'scheduled']]
    while True:
        batch_start_points = np.random.choice(len(data) - sequence_length, batch_size, replace=False)
        
        # Initialize X_batch and y_batch
        X_batch = np.zeros((batch_size, sequence_length, len(selected_cols)))  # Adjust number of features
        y_batch = np.zeros(batch_size)

        for i in range(batch_size):
            start_point = batch_start_points[i].item()
            # Ensure the selection excludes non-numeric columns
            X_batch[i] = data[selected_cols].iloc[start_point:start_point + sequence_length].values
            y_batch[i] = data['adherence'].iloc[start_point + sequence_length]

        yield X_batch, y_batch

In [None]:
batch_size = 32
steps_per_epoch = len(data) // batch_size

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_generator = data_generator(train_data, batch_size, sequence_length)
val_generator = data_generator(val_data, batch_size, sequence_length)

model.fit(train_generator, 
          steps_per_epoch=steps_per_epoch, 
          epochs=10,
          validation_data=val_generator, 
          validation_steps=50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
  950/10325 [=>............................] - ETA: 1:43:09 - loss: 125.7992