In [1]:
import pandas as pd

path = '/Users/blaise/Documents/ML/Machine-Learning-and-Big-Data-Analytics/data/CTA_-_Ridership_-_Daily_Boarding_Totals_20251110.csv'

df = pd.read_csv(path, parse_dates=['service_date'])

df.columns = ['date', 'day_type','bus', 'rail', 'total']
df = df.sort_values(by='date').set_index('date')

df = df.drop('total', axis=1)
df = df.drop_duplicates()

import numpy as np

df['bus'] = df['bus'].str.replace(',','')
df['rail'] = df['rail'].str.replace(',','')

df['bus'] = df['bus'].astype(np.int64)
df['rail'] = df['rail'].astype(np.int64)

In [2]:
df_mulvar = (df[['bus','rail']]/1e6).copy()
df_mulvar['next_day_type'] = df['day_type'].shift(-1)
mulvar_train = df_mulvar.iloc[:-1,:]["2016-01":"2018-12"]
mulvar_valid = df_mulvar.iloc[:-1,:]["2019-01":"2019-05"]
mulvar_test =  df_mulvar.iloc[:-1,:]["2019-06":]

In [3]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
onehotencoder.fit(mulvar_train[['next_day_type']])
mulvar_train_2 = onehotencoder.transform(mulvar_train[['next_day_type']])

In [4]:
mulvar_train_2 = pd.DataFrame(
    mulvar_train_2,
    columns=list(onehotencoder.get_feature_names_out()),
    index = mulvar_train.index
)

In [5]:
mulvar_train.loc[:,'next_day_type_A'] = mulvar_train_2['next_day_type_A']
mulvar_train.loc[:,'next_day_type_U'] = mulvar_train_2['next_day_type_U']
mulvar_train.loc[:,'next_day_type_W'] = mulvar_train_2['next_day_type_W']
mulvar_train = mulvar_train.reindex(columns=['bus','rail','next_day_type_A','next_day_type_U','next_day_type_W'])

In [6]:
mulvar_valid_2 = onehotencoder.transform(mulvar_valid[['next_day_type']])
mulvar_valid_2 = pd.DataFrame(
    mulvar_valid_2,
    columns=list(onehotencoder.get_feature_names_out()),
    index = mulvar_valid.index
)
mulvar_valid.loc[:,'next_day_type_A'] = mulvar_valid_2['next_day_type_A']
mulvar_valid.loc[:,'next_day_type_U'] = mulvar_valid_2['next_day_type_U']
mulvar_valid.loc[:,'next_day_type_W'] = mulvar_valid_2['next_day_type_W']

mulvar_valid = mulvar_valid.reindex(columns=['bus','rail','next_day_type_A','next_day_type_U','next_day_type_W'])

In [7]:
mulvar_test_2 = onehotencoder.transform(mulvar_test[['next_day_type']])
mulvar_test_2 = pd.DataFrame(
    mulvar_test_2,
    columns=list(onehotencoder.get_feature_names_out()),
    index = mulvar_test.index
)
mulvar_test.loc[:,'next_day_type_A'] = mulvar_test_2['next_day_type_A']
mulvar_test.loc[:,'next_day_type_U'] = mulvar_test_2['next_day_type_U']
mulvar_test.loc[:,'next_day_type_W'] = mulvar_test_2['next_day_type_W']

mulvar_test = mulvar_test.reindex(columns=['bus','rail','next_day_type_A','next_day_type_U','next_day_type_W'])

In [8]:
seq_length = 56
chunk_size = seq_length+1
mv_rail_train_chunks = [mulvar_train.iloc[i:i+chunk_size].values for i in range(mulvar_train.shape[0]-chunk_size+1)]
mv_rail_valid_chunks = [mulvar_valid.iloc[i:i+chunk_size].values for i in range(mulvar_valid.shape[0]-chunk_size+1)]
mv_rail_test_chunks = [mulvar_test.iloc[i:i+chunk_size].values for i in range(mulvar_test.shape[0]-chunk_size+1)]

In [9]:
import torch

In [10]:
from torch.utils.data import Dataset,DataLoader

In [11]:
class TimeSeriesDataset2(Dataset):
    def __init__(self, chunks, seq_length):
        super().__init__()
        self.chunks = chunks
        self.seq_length = seq_length
    def __len__(self):
        return len(self.chunks)
    def __getitem__(self, index):
        chunk = self.chunks[index,:,:]
        data, label = chunk[:seq_length,:], chunk[seq_length,1].to(torch.float32).item() # rail value is in column 1
        return data.to(torch.float32),label

In [12]:
torch.manual_seed(1)

<torch._C.Generator at 0x1114f13f0>

In [13]:
train_mulvar_ds = TimeSeriesDataset2(torch.tensor(np.array(mv_rail_train_chunks),dtype=torch.float16),seq_length=56)
valid_mulvar_ds = TimeSeriesDataset2(torch.tensor(np.array(mv_rail_valid_chunks),dtype=torch.float16), seq_length=56)
test_mulvar_ds = TimeSeriesDataset2(torch.tensor(np.array(mv_rail_test_chunks),dtype=torch.float16), seq_length=56)

In [14]:
train_mulvar_dl = DataLoader(train_mulvar_ds, batch_size=64, shuffle=True)
valid_mulvar_dl = DataLoader(valid_mulvar_ds, batch_size=64)
test_mulvar_dl = DataLoader(test_mulvar_ds, batch_size=64)

In [15]:
torch.manual_seed(1)

<torch._C.Generator at 0x1114f13f0>

In [16]:
import torch.nn as nn

In [None]:
device = torch.device("mps")

In [18]:
class SimpleRNN2(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, 16)
        self.linear2 = nn.Linear(16,8)
        self.linear3 = nn.Linear(8,1)
        self.relu = nn.ReLU()
    def forward(self, x):
        out, hn = self.rnn(x)
        mid = self.relu(self.linear(hn[-1,:,:]))
        mid2 = self.relu(self.linear2(mid))
        return self.relu(self.linear3(mid2))

In [19]:
simple_model2 = SimpleRNN2(5, 32)
simple_model2 = simple_model2.to(device)

In [20]:
from utils.early_stopping import EarlyStopping

In [21]:
optimizer = torch.optim.SGD(simple_model2.parameters(), lr=0.02,momentum=0.95,nesterov=True)
criterion = nn.HuberLoss()
early_stopper = EarlyStopping(patience=50, checkpoint_path='simple_model2.pt', restore_best_weights=True, verbose=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',patience=5)

In [22]:
n_epochs = 500

train_loss = [0]*n_epochs
val_loss = [0]*n_epochs

for epoch in range(n_epochs):
    simple_model2.train()
    for x_batch,y_batch in train_mulvar_dl:
        # move the data to GPU
        x_batch = x_batch.to(device)
        y_batch = y_batch.float().to(device)
        # compute the forward pass
         # zero the gradients
        optimizer.zero_grad()
        out = simple_model2(x_batch)
        # compute the loss
        loss = criterion(out.squeeze(1), y_batch)
        # Compute the gradients
        loss.backward()
        # Backpropagation
        optimizer.step()
       
        # training loss computation
        train_loss[epoch] += loss.item()*y_batch.size(0)
    train_loss[epoch] /= len(train_mulvar_dl.dataset)

    # Validation step
    simple_model2.eval()
    with torch.no_grad():
        for x_batch, y_batch in valid_mulvar_dl:
            # move the data to GPU
            x_batch = x_batch.to(device)
            y_batch = y_batch.float().to(device)
            # forward pass
            out = simple_model2(x_batch)
            loss = criterion(out.squeeze(1), y_batch)
            val_loss[epoch]+= loss.item()*y_batch.size(0)
    val_loss[epoch]/=len(valid_mulvar_dl.dataset)
    scheduler.step(val_loss[epoch])

    print(f'Epoch: {epoch+1}| Train loss: {train_loss[epoch]:.4f}| Val loss: {val_loss[epoch]:.4f}')

    # early stopping
    early_stopper(val_loss[epoch], simple_model2, optimizer, epoch)
    if early_stopper.should_stop:
        print("Stopping at epoch ",epoch)
        break   


Epoch: 1| Train loss: 0.2180| Val loss: 0.2004
Metric improved to 0.2004. Checkpoint saved at epoch 0
Epoch: 2| Train loss: 0.2180| Val loss: 0.2004
No improvement for 1 epoch(s)
Epoch: 3| Train loss: 0.2180| Val loss: 0.2004
No improvement for 2 epoch(s)
Epoch: 4| Train loss: 0.2180| Val loss: 0.2004
No improvement for 3 epoch(s)
Epoch: 5| Train loss: 0.2180| Val loss: 0.2004
No improvement for 4 epoch(s)
Epoch: 6| Train loss: 0.2180| Val loss: 0.2004
No improvement for 5 epoch(s)
Epoch: 7| Train loss: 0.2180| Val loss: 0.2004
No improvement for 6 epoch(s)
Epoch: 8| Train loss: 0.2180| Val loss: 0.2004
No improvement for 7 epoch(s)
Epoch: 9| Train loss: 0.2180| Val loss: 0.2004
No improvement for 8 epoch(s)
Epoch: 10| Train loss: 0.2180| Val loss: 0.2004
No improvement for 9 epoch(s)
Epoch: 11| Train loss: 0.2180| Val loss: 0.2004
No improvement for 10 epoch(s)
Epoch: 12| Train loss: 0.2180| Val loss: 0.2004
No improvement for 11 epoch(s)
Epoch: 13| Train loss: 0.2180| Val loss: 0.200

In [23]:
l1_loss = nn.L1Loss(reduction='sum')

loss_ = []

with torch.no_grad():
    for x_test_batch, y_test_batch in test_mulvar_dl:
        x_test_batch = x_test_batch.to(device=device,dtype=torch.float32)
        y_test_batch = y_test_batch.to(device=device,dtype=torch.float32)
        out = simple_model2(x_test_batch)
        loss = l1_loss(out.squeeze(1), y_test_batch)
        loss_.append(loss.item())

In [24]:
sum(loss_)/len(test_mulvar_dl.dataset)*1e6

308483.0398456794