In [1]:
import torch
import torch.nn as nn
from utils.early_stopping import EarlyStopping
import pandas as pd
import numpy as np

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
from torch.utils.data import Dataset,DataLoader

In [2]:
path = '/Users/blaise/Documents/ML/Machine-Learning-and-Big-Data-Analytics/data/CTA_-_Ridership_-_Daily_Boarding_Totals_20251110.csv'
df = pd.read_csv(path, parse_dates=['service_date'])

df.columns = ['date', 'day_type','bus', 'rail', 'total']
df = df.sort_values(by='date').set_index('date')

df = df.drop('total', axis=1)
df = df.drop_duplicates()

import numpy as np

df['bus'] = df['bus'].str.replace(',','')
df['rail'] = df['rail'].str.replace(',','')

df['bus'] = df['bus'].astype(np.int64)
df['rail'] = df['rail'].astype(np.int64)

In [4]:
df_mulvar = (df[['bus','rail']]/1e6).copy()
df_mulvar['next_day_type'] = df['day_type'].shift(-1)
mulvar_train = df_mulvar.iloc[:-1,:]["2016-01":"2018-12"]
mulvar_valid = df_mulvar.iloc[:-1,:]["2019-01":"2019-05"]
mulvar_test =  df_mulvar.iloc[:-1,:]["2019-06":]

In [6]:
onehotencoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
onehotencoder.fit(mulvar_train[['next_day_type']])
mulvar_train_2 = onehotencoder.transform(mulvar_train[['next_day_type']])

In [7]:
mulvar_train_2 = pd.DataFrame(
    mulvar_train_2,
    columns=list(onehotencoder.get_feature_names_out()),
    index = mulvar_train.index
)

In [8]:
mulvar_train.loc[:,'next_day_type_A'] = mulvar_train_2['next_day_type_A']
mulvar_train.loc[:,'next_day_type_U'] = mulvar_train_2['next_day_type_U']
mulvar_train.loc[:,'next_day_type_W'] = mulvar_train_2['next_day_type_W']
mulvar_train = mulvar_train.reindex(columns=['bus','rail','next_day_type_A','next_day_type_U','next_day_type_W'])

In [9]:
mulvar_valid_2 = onehotencoder.transform(mulvar_valid[['next_day_type']])
mulvar_valid_2 = pd.DataFrame(
    mulvar_valid_2,
    columns=list(onehotencoder.get_feature_names_out()),
    index = mulvar_valid.index
)
mulvar_valid.loc[:,'next_day_type_A'] = mulvar_valid_2['next_day_type_A']
mulvar_valid.loc[:,'next_day_type_U'] = mulvar_valid_2['next_day_type_U']
mulvar_valid.loc[:,'next_day_type_W'] = mulvar_valid_2['next_day_type_W']

mulvar_valid = mulvar_valid.reindex(columns=['bus','rail','next_day_type_A','next_day_type_U','next_day_type_W'])

In [10]:
mulvar_test_2 = onehotencoder.transform(mulvar_test[['next_day_type']])
mulvar_test_2 = pd.DataFrame(
    mulvar_test_2,
    columns=list(onehotencoder.get_feature_names_out()),
    index = mulvar_test.index
)
mulvar_test.loc[:,'next_day_type_A'] = mulvar_test_2['next_day_type_A']
mulvar_test.loc[:,'next_day_type_U'] = mulvar_test_2['next_day_type_U']
mulvar_test.loc[:,'next_day_type_W'] = mulvar_test_2['next_day_type_W']

mulvar_test = mulvar_test.reindex(columns=['bus','rail','next_day_type_A','next_day_type_U','next_day_type_W'])

In [11]:
seq_length = 56
chunk_size = seq_length+1
mv_rail_train_chunks = [mulvar_train.iloc[i:i+chunk_size].values for i in range(mulvar_train.shape[0]-chunk_size+1)]
mv_rail_valid_chunks = [mulvar_valid.iloc[i:i+chunk_size].values for i in range(mulvar_valid.shape[0]-chunk_size+1)]
mv_rail_test_chunks = [mulvar_test.iloc[i:i+chunk_size].values for i in range(mulvar_test.shape[0]-chunk_size+1)]

In [13]:
class TimeSeriesDataset2(Dataset):
    def __init__(self, chunks, seq_length):
        self.chunks = chunks
        self.seq_length = seq_length
    def __len__(self):
        return len(self.chunks)
    def __getitem__(self, index):
        chunk = self.chunks[index]
        data, label = chunk[:seq_length,:], chunk[seq_length,1].to(torch.float32).item() # rail value is in column 1
        return data.to(torch.float32),label

In [14]:
torch.manual_seed(1)

<torch._C.Generator at 0x10c5425f0>

In [15]:
train_mulvar_ds = TimeSeriesDataset2(torch.tensor(np.array(mv_rail_train_chunks),dtype=torch.float32),seq_length=56)
valid_mulvar_ds = TimeSeriesDataset2(torch.tensor(np.array(mv_rail_valid_chunks),dtype=torch.float32), seq_length=56)
test_mulvar_ds = TimeSeriesDataset2(torch.tensor(np.array(mv_rail_test_chunks),dtype=torch.float32), seq_length=56)

In [18]:
train_mulvar_dl = DataLoader(train_mulvar_ds, batch_size=64, shuffle=True)
valid_mulvar_dl = DataLoader(valid_mulvar_ds, batch_size=64)
test_mulvar_dl = DataLoader(test_mulvar_ds, batch_size=64)

In [19]:
torch.manual_seed(1)

<torch._C.Generator at 0x10c5425f0>

In [20]:
device = torch.device("mps")

In [112]:
class SimpleRNN2(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(16,1)
    def forward(self, x):
        out, hn = self.rnn(x)
        out = self.relu(self.linear(hn[-1,:,:]))
        return self.linear2(out)

In [113]:
simple_model2 = SimpleRNN2(5, 32)
simple_model2 = simple_model2.to(device)

In [114]:
optimizer = torch.optim.SGD(simple_model2.parameters(), lr=0.06, momentum=0.95,nesterov=True)
criterion = nn.HuberLoss()
early_stopper = EarlyStopping(patience=50, checkpoint_path='simple_modelx.pt', restore_best_weights=True, verbose=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',patience=5, factor=0.8)

In [115]:
n_epochs = 500

train_loss = [0]*n_epochs
val_loss = [0]*n_epochs

for epoch in range(n_epochs):
    for x_batch, y_batch in train_mulvar_dl:
        x_batch, y_batch = x_batch.float().to(device), y_batch.float().to(device)
        optimizer.zero_grad()
        out = simple_model2(x_batch)
        loss = criterion(out.squeeze(1), y_batch)
        loss.backward()
        optimizer.step()
        train_loss[epoch] += loss.item()*y_batch.size(0)
    train_loss[epoch] /= len(train_mulvar_dl.dataset)

    with torch.no_grad():
        for x_batch_val, y_batch_val in valid_mulvar_dl:
            x_batch_val, y_batch_val = x_batch_val.float().to(device), y_batch_val.float().to(device)
            val_out = simple_model2(x_batch_val)
            vloss = criterion(val_out.squeeze(1), y_batch_val)
            val_loss[epoch] += vloss.item()*y_batch_val.size(0)
        val_loss[epoch] /= len(valid_mulvar_dl.dataset)
        scheduler.step(val_loss[epoch])

        print(f'Epoch: {epoch+1}| Train loss: {train_loss[epoch]:.4f}| Val loss: {val_loss[epoch]:.4f}')

        # early stopping
        early_stopper(val_loss[epoch], simple_model2, optimizer, epoch)
        if early_stopper.should_stop:
            print("Stopping at epoch ",epoch)
            break

Epoch: 1| Train loss: 0.0455| Val loss: 0.0156
Metric improved to 0.0156. Checkpoint saved at epoch 0
Epoch: 2| Train loss: 0.0164| Val loss: 0.0135
Metric improved to 0.0135. Checkpoint saved at epoch 1
Epoch: 3| Train loss: 0.0085| Val loss: 0.0039
Metric improved to 0.0039. Checkpoint saved at epoch 2
Epoch: 4| Train loss: 0.0032| Val loss: 0.0018
Metric improved to 0.0018. Checkpoint saved at epoch 3
Epoch: 5| Train loss: 0.0025| Val loss: 0.0014
Metric improved to 0.0014. Checkpoint saved at epoch 4
Epoch: 6| Train loss: 0.0022| Val loss: 0.0014
No improvement for 1 epoch(s)
Epoch: 7| Train loss: 0.0021| Val loss: 0.0014
No improvement for 2 epoch(s)
Epoch: 8| Train loss: 0.0019| Val loss: 0.0010
Metric improved to 0.0010. Checkpoint saved at epoch 7
Epoch: 9| Train loss: 0.0018| Val loss: 0.0011
No improvement for 1 epoch(s)
Epoch: 10| Train loss: 0.0018| Val loss: 0.0018
No improvement for 2 epoch(s)
Epoch: 11| Train loss: 0.0018| Val loss: 0.0009
Metric improved to 0.0009. Chec

In [116]:
l1_loss = nn.L1Loss(reduction='sum')

loss_ = []

with torch.no_grad():
    for x_test_batch, y_test_batch in test_mulvar_dl:
        x_test_batch = x_test_batch.to(device=device,dtype=torch.float32)
        y_test_batch = y_test_batch.to(device=device,dtype=torch.float32)
        out = simple_model2(x_test_batch)
        loss = l1_loss(out.squeeze(1), y_test_batch)
        loss_.append(loss.item())

In [117]:
sum(loss_)/len(test_mulvar_dl.dataset)*1e6

123206.21612096614