In [7]:
import numpy as np
import pandas as pd
import plotly.express as px

In [8]:
data = np.load('../data/metr_la_new.npz')

In [9]:
data['edges']

array([[  0,  37],
       [  0,  54],
       [  0, 116],
       ...,
       [206, 155],
       [206, 159],
       [206, 163]], dtype=int32)

In [10]:
dataset = data['targets']
dataset_size = len(dataset)

In [11]:
for i in range(0, len(dataset)):
    for j in range(0, len(dataset[i])):
        if np.isnan(dataset[i][j]):
            dataset[i][j] = dataset[max(i - 1, 0)][j]

In [12]:
import torch 
import random
from torch.utils.data import TensorDataset, DataLoader


In [13]:
device = "cpu" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [14]:
train_size = int(0.6 * dataset_size)
test_size =  int(0.2 * dataset_size)
vertice = 0
dataset_for_vertice = dataset[:, vertice]

In [15]:
coef = 20
pred_cnt = 12

In [16]:
samples_train = [i for i in random.sample(range(coef, int(dataset_size * 0.7)), train_size)]

X_train = [dataset_for_vertice[i - coef: i] for i in samples_train]
X_train = torch.tensor(X_train).to(device)

y_train = [dataset_for_vertice[i:i + pred_cnt] for i in samples_train]
y_train = torch.tensor(y_train).to(device)

samples_test = [i for i in random.sample(range(int(dataset_size * 0.7), dataset_size - pred_cnt), test_size)]
X_test = [dataset_for_vertice[i - coef: i] for i in samples_test]
X_test = torch.tensor(X_test).to(device)

y_test = [dataset_for_vertice[i:i + pred_cnt] for i in samples_test]
y_test = torch.tensor(y_test).to(device)

  X_train = torch.tensor(X_train).to(device)


In [17]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(y_train)
print(y_test)


torch.Size([20563, 20]) torch.Size([20563, 12])
torch.Size([6854, 20]) torch.Size([6854, 12])
tensor([[66.1111, 64.7500, 69.5000,  ..., 68.8750, 68.6667, 67.3750],
        [65.3333, 68.3333, 68.3333,  ..., 66.8750, 66.1111, 66.5556],
        [67.4444, 65.1250, 64.7500,  ..., 64.3750, 65.1111, 65.5000],
        ...,
        [65.2500, 65.2500, 63.5556,  ..., 65.5714, 66.2222, 64.8750],
        [63.3333, 57.8750, 62.6667,  ..., 60.7500, 62.0000, 65.0000],
        [64.0000, 64.4444, 60.7500,  ..., 63.2500, 68.1250, 67.5556]])
tensor([[66.0000, 66.0000, 66.0000,  ..., 66.0000, 66.0000, 66.0000],
        [64.1250, 63.1111, 63.2500,  ..., 14.5000, 16.1250, 18.2500],
        [62.8889, 59.3750, 65.5000,  ..., 66.7500, 66.1250, 63.8889],
        ...,
        [68.0000, 67.1111, 68.0000,  ..., 68.1250, 68.1429, 66.5556],
        [65.1429, 66.1111, 66.3750,  ..., 64.7500, 58.8750, 62.4444],
        [63.8889, 65.0000, 64.8750,  ..., 62.0000, 63.7500, 66.8889]])


In [18]:
train_data_set = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_data_set,
                        shuffle = True,
                        batch_size = 8)

test_data_set = TensorDataset(X_test, y_test)
test_dataloader = DataLoader(test_data_set,
                        shuffle = True,
                        batch_size = 1)

In [19]:
from torch import nn

model_1 = nn.Linear(coef, 1)
model_1.to(device)

model_1 = torch.compile(model_1)


In [20]:
optimizer = torch.optim.Adam(params = model_1.parameters(), lr = 1e-7, weight_decay=1e-5)
loss_fn = nn.L1Loss()

In [21]:
from tqdm import tqdm

In [22]:
torch.set_float32_matmul_precision('high')

In [23]:
def make_a_guess(model, X):
    ans = []
    updated_X = X
    for k in range(pred_cnt):
        ans.append(model(updated_X).to(device))
        updated_X = updated_X[:, :-1]
        updated_X = torch.cat([updated_X, ans[-1]], dim = 1)
    ans = torch.stack(ans, dim = 0).to(device)
    ans = torch.permute(ans, (1, 0, 2)).squeeze(dim = 2)
    return ans

In [24]:
epochs = 100


losses = []

for i in tqdm(range(epochs)):
    for x, y in train_dataloader:
        y_pred = make_a_guess(model = model_1, X=x)
        loss = loss_fn(y_pred, y).to(device)
        losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


  0%|          | 0/100 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [25]:
px.line(y = np.array(losses))

In [21]:
np.array(losses).mean()

69.5237351027837

In [51]:
def handle_autoregression(model_1):
    model_1.eval()
    test_losses_using_model = []
    with torch.inference_mode():
        for x, y in test_dataloader:
            y_pred = make_a_guess(model = model_1, X=x)
            loss = loss_fn(y_pred, y).to(device)
            test_losses_using_model.append(loss.item())
    return test_losses_using_model

mean_loss_using_model = np.array(handle_autoregression(model_1)).mean()
mean_loss_using_model

65.79896633737039

In [27]:
type(dataset_for_vertice)

numpy.ndarray

In [28]:
average_val = torch.tensor(dataset_for_vertice.mean())
test_losses_using_average = []

for x, y in test_dataloader:
    y_pred = torch.stack([average_val] * pred_cnt, dim = 0).unsqueeze(dim = 0).to(device)
    loss = loss_fn(y_pred, y).to(device)
    test_losses_using_average.append(loss.item())


mean_loss_using_average = np.array(test_losses_using_average).mean()
mean_loss_using_average

7.069966372040538

In [29]:
test_losses_using_last = []

for x, y in test_dataloader:
    last_val = x[:, -1]
    y_pred = torch.stack([last_val] * pred_cnt, dim = 0).unsqueeze(dim = 0).squeeze(dim = 2).to(device)
    loss = loss_fn(y_pred, y).to(device)
    test_losses_using_last.append(loss.item())


mean_loss_using_last = np.array(test_losses_using_last).mean()
mean_loss_using_last

3.202477211234284

In [30]:
from random import randint

In [27]:
test_losses_using_week_before = []
week_length = 12 * 24 * 7

for i in range(test_size):
    x = randint(week_length, dataset_for_vertice.shape[0] - 1)
    y = torch.tensor(dataset_for_vertice[x]).to(device)
    y_pred = torch.tensor(dataset_for_vertice[x - week_length]).to(device)
    loss = loss_fn(y_pred, y).to(device)
    test_losses_using_week_before.append(loss.item())

mean_loss_using_week_before = np.array(test_losses_using_week_before).mean()
mean_loss_using_week_before

4.602694672400032

In [28]:
from pandas import DataFrame as df

In [29]:
print("Average error for next hour guesses")
print(f"mean_loss_using_average: {mean_loss_using_average}")
print(f"mean_loss_using_last: {mean_loss_using_last}")
print(f"mean_loss_using_week_before :{mean_loss_using_week_before}")
print(f"mean_loss_using_model: {mean_loss_using_model}")


Average error for next hour guesses
mean_loss_using_average: 7.001247019917077
mean_loss_using_last: 3.129279187463848
mean_loss_using_week_before :4.602694672400032
mean_loss_using_model: 56.34433264491646


In [30]:
comparison = df({"mean_loss_using_average": [mean_loss_using_average.item()],
                 "mean_loss_using_last": [mean_loss_using_last.item()],
                 "mean_loss_using_week_before": [mean_loss_using_week_before.item()],
                 "mean_loss_using_model": [mean_loss_using_model.item()]})
comparison.T

Unnamed: 0,0
mean_loss_using_average,7.001247
mean_loss_using_last,3.129279
mean_loss_using_week_before,4.602695
mean_loss_using_model,56.344333


In [32]:
! pip install catboost

Collecting catboost
  Using cached catboost-1.2.7-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp312-cp312-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3


In [33]:
from catboost import CatBoostRegressor

In [70]:
cat = CatBoostRegressor(iterations =)

In [71]:
X_train = pd.DataFrame(np.array(X_train))
y_train = pd.DataFrame(np.array(y_train)[:, 0])

cat.fit(X_train, y_train,)

Learning rate set to 0.5
0:	learn: 7.0334490	total: 7.23ms	remaining: 0us


<catboost.core.CatBoostRegressor at 0x70172beed5b0>

In [73]:
def handle_autoregression_for_catboost(time_line: list, 
                                       ethalon:list,
                                       epochs:int,
                                       model_cat):
    answer = []
    time_line = time_line.tolist()
    for i in range(0, epochs):
        next = model_cat.predict(time_line)
        answer.append(next)
        time_line.append(next)
        time_line[1:]
    return (abs(np.array(answer) - np.array(ethalon))**2).mean()

In [74]:
result = 0

for x, y in zip(np.array(X_test), np.array(y_test)):
    result += handle_autoregression_for_catboost(x, 
                                                 y, 
                                                 epochs = 12, 
                                                 model_cat = cat)

print(result / len(X_test))

81.6302172653747
