In [73]:
import pandas as pd
import numpy as np
import torch
from torch import nn

from sklearn.model_selection import train_test_split

In [46]:
train_df = pd.read_csv('train.csv')
train_df

Unnamed: 0,Id,node_start,node_finish,day_time,distance,speed,route_distance_km,delta_time
0,-6374252502568484586,10096,2517,10,2.232533,34.0,2.156,436.0
1,-6374252502568484586,10103,11280,10,17.414917,25.0,2.156,436.0
2,-6374252502568484586,10104,13554,10,17.186843,27.0,2.156,436.0
3,-6374252502568484586,10117,18329,10,20.216909,37.0,2.156,436.0
4,-6374252502568484586,18329,13193,10,202.754917,35.0,2.156,436.0
...,...,...,...,...,...,...,...,...
401306,-8229597404562288405,7076,17753,11,4.847930,23.0,3.065,767.0
401307,-8229597404562288405,7083,7076,11,131.325685,26.0,3.065,767.0
401308,-8229597404562288405,7084,7083,11,44.026544,38.0,3.065,767.0
401309,-8229597404562288405,7077,7084,11,0.111226,34.0,3.065,767.0


In [67]:
min_ = train_df[['distance', 'speed', 'route_distance_km']].min()
max_ = train_df[['distance', 'speed', 'route_distance_km']].max()
train_df[['distance', 'speed', 'route_distance_km']] = (train_df[['distance', 'speed', 'route_distance_km']] - min_) / (max_ - min_)
train_df

Unnamed: 0,Id,node_start,node_finish,day_time,distance,speed,route_distance_km,delta_time
0,-6374252502568484586,10096,2517,10,0.001569,0.305263,0.135139,436.0
1,-6374252502568484586,10103,11280,10,0.012615,0.210526,0.135139,436.0
2,-6374252502568484586,10104,13554,10,0.012449,0.231579,0.135139,436.0
3,-6374252502568484586,10117,18329,10,0.014654,0.336842,0.135139,436.0
4,-6374252502568484586,18329,13193,10,0.147463,0.315789,0.135139,436.0
...,...,...,...,...,...,...,...,...
401306,-8229597404562288405,7076,17753,11,0.003472,0.189474,0.192115,767.0
401307,-8229597404562288405,7083,7076,11,0.095493,0.221053,0.192115,767.0
401308,-8229597404562288405,7084,7083,11,0.031977,0.347368,0.192115,767.0
401309,-8229597404562288405,7077,7084,11,0.000025,0.305263,0.192115,767.0


In [80]:
min_, max_

(distance             0.0
 speed                0.0
 route_distance_km    0.0
 dtype: float64,
 distance             1.0
 speed                1.0
 route_distance_km    1.0
 dtype: float64)

In [68]:
train_df.fillna(0, inplace=True).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401311 entries, 0 to 401310
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Id                 401311 non-null  int64  
 1   node_start         401311 non-null  int64  
 2   node_finish        401311 non-null  int64  
 3   day_time           401311 non-null  int64  
 4   distance           401311 non-null  float64
 5   speed              398043 non-null  float64
 6   route_distance_km  401311 non-null  float64
 7   delta_time         401311 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 24.5 MB


In [48]:
node_emb_size = train_df['node_finish'].max() + 1
node_emb_dim = 50

hour_emb_size = train_df['day_time'].max() + 1
hour_emb_dim = 50

In [49]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.node_emb = nn.Sequential(
            nn.Embedding(node_emb_size, node_emb_dim),
            nn.Linear(node_emb_dim, 128),
            nn.ReLU(),
        )

        self.hour_emb = nn.Sequential(
            nn.Embedding(hour_emb_size, hour_emb_dim),
            nn.Linear(hour_emb_dim, 128),
            nn.ReLU(),
        )

        self.fc1 = nn.Sequential(
            nn.Linear(3, 64),
            nn.ReLU(),

            nn.Linear(64, 128),
            nn.ReLU(),
        )

        self.fc2 = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64, 1),
        )

    def forward(self, x):
        x1 = self.node_emb(x[:, :2]).sum(axis=2)

        x2 = self.hour_emb(x[:, 2:3])
        x3 = self.fc1(x[:, 3:])
        return self.fc2(x1+x2+x3)

In [50]:
groups = train_df.groupby('Id')
groups.size()

Id
-9220688251826578095     83
-9218029690111550526    110
-9203204462098983368    156
-9183832189489697758    129
-9182793427449706037     78
                       ... 
 9210081165830935160    161
 9210110364218727646     99
 9210371713303339999     95
 9211624789772099086     45
 9222286614781517942     49
Length: 5000, dtype: int64

In [51]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [65]:
def train(epochs, model, optim, loss):
    for epoch in range(epochs):
        print(f'Epoch {epoch}'.center(50, '='))
        for i, group in enumerate(groups):
            id_ = group[0]
            X = group[1].to_numpy(dtype='float32')[2:]
            y = train_df[train_df.Id == id_]['delta_time']

            X = torch.tensor(X, device==device)
            y = torch.tensor(y, device=device)

            pred = model(X)
            l = loss(pred, y)

            optim.zero_grad()
            l.backward()
            optim.step()

            if i % 100 == 0:
                print(f'Batch: {i * 100}\n'
                      f'Loss: {l.item()}')


In [59]:
net = Model().to(device)
opt = torch.optim.Adam(net.parameters(), lr=2e-4)

In [66]:
train(5, net, opt, nn.MSELoss())



TypeError: tensor() takes 1 positional argument but 2 were given

In [70]:
len(list(groups))

5000

In [72]:
groups['node_start'].count().max()

272

In [77]:
groups = train_df.groupby('Id')
train, test = train_test_split(list(groups), test_size=0.1)

In [78]:
len(test)

500

In [79]:
groups[groups['speed'].isna()] = groups['speed'].mean()

AttributeError: 'SeriesGroupBy' object has no attribute 'isna'