In [1]:
# Requirements:
!pip install rtdl

Collecting rtdl
  Downloading rtdl-0.0.3-py3-none-any.whl (19 kB)
Installing collected packages: rtdl
Successfully installed rtdl-0.0.3


In [2]:
import numpy as np 
import pandas as pd 
import pandas_profiling as pdp

import rtdl

import sklearn.model_selection
import sklearn.preprocessing

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

import pytorch_lightning as pl

if torch.cuda.is_available():  
    print('Wohooo, GPU found!!')
    dev = "cuda:0" 
else:  
    dev = "cpu"  
    
device = torch.device(dev)

# setting random seeds for reproducibility 
torch.manual_seed(42)
np.random.seed(42)
import random
random.seed(0)

Wohooo, GPU found!!


In [3]:
### Reading in data

train = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2021/test.csv')

In [4]:
train.head

<bound method NDFrame.head of                 date_time  deg_C  relative_humidity  absolute_humidity  \
0     2010-03-10 18:00:00   13.1               46.0             0.7578   
1     2010-03-10 19:00:00   13.2               45.3             0.7255   
2     2010-03-10 20:00:00   12.6               56.2             0.7502   
3     2010-03-10 21:00:00   11.0               62.4             0.7867   
4     2010-03-10 22:00:00   11.9               59.0             0.7888   
...                   ...    ...                ...                ...   
7106  2010-12-31 20:00:00    9.2               32.0             0.3871   
7107  2010-12-31 21:00:00    9.1               33.2             0.3766   
7108  2010-12-31 22:00:00    9.6               34.6             0.4310   
7109  2010-12-31 23:00:00    8.0               40.7             0.4085   
7110  2011-01-01 00:00:00    8.0               41.3             0.4375   

      sensor_1  sensor_2  sensor_3  sensor_4  sensor_5  \
0       1387.2    1087.

In [5]:
#pdp.ProfileReport(train)

In [6]:
########## Making custom dataset class ########## 

class TabularDataset(Dataset):
   
    def __init__(self, df, target):
        self.features = df[['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']]
        
        self.targets = df[target]
        
        self.preprocess_features = sklearn.preprocessing.MinMaxScaler().fit(self.features)
        self.target_mean = float(self.targets.mean())
        self.target_std = float(self.targets.std())

   
    def __len__(self):
        return len(self.targets)

    def __getitem__(self, item):
        features = self.features.iloc[[item]].values
        target = self.targets.iloc[[item]].values
        

        return torch.tensor(self.preprocess_features.transform(features), device=device, dtype = torch.float), torch.tensor((target-self.target_mean)/self.target_std, device=device, dtype = torch.float)

In [7]:
########## Making pl module ##########

class FTTransformerRegressor(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.FTTransformer = rtdl.FTTransformer.make_default(
            n_num_features= 8,
            cat_cardinalities=None,
            last_layer_query_idx=[-1],  # it makes the model faster and does NOT affect its output
            d_out=1,
            )
        
        self.FTTransformer
        # create metrics
        self.loss_func = nn.MSELoss()

    def forward(self, features):
        # produce embeddings
        preds = self.FTTransformer(features, None)
        return preds.squeeze(0)

    def configure_optimizers(self):        
        params = list(self.FTTransformer.parameters())
        optimizer = torch.optim.AdamW(params, lr=0.0001, weight_decay = 1e-05)

        return optimizer
    
    def training_step(self, train_batch, batch_idx):
        features, targets = train_batch
        
        preds = self.forward(features.squeeze(1))        
        loss = torch.sqrt(self.loss_func(preds, targets))
        self.log('train_loss', loss, on_epoch=True, prog_bar=True, sync_dist=True)
        
        return loss
        
    def validation_step(self, val_batch, batch_idx):
        features, targets = val_batch
        
        preds = self.forward(features.squeeze(1))
        loss = torch.sqrt(self.loss_func(preds, targets))        
        self.log('val_loss', loss, on_epoch=True, prog_bar=True, sync_dist=True)

        


# Working with target carbon monoxide

In [8]:
dataset_cm = TabularDataset(train, 'target_carbon_monoxide')

train_cm, val_cm = torch.utils.data.random_split(dataset_cm, [5689, 1422])

model_cm = FTTransformerRegressor().to(device).float()



train_loader_cm = torch.utils.data.DataLoader(train_cm, batch_size = 32, shuffle = True)

val_loader_cm = torch.utils.data.DataLoader(val_cm,  batch_size = 32, shuffle = False)


# instantiating the trainer
trainer_cm = pl.Trainer(gpus=1,
                     max_epochs=50)


trainer_cm.fit(model_cm, train_loader_cm, val_loader_cm)

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

# Working with target benzene

In [9]:
dataset_b = TabularDataset(train, 'target_benzene')

train_b, val_b = torch.utils.data.random_split(dataset_b, [5689, 1422])

model_b = FTTransformerRegressor().to(device).float()



train_loader_b = torch.utils.data.DataLoader(train_b, batch_size = 32, shuffle = True)

val_loader_b = torch.utils.data.DataLoader(val_b,  batch_size = 32, shuffle = False)


# instantiating the trainer
trainer_b = pl.Trainer(gpus=1,
                     max_epochs=50)


trainer_b.fit(model_b, train_loader_b, val_loader_b)

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

# Working with target nitrogen oxides

In [10]:
dataset_no = TabularDataset(train, 'target_nitrogen_oxides')

train_no, val_no = torch.utils.data.random_split(dataset_no, [5689, 1422])

model_no = FTTransformerRegressor().to(device).float()



train_loader_no = torch.utils.data.DataLoader(train_no, batch_size = 32, shuffle = True)

val_loader_no = torch.utils.data.DataLoader(val_no,  batch_size = 32, shuffle = False)


# instantiating the trainer
trainer_no = pl.Trainer(gpus=1,
                     max_epochs=50)


trainer_no.fit(model_no, train_loader_no, val_loader_no)

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

# Inference

In [11]:
model_cm.eval().cuda()
model_b.eval().cuda()
model_no.eval().cuda()

test_features = test[['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']]

# preporcess data

test_features = torch.tensor(sklearn.preprocessing.MinMaxScaler().fit_transform(test_features), dtype = torch.float).to(device)


cm_mean = dataset_cm.target_mean
cm_std = dataset_cm.target_std

b_mean = dataset_b.target_mean
b_std = dataset_b.target_std

no_mean = dataset_no.target_mean
no_std = dataset_no.target_std


submission_df = pd.DataFrame(columns = ['date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
                                        
        

In [12]:
for i in range(len(test_features)):
    submission_df.loc[len(submission_df.index)] = [test['date_time'].iloc[i], (model_cm(test_features[i]).detach().cpu().numpy()[0]*cm_std + cm_mean), (model_b(test_features[i]).detach().cpu().numpy()[0]*b_std + b_mean), (model_no(test_features[i]).detach().cpu().numpy()[0]*no_std + no_mean)]




In [13]:
submission_df


submission_df.to_csv('submission.csv', index = False)