In [26]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import RobustScaler
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import itertools
import pprint
import random
import time
import os

from sklearn.model_selection import GroupKFold

from transformers import get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

import gc
import os
import torch
import random
import numpy as np

In [27]:
# Load train
train = pd.read_csv("./vpp_data/train.csv")
test = pd.read_csv('./vpp_data/test.csv')

In [28]:
def create_features(df):
    df['cross']= df['u_in'] * df['u_out']
    df['cross2']= df['time_step'] * df['u_out']
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    print("Step-1...Completed")
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    print("Step-2...Completed")
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_in__mean'] = df.groupby(['breath_id'])['u_in'].transform('mean')
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    print("Step-3...Completed")
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    print("Step-4...Completed")
    
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] /df['count']
    
    df['breath_id_lag']=df['breath_id'].shift(1).fillna(0)
    df['breath_id_lag2']=df['breath_id'].shift(2).fillna(0)
    df['breath_id_lagsame']=np.select([df['breath_id_lag']==df['breath_id']],[1],0)
    df['breath_id_lag2same']=np.select([df['breath_id_lag2']==df['breath_id']],[1],0)
    df['breath_id__u_in_lag'] = df['u_in'].shift(1).fillna(0)
    df['breath_id__u_in_lag'] = df['breath_id__u_in_lag'] * df['breath_id_lagsame']
    df['breath_id__u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['breath_id__u_in_lag2'] = df['breath_id__u_in_lag2'] * df['breath_id_lag2same']
    print("Step-5...Completed")
    
    df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df[["15_in_sum","15_in_min","15_in_max","15_in_mean"]] = (df\
                                                              .groupby('breath_id')['u_in']\
                                                              .rolling(window=15,min_periods=1)\
                                                              .agg({"15_in_sum":"sum",
                                                                    "15_in_min":"min",
                                                                    "15_in_max":"max",
                                                                    "15_in_mean":"mean"})\
                                                               .reset_index(level=0,drop=True))
    print("Step-6...Completed")
    
    df['u_in_lagback_diff1'] = df['u_in'] - df['u_in_lag_back1']
    df['u_out_lagback_diff1'] = df['u_out'] - df['u_out_lag_back1']
    df['u_in_lagback_diff2'] = df['u_in'] - df['u_in_lag_back2']
    df['u_out_lagback_diff2'] = df['u_out'] - df['u_out_lag_back2']
    print("Step-7...Completed")
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    print("Step-8...Completed")
    
    return df


print("Train data...\n")
train_df = create_features(train)

print("\nTest data...\n")
test_df = create_features(test)

del train
del test
gc.collect()

Train data...

Step-1...Completed
Step-2...Completed
Step-3...Completed
Step-4...Completed
Step-5...Completed
Step-6...Completed
Step-7...Completed
Step-8...Completed

Test data...

Step-1...Completed
Step-2...Completed
Step-3...Completed
Step-4...Completed
Step-5...Completed
Step-6...Completed
Step-7...Completed
Step-8...Completed


0

In [29]:
#pd.show_versions()

In [30]:
def create_features(dataframe, list_of_features = ['u_in']):
    
    # u_in cumsum
    dataframe['u_in_cumsum'] = dataframe.groupby('breath_id')['u_in'].cumsum()
    
    # u_in shift change 
    for lag in np.arange(1, 5, 1):
        dataframe[f'u_in_lag_fwrd{lag}'] = dataframe.groupby('breath_id')['u_in'].shift(lag).fillna(0)
        dataframe[f'u_in_lag_back{lag}'] = dataframe.groupby('breath_id')['u_in'].shift(int(-lag)).fillna(0)
        
    # time diff
    dataframe['time_diff'] = dataframe.groupby('breath_id')['time_step'].diff(1).fillna(0)
    dataframe['time_diff_2'] = dataframe.groupby('breath_id')['time_step'].diff(2).fillna(0)
    dataframe['time_diff_3'] = dataframe.groupby('breath_id')['time_step'].diff(3).fillna(0)
    dataframe['time_diff_4'] = dataframe.groupby('breath_id')['time_step'].diff(4).fillna(0)
    dataframe['time_diff_5'] = dataframe.groupby('breath_id')['time_step'].diff(5).fillna(0)

    # u_in area
    dataframe['area'] = dataframe['time_step'] * dataframe['u_in']
    dataframe['area_cumsum'] = dataframe.groupby('breath_id')['area'].cumsum()
    # add rectangle method
    dataframe['auc_u_in'] = dataframe['time_diff'] * dataframe['u_in']
    dataframe['auc_u_in_cumsum'] = dataframe.groupby('breath_id')['auc_u_in'].cumsum()
    
    dataframe['u_in_cumsum'] = dataframe.groupby('breath_id')['u_in'].cumsum()
    
    for feature in list_of_features:
    
        grouped_dataframe = dataframe.groupby('breath_id')[feature].agg([max, min, np.mean, np.median])    
        
        dataframe = dataframe.merge(
            grouped_dataframe, 
            how='left', 
            on='breath_id'
        )
        
        dataframe = dataframe.rename(
            columns = {
                'max':feature+'_max', 
                'min':feature+'_min', 
                'mean':feature+'_mean', 
                'median':feature+'_median'
            }
        )
    
        dataframe[f'{feature}_range'] = (dataframe[f'{feature}_max'] - dataframe[f'{feature}_min']).apply(lambda x: max(0,x))
    
    return dataframe

#train_df = create_features(train_df)
#test_df = create_features(test_df)
#train_df.head()

In [31]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
# robustscaler is great to take into account outliers

RS = RobustScaler()

# not including u_out as it is a boolean
columns_to_scale = [elm for elm in train_df.columns if elm not in ['id', 'breath_id', 'time_step', 'pressure', 'u_out']]
train_df[columns_to_scale] = RS.fit_transform(train_df[columns_to_scale])

test_df['pressure'] = 0
test_df[columns_to_scale] = RS.transform(test_df[columns_to_scale])

test_df = test_df.drop('pressure', axis = 1)

In [32]:
import torch
from torch.utils.data import Dataset
import torch.nn as nn

# Dataset

In [33]:
train_df.columns

# ['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure',
#        'u_in_cumsum', 'u_in_lag_fwrd1', 'u_in_lag_back1', 'u_in_lag_fwrd2',
#        'u_in_lag_back2', 'u_in_lag_fwrd3', 'u_in_lag_back3', 'u_in_lag_fwrd4',
#        'u_in_lag_back4', 'time_diff', 'time_diff_2', 'time_diff_3',
#        'time_diff_4', 'time_diff_5', 'area', 'u_in_max', 'u_in_min',
#        'u_in_mean', 'u_in_median', 'u_in_range', 'time_step_max',
#        'time_step_min', 'time_step_mean', 'time_step_median',
#        'time_step_range', 'RC', 'R/C', 'C/R', 'log_RC', 'log_R/C', 'log_C/R',
#        'exp_R/C', 'exp_C/R']

Index(['id', 'breath_id', 'time_step', 'u_in', 'u_out', 'pressure', 'cross',
       'cross2', 'area', 'time_step_cumsum', 'u_in_cumsum', 'u_in_lag1',
       'u_out_lag1', 'u_in_lag_back1', 'u_out_lag_back1', 'u_in_lag2',
       'u_out_lag2', 'u_in_lag_back2', 'u_out_lag_back2', 'u_in_lag3',
       'u_out_lag3', 'u_in_lag_back3', 'u_out_lag_back3', 'u_in_lag4',
       'u_out_lag4', 'u_in_lag_back4', 'u_out_lag_back4',
       'breath_id__u_in__max', 'breath_id__u_in__mean',
       'breath_id__u_in__diffmax', 'breath_id__u_in__diffmean', 'u_in_diff1',
       'u_out_diff1', 'u_in_diff2', 'u_out_diff2', 'u_in_diff3', 'u_out_diff3',
       'u_in_diff4', 'u_out_diff4', 'one', 'count', 'u_in_cummean',
       'breath_id_lag', 'breath_id_lag2', 'breath_id_lagsame',
       'breath_id_lag2same', 'breath_id__u_in_lag', 'breath_id__u_in_lag2',
       'time_step_diff', '15_in_sum', '15_in_min', '15_in_max', '15_in_mean',
       'u_in_lagback_diff1', 'u_out_lagback_diff1', 'u_in_lagback_diff2',
      

In [34]:
class VPPDataLoader(Dataset):
    def __init__(self, dataframe):
        
        if "pressure" not in dataframe.columns:
            dataframe['pressure'] = 0
            
        # aggregate data and store features as list
        self.df_grouped = dataframe.groupby('breath_id').agg(list).reset_index()
        self._preprocess()
        
    def _preprocess(self):
        self.pressures = np.array(self.df_grouped['pressure'].values.tolist())
        
        u_ins = np.array(self.df_grouped['u_in'].values.tolist())
        self.u_outs = np.array(self.df_grouped['u_out'].values.tolist())
        
        cross = np.array(self.df_grouped['cross'].values.tolist())
        cross2 = np.array(self.df_grouped['cross2'].values.tolist())
        
        area = np.array(self.df_grouped['area'].values.tolist())
        time_step_cumsum = np.array(self.df_grouped['time_step_cumsum'].values.tolist())
        u_in_cumsum = np.array(self.df_grouped['u_in_cumsum'].values.tolist())
        
        u_in_lag1 = np.array(self.df_grouped['u_in_lag1'].values.tolist())
        u_out_lag1 = np.array(self.df_grouped['u_out_lag1'].values.tolist())
        u_in_lag_back1 = np.array(self.df_grouped['u_in_lag_back1'].values.tolist())
        u_out_lag_back1 = np.array(self.df_grouped['u_out_lag_back1'].values.tolist())
        u_in_lag2 = np.array(self.df_grouped['u_in_lag2'].values.tolist())
        u_out_lag2 = np.array(self.df_grouped['u_out_lag2'].values.tolist())
        u_in_lag_back2 = np.array(self.df_grouped['u_in_lag_back2'].values.tolist())
        u_out_lag_back2 = np.array(self.df_grouped['u_out_lag_back2'].values.tolist())
        u_in_lag3 = np.array(self.df_grouped['u_in_lag3'].values.tolist())
        u_out_lag3 = np.array(self.df_grouped['u_out_lag3'].values.tolist())
        u_in_lag_back3 = np.array(self.df_grouped['u_in_lag_back3'].values.tolist())
        u_out_lag_back3 = np.array(self.df_grouped['u_out_lag_back3'].values.tolist())
        u_in_lag4 = np.array(self.df_grouped['u_in_lag4'].values.tolist())
        u_out_lag4 = np.array(self.df_grouped['u_out_lag4'].values.tolist())
        u_in_lag_back4 = np.array(self.df_grouped['u_in_lag_back4'].values.tolist())
        u_out_lag_back4 = np.array(self.df_grouped['u_out_lag_back4'].values.tolist())
        
        breath_id__u_in__max = np.array(self.df_grouped['breath_id__u_in__max'].values.tolist())
        breath_id__u_in__mean = np.array(self.df_grouped['breath_id__u_in__mean'].values.tolist())
        breath_id__u_in__diffmax = np.array(self.df_grouped['breath_id__u_in__diffmax'].values.tolist())
        breath_id__u_in__diffmean = np.array(self.df_grouped['breath_id__u_in__diffmean'].values.tolist())
        
        u_in_diff1 = np.array(self.df_grouped['u_in_diff1'].values.tolist())
        u_out_diff1 = np.array(self.df_grouped['u_out_diff1'].values.tolist())
        u_in_diff2 = np.array(self.df_grouped['u_in_diff2'].values.tolist())
        u_out_diff2 = np.array(self.df_grouped['u_out_diff2'].values.tolist())
        u_in_diff3 = np.array(self.df_grouped['u_in_diff3'].values.tolist())
        u_out_diff3 = np.array(self.df_grouped['u_out_diff3'].values.tolist())
        u_in_diff4 = np.array(self.df_grouped['u_in_diff4'].values.tolist())
        u_out_diff4 = np.array(self.df_grouped['u_out_diff4'].values.tolist())
        
        count = np.array(self.df_grouped['count'].values.tolist())
        u_in_cummean = np.array(self.df_grouped['u_in_cummean'].values.tolist())
        
        breath_id_lag = np.array(self.df_grouped['breath_id_lag'].values.tolist())
        breath_id_lag2 = np.array(self.df_grouped['breath_id_lag2'].values.tolist())
        breath_id_lagsame = np.array(self.df_grouped['breath_id_lagsame'].values.tolist())
        breath_id_lag2same = np.array(self.df_grouped['breath_id_lag2same'].values.tolist())
        breath_id__u_in_lag = np.array(self.df_grouped['breath_id__u_in_lag'].values.tolist())
        breath_id__u_in_lag2 = np.array(self.df_grouped['breath_id__u_in_lag2'].values.tolist())
        time_step_diff = np.array(self.df_grouped['time_step_diff'].values.tolist())
        
        v15_in_sum = np.array(self.df_grouped['15_in_sum'].values.tolist())
        v15_in_min = np.array(self.df_grouped['15_in_min'].values.tolist())
        v15_in_max = np.array(self.df_grouped['15_in_max'].values.tolist())
        v15_in_mean = np.array(self.df_grouped['15_in_mean'].values.tolist())
        
        u_in_lagback_diff1 = np.array(self.df_grouped['u_in_lagback_diff1'].values.tolist())
        u_out_lagback_diff1 = np.array(self.df_grouped['u_out_lagback_diff1'].values.tolist())
        u_in_lagback_diff2 = np.array(self.df_grouped['u_in_lagback_diff2'].values.tolist())
        u_out_lagback_diff2 = np.array(self.df_grouped['u_out_lagback_diff2'].values.tolist())
        
        R_20 = np.array(self.df_grouped['R_20'].values.tolist())
        R_5 = np.array(self.df_grouped['R_5'].values.tolist())
        R_50 = np.array(self.df_grouped['R_50'].values.tolist())
        C_10 = np.array(self.df_grouped['C_10'].values.tolist())
        C_20 = np.array(self.df_grouped['C_20'].values.tolist())
        C_50 = np.array(self.df_grouped['C_50'].values.tolist())
        R__C_20__10 = np.array(self.df_grouped['R__C_20__10'].values.tolist())
        R__C_20__20 = np.array(self.df_grouped['R__C_20__20'].values.tolist())
        R__C_20__50 = np.array(self.df_grouped['R__C_20__50'].values.tolist())
        R__C_50__10 = np.array(self.df_grouped['R__C_50__10'].values.tolist())
        R__C_50__20 = np.array(self.df_grouped['R__C_50__20'].values.tolist())
        R__C_50__50 = np.array(self.df_grouped['R__C_50__50'].values.tolist())
        R__C_5__10 = np.array(self.df_grouped['R__C_5__10'].values.tolist())
        R__C_5__20 = np.array(self.df_grouped['R__C_5__20'].values.tolist())
        R__C_5__50 = np.array(self.df_grouped['R__C_5__50'].values.tolist())
        
        
        # [:, None] increases array dimension from 1 to 2, becomes a [[v1, v2, v3]] numpy array
        self.inputs = np.concatenate([
            u_ins[:, None], 
            self.u_outs[:, None],
            
            cross[:, None],
            cross2[:, None],
            
            area[:, None],
            time_step_cumsum[:, None],
            u_in_cumsum[:, None],
            
            u_in_lag1[:, None],
            u_out_lag1[:, None],
            u_in_lag_back1[:, None],
            u_out_lag_back1[:, None],
            u_in_lag2[:, None],
            u_out_lag2[:, None],
            u_in_lag_back2[:, None],
            u_out_lag_back2[:, None],
            u_in_lag3[:, None],
            u_out_lag3[:, None],
            u_in_lag_back3[:, None],
            u_out_lag_back3[:, None],
            u_in_lag4[:, None],
            u_out_lag4[:, None],
            u_in_lag_back4[:, None],
            u_out_lag_back4[:, None],
            
            breath_id__u_in__max[:, None],
            breath_id__u_in__mean[:, None],
            breath_id__u_in__diffmax[:, None],
            breath_id__u_in__diffmean[:, None],
            
            u_in_diff1[:, None],
            u_out_diff1[:, None],
            u_in_diff2[:, None],
            u_out_diff2[:, None],
            u_in_diff3[:, None],
            u_out_diff3[:, None],
            u_in_diff4[:, None],
            u_out_diff4[:, None],
            
            count[:, None],
            u_in_cummean[:, None],
            
            breath_id_lag[:, None],
            breath_id_lag2[:, None],
            breath_id_lagsame[:, None],
            breath_id_lag2same[:, None],
            breath_id__u_in_lag[:, None],
            breath_id__u_in_lag2[:, None],
            time_step_diff[:, None],
            
            v15_in_sum[:, None],
            v15_in_min[:, None],
            v15_in_max[:, None],
            v15_in_mean[:, None],
            
            u_in_lagback_diff1[:, None],
            u_out_lagback_diff1[:, None],
            u_in_lagback_diff2[:, None],
            u_out_lagback_diff2[:, None],
            
            R_20[:, None],
            R_5[:, None],
            R_50[:, None],
            C_10[:, None],
            C_20[:, None],
            C_50[:, None],
            R__C_20__10[:, None],
            R__C_20__20[:, None],
            R__C_20__50[:, None],
            R__C_50__10[:, None],
            R__C_50__20[:, None],
            R__C_50__50[:, None],
            R__C_5__10[:, None],
            R__C_5__20[:, None],
            R__C_5__50[:, None],
        ], axis = 1).transpose(0, 2, 1)

        
    def __len__(self):
        return self.df_grouped.shape[0]
    
    def __getitem__(self, index):
        target = 'pressure'
        return {
            "input": torch.tensor(self.inputs[index], dtype=torch.float),
            "u_out": torch.tensor(self.df_grouped.u_out[index], dtype=torch.int8),
            "p": torch.tensor(self.df_grouped.loc[index, target], dtype=torch.float),
        }

In [35]:
# create a class wrapper from PyTorch nn.Module, so
# the function now can be easily used in models

# next time, fool, use directly the nn.SiLU() activation

class Swish(nn.Module):
    '''
    Applies the Sigmoid Linear Unit (SiLU) function element-wise:
        SiLU(x) = x * sigmoid(x)
    Shape:
        - Input: (N, *) where * means, any number of additional
          dimensions
        - Output: (N, *), same shape as the input
    '''
    
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x * torch.sigmoid(x)
    
act_function = Swish()

# Model

In [36]:
class LSTM_archi(nn.Module):
    def __init__(
        self, 
        # nb of expected features
        input_dim,
        lstm_dim,
        dense_dim,
        logit_dim,
        num_classes=1,
    ):
        
        super().__init__()
        self.classic_layer = nn.Sequential(
            nn.Linear(in_features = input_dim, out_features = 2*(dense_dim // 3)),
            nn.ReLU(),
            #act_function,
        )
        
        self.upscale_layer = nn.Sequential(
            nn.Linear(in_features =  2*(dense_dim // 3), out_features = dense_dim),
            #act_function,
            nn.ReLU(),
        )
        
        self.LSTM_layer = nn.LSTM(
            input_size = dense_dim,
            hidden_size = lstm_dim,
            bidirectional = True,
            num_layers = 4,
            # then the input and output tensors are provided as (batch, seq, feature) instead of (seq, batch, feature).
            batch_first = True,
        )
        
        self.output_layer = nn.Sequential(
            nn.Linear(in_features = dense_dim * 2, out_features = logit_dim),
            #act_function,
            #nn.ReLU(),
            nn.Linear(in_features = logit_dim, out_features = num_classes)
        )
        
        # Define proportion or neurons to dropout
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        x = self.classic_layer(x)
        #x = self.dropout(x)
        x = self.upscale_layer(x)
        #x = self.dropout(x)
        x, _ = self.LSTM_layer(x)
        #x = self.dropout(x)
        pred = self.output_layer(x)
        return pred

In [37]:
class LSTM_archi_3(nn.Module):
    def __init__(
        self, 
        # nb of expected features
        input_dim,
        lstm_dim,
        dense_dim,
        logit_dim,
        num_classes=1,
    ):
        
        super().__init__()
        
        self.classic_layer = nn.Sequential(
            nn.Linear(in_features = input_dim, out_features = dense_dim),
            act_function,
        )
        
        self.LSTM_layer_1 = nn.LSTM(
            input_size = dense_dim,
            hidden_size = lstm_dim // 2,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.LSTM_layer_2 = nn.LSTM(
            input_size = lstm_dim,
            hidden_size = lstm_dim // 4,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.LSTM_layer_3 = nn.LSTM(
            input_size = lstm_dim // 2,
            hidden_size = lstm_dim // 8,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.LSTM_layer_y = nn.LSTM(
            input_size = lstm_dim // 2,
            hidden_size = lstm_dim // 4,
            bidirectional = True,
            num_layers = 3,
            batch_first = True,
        )
        
        self.uplayer = nn.Sequential(
            nn.Linear(in_features = input_dim, out_features = dense_dim),
            act_function,
        )
        
        self.output_layer = nn.Sequential(
            nn.Linear(in_features = lstm_dim // 4, out_features = logit_dim),
            nn.Linear(in_features = logit_dim, out_features = num_classes)
        )
        
        # Define proportion or neurons to dropout
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        x1 = self.classic_layer(x)
        y1 = self.classic_layer(x)
        
        x1, _ = self.LSTM_layer_1(x1)
        x1, _ = self.LSTM_layer_2(x1)
        x, _ = self.LSTM_layer_3(x)
        
        y1, _ = self.LSTM_layer_y(y1)
        
        
        
        pred = self.output_layer(x)
        return pred

In [38]:
class LSTM_archi_2(nn.Module):
    def __init__(
        self, 
        # nb of expected features
        input_dim,
        lstm_dim,
        dense_dim,
        logit_dim,
        num_classes=1,
    ):
        
        super().__init__()
        
        self.classic_layer = nn.Sequential(
            nn.Linear(in_features = input_dim, out_features = dense_dim),
            act_function,
        )
        
        self.LSTM_layer_1 = nn.LSTM(
            input_size = dense_dim,
            hidden_size = lstm_dim // 2,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.LSTM_layer_2 = nn.LSTM(
            input_size = lstm_dim,
            hidden_size = lstm_dim // 4,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.LSTM_layer_3 = nn.LSTM(
            input_size = lstm_dim // 2,
            hidden_size = lstm_dim // 8,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.output_layer = nn.Sequential(
            nn.Linear(in_features = lstm_dim // 4, out_features = logit_dim),
            nn.Linear(in_features = logit_dim, out_features = num_classes)
        )
        
        # Define proportion or neurons to dropout
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        x = self.classic_layer(x)
        x, _ = self.LSTM_layer_1(x)
        #x = self.dropout(x)
        x, _ = self.LSTM_layer_2(x)
        #x = self.dropout(x)
        x, _ = self.LSTM_layer_3(x)
        #x = self.dropout(x)
        pred = self.output_layer(x)
        return pred

In [39]:
class GRU_archi(nn.Module):
    def __init__(
        self, 
        input_dim,
        lstm_dim,
        dense_dim,
        logit_dim,
        num_classes=1,
    ):
        
        super().__init__()
        
        self.classic_layer = nn.Sequential(
            nn.Linear(in_features = input_dim, out_features = dense_dim),
            act_function,
        )
        
        self.GRU_layer_1 = nn.GRU(
            input_size = dense_dim,
            hidden_size = lstm_dim // 2,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.GRU_layer_2 = nn.GRU(
            input_size = lstm_dim,
            hidden_size = lstm_dim // 4,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.GRU_layer_3 = nn.GRU(
            input_size = lstm_dim // 2,
            hidden_size = lstm_dim // 8,
            bidirectional = True,
            num_layers = 1,
            batch_first = True,
        )
        
        self.output_layer = nn.Sequential(
            nn.Linear(in_features = lstm_dim // 4, out_features = logit_dim),
            nn.Linear(in_features = logit_dim, out_features = num_classes)
        )
        
        # Define proportion or neurons to dropout
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        x = self.classic_layer(x)
        x, _ = self.GRU_layer_1(x)
        x, _ = self.GRU_layer_2(x)
        x, _ = self.GRU_layer_3(x)
        pred = self.output_layer(x)
        return pred

# Training

## utils

In [40]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results.

    Args:
        seed (int): Number of the seed.
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    
def worker_init_fn(worker_id):
    """
    Handles PyTorch x Numpy seeding issues.

    Args:
        worker_id (int): Id of the worker.
    """
    np.random.seed(np.random.get_state()[1][0] + worker_id)
    

def save_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Saves the weights of a PyTorch model.

    Args:
        model (torch model): Model to save the weights of.
        filename (str): Name of the checkpoint.
        verbose (int, optional): Whether to display infos. Defaults to 1.
        cp_folder (str, optional): Folder to save to. Defaults to "".
    """
    if verbose:
        print(f"\n -> Saving weights to {os.path.join(cp_folder, filename)}\n")
    torch.save(model.state_dict(), os.path.join(cp_folder, filename))

## metric

The competition will be scored as the mean absolute error between the predicted and actual pressures during the inspiratory phase of each breath. The expiratory phase is not scored.

In [41]:
def compute_metric(df, preds):
    """
    Metric for the problem, as I understood it.
    """
    
    y = np.array(df['pressure'].values.tolist())

    # inspiratory phase
    mask = 1 - np.array(df['u_out'].values.tolist())
        
    # combine with mae calculusse
    mae = mask * np.abs(y - preds)
    mae = mae.sum() / mask.sum()
    
    return mae


# Custom loss
class VentilatorLoss(nn.Module):
    """
    Directly optimizes the competition metric
    """
    def __call__(self, preds, y, u_out):
        mask = 1 - u_out
        mae = mask * (y - preds).abs()
        mae = mae.sum(-1) / mask.sum(-1)

        return mae

## fit

In [42]:
import gc
import time
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup

In [43]:
def fit(
    model,
    train_dataset,
    val_dataset,
    loss_name="L1Loss",
    optimizer="Adam",
    epochs=56,
    batch_size=32,
    val_bs=32,
    warmup_prop=0.1,
    lr=1e-3,
    num_classes=1,
    verbose=1,
    first_epoch_eval=0,
    device="cuda"
):
    avg_val_loss = 0.

    # Optimizer
    optimizer = getattr(torch.optim, optimizer)(model.parameters(), lr=lr)
    
    # Data loaders
    train_loader = DataLoader(
        train_dataset,
        # how many samples per batch to load
        batch_size=batch_size,
        # to have the data reshuffled at every epoch
        shuffle=True,
        # drop the last incomplete batch
        drop_last=True,
        num_workers=4,
        # the data loader will copy Tensors into CUDA pinned memory before returning them.
        pin_memory=True,
        # this will be called on each worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as input, after seeding and before data loading.
        #worker_init_fn=worker_init_fn
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=val_bs,
        # no shuffling val data at every epochs meaning validating on same batches of data
        shuffle=False,
        num_workers=4,
        pin_memory=True,
    )

    # Loss
    loss_fct = VentilatorLoss()

    # Scheduler
    num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
    num_training_steps = int(epochs * len(train_loader))
    # Create a schedule with a varying learning rate
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps, 
        num_training_steps
    )

    for epoch in range(epochs):
        model.train()
        # Sets the gradients of all optimized Tensors to zero.
        model.zero_grad()
        start_time = time.time()

        avg_loss = 0

        for data in train_loader:
            
            
            pred = model(data['input'].to(device)).squeeze(-1)

            loss = loss_fct(
                pred,
                data['p'].to(device),
                data['u_out'].to(device),
            ).mean()
            # Computes the gradient of current tensor w.r.t. (with respect to) graph leaves.
            loss.backward()
            avg_loss += loss.item() / len(train_loader)

            optimizer.step()
            scheduler.step()

            for param in model.parameters():
                param.grad = None

        model.eval()
        mae, avg_val_loss = 0, 0
        preds = []

        # Context-manager that disable gradient calculation. To use when dealing with validation datasets
        with torch.no_grad():
            for data in val_loader:
                pred = model(data['input'].to(device)).squeeze(-1)

                loss = loss_fct(
                    pred.detach(), 
                    data['p'].to(device),
                    data['u_out'].to(device),
                ).mean()
                avg_val_loss += loss.item() / len(val_loader)

                preds.append(pred.detach().cpu().numpy())
        
        preds = np.concatenate(preds, 0)
        mae = compute_metric(val_dataset.df_grouped, preds)

        elapsed_time = time.time() - start_time
        if (epoch + 1) % verbose == 0:
            elapsed_time = elapsed_time * verbose
            lr = scheduler.get_last_lr()[0]
            print(
                f"Epoch {epoch + 1:02d}/{epochs:02d} \t lr={lr:.1e}\t t={elapsed_time:.0f}s \t"
                f"loss={avg_loss:.3f}",
                end="\t",
            )

            if (epoch + 1 >= first_epoch_eval) or (epoch + 1 == epochs):
                print(f"val_loss={avg_val_loss:.3f}\tmae={mae:.3f}")
            else:
                print("")

    del (val_loader, train_loader, loss, data, pred)
    gc.collect()
    torch.cuda.empty_cache()

    return preds

## predict

In [44]:
def predict(
    model,
    dataset,
    batch_size = 64,
    device = "cuda"
):
    """
    Args:
        model (torch model): Model to predict with.
        dataset (PathologyDataset): Dataset to predict on.
        batch_size (int, optional): Batch size. Defaults to 64.
        device (str, optional): Device for torch. Defaults to "cuda".

    Returns:
        numpy array [len(dataset) x num_classes] preds.
    """
    
    model.eval()

    loader = DataLoader(
        dataset, 
        batch_size = batch_size, 
        shuffle = False, 
        num_workers = 4
    )
    
    preds = []
    with torch.no_grad():
        for data in loader:
            pred = model(data['input'].to(device)).squeeze(-1)
            preds.append(pred.detach().cpu().numpy())

    preds = np.concatenate(preds, axis = 0)
    return preds

# Train

In [45]:
def train(config, df_train, df_val, df_test, fold):
    """
    Args:
        config (Config): Parameters.
        df_train (pandas dataframe): Training metadata.
        df_val (pandas dataframe): Validation metadata.
        df_test (pandas dataframe): Test metadata.
        fold (int): Selected fold.

    Returns:
        np array: Study validation predictions.
    """

    # Seed
    seed_everything(config.seed)

    # Load model arch
    model = LSTM_archi_2(
        input_dim = config.input_dim,
        lstm_dim = config.lstm_dim,
        dense_dim = config.dense_dim,
        logit_dim = config.logit_dim,
        num_classes = config.num_classes,
    ).to(config.device)
    model.zero_grad()

    train_dataset = VPPDataLoader(df_train)
    val_dataset = VPPDataLoader(df_val)
    test_dataset = VPPDataLoader(df_test)

    print(f"    -> {len(train_dataset)} training breathes")
    print(f"    -> {len(val_dataset)} validation breathes")

    pred_val = fit(
        model,
        train_dataset,
        val_dataset,
        loss_name = config.loss,
        optimizer = config.optimizer,
        epochs = config.epochs,
        batch_size = config.batch_size,
        val_bs = config.val_bs,
        lr = config.lr,
        warmup_prop = config.warmup_prop,
        verbose = config.verbose,
        first_epoch_eval = config.first_epoch_eval,
        device = config.device,
    )
    
    pred_test = predict(
        model, 
        test_dataset, 
        batch_size = config.val_bs, 
        device = config.device
    )

    if config.save_weights:
        save_model_weights(
            model,
            f"{config.selected_model}_{fold}.pt",
            cp_folder = "",
        )

    del (model, train_dataset, val_dataset, test_dataset)
    gc.collect()
    torch.cuda.empty_cache()

    return pred_val, pred_test

## kfold

In [46]:
def k_fold(config, df, df_test):
    """
    Performs a patient grouped k-fold cross validation.
    """

    pred_oof = np.zeros(len(df))
    preds_test = []
    
    gkf = GroupKFold(n_splits = config.k)
    splits = list(gkf.split(X = df, y = df, groups = df["breath_id"]))

    for i, (train_idx, val_idx) in enumerate(splits):
        if i in config.selected_folds:
            print(f"\n-------------   Fold {i + 1} / {config.k}  -------------\n")

            df_train = df.iloc[train_idx].copy().reset_index(drop = True)
            df_val = df.iloc[val_idx].copy().reset_index(drop = True)

            pred_val, pred_test = train(config, df_train, df_val, df_test, i)
            
            pred_oof[val_idx] = pred_val.flatten()
            preds_test.append(pred_test.flatten())

    print(f'\n -> CV MAE : {compute_metric(df, pred_oof) :.3f}')

    return pred_oof, np.mean(preds_test, 0)

# Main

In [47]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    device = "cuda" if torch.cuda.is_available() else "cpu"
    save_weights = True

    # k-fold
    k = 7
    selected_folds = [0, 1, 2, 3, 4, 5, 6]
    
    # Model
    selected_model = 'LSTM'
    input_dim = 67 

    dense_dim = 512
    lstm_dim = 512
    logit_dim = 512
    num_classes = 1

    # Training
    loss = "L1Loss"  # not used
    optimizer = "Adam"
    batch_size = 256
    epochs = 100

    lr = 1e-3
    warmup_prop = 0

    val_bs = 256
    first_epoch_eval = 0

In [48]:
pred_oof, pred_test = k_fold(
    Config, 
    train_df,
    test_df,
)


-------------   Fold 1 / 7  -------------

    -> 64671 training breathes
    -> 10779 validation breathes
Epoch 01/100 	 lr=9.9e-04	 t=22s 	loss=3.375	val_loss=1.465	mae=1.475
Epoch 02/100 	 lr=9.8e-04	 t=22s 	loss=1.288	val_loss=1.048	mae=1.051
Epoch 03/100 	 lr=9.7e-04	 t=22s 	loss=1.069	val_loss=0.891	mae=0.894
Epoch 04/100 	 lr=9.6e-04	 t=22s 	loss=0.851	val_loss=1.372	mae=1.373
Epoch 05/100 	 lr=9.5e-04	 t=22s 	loss=0.853	val_loss=0.749	mae=0.750
Epoch 06/100 	 lr=9.4e-04	 t=22s 	loss=0.748	val_loss=0.776	mae=0.777
Epoch 07/100 	 lr=9.3e-04	 t=22s 	loss=0.692	val_loss=0.715	mae=0.717
Epoch 08/100 	 lr=9.2e-04	 t=22s 	loss=0.675	val_loss=0.830	mae=0.832
Epoch 09/100 	 lr=9.1e-04	 t=22s 	loss=0.644	val_loss=0.666	mae=0.670
Epoch 10/100 	 lr=9.0e-04	 t=22s 	loss=0.609	val_loss=0.595	mae=0.595
Epoch 11/100 	 lr=8.9e-04	 t=22s 	loss=0.610	val_loss=0.623	mae=0.625
Epoch 12/100 	 lr=8.8e-04	 t=22s 	loss=0.566	val_loss=0.664	mae=0.666
Epoch 13/100 	 lr=8.7e-04	 t=22s 	loss=0.555	val_los

In [49]:
sub = pd.read_csv('./vpp_data/sample_submission.csv')
sub['pressure'] = pred_test
sub.to_csv('./vpp_data/submission.csv', index = False)