# PyTorch Solution: Rolling Window
Here we introduce a pytorch solution which takes an window of past and future Acc readings to predict the outcomes. The different segments of the notebook can be modified and improved as per your liking to better the whole pipeline. As a way, this works as a good starter baseline!

**Please leave an upvote if you found this notebook helpful!**

In [None]:
import os
import gc
import random
import time

import json
from tqdm import tqdm
import glob
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, average_precision_score

import warnings
warnings.filterwarnings(action='ignore')

In [6]:
class Config:
    train_dir1 = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog"
    train_dir2 = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog"

    batch_size = 2048
    window_size = 256
    window_future = 32
    window_past = window_size - window_future
    
    model_dropout = 0.3
    model_hidden = 768
    model_nblocks = 2
    
    lr = 0.001
    num_epochs = 8
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    feature_list = ['AccV', 'AccML', 'AccAP']
    label_list = ['StartHesitation', 'Turn', 'Walking']
    
    
cfg = Config()

In [4]:
cfg.device

'cuda'

# Dataset

We use a window comprised of past and future time Acc readings to form our dataset for a particular time instance. In case some portion of the window data is not available, we pad them with zeros.

In [5]:
class FOGDataset(Dataset):
    def __init__(self, fpaths, scale=9.806, test=False):
        super(FOGDataset, self).__init__()
        tm = time.time()
        self.test = test
        self.fpaths = fpaths
        self.f_ids = [os.path.basename(f)[:-4] for f in self.fpaths]
        self.curr_df_idx = 0
        self.curr_row_idx = 0
        self.dfs = [np.array(pd.read_csv(f)) for f in fpaths]
        self.end_indices = []
        self.scale = scale
        
        self.length = 0
        for df in self.dfs:
            self.length += df.shape[0]
            self.end_indices.append(self.length)
            
        print(f"Dataset initialized in {time.time() - tm} secs!")
        
    def pad(self, df, time_start):
        if df.shape[0] == cfg.window_size:
            return df
        
        npad = cfg.window_size - df.shape[0]
        padzeros = np.zeros((npad, 3))
        if time_start <= 0:
            df = np.concatenate((padzeros, df), axis=0)
        else:
            df = np.concatenate((df, padzeros), axis=0)
        return df
            
    def __getitem__(self, index):
        for i,e in enumerate(self.end_indices):
            if index >= e:
                continue
            df_idx = i
            break
            
        curr_df = self.dfs[i]
        row_idx = curr_df.shape[0] - (self.end_indices[i] - index)
        _id = self.f_ids[i] + "_" + str(row_idx)
        
        x = self.pad(curr_df[row_idx-cfg.window_past:row_idx+cfg.window_future, 1:4], row_idx-cfg.window_past )
        x = torch.tensor(x)/self.scale
        
        if self.test == True:
            return _id, x
        
        y = curr_df[row_idx, -3:].astype('float')
        y = torch.tensor(y)
        
        return x, y
    
    def __len__(self):
        return self.length

# Stratified Group K Fold

It's mentioned in the data that the subjects are different in the train and test set and even different between the public/private splits of the test data. So we need to use Stratified Group K Fold. But since the positive instances in the sequences are very scarce, we need to pick up the best fold which will give us the best balance of the positive/negative instances. For this notebook, we use only the tdcsfog dataset.

In [9]:
# Analysis of positive instances in each fold of our CV folds

n1 = []
n2 = []
n3 = []

# Here I am using the metadata file available during training. Since the code will run again during submission, if 
# I used the usual file from the competition folder, it would have been updated with the test files too.
metadata = pd.read_csv("/kaggle/input/copy-train-metadata/tdcsfog_metadata.csv")

for f in tqdm(metadata['Id']):
    fpath = f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{f}.csv"
    df = pd.read_csv(fpath)
    
    n1.append(np.sum(df['StartHesitation']))
    n2.append(np.sum(df['Turn']))
    n3.append(np.sum(df['Walking']))
    
print(f"32 files have positive values in all 3 classes")

metadata['n1'] = n1
metadata['n2'] = n2
metadata['n3'] = n3

sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    
    print(f"Length of Train = {len(train_ids)}, Length of Valid = {len(valid_ids)}")
    n1_sum = metadata.loc[train_index, 'n1'].sum()
    n2_sum = metadata.loc[train_index, 'n2'].sum()
    n3_sum = metadata.loc[train_index, 'n3'].sum()
    print(f"Train classes: {n1_sum:,}, {n2_sum:,}, {n3_sum:,}")
    
    n1_sum = metadata.loc[valid_index, 'n1'].sum()
    n2_sum = metadata.loc[valid_index, 'n2'].sum()
    n3_sum = metadata.loc[valid_index, 'n3'].sum()
    print(f"Valid classes: {n1_sum:,}, {n2_sum:,}, {n3_sum:,}")
    
# # FOLD 2 is the most well balanced

100%|██████████| 833/833 [00:23<00:00, 36.17it/s]

32 files have positive values in all 3 classes
Fold = 0
Length of Train = 672, Length of Valid = 161
Train classes: 287,832, 1,462,652, 175,633
Valid classes: 16,958, 216,130, 32,205
Fold = 1
Length of Train = 613, Length of Valid = 220
Train classes: 51,748, 909,505, 65,242
Valid classes: 253,042, 769,277, 142,596
Fold = 2
Length of Train = 703, Length of Valid = 130
Train classes: 271,881, 1,332,746, 183,673
Valid classes: 32,909, 346,036, 24,165
Fold = 3
Length of Train = 649, Length of Valid = 184
Train classes: 303,710, 1,517,147, 205,196
Valid classes: 1,080, 161,635, 2,642
Fold = 4
Length of Train = 695, Length of Valid = 138
Train classes: 303,989, 1,493,078, 201,608
Valid classes: 801, 185,704, 6,230





In [10]:
# The actual train-test split (based on Fold 2)

metadata = pd.read_csv("/kaggle/input/copy-train-metadata/tdcsfog_metadata.csv")
sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (train_index, valid_index) in enumerate(sgkf.split(X=metadata['Id'], y=[1]*len(metadata), groups=metadata['Subject'])):
    if i != 2:
        continue
    print(f"Fold = {i}")
    train_ids = metadata.loc[train_index, 'Id']
    valid_ids = metadata.loc[valid_index, 'Id']
    print(f"Length of Train = {len(train_ids)}, Length of Valid = {len(valid_ids)}")
    
    if i == 2:
        break
        
train_fpaths = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{_id}.csv" for _id in train_ids]
valid_fpaths = [f"/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/{_id}.csv" for _id in valid_ids]

Fold = 2
Length of Train = 703, Length of Valid = 130


# Model

In [11]:
def _block(in_features, out_features, drop_rate):
    return nn.Sequential(
        nn.Linear(in_features, out_features),
        nn.BatchNorm1d(out_features),
        nn.ReLU(),
        nn.Dropout(drop_rate)
    )

class FOGModel(nn.Module):
    def __init__(self, p=cfg.model_dropout, dim=cfg.model_hidden, nblocks=cfg.model_nblocks):
        super(FOGModel, self).__init__()
        self.dropout = nn.Dropout(p)
        self.in_layer = nn.Linear(cfg.window_size*3, dim)
        self.blocks = nn.Sequential(*[_block(dim, dim, p) for _ in range(nblocks)])
        self.out_layer = nn.Linear(dim, 3)
        
    def forward(self, x):
        x = x.view(-1, cfg.window_size*3)
        x = self.in_layer(x)
        for block in self.blocks:
            x = block(x)
        x = self.out_layer(x)
        return x

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Training

In [13]:
def train_one_epoch(model, loader, optimizer, criterion):
    loss_sum = 0.
    
    model.train()
    for x,y in tqdm(loader):
        x = x.to(cfg.device).float()
        y = y.to(cfg.device).float()
        
        y_pred = model(x)
        loss = criterion(y_pred, y)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        loss_sum += loss.item()
    
    print(f"Train Loss: {(loss_sum/len(loader)):.04f}")
    

def validation_one_epoch(model, loader, criterion):
    loss_sum = 0.
    y_true_epoch = []
    y_pred_epoch = []
    
    model.eval()
    for x,y in tqdm(loader):
        x = x.to(cfg.device).float()
        y = y.to(cfg.device).float()
        
        with torch.no_grad():
            y_pred = model(x)
            loss = criterion(y_pred, y)
        
        loss_sum += loss.item()
        y_true_epoch.append(y.cpu().numpy())
        y_pred_epoch.append(y_pred.cpu().numpy())
        
    y_true_epoch = np.concatenate(y_true_epoch, axis=0)
    y_pred_epoch = np.concatenate(y_pred_epoch, axis=0)
    
    scores = [average_precision_score(y_true_epoch[:,i], np.round(y_pred_epoch[:,i],3)) for i in range(3)]
    mean_score = np.mean(scores)
    print(f"Validation Loss: {(loss_sum/len(loader)):.04f}, Validation Score: {mean_score:.03f}, ClassWise: {scores[0]:.03f},{scores[1]:.03f},{scores[2]:.03f}")
    
    return mean_score
        
def train():
    model = FOGModel().to(cfg.device)
    print(f"Number of parameters in model - {count_parameters(model):,}")
    
    train_dataset = FOGDataset(train_fpaths)
    valid_dataset = FOGDataset(valid_fpaths)
    print(f"lengths of datasets: train - {len(train_dataset)}, valid - {len(valid_dataset)}")
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, num_workers=4, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, num_workers=4)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr)
    criterion = torch.nn.BCEWithLogitsLoss().to(cfg.device)
    
    max_score = 0.0
    
    print("="*50)
    for epoch in range(cfg.num_epochs):
        print(f"Epoch: {epoch}")
        train_one_epoch(model, train_loader, optimizer, criterion)
        score = validation_one_epoch(model, valid_loader, criterion)
        
        if score > max_score:
            max_score = score
            torch.save(model.state_dict(), "best_model_state.h5")
            print("Saving Model ...")
        
        print("="*50)
        
    return model

In [14]:
model = train()

Number of parameters in model - 1,777,155
Dataset initialized in 5.4231555461883545 secs!
Dataset initialized in 0.9551897048950195 secs!
lengths of datasets: train - 5963939, valid - 1098733
Epoch: 0


100%|██████████| 2913/2913 [05:41<00:00,  8.54it/s]


Train Loss: 0.1609


100%|██████████| 537/537 [00:43<00:00, 12.37it/s]


Validation Loss: 0.2122, Validation Score: 0.300, ClassWise: 0.052,0.708,0.141
Saving Model ...


# Submission

In [16]:
model = FOGModel().cuda()
model.load_state_dict(torch.load("/kaggle/working/best_model_state.h5"))
model.eval()

test_defog_paths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/*.csv")
test_tdcsfog_paths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/*.csv")

test_dataset = FOGDataset(test_defog_paths, test=True)
test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=0)

ids = []
preds = []

for _id, x in tqdm(test_loader):
    x = x.to(cfg.device).float()
    with torch.no_grad():
        y_pred = torch.clip(model(x)*0.02+1, 0.0, 1.0)
    
    ids.extend(_id)
    preds.extend(list(np.nan_to_num(y_pred.cpu().numpy())))
    

    
test_dataset = FOGDataset(test_tdcsfog_paths, test=True)
test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=0)
    
for _id, x in tqdm(test_loader):
    x = x.to(cfg.device).float()
    with torch.no_grad():
        y_pred = torch.clip(model(x)*0.02+1, 0.0, 1.0)
    
    ids.extend(_id)
    preds.extend(list(np.nan_to_num(y_pred.cpu().numpy())))

Dataset initialized in 0.39499998092651367 secs!


100%|██████████| 138/138 [00:05<00:00, 23.23it/s]


Dataset initialized in 0.01528620719909668 secs!


100%|██████████| 3/3 [00:00<00:00, 28.72it/s]


In [18]:
sample_submission = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/sample_submission.csv")
sample_submission.shape

(286370, 4)

In [19]:
preds = np.array(preds)
submission = pd.DataFrame({'Id': ids, 'StartHesitation': np.round(preds[:,0],3), \
                           'Turn': np.round(preds[:,1],3), 'Walking': np.round(preds[:,2],3)})

submission = pd.merge(sample_submission[['Id']], submission, how='left', on='Id').fillna(0.0)
submission.to_csv("submission.csv", index=False)

In [20]:
print(submission.shape)
submission.head()

(286370, 4)


Unnamed: 0,Id,StartHesitation,Turn,Walking
0,003f117e14_0,0.751,0.693,0.71
1,003f117e14_1,0.751,0.693,0.71
2,003f117e14_2,0.751,0.693,0.71
3,003f117e14_3,0.751,0.693,0.71
4,003f117e14_4,0.751,0.693,0.71
