### import, installs and downloading

In [1]:
!pip install wandb -Uq
!pip install torchmetrics

import torch
import torchmetrics
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

import os
import numpy as np
import pandas as pd
import wandb
from tqdm.notebook import tqdm

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, classification_report, f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE 

!gdown 10uKko-btA83zgMaUcsz9Cf503S0mpmGh -O /content/train_dataset_train.csv
!gdown 1dN6OKDuYg70V7l9qrjL7CCxQN4W8ROyh -O /content/test_dataset_test.csv

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('DEVICE:', device)

[K     |████████████████████████████████| 1.9 MB 9.1 MB/s 
[K     |████████████████████████████████| 182 kB 53.4 MB/s 
[K     |████████████████████████████████| 162 kB 36.4 MB/s 
[K     |████████████████████████████████| 63 kB 1.4 MB/s 
[K     |████████████████████████████████| 162 kB 26.0 MB/s 
[K     |████████████████████████████████| 158 kB 43.7 MB/s 
[K     |████████████████████████████████| 157 kB 12.1 MB/s 
[K     |████████████████████████████████| 157 kB 13.0 MB/s 
[K     |████████████████████████████████| 157 kB 45.2 MB/s 
[K     |████████████████████████████████| 157 kB 49.3 MB/s 
[K     |████████████████████████████████| 157 kB 49.5 MB/s 
[K     |████████████████████████████████| 157 kB 19.9 MB/s 
[K     |████████████████████████████████| 157 kB 46.8 MB/s 
[K     |████████████████████████████████| 156 kB 45.6 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-whe

### dataset splitting

In [2]:
# initialize preprocessors for data
scaler = preprocessing.StandardScaler()
rus = RandomUnderSampler(sampling_strategy={0:24000,
                                            1:24000,
                                            3:24000, 
                                            4:24000,
                                            5:24000
                                            }, random_state=44)
adasyn = ADASYN(sampling_strategy='not majority')

df = pd.read_csv('/content/train_dataset_train.csv')

df_predict = pd.read_csv('/content/test_dataset_test.csv')

# get features and targets
X = df.drop(["Class", "id"], axis = 1)
y = df[["Class"]]

X_predict = df_predict.drop(["id"], axis = 1)
id = df_predict["id"]

# train <-> test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=44)

# scale data
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=['Easting', 'Northing', 'Height','Reflectance'])
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=['Easting', 'Northing', 'Height','Reflectance'])

X_predict = pd.DataFrame(scaler.fit_transform(X_predict), columns=['Easting', 'Northing', 'Height','Reflectance'])

# resample data
print('before resampling:', '\n', y_train.value_counts())
X_train, y_train = rus.fit_resample(X_train, y_train)
print('after under sampling:', '\n', y_train.value_counts())
# X_train, y_train = adasyn.fit_resample(X_train, y_train)
X_train, y_train = SMOTE({64:8000}).fit_resample(X_train, y_train)
print('after over sampling:', '\n', y_train.value_counts())

# to nunmpy
X_train = X_train.to_numpy()
y_train = y_train.to_numpy().reshape(-1)
X_test = X_test.to_numpy()
y_test = y_test.to_numpy().reshape(-1)

X_predict = X_predict.to_numpy()
id = id.to_numpy().reshape(-1)

# train <-> valid split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=44)
print('train:', y_train.shape, '; valid:', y_valid.shape, '; test:', y_test.shape)

# reset Class id
y_train = np.where(y_train==3, 2, y_train)
y_train = np.where(y_train==4, 3, y_train)
y_train = np.where(y_train==5, 4, y_train)
y_train = np.where(y_train==64, 5, y_train)
y_valid = np.where(y_valid==3, 2, y_valid)
y_valid = np.where(y_valid==4, 3, y_valid)
y_valid = np.where(y_valid==5, 4, y_valid)
y_valid = np.where(y_valid==64, 5, y_valid)
y_test = np.where(y_test==3, 2, y_test)
y_test = np.where(y_test==4, 3, y_test)
y_test = np.where(y_test==5, 4, y_test)
y_test = np.where(y_test==64, 5, y_test)

before resampling: 
 Class
0        2457843
3        1159329
4          93106
5          50306
1          35332
64          3968
dtype: int64
after under sampling: 
 Class
0        24000
1        24000
3        24000
4        24000
5        24000
64        3968
dtype: int64
after over sampling: 
 Class
0        24000
1        24000
3        24000
4        24000
5        24000
64        8000
dtype: int64
train: (115200,) ; valid: (12800,) ; test: (422210,)




### to tensor

In [3]:
class RailsDataset(Dataset):
  
    def __init__(self, X, y):

        self.X = X
        self.y = y

        self.features = torch.Tensor(X)
        self.targets = torch.LongTensor(y)

    def __len__(self):
        return self.features.shape[0]
  
    def __getitem__(self, index):
        return (self.features[index], self.targets[index])

In [4]:
train_dataset = RailsDataset(X_train, y_train)
valid_dataset = RailsDataset(X_valid, y_valid)
test_dataset = RailsDataset(X_test, y_test)

predict_dataset = RailsDataset(X_predict, id)

### functions to train and test

In [24]:
def trainer(model, train_loader, valid_loader, loss_function, optimizer, scheduler, config):
    """
    (count_of_epoch, batch_size, dataset, model, loss_function, optimizer, lr = 0.001)
    trainer итерируется по кол-ву эпох и вызывает функцию train_epoch
    count_of_epoch - кол-во эпох
    batch_size - размер батча
    dataset - данные для обучения
    model - модель нейронной сети
    loss_function - функция потерь
    optimizer - оптимизатор
    lr - скорость обучения, по умолчанию 0.001
    """
    min_valid_loss = np.inf

    # # in this foulder will save model weights
    if not os.path.exists('/content/model_weights'):
        os.mkdir('/content/model_weights')

    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, loss_function, log="all", log_freq=10)
    
    for e in range(config.count_of_epoch):
        # train
        epoch_loss = train_epoch(train_generator=train_loader, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optimizer)

        # valid
        valid_loss = 0.0
        model.eval()
        valid_loss = train_epoch(train_generator=valid_loader, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optimizer)
        
        scheduler.step(epoch_loss)

        # log things
        trainer_log(epoch_loss, valid_loss, e, optimizer.param_groups[0]['lr'], min_valid_loss)

        # saving models
        if min_valid_loss > valid_loss:
            print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
            min_valid_loss = valid_loss
            torch.save(model.state_dict(), f'/content/model_weights/saved_model_{e}.pth')
            wandb.log_artifact(f'/content/model_weights/saved_model_{e}.pth', 
                               name=f'saved_model_{e}', type='model')
        print()

def train_epoch(train_generator, model, loss_function, optimizer):
    """
    внутри train_epoch итерируемся по батчам внутри батчгенератора
    train_generator - батчгенератора
    model - модель нейронной сети
    loss_function - функция потерь
    optimizer - оптимизатор
    """
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
            
        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    """
    в train_on_batch обучаемся на одном батче
    model - модель нейронной сети
    x_batch - фичи
    y_batch - таргеты(метки классов)
    optimizer - оптимизатор
    loss_function - функция потерь
    """
    model.train()
    optimizer.zero_grad()
    
    output = model(x_batch.to(device))

    loss = loss_function(output, y_batch.to(device))
    # loss.requires_grad = True
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

def tester(model, test_loader):
    pred = []
    real = [] 
    model.eval()
    for it, (x_batch, y_batch) in enumerate(test_loader):
        x_batch = x_batch.to(device)
        with torch.no_grad():
            output = model(x_batch)

        pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
        real.extend(y_batch.cpu().numpy().tolist())

    wandb.log({"test_recall": recall_score(real, pred, average='macro')})
    print('Recall:', recall_score(real, pred, average='macro'))
    print('Recall weights:', recall_score(real, pred, average=None))
    print(classification_report(real, pred, zero_division = 0))

def predicter(model, predict_loader):
    pred = []
    ids = []
    model.eval()
    for it, (x_batch, y_batch) in enumerate(predict_loader):
        x_batch = x_batch.to(device)
        with torch.no_grad():
            output = model(x_batch)

        pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
        ids.extend(y_batch.cpu().numpy().tolist())
    
    pred = np.array(pred)
    pred = np.where(pred==5, 64, pred)
    pred = np.where(pred==4, 5, pred)
    pred = np.where(pred==3, 4, pred)
    pred = np.where(pred==2, 3, pred)
    predict_df = pd.concat([pd.Series(ids, name='id'), pd.Series(pred, name='Class')], axis=1)
    predict_df.to_csv('prediction.csv', sep=',', index=False)

def trainer_log(train_loss, valid_loss, epoch, lr, min_val_loss):
    wandb.log({'train_loss': train_loss, 'valid_loss': valid_loss,
               'epoch': epoch, 'learning_rate': lr,
               'min_validation_loss': min_val_loss})
    print(f'train loss on {str(epoch).zfill(3)} epoch: {train_loss:.6f} with lr: {lr:.10f}')
    print(f'valid loss on {str(epoch).zfill(3)} epoch: {valid_loss:.6f}')

def make_loader(dataset, batch_size):
    loader = torch.utils.data.DataLoader(dataset=dataset,
                                         batch_size=batch_size, 
                                         shuffle=False,
                                         pin_memory=True, num_workers=2)
    loader = tqdm(loader, )
    return loader

### Pipeline

In [23]:
def pipeline(hyperparameters, saved_model=None, to_train=True, to_test=True, to_predict=False):

    with wandb.init(project=hyperparameters['project'], config=hyperparameters) as run:
      config = wandb.config
      
      # build the model
      model = build_model(run, config, saved_model)

      # make the data and optimization 
      train_loader, valid_loader, test_loader, predict_loader, criterion, optimizer, scheduler = make(model, config)

      print('config:', '\n', config, '\n', model, '\n', 'running on device:', device, '\n')

      if to_train:
        trainer(model, train_loader, valid_loader, criterion, optimizer, scheduler, config)

      if to_test:
        tester(model, test_loader)

      if to_predict:
        predicter(model, predict_loader)

    return model

def build_model(run, config, saved_model=None):
    IN, H1, H2, H3, H4, H5, OUT = 4, 256, 128, 64, 32, 16, 6
    p = config.dropout

    model =  nn.Sequential(
    nn.Linear(IN, H1), nn.Dropout(p), nn.BatchNorm1d(H1), nn.ReLU(),        
    nn.Linear(H1, H2), nn.Dropout(p), nn.BatchNorm1d(H2), nn.ReLU(), 
    nn.Linear(H2, H3), nn.Dropout(p), nn.BatchNorm1d(H3), nn.ReLU(),  
    nn.Linear(H3, H4), nn.Dropout(p), nn.BatchNorm1d(H4), nn.ReLU(),  
    nn.Linear(H4, H5), nn.Dropout(p), nn.BatchNorm1d(H5), nn.ReLU(),  
    nn.Linear(H5, OUT), nn.Dropout(p), nn.BatchNorm1d(OUT)) 

    if saved_model is not None:
        artifact = run.use_artifact(f'abletobetable/{config.project}/saved_model_{saved_model[0]}:{saved_model[1]}', type='model')
        artifact_dir = artifact.download() + f'/saved_model_{saved_model[0]}.pth'
        model.load_state_dict(torch.load(artifact_dir, map_location=torch.device(device)))
    else:
        def init_weights(m):
            if type(m) == nn.Linear:
                torch.nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
        model.apply(init_weights)

    model = model.to(device)
    return model

def Recall(prediction, real, eps=1e-8):
    
    torch_recall = torchmetrics.Recall(6, 0.5, 'macro').to(device)
    pred = torch.argmax(prediction, dim=1)

    return torch_recall(pred, real)

def make_criterion():
    def recall_criterion(pred, real):
        return torch.tensor(1) - Recall(torch.exp(pred), real)
    return recall_criterion

def make(model, config):

    # if to train and test
    if train_dataset is not None: 
        train_loader = make_loader(train_dataset, batch_size=config.batch_size)
        valid_loader = make_loader(valid_dataset, batch_size=config.batch_size)
        test_loader = make_loader(test_dataset, batch_size=config.batch_size)
        predict_loader = make_loader(predict_dataset, batch_size=config.batch_size)

    # if only to test
    else:  
        train_loader = None
        valid_loader = None
        predict_loader = None
        test_loader = make_loader(test_dataset, batch_size=config.batch_size)
    
    criterion = nn.CrossEntropyLoss()
    # criterion = make_criterion()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience = config.patience, 
                                                           factor = config.step_gamma, min_lr=1e-8)
    # scheduler = StepLR(optimizer, config.step_size, config.step_gamma)
    
    return train_loader, valid_loader, test_loader, predict_loader, criterion, optimizer, scheduler

# Running

In [None]:
wandb.init()

In [None]:
config = dict(count_of_epoch=10000, batch_size=128, lr=1e-2, 
              dropout=0.001, critirion='Recall', 
              optimizer='Adam', scheduler='ReducePlateau', 
              step_size = 250, step_gamma = 0.1, patience=25,
              project='rails', name_of_model='mlp')


model = pipeline(config, saved_model=None, to_train=True, to_test=True, to_predict=False)

# predict

In [None]:
config = dict(count_of_epoch=1, batch_size=256, lr=1e-2, 
              dropout=0.0, critirion='Recall', 
              optimizer='Adam', scheduler='ReducePlateau', 
              step_size = 250, step_gamma = 0.1, patience=25,
              project='rails', name_of_model='mlp')

saved_model = ['1404', 'v0']

model = pipeline(config, saved_model=saved_model, to_train=False, to_test=True, to_predict=True)