### Neural Network as classifier

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset

import torchvision.transforms as transforms
import torchvision.datasets as datasets

from sklearn import metrics
from sklearn import decomposition
from sklearn import manifold
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import numpy as np

import copy
import random
import time

In [27]:
import pandas as pd
import joblib

In [28]:
train_df = pd.read_csv('data/clean_train.csv')
test_df = pd.read_csv('data/clean_test.csv')

In [29]:
from sklearn.impute import SimpleImputer

In [30]:
class PandasSimpleImputer(SimpleImputer):
    """A wrapper around `SimpleImputer` to return data frames with columns.
    """

    def fit(self, X, y=None):
        self.columns = X.columns
        return super().fit(X, y)

    def transform(self, X):
        return pd.DataFrame(super().transform(X), columns=self.columns)

In [31]:
from sklearn.base import BaseEstimator, TransformerMixin

In [32]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
feature_extractor = joblib.load('feature_extractor.joblib')

In [34]:
from sklearn.model_selection import train_test_split
target = 'target'

In [35]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df.drop(target, axis=1), 
    train_df[target], 
    test_size=0.05, 
    random_state=42
)

In [None]:
train_values = feature_extractor.transform(X_train).toarray()
valid_values = feature_extractor.transform(X_valid).toarray()
test_values = feature_extractor.transform(test_df).toarray()

In [None]:
class CustomPandasDatasets(Dataset):
    def __init__(self, values, points):
        self.values = values
        self.points = points
        
    def __len__(self):
        return self.values.shape[0]
    
    def __getitem__(self, idx):
        return torch.Tensor(self.values[idx]), self.points.iloc[idx]

In [252]:
BATCH_SIZE = 64

train_data = CustomPandasDatasets(train_values, y_train)
valid_data = CustomPandasDatasets(valid_values, y_valid)
test_data = CustomPandasDatasets(test_values, pd.Series([-1] * test_df.shape[0]))

train_iterator = data.DataLoader(train_data,
                                 shuffle=True,
                                 batch_size=BATCH_SIZE)

valid_iterator = data.DataLoader(valid_data,
                                 batch_size=BATCH_SIZE)

test_iterator = data.DataLoader(test_data,
                                batch_size=BATCH_SIZE)

In [345]:
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(input_dim, 100),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(100, output_dim),
        ).to(device)
        
    def forward(self, x):
        x = self.classifier(x)

        return x

In [346]:
device = 'cpu'

In [347]:
INPUT_DIM = train_values.shape[1]
OUTPUT_DIM = 9

model = MLP(INPUT_DIM, OUTPUT_DIM)

In [348]:
INPUT_DIM

4129

In [349]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 413,909 trainable parameters


In [350]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [351]:
def train(model, iterator, optimizer, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for (x, y) in tqdm(iterator, desc="Training", leave=False):

        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()

        y_pred = model(x)

        loss = criterion(y_pred, y)

        acc = calculate_accuracy(y_pred, y)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [352]:
from sklearn.metrics import f1_score

In [353]:
def evaluate(model, iterator, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        labels = []
        predicts = []
        
        for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):

            x = x.to(device)
            y = y.to(device)

            y_pred = model(x)
        
            labels.extend(y.numpy())
            predicts.extend(torch.max(y_pred, 1).indices.numpy())
            
            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), f1_score(labels, predicts, average='weighted')

In [354]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [355]:
EPOCHS = 40

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)

best_valid_loss = float('inf')  # loss is the best!

for epoch in trange(EPOCHS):

    start_time = time.monotonic()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut-loss-model.pt')
    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train F1 score: {train_acc*100:.2f}%')
    print(f'\tValid Loss: {valid_loss:.3f} | Valid F1 score: {valid_acc*100:.2f}%')

  0%|          | 0/40 [00:00<?, ?it/s]

Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  1 | Epoch Time: 0m 6s
	Train Loss: 0.845 | Train F1 score: 74.54%
	Valid Loss: 0.646 | Valid F1 score: 75.47%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  2 | Epoch Time: 0m 5s
	Train Loss: 0.750 | Train F1 score: 76.35%
	Valid Loss: 0.639 | Valid F1 score: 75.36%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  3 | Epoch Time: 0m 6s
	Train Loss: 0.729 | Train F1 score: 76.65%
	Valid Loss: 0.615 | Valid F1 score: 75.63%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  4 | Epoch Time: 0m 6s
	Train Loss: 0.710 | Train F1 score: 77.05%
	Valid Loss: 0.608 | Valid F1 score: 75.79%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  5 | Epoch Time: 0m 5s
	Train Loss: 0.708 | Train F1 score: 77.21%
	Valid Loss: 0.606 | Valid F1 score: 75.81%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  6 | Epoch Time: 0m 6s
	Train Loss: 0.722 | Train F1 score: 77.22%
	Valid Loss: 0.604 | Valid F1 score: 75.53%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  7 | Epoch Time: 0m 6s
	Train Loss: 0.689 | Train F1 score: 77.27%
	Valid Loss: 0.604 | Valid F1 score: 74.89%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  8 | Epoch Time: 0m 6s
	Train Loss: 0.685 | Train F1 score: 77.42%
	Valid Loss: 0.603 | Valid F1 score: 76.05%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch:  9 | Epoch Time: 0m 6s
	Train Loss: 0.714 | Train F1 score: 77.38%
	Valid Loss: 0.600 | Valid F1 score: 76.07%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 10 | Epoch Time: 0m 6s
	Train Loss: 0.681 | Train F1 score: 77.27%
	Valid Loss: 0.594 | Valid F1 score: 75.51%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 11 | Epoch Time: 0m 6s
	Train Loss: 0.674 | Train F1 score: 77.27%
	Valid Loss: 0.597 | Valid F1 score: 75.77%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 12 | Epoch Time: 0m 6s
	Train Loss: 0.681 | Train F1 score: 77.31%
	Valid Loss: 0.593 | Valid F1 score: 75.85%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 13 | Epoch Time: 0m 6s
	Train Loss: 0.700 | Train F1 score: 77.31%
	Valid Loss: 0.593 | Valid F1 score: 75.55%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 14 | Epoch Time: 0m 5s
	Train Loss: 0.659 | Train F1 score: 77.58%
	Valid Loss: 0.600 | Valid F1 score: 74.61%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 15 | Epoch Time: 0m 6s
	Train Loss: 0.700 | Train F1 score: 77.24%
	Valid Loss: 0.593 | Valid F1 score: 75.44%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 16 | Epoch Time: 0m 6s
	Train Loss: 0.670 | Train F1 score: 77.47%
	Valid Loss: 0.589 | Valid F1 score: 75.97%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 17 | Epoch Time: 0m 5s
	Train Loss: 0.656 | Train F1 score: 77.56%
	Valid Loss: 0.590 | Valid F1 score: 75.17%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 18 | Epoch Time: 0m 6s
	Train Loss: 0.667 | Train F1 score: 77.61%
	Valid Loss: 0.585 | Valid F1 score: 75.73%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 19 | Epoch Time: 0m 6s
	Train Loss: 0.653 | Train F1 score: 77.56%
	Valid Loss: 0.590 | Valid F1 score: 75.40%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 20 | Epoch Time: 0m 5s
	Train Loss: 0.645 | Train F1 score: 77.57%
	Valid Loss: 0.592 | Valid F1 score: 75.41%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 21 | Epoch Time: 0m 5s
	Train Loss: 0.681 | Train F1 score: 77.58%
	Valid Loss: 0.587 | Valid F1 score: 75.71%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 22 | Epoch Time: 0m 5s
	Train Loss: 0.653 | Train F1 score: 77.69%
	Valid Loss: 0.592 | Valid F1 score: 75.68%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 23 | Epoch Time: 0m 5s
	Train Loss: 0.655 | Train F1 score: 77.54%
	Valid Loss: 0.591 | Valid F1 score: 75.42%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 24 | Epoch Time: 0m 5s
	Train Loss: 0.690 | Train F1 score: 77.77%
	Valid Loss: 0.593 | Valid F1 score: 75.12%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 25 | Epoch Time: 0m 5s
	Train Loss: 0.655 | Train F1 score: 77.61%
	Valid Loss: 0.590 | Valid F1 score: 76.02%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 26 | Epoch Time: 0m 5s
	Train Loss: 0.649 | Train F1 score: 77.48%
	Valid Loss: 0.584 | Valid F1 score: 75.49%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 27 | Epoch Time: 0m 5s
	Train Loss: 0.651 | Train F1 score: 77.40%
	Valid Loss: 0.587 | Valid F1 score: 75.80%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 28 | Epoch Time: 0m 5s
	Train Loss: 0.659 | Train F1 score: 77.68%
	Valid Loss: 0.589 | Valid F1 score: 76.03%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 29 | Epoch Time: 0m 5s
	Train Loss: 0.657 | Train F1 score: 77.56%
	Valid Loss: 0.589 | Valid F1 score: 75.92%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 30 | Epoch Time: 0m 5s
	Train Loss: 0.661 | Train F1 score: 77.75%
	Valid Loss: 0.589 | Valid F1 score: 75.62%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 31 | Epoch Time: 0m 6s
	Train Loss: 0.672 | Train F1 score: 77.86%
	Valid Loss: 0.593 | Valid F1 score: 75.78%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 32 | Epoch Time: 0m 5s
	Train Loss: 0.628 | Train F1 score: 78.16%
	Valid Loss: 0.587 | Valid F1 score: 75.61%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 33 | Epoch Time: 0m 5s
	Train Loss: 0.630 | Train F1 score: 78.13%
	Valid Loss: 0.588 | Valid F1 score: 75.63%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 34 | Epoch Time: 0m 5s
	Train Loss: 0.680 | Train F1 score: 77.65%
	Valid Loss: 0.590 | Valid F1 score: 75.88%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 35 | Epoch Time: 0m 5s
	Train Loss: 0.710 | Train F1 score: 77.64%
	Valid Loss: 0.592 | Valid F1 score: 75.83%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 36 | Epoch Time: 0m 5s
	Train Loss: 0.677 | Train F1 score: 77.73%
	Valid Loss: 0.594 | Valid F1 score: 75.96%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 37 | Epoch Time: 0m 5s
	Train Loss: 0.646 | Train F1 score: 77.64%
	Valid Loss: 0.594 | Valid F1 score: 75.68%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 38 | Epoch Time: 0m 5s
	Train Loss: 0.639 | Train F1 score: 77.89%
	Valid Loss: 0.591 | Valid F1 score: 75.70%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 39 | Epoch Time: 0m 5s
	Train Loss: 0.622 | Train F1 score: 78.04%
	Valid Loss: 0.587 | Valid F1 score: 75.91%


Training:   0%|          | 0/798 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch: 40 | Epoch Time: 0m 5s
	Train Loss: 0.631 | Train F1 score: 78.07%
	Valid Loss: 0.598 | Valid F1 score: 75.17%


In [356]:
model.load_state_dict(torch.load('tut-loss-model.pt'))

test_loss, test_acc = evaluate(model, valid_iterator, criterion, device)
test_loss, test_acc  # 0.565   .7685

Evaluating:   0%|          | 0/42 [00:00<?, ?it/s]

(0.5843048642079035, 0.754850054920235)

In [357]:
import joblib

In [365]:
import torch

In [369]:
sentences = torch.load('sentences.joblib', map_location='cpu')

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [71]:
def generate_answer(model):
    predicts = []

    for (x, _) in tqdm(test_iterator):
        x = x.to(device)
        y_pred = model(x)
        predicts.extend(torch.max(y_pred, 1).indices.numpy())
        
    return predicts

In [72]:
def sumbit(submittions):
    sub_df = pd.read_csv('data/HeadHunter_sample_submit.csv')
    sub_df['target'] = submittions
    return sub_df.to_csv('submittion.csv', index=False)

In [73]:
model.load_state_dict(torch.load('tut-loss-model.pt'))
predicts = generate_answer(model)
sumbit(predicts)

  0%|          | 0/396 [00:00<?, ?it/s]