In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import ParameterGrid
import torch.utils.data as data_utils
from nltk.tokenize import word_tokenize
from collections import defaultdict
import torch.optim as optim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
import pymorphy2
from tqdm import tqdm

import warnings
# warnings.filterwarnings('ignore')

In [2]:
if torch.cuda.is_available(): 
    dev = "cuda:0" 
else: 
    dev = "cpu" 
device = torch.device(dev)
batch_size = 512

In [3]:
def save_file(path, arr):
    with open(path, 'wb') as f:
        np.save(f, arr)
    return

def load_file(path):
    with open(path, 'rb') as f:
        arr = np.load(f)
    return arr

## Preprocessing

In [4]:
def tokenize(s):
    from string import punctuation
    tokens = nltk.word_tokenize(s)
    return [w.lower() for w in tokens if w not in punctuation]

def morph_analyze(w, morph):
    return morph.parse(w)[0].normal_form

def build_vocab(data):
    vocab = dict()
    freqdist = nltk.FreqDist()
    for s in data:
        for w in s:
            freqdist[w] += 1
    cwords = freqdist.most_common(10_000)
    for i, w in enumerate(cwords):
        vocab[w[0]] = i + 1
    return vocab

def w2i(data, vocab):
    token_is = []
    for s in data:
        temp = []
        for w in s:
            if w in vocab.keys():
                temp.append(vocab[w])
        token_is.append(temp)
    return token_is

def pad(tokens, max_len):
    pad_i = 0
    x_pad = []
    for s in tokens:
        if len(s) < max_len:
            while len(s) < max_len:
                s.insert(len(s), pad_i)
            x_pad.append(s)
        else:
            x_pad.append(s[:max_len])
    return x_pad

In [5]:
train = pd.read_excel('../2/X_y_train.xlsx')
test = pd.read_excel('../2/X_y_test.xlsx')
X_train, y_train, X_test, y_test = (train.drop(['Class'], axis=1).Text, train.Class, 
                                    test.drop(['Class'], axis=1).Text, test.Class)
y_train = y_train.apply(lambda x: x if x == 1 else 0).values
y_test = y_test.apply(lambda x: x if x == 1 else 0).values

In [6]:
X_train_tokens = [tokenize(s) for s in X_train.values]
X_test_tokens = [tokenize(s) for s in X_test.values]
morph = pymorphy2.MorphAnalyzer()
X_train_tokens = [[morph_analyze(w, morph) for w in s] for s in X_train_tokens]
X_test_tokens = [[morph_analyze(w, morph) for w in s] for s in X_test_tokens]
vocab_ = build_vocab(X_train_tokens + X_test_tokens)
X_train_wi = w2i(X_train_tokens, vocab_)
X_test_wi = w2i(X_test_tokens, vocab_)

In [7]:
max_len = 20
X_train_wi = pad(X_train_wi, max_len=max_len)
X_test_wi = pad(X_test_wi, max_len=max_len)

# 2.1

In [8]:
class Net(nn.Module):
    
    def __init__(self,
                 vocab_size=len(vocab_), 
                 embed_dim=100, 
                 conv_layer_count=2, 
                 stride=1,
                 kernel_size=3,
                 seq_len=max_len, 
                 dropout=0):
        super().__init__()
        self.embed_dim = embed_dim
        self.seq_len = seq_len
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size+1,
                                      embedding_dim=embed_dim,
                                      padding_idx=0)
        
        self.conv1d_layers = nn.ModuleList([])
        out_shape = self.seq_len
        padding = kernel_size // 2
        for i in range(conv_layer_count):
            self.conv1d_layers.append(nn.Conv1d(in_channels=embed_dim, 
                                                out_channels=embed_dim,
                                                kernel_size=kernel_size,
                                                stride=stride, padding=padding))
            out_shape = 1 + (out_shape + 2 * padding - kernel_size) // stride
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(out_shape * embed_dim, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = x.reshape(len(x), self.embed_dim, self.seq_len)
        for conv_layer in self.conv1d_layers:
            x = F.relu(conv_layer(x))
        x = x.reshape(x.shape[0], -1)
        logits = self.fc(self.dropout(x))
        return F.log_softmax(logits, dim=-1)

In [9]:
net = Net()
net(torch.tensor(X_train_wi[:5], dtype=torch.long))

tensor([[-0.7398, -0.6486],
        [-0.6804, -0.7060],
        [-0.7317, -0.6560],
        [-0.7459, -0.6431],
        [-0.7014, -0.6849]], grad_fn=<LogSoftmaxBackward0>)

In [10]:
def fit(net, num_epoch, trainset, optimizer, lr, scheduler, log=False):
#     loss_f = nn.CrossEntropyLoss()
    optimizer = optimizer(net.parameters(), lr=lr)
    
    for epoch in range(num_epoch):
        for data in trainset:
            X, y = data
            net.zero_grad()
            output = net(X)
            loss = F.cross_entropy(output, y.long())
            loss.backward()
            optimizer.step()
        if scheduler is not None: scheduler.step()
        
        if log:  print('loss ====> ', loss.item())
    return net

def predict(net, testset):
    ans = []
    with torch.no_grad():
        for data in testset:
            X, y = data
            output = net(X)
            for idx, i in enumerate(output):
                ans.append(i.cpu().data.numpy().argmax().item())
    return ans

In [11]:
param_grid = {
    "lr": [3e-4],
    "epochs": [5, 10],
    "optimizer": [optim.SGD, optim.AdamW],
    "batch_size": [batch_size],
    "layers_count": [3, 5],
    "kernel_size": [3, 5],
    "stride": [1, 3]
}
params_list = ParameterGrid(param_grid)
len(params_list)

32

In [12]:
# torch.backends.cudnn.enabled = False

configs = []

inputs_train = torch.tensor(X_train_wi, dtype=torch.int32).to(device)
targets_train = torch.tensor(y_train, dtype=torch.int32).to(device)

inputs_test = torch.tensor(X_test_wi, dtype=torch.int32).to(device)
targets_test = torch.tensor(y_test, dtype=torch.int32).to(device)

train = data_utils.TensorDataset(inputs_train, targets_train)
test = data_utils.TensorDataset(inputs_test, targets_test)

trainset = torch.utils.data.DataLoader(train, shuffle=True, batch_size=batch_size)
testset = torch.utils.data.DataLoader(test, shuffle=False, batch_size=batch_size)

# grid search
for params in tqdm(params_list):
    
    # get param for pass to network
    lr = params['lr']
    epochs = params['epochs']
    optimizer = params['optimizer']
    batch_size = params['batch_size']
    layers_count = params['layers_count']
    kernel_size = params['kernel_size']
    stride = params['stride']
    
    # net build
    net = Net(vocab_size=len(vocab_), 
              embed_dim=100, 
              conv_layer_count=layers_count, 
              stride=stride, 
              kernel_size=kernel_size, 
              seq_len=max_len)
    net.to(device)

    # fit
    net = fit(net, epochs, trainset, optimizer, lr, None, False)
    
    # predict
    ans = predict(net, testset)
    
    # add param in config
    config = [epochs, optimizer.__name__, batch_size, layers_count, 
              kernel_size, stride, f'{f1_score(y_test, ans, average="weighted"):.5f}']
    configs.append(config)
    
print()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [02:36<00:00,  4.89s/it]







In [13]:
df = pd.DataFrame(configs)
df.columns = ['epochs', 'optimizer', 'batch_size', 'layers_count', 'kernel_size', 'stride', 'f1_score']
df.sort_values(by='f1_score', ascending=False).reset_index(drop=True)

Unnamed: 0,epochs,optimizer,batch_size,layers_count,kernel_size,stride,f1_score
0,5,AdamW,512,5,5,3,0.81964
1,5,AdamW,512,5,5,1,0.79904
2,10,AdamW,512,5,5,3,0.77956
3,5,AdamW,512,3,3,3,0.77591
4,5,AdamW,512,5,3,3,0.74031
5,10,AdamW,512,3,5,1,0.72045
6,5,AdamW,512,5,3,1,0.72
7,5,AdamW,512,3,3,1,0.70036
8,10,AdamW,512,5,3,3,0.70036
9,10,AdamW,512,5,5,1,0.68


# 2.2

In [8]:
path_ = '../2/'
train = pd.read_excel(path_ + 'X_y_train.xlsx')
test = pd.read_excel(path_ + 'X_y_test.xlsx')
X_train, y_train, X_test, y_test = train.drop(['Class'], axis=1), train.Class, test.drop(['Class'], axis=1), test.Class
assert y_train.shape == (X_train.shape[0],) and y_test.shape == (X_test.shape[0], )
y_train = y_train.apply(lambda x: x if x == 1 else 0)
y_test = y_test.apply(lambda x: x if x == 1 else 0)


path_ = '../2/saved/'
__train_w2v_pretrain = load_file(path_ + '__train_w2v_pretrain.npy')
__test_w2v_pretrain = load_file(path_ + '__test_w2v_pretrain.npy')

__train_w2v = load_file(path_ + '__train_w2v.npy')
__test_w2v = load_file(path_ + '__test_w2v.npy')

__train_fasttext_500 = load_file(path_ + '__train_fasttext_500_10.npy')
__test_fasttext_500 = load_file(path_ + '__test_fasttext_500_10.npy')

__train_fasttext_pretrain = load_file(path_ + '__train_fasttext_pretrain.npy')
__test_fasttext_pretrain = load_file(path_ + '__test_fasttext_pretrain.npy')

In [9]:
__train_fasttext_pretrain.shape

(22348, 300)

In [18]:
class Net(nn.Module):
    
    def __init__(self,
                 embed_dim=300, 
                 conv_layer_count=2, 
                 stride=1,
                 kernel_size=3,
                 dropout=0):
        super().__init__()
        self.embed_dim = embed_dim
        
        self.conv1d_layers = nn.ModuleList([])
        out_shape = 1
        padding = kernel_size // 2
        for i in range(conv_layer_count):
            self.conv1d_layers.append(nn.Conv1d(in_channels=embed_dim, 
                                                out_channels=embed_dim,
                                                kernel_size=kernel_size,
                                                stride=stride, padding=padding))
            out_shape = 1 + (out_shape + 2 * padding - kernel_size) // stride
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(out_shape *  embed_dim, 2)

    def forward(self, x):
        x = x[:, :, None]
        for conv_layer in self.conv1d_layers:
            x = F.relu(conv_layer(x))
        x = x.reshape(x.shape[0], -1)
        logits = self.fc(self.dropout(x))
        return F.log_softmax(logits, dim=-1)

In [19]:
param_grid = {
    "embeddings": [(__train_w2v_pretrain, __test_w2v_pretrain), 
                   (__train_w2v, __test_w2v), 
                   (__train_fasttext_500, __test_fasttext_500),
                   (__train_fasttext_pretrain, __test_fasttext_pretrain)],
    "lr": [3e-4],
    "epochs": [5, 10],
    "optimizer": [optim.SGD, optim.AdamW],
    "batch_size": [512],
    "layers_count": [3, 5],
    "kernel_size": [3, 5],
    "stride": [1, 3]
}
params_list = ParameterGrid(param_grid)
len(params_list)

128

In [20]:
# f1_score(y_test, ans, average='weighted')

In [22]:
# torch.backends.cudnn.enabled = False

configs = []

# grid search
for params in tqdm(params_list):
    
    # get param for pass to network
    X_train, X_test = params['embeddings']
    lr = params['lr']
    epochs = params['epochs']
    optimizer = params['optimizer']
    batch_size = params['batch_size']
    layers_count = params['layers_count']
    kernel_size = params['kernel_size']
    stride = params['stride']
    
    inputs_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    targets_train = torch.tensor(y_train, dtype=torch.float32).to(device)

    inputs_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    targets_test = torch.tensor(y_test, dtype=torch.float32).to(device)

    train = data_utils.TensorDataset(inputs_train, targets_train)
    test = data_utils.TensorDataset(inputs_test, targets_test)

    trainset = torch.utils.data.DataLoader(train, shuffle=True, batch_size=batch_size)
    testset = torch.utils.data.DataLoader(test, shuffle=False, batch_size=batch_size)
    
    # net build
    net = Net(embed_dim=300, 
              conv_layer_count=layers_count, 
              stride=stride, 
              kernel_size=kernel_size)
    net.to(device)

    # fit
    net = fit(net, epochs, trainset, optimizer, lr, None, False)
    
    # predict
    ans = predict(net, testset)
    
    # add param in config
    config = [epochs, optimizer.__name__, batch_size, layers_count, 
              kernel_size, stride, f'{f1_score(y_test, ans, average="weighted"):.5f}']
    configs.append(config)
    
print()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [07:35<00:00,  3.56s/it]







In [23]:
df = pd.DataFrame(configs)
df.columns = ['epochs', 'optimizer', 'batch_size', 'layers_count', 'kernel_size', 'stride', 'f1_score']
df.sort_values(by='f1_score', ascending=False).reset_index(drop=True)

Unnamed: 0,epochs,optimizer,batch_size,layers_count,kernel_size,stride,f1_score
0,10,AdamW,512,3,5,3,0.85880
1,10,AdamW,512,3,3,3,0.85880
2,10,AdamW,512,3,3,1,0.85880
3,10,AdamW,512,5,5,3,0.84026
4,10,AdamW,512,5,5,1,0.84000
...,...,...,...,...,...,...,...
123,5,SGD,512,5,5,1,0.28986
124,10,SGD,512,3,3,1,0.28986
125,10,SGD,512,3,3,3,0.28986
126,10,SGD,512,5,5,3,0.28986


# 2.3

In [4]:
class Net(nn.Module):
    
    def __init__(self,
                 embed_dim=300, 
                 conv_layer_count=2, 
                 stride=1,
                 kernel_size=3,
                 dropout=0, 
                 init=None, 
                 reg=None,
                 norm=None, 
                 bs=None):
        super().__init__()
        self.embed_dim = embed_dim
        
        self.conv1d_layers = nn.ModuleList([])
        out_shape = 1
        padding = kernel_size // 2
        for i in range(conv_layer_count):
            self.conv1d_layers.append(nn.Conv1d(in_channels=embed_dim, 
                                                out_channels=embed_dim,
                                                kernel_size=kernel_size,
                                                stride=stride, 
                                                padding=padding))
            out_shape = 1 + (out_shape + 2 * padding - kernel_size) // stride
        self.fc = nn.Linear(out_shape *  embed_dim, 2)
        self.dropout = nn.Dropout1d(p=0.3) if reg == 'dropout' else nn.Dropout1d(p=.0)
        if norm.__name__ == 'LayerNorm':
            self.norm = norm((self.embed_dim, 1), device=device)
        else:
            self.norm = norm(self.embed_dim, device=device)
        for module in self.modules():
            if isinstance(module, nn.Linear) or isinstance(module, nn.Conv1d):
                init(module.weight)
            

    def forward(self, x):
        x = x[:, :, None]
        for conv_layer in self.conv1d_layers:
            x = F.relu(conv_layer(x))
            x = self.dropout(self.norm(x))
        x = x.reshape(x.shape[0], -1)
        logits = self.fc(x)
        return F.log_softmax(logits, dim=-1)

In [5]:
def fit(net, num_epoch, trainset, optimizer, lr, scheduler, weight_decay, log=False):
#     loss_f = nn.CrossEntropyLoss()
    optimizer = optimizer(net.parameters(), lr=lr, weight_decay=weight_decay)
    if scheduler.__name__ == 'ExponentialLR':
        scheduler = scheduler(optimizer, gamma=0.9)
    else:
        scheduler = scheduler(optimizer, milestones=list(range(0, epochs)), gamma=0.9)
    
    for epoch in range(num_epoch):
        for data in trainset:
            X, y = data
            net.zero_grad()
            output = net(X)
            loss = F.cross_entropy(output, y.long())
            loss.backward()
            optimizer.step()
        if scheduler is not None: scheduler.step()
        
        if log:  print('loss ====> ', loss.item())
    return net

def predict(net, testset):
    ans = []
    with torch.no_grad():
        for data in testset:
            X, y = data
            output = net(X)
            for idx, i in enumerate(output):
                ans.append(i.cpu().data.numpy().argmax().item())
    return ans

In [10]:
param_grid = {
    "embeddings": [(__train_w2v, __test_w2v), 
                   (__train_fasttext_500, __test_fasttext_500)],
    "lr": [3e-4],
    "epochs": [10, 20],
    "optimizer": [optim.AdamW],
    "batch_size": [512],
    "layers_count": [3],
    "kernel_size": [3],
    "stride": [1],
    "init": [torch.nn.init.kaiming_uniform_, torch.nn.init.xavier_uniform_],
    "reg": ['dropout', 'l2_reg'],
    "norm": [torch.nn.LayerNorm, torch.nn.BatchNorm1d],
    "scheduler": [torch.optim.lr_scheduler.ExponentialLR, torch.optim.lr_scheduler.MultiStepLR]
}
params_list = ParameterGrid(param_grid)
len(params_list)

64

In [11]:
# torch.backends.cudnn.enabled = False

configs = []

# grid search
for params in tqdm(params_list):
    
    # get param for pass to network
    X_train, X_test = params['embeddings']
    lr = params['lr']
    epochs = params['epochs']
    optimizer = params['optimizer']
    batch_size = params['batch_size']
    layers_count = params['layers_count']
    kernel_size = params['kernel_size']
    stride = params['stride']
    
    init = params['init']
    reg = params['reg']
    norm = params['norm']
    scheduler = params['scheduler']
    
    inputs_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    targets_train = torch.tensor(y_train, dtype=torch.float32).to(device)

    inputs_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    targets_test = torch.tensor(y_test, dtype=torch.float32).to(device)

    train = data_utils.TensorDataset(inputs_train, targets_train)
    test = data_utils.TensorDataset(inputs_test, targets_test)

    trainset = torch.utils.data.DataLoader(train, shuffle=True, batch_size=batch_size)
    testset = torch.utils.data.DataLoader(test, shuffle=False, batch_size=batch_size)
    
    # net build
    net = Net(embed_dim=300, 
              conv_layer_count=layers_count, 
              stride=stride, 
              kernel_size=kernel_size,
              init=init, 
              reg=reg,
              norm=norm,
              bs=batch_size)
    net.to(device)
    
    # fit
    net = fit(net, epochs, trainset, optimizer, lr, scheduler, 0.01 if reg == 'l2_reg' else 0, False)
    
    # predict
    ans = predict(net, testset)
    
    # add param in config
    config = [epochs, optimizer.__name__, batch_size, 
              layers_count, kernel_size, stride, 
              init.__name__, reg, norm.__name__, scheduler.__name__, f'{f1_score(y_test, ans, average="weighted"):.5f}']
    configs.append(config)
    
print()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [05:26<00:00,  5.10s/it]







In [12]:
df = pd.DataFrame(configs)
df.columns = ['epochs', 'optimizer', 'batch_size', 'layers_count', 
              'kernel_size', 'stride', 'init', 'reg', 'norm', 'scheduler', 'f1_score']
df.sort_values(by='f1_score', ascending=False).reset_index(drop=True)

Unnamed: 0,epochs,optimizer,batch_size,layers_count,kernel_size,stride,init,reg,norm,scheduler,f1_score
0,10,AdamW,512,3,3,1,kaiming_uniform_,dropout,LayerNorm,ExponentialLR,0.88000
1,10,AdamW,512,3,3,1,kaiming_uniform_,l2_reg,LayerNorm,MultiStepLR,0.88000
2,20,AdamW,512,3,3,1,xavier_uniform_,l2_reg,BatchNorm1d,ExponentialLR,0.82022
3,10,AdamW,512,3,3,1,xavier_uniform_,l2_reg,LayerNorm,MultiStepLR,0.80032
4,10,AdamW,512,3,3,1,xavier_uniform_,dropout,BatchNorm1d,MultiStepLR,0.80000
...,...,...,...,...,...,...,...,...,...,...,...
59,20,AdamW,512,3,3,1,kaiming_uniform_,dropout,BatchNorm1d,MultiStepLR,0.64058
60,20,AdamW,512,3,3,1,kaiming_uniform_,dropout,BatchNorm1d,ExponentialLR,0.64000
61,20,AdamW,512,3,3,1,xavier_uniform_,dropout,BatchNorm1d,MultiStepLR,0.63825
62,10,AdamW,512,3,3,1,xavier_uniform_,dropout,BatchNorm1d,MultiStepLR,0.62046
