In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import ParameterGrid
import torch.utils.data as data_utils
from nltk.tokenize import word_tokenize
from collections import defaultdict
import torch.optim as optim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
import pymorphy2
from tqdm import tqdm

import warnings
# warnings.filterwarnings('ignore')

In [2]:
if torch.cuda.is_available(): 
    dev = "cuda:0" 
else: 
    dev = "cpu" 
device = torch.device(dev)
batch_size = 512

In [3]:
def save_file(path, arr):
    with open(path, 'wb') as f:
        np.save(f, arr)
    return

def load_file(path):
    with open(path, 'rb') as f:
        arr = np.load(f)
    return arr

In [4]:
def tokenize(s):
    from string import punctuation
    tokens = nltk.word_tokenize(s)
    return [w.lower() for w in tokens if w not in punctuation]

def morph_analyze(w, morph):
    return morph.parse(w)[0].normal_form

def build_vocab(data):
    vocab = dict()
    freqdist = nltk.FreqDist()
    for s in data:
        for w in s:
            freqdist[w] += 1
    cwords = freqdist.most_common(10_000)
    for i, w in enumerate(cwords):
        vocab[w[0]] = i + 1
    return vocab

def w2i(data, vocab):
    token_is = []
    for s in data:
        temp = []
        for w in s:
            if w in vocab.keys():
                temp.append(vocab[w])
        token_is.append(temp)
    return token_is

def pad(tokens, max_len):
    pad_i = 0
    x_pad = []
    for s in tokens:
        if len(s) < max_len:
            while len(s) < max_len:
                s.insert(len(s), pad_i)
            x_pad.append(s)
        else:
            x_pad.append(s[:max_len])
    return x_pad

In [5]:
train = pd.read_excel('../2/X_y_train.xlsx')
test = pd.read_excel('../2/X_y_test.xlsx')
X_train, y_train, X_test, y_test = (train.drop(['Class'], axis=1).Text, train.Class, 
                                    test.drop(['Class'], axis=1).Text, test.Class)
y_train = y_train.apply(lambda x: x if x == 1 else 0).values
y_test = y_test.apply(lambda x: x if x == 1 else 0).values

In [6]:
X_train_tokens = [tokenize(s) for s in X_train.values]
X_test_tokens = [tokenize(s) for s in X_test.values]
morph = pymorphy2.MorphAnalyzer()
X_train_tokens = [[morph_analyze(w, morph) for w in s] for s in X_train_tokens]
X_test_tokens = [[morph_analyze(w, morph) for w in s] for s in X_test_tokens]
vocab_ = build_vocab(X_train_tokens + X_test_tokens)
X_train_wi = w2i(X_train_tokens, vocab_)
X_test_wi = w2i(X_test_tokens, vocab_)

In [7]:
max_len = 20
X_train_wi = pad(X_train_wi, max_len=max_len)
X_test_wi = pad(X_test_wi, max_len=max_len)

# 2.1

In [8]:
class Net(nn.Module):
    
    def __init__(self,
                 vocab_size=len(vocab_), 
                 embed_dim=100,
                 seq_len=max_len, 
                 type_rnn=None,
                 hidden_size=None, 
                 bidirectional=None,
                 n_layer=None):
        super().__init__()
        self.embed_dim = embed_dim
        self.seq_len = seq_len
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size+1,
                                      embedding_dim=embed_dim,
                                      padding_idx=0)
        
        self.rnn = type_rnn(
            input_size=self.embed_dim,
            hidden_size=hidden_size,
            num_layers=n_layer,
            bidirectional=bidirectional,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size * self.seq_len * (1 + bidirectional), 2)

    def forward(self, x):
        x = self.embedding(x)
        x = x.reshape(len(x), self.seq_len, self.embed_dim)
        x, _ = self.rnn(x)
        x = x.reshape(x.shape[0], -1)
        logits = self.fc(x)
        return F.log_softmax(logits, dim=-1)

In [9]:
net = Net(type_rnn=torch.nn.RNN,
          hidden_size=100, 
          bidirectional=True,
          n_layer=3)
net(torch.tensor(X_train_wi[:5], dtype=torch.long))

tensor([[-0.8628, -0.5481],
        [-0.7010, -0.6854],
        [-0.7844, -0.6096],
        [-0.8548, -0.5540],
        [-0.7894, -0.6053]], grad_fn=<LogSoftmaxBackward0>)

In [10]:
def fit(net, num_epoch, trainset, optimizer, lr, scheduler, log=False):
#     loss_f = nn.CrossEntropyLoss()
    optimizer = optimizer(net.parameters(), lr=lr)
    
    for epoch in range(num_epoch):
        for data in trainset:
            X, y = data
            net.zero_grad()
            output = net(X)
            loss = F.cross_entropy(output, y.long())
            loss.backward()
            optimizer.step()
        if scheduler is not None: scheduler.step()
        
        if log:  print('loss ====> ', loss.item())
    return net

def predict(net, testset):
    ans = []
    with torch.no_grad():
        for data in testset:
            X, y = data
            output = net(X)
            for idx, i in enumerate(output):
                ans.append(i.cpu().data.numpy().argmax().item())
    return ans

In [11]:
param_grid = {
    "lr": [3e-4],
    "epochs": [5, 10],
    "optimizer": [optim.AdamW],
    "batch_size": [512],
    "type_rnn": [torch.nn.RNN, torch.nn.LSTM, torch.nn.GRU],
    "layers_count": [3, 5],
    "bidirectional": [False, True],
    "hidden_size": [100, 300]
}
params_list = ParameterGrid(param_grid)
len(params_list)

48

In [12]:
# torch.backends.cudnn.enabled = False

configs = []

inputs_train = torch.tensor(X_train_wi, dtype=torch.int32).to(device)
targets_train = torch.tensor(y_train, dtype=torch.int32).to(device)

inputs_test = torch.tensor(X_test_wi, dtype=torch.int32).to(device)
targets_test = torch.tensor(y_test, dtype=torch.int32).to(device)

train = data_utils.TensorDataset(inputs_train, targets_train)
test = data_utils.TensorDataset(inputs_test, targets_test)

trainset = torch.utils.data.DataLoader(train, shuffle=True, batch_size=batch_size)
testset = torch.utils.data.DataLoader(test, shuffle=False, batch_size=batch_size)

# grid search
for params in tqdm(params_list):
    
    # get param for pass to network
    lr = params["lr"]
    epochs = params["epochs"]
    optimizer = params["optimizer"]
    batch_size = params["batch_size"]
    type_rnn = params["type_rnn"]
    n_layer = params["layers_count"]
    bidirectional = params["bidirectional"]
    hidden_size = params["hidden_size"]
    
    # net build
    net = Net(vocab_size=len(vocab_), 
              embed_dim=100,
              seq_len=max_len,
              type_rnn=type_rnn,
              hidden_size=hidden_size, 
              bidirectional=bidirectional,
              n_layer=n_layer)
    net.to(device)

    # fit
    net = fit(net, epochs, trainset, optimizer, lr, None, False)
    
    # predict
    ans = predict(net, testset)
    
    # add param in config
    config = [epochs, optimizer.__name__, batch_size, type_rnn.__name__, hidden_size,
              n_layer, bidirectional, f'{f1_score(y_test, ans, average="weighted"):.5f}']
    configs.append(config)
    
print()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [29:55<00:00, 37.41s/it]







In [13]:
df = pd.DataFrame(configs)
df.columns = ['epochs', 'optimizer', 'batch_size', 'type_rnn', 
              'hidden_size', 'n_layer', 'bidirectional', 'f1_score']
df.sort_values(by='f1_score', ascending=False).reset_index(drop=True)

Unnamed: 0,epochs,optimizer,batch_size,type_rnn,hidden_size,n_layer,bidirectional,f1_score
0,5,AdamW,512,LSTM,100,5,True,0.82022
1,5,AdamW,512,LSTM,100,3,False,0.80032
2,10,AdamW,512,LSTM,300,3,False,0.8
3,5,AdamW,512,GRU,100,3,False,0.79903
4,5,AdamW,512,LSTM,300,3,False,0.79185
5,5,AdamW,512,LSTM,100,5,False,0.77956
6,5,AdamW,512,LSTM,100,3,True,0.77956
7,10,AdamW,512,RNN,100,5,True,0.77815
8,10,AdamW,512,RNN,300,3,True,0.76
9,10,AdamW,512,GRU,100,3,True,0.76


## 2.2

In [14]:
path_ = '../2/'
train = pd.read_excel(path_ + 'X_y_train.xlsx')
test = pd.read_excel(path_ + 'X_y_test.xlsx')
X_train, y_train, X_test, y_test = train.drop(['Class'], axis=1), train.Class, test.drop(['Class'], axis=1), test.Class
assert y_train.shape == (X_train.shape[0],) and y_test.shape == (X_test.shape[0], )
y_train = y_train.apply(lambda x: x if x == 1 else 0)
y_test = y_test.apply(lambda x: x if x == 1 else 0)


path_ = '../2/saved/'
__train_w2v_pretrain = load_file(path_ + '__train_w2v_pretrain.npy')
__test_w2v_pretrain = load_file(path_ + '__test_w2v_pretrain.npy')

__train_w2v = load_file(path_ + '__train_w2v.npy')
__test_w2v = load_file(path_ + '__test_w2v.npy')

__train_fasttext_500 = load_file(path_ + '__train_fasttext_500_10.npy')
__test_fasttext_500 = load_file(path_ + '__test_fasttext_500_10.npy')

__train_fasttext_pretrain = load_file(path_ + '__train_fasttext_pretrain.npy')
__test_fasttext_pretrain = load_file(path_ + '__test_fasttext_pretrain.npy')

In [15]:
class Net(nn.Module):
    
    def __init__(self,
                 vocab_size=len(vocab_), 
                 embed_dim=300,
                 seq_len=max_len,
                 conv_layer_count=2, 
                 stride=1,
                 kernel_size=3,
                 type_rnn=None,
                 hidden_size=None, 
                 bidirectional=None,
                 n_layer=None):
        super().__init__()
        self.embed_dim = embed_dim
        self.seq_len = seq_len
        
        self.conv1d_layers = nn.ModuleList([])
        out_shape = 1
        padding = kernel_size // 2
        for i in range(conv_layer_count):
            self.conv1d_layers.append(nn.Conv1d(in_channels=embed_dim,
                                                out_channels=embed_dim,
                                                kernel_size=kernel_size,
                                                stride=stride, 
                                                padding=padding))
            out_shape = 1 + (out_shape + 2 * padding - kernel_size) // stride
        
        self.rnn = type_rnn(
            input_size=self.embed_dim,
            hidden_size=hidden_size,
            num_layers=n_layer,
            bidirectional=bidirectional,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size * out_shape * (1 + bidirectional), 2)

    def forward(self, x):
        x = x[:, :, None]
        for conv_layer in self.conv1d_layers:
            x = F.relu(conv_layer(x))
        x = x.reshape(len(x), -1, self.embed_dim)
        x, _ = self.rnn(x)
        x = x.reshape(x.shape[0], -1)
        logits = self.fc(x)
        return F.log_softmax(logits, dim=-1)

In [16]:
net = Net(type_rnn=torch.nn.RNN,
          hidden_size=100, 
          bidirectional=True,
          n_layer=3)
net(torch.tensor(__train_w2v_pretrain[:5], dtype=torch.float32))

tensor([[-0.7234, -0.6638],
        [-0.7231, -0.6641],
        [-0.7237, -0.6635],
        [-0.7239, -0.6633],
        [-0.7234, -0.6638]], grad_fn=<LogSoftmaxBackward0>)

In [17]:
param_grid = {
    "embeddings": [(__train_w2v, __test_w2v), 
                   (__train_fasttext_500, __test_fasttext_500)],
    "lr": [3e-4],
    "epochs": [5, 10],
    "optimizer": [optim.AdamW],
    "batch_size": [batch_size],
    "layers_count": [3, 5],
    "kernel_size": [3],
    "stride": [1, 3],
    "type_rnn": [torch.nn.LSTM, torch.nn.GRU],
    "layers_count": [3, 5],
    "bidirectional": [True],
    "hidden_size": [100, 300]
}
params_list = ParameterGrid(param_grid)
len(params_list)

64

In [18]:
# torch.backends.cudnn.enabled = False

configs = []

# grid search
for params in tqdm(params_list):
    
    # get param for pass to network
    X_train, X_test = params["embeddings"]
    lr = params['lr']
    epochs = params['epochs']
    optimizer = params['optimizer']
    batch_size = params['batch_size']
    layers_count = params['layers_count']
    kernel_size = params['kernel_size']
    stride = params['stride']
    type_rnn = params["type_rnn"]
    n_layer = params["layers_count"]
    bidirectional = params["bidirectional"]
    hidden_size = params["hidden_size"]
    
    inputs_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    targets_train = torch.tensor(y_train, dtype=torch.int32).to(device)

    inputs_test = torch.tensor(X_test, dtype=torch.float32).to(device)
    targets_test = torch.tensor(y_test, dtype=torch.int32).to(device)

    train = data_utils.TensorDataset(inputs_train, targets_train)
    test = data_utils.TensorDataset(inputs_test, targets_test)

    trainset = torch.utils.data.DataLoader(train, shuffle=True, batch_size=batch_size)
    testset = torch.utils.data.DataLoader(test, shuffle=False, batch_size=batch_size)
    
    # net build
    net = Net(vocab_size=len(vocab_), 
              embed_dim=300, 
              conv_layer_count=layers_count, 
              stride=stride, 
              kernel_size=kernel_size, 
              seq_len=max_len, 
              type_rnn=type_rnn,
              hidden_size=hidden_size, 
              bidirectional=bidirectional,
              n_layer=n_layer)
    net.to(device)

    # fit
    net = fit(net, epochs, trainset, optimizer, lr, None, False)
    
    # predict
    ans = predict(net, testset)
    
    # add param in config
    config = [epochs, optimizer.__name__, batch_size, layers_count, kernel_size, stride, 
              type_rnn.__name__, n_layer, bidirectional, hidden_size, f'{f1_score(y_test, ans):.5f}']
    configs.append(config)
    
print()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [06:43<00:00,  6.30s/it]







In [19]:
df = pd.DataFrame(configs)
df.columns = ['epochs', 'optimizer', 'batch_size', 'cnn_layer', 'kernel_size', 'stride', 
              'type_rnn', 'rnn_layer', 'bidirectional', 'hidden_size', 'f1_score']
df.sort_values(by='f1_score', ascending=False).reset_index(drop=True)

Unnamed: 0,epochs,optimizer,batch_size,cnn_layer,kernel_size,stride,type_rnn,rnn_layer,bidirectional,hidden_size,f1_score
0,10,AdamW,512,5,3,3,GRU,5,True,300,0.89286
1,5,AdamW,512,5,3,1,LSTM,5,True,100,0.89286
2,10,AdamW,512,3,3,3,GRU,3,True,100,0.87719
3,10,AdamW,512,5,3,1,GRU,5,True,100,0.85185
4,10,AdamW,512,5,3,1,LSTM,5,True,100,0.84211
...,...,...,...,...,...,...,...,...,...,...,...
59,10,AdamW,512,5,3,3,LSTM,5,True,300,0.70130
60,5,AdamW,512,5,3,1,LSTM,5,True,300,0.70130
61,5,AdamW,512,5,3,1,LSTM,5,True,300,0.70130
62,10,AdamW,512,3,3,1,GRU,3,True,300,0.68000
