In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Обратите внимание
Я примонтировал эти данные: Fasttext Common Crawl (Russian)
Это эмбеддинги для русских слов

In [None]:
train = pd.read_csv('/kaggle/input/dmia-dl-nlp-2019/train.csv')
test = pd.read_csv('/kaggle/input/dmia-dl-nlp-2019/test.csv')

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

from nltk.tokenize import word_tokenize, wordpunct_tokenize
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
def process_text(text):
    
    # просто токенизация текста, то есть разбиение предложения на токены (слова)
    words = wordpunct_tokenize(text.lower())
    
    return words

In [None]:
process_text('красивая мама мыла красивую раму')

In [None]:
# все наши тексты
texts = list(train.question.map(process_text)) + list(test.question.map(process_text))

In [None]:
# соберем словарь встречаемости слов
# просто подсчет сколько раз то или иное слово встретилось в наших текстах

word2freq = {}

for text in texts:
    
    for word in text:
        
        word2freq[word] = word2freq.get(word, 0) + 1

# Прочитаем файл с векторами
В каждой строке этого файла стоит слово, а затем через пробел указано 300 чисел, которые соответствуют эмбеддингу этого слова.
Мы не будем читать все 2 000 000 слов, чтобы не хранить в памяти матрицу из 2 000 000 * 300 чисел. Мы будем добавлять в наш словарь только те слова, которые встречаются в наших текстах.

Мы будем добавлять в ```word2index``` слова и их индекс в матрице эмбеддингов

Также первым словом добавим специальный токен PAD, эмбеддинг которого будет состоять из нулей. Он нужен для дополнения наших предложений до нужной длины, чтобы составить батч из текстов.

In [None]:
word2index = {'PAD': 0}
vectors = []
    
word2vec_file = open('/kaggle/input/fasttest-common-crawl-russian/cc.ru.300.vec')
    
n_words, embedding_dim = word2vec_file.readline().split()
n_words, embedding_dim = int(n_words), int(embedding_dim)

# Zero vector for PAD
vectors.append(np.zeros((1, embedding_dim)))

progress_bar = tqdm(desc='Read word2vec', total=n_words)

while True:

    line = word2vec_file.readline().strip()

    if not line:
        break
        
    current_parts = line.split()

    current_word = ' '.join(current_parts[:-embedding_dim])

    if current_word in word2freq:

        word2index[current_word] = len(word2index)

        current_vectors = current_parts[-embedding_dim:]
        current_vectors = np.array(list(map(float, current_vectors)))
        current_vectors = np.expand_dims(current_vectors, 0)

        vectors.append(current_vectors)

    progress_bar.update()

progress_bar.close()

word2vec_file.close()

vectors = np.concatenate(vectors)

In [None]:
vectors.shape

In [None]:
unk_words = [word for word in word2freq if word not in word2index]
unk_counts = [word2freq[word] for word in unk_words]
n_unk = sum(unk_counts) * 100 / sum(list(word2freq.values()))

sub_sample_unk_words = {word: word2freq[word] for word in unk_words}
sorted_unk_words = list(sorted(sub_sample_unk_words, key=lambda x: sub_sample_unk_words[x], reverse=True))

print('Мы не знаем {:.2f} % слов в датасете'.format(n_unk))
print('Количество неизвестных слов {} из {}, то есть {:.2f} % уникальных слов в словаре'.format(
    len(unk_words), len(word2freq), len(unk_words) * 100 / len(word2freq)))
print('В среднем каждое встречается {:.2f} раз'.format(np.mean(unk_counts)))
print()
print('Топ 5 невошедших слов:')

for i in range(5):
    print(sorted_unk_words[i], 'с количеством вхождениий -', word2freq[sorted_unk_words[i]])

In [None]:
class WordData(Dataset):
    
    def __init__(self, x_data, y_data, word2index, sequence_length=32, pad_token='PAD', verbose=True):
        
        super().__init__()
        
        self.x_data = []
        self.y_data = y_data
        
        self.word2index = word2index
        self.sequence_length = sequence_length
        
        self.pad_token = pad_token
        self.pad_index = self.word2index[self.pad_token]
        
        self.load(x_data, verbose=verbose)
        
    @staticmethod
    def process_text(text):
    
        words = wordpunct_tokenize(text.lower())

        return words
        
    def load(self, data, verbose=True):
        
        data_iterator = tqdm(data, desc='Loading data', disable=not verbose)
        
        for text in data_iterator:
            words = self.process_text(text)
            indexed_words = self.indexing(words)
            self.x_data.append(indexed_words)
    
    def indexing(self, tokenized_text):

        # выбрасываем неизвестные слова и переводим слова в индекс позиций в матрице эмбеддингов

        return [self.word2index[token] for token in tokenized_text if token in self.word2index]
    
    def padding(self, sequence):
        
        # Ограничить длину self.sequence_length
        # если длина меньше максимально - западить

        return sequence[:self.sequence_length] + [self.pad_index] * (self.sequence_length - len(sequence))
    
    def __len__(self):
        
        return len(self.x_data)
    
    def __getitem__(self, idx):
        
        x = self.x_data[idx]
        x = self.padding(x)
        x = torch.Tensor(x).long()
        
        y = self.y_data[idx]
        
        return x, y

In [None]:
x_train, x_validation, y_train, y_validation = train_test_split(train.question, train.main_category, test_size=0.15)

# train_dataset = WordData(list(x_train), list(y_train), word2index)
# train_loader = DataLoader(train_dataset, batch_size=64, drop_last=True)

train_dataset = WordData(list(train.question), list(train.main_category), word2index)
train_loader = DataLoader(train_dataset, batch_size=64, drop_last=True)

validation_dataset = WordData(list(x_validation), list(y_validation), word2index)
validation_loader = DataLoader(validation_dataset, batch_size=64, drop_last=True)

test_dataset = WordData(list(test.question), np.zeros((test.shape[0])), word2index)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
for x, y in train_loader:
    break

In [None]:
# x - это батч размером 64
x

In [None]:
# чтобы составить матрицу мы отрезали длинные предложения до 32 токенов, а короткие дополнили индексом PAD до нужной длины

In [None]:
x.shape

In [None]:
# наши таргеты
y.shape

In [None]:
n_classes = train.main_category.unique().shape[0]

In [None]:
vectors.shape

In [None]:
from torch import nn

In [None]:
class DeepAverageNetwork(torch.nn.Module):
    
    def __init__(self, embedding_matrix, n_classes):
        
        super().__init__()
        
        # здесь мы как раз передаем вектора слов в нашу матрицу эмбеддингов
        # по умолчанию метод from_pretrained замораживает эту матрицу
        self.embedding_layer = torch.nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix))
        
        self.layers = torch.nn.Sequential(
#                                           torch.nn.Conv1d(300, 256, kernel_size=3),
#                                           torch.nn.ReLU(),
#                                           torch.nn.Conv1d(256, 256, kernel_size=5),
#                                           torch.nn.ReLU(),
#                                           torch.nn.LSTM(300, 256, batch_first=True), 
#                                           torch.nn.LSTM(256, 128, batch_first=True),
#                                           torch.nn.LSTM(128, n_classes, batch_first=True),
#                                           torch.nn.ReLU(), 
#                                           torch.nn.Flatten(),
                                          torch.nn.Linear(256, 128),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(128, n_classes)
                                         )
    def forward(self, x):
        
        # переводим индексы слов в вектора
        x = self.embedding_layer(x)
        
        # усредняем эмбеддинги слов
        # переходим к одну вектору на предложение
        # обратите внимание, что за счет нулевого токена PAD мы усредняем нечестно, считая, что у всех предложений длина 32 токена
#         x = x.transpose(1, 2)
        x = x.mean(dim=-2)
        
        
        # применяем несколько линейных слоев с релу
        x = self.layers(x)
        
        return x

In [None]:
# инициализируем модель
model = DeepAverageNetwork(vectors, n_classes)

In [None]:
# смотрим отработает ли наша модель
# нет ли багов
with torch.no_grad():
    pred, hidden = model(x, Variable(torch.randn(2, 3, 20))

print(pred)

In [None]:
embeddings = model.embedding_layer(x)

In [None]:
# эмбеддинги слов
# 64 - размер батча
# 32 - количество слов в примере
# 300 - размер эмбеддинга на каждое слово
embeddings.shape

In [None]:
# задаем девайс, где будет учиться модель
# если доступна гпу, то зададим гпу
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
# напомню, что мы не используем в моделе софтмакс, потому что он уже есть здесь
# criterion = torch.nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(params=model.parameters())

model = model.to(device)
criterion = criterion.to(device)

In [None]:
epochs = 10
losses = []
best_test_loss = 10.

test_f1 = []

for n_epoch in range(epochs):
    
    train_losses = []
    test_losses = []
    test_targets = []
    test_pred_class = []
    
    progress_bar = tqdm(total=len(train_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))
    
    model.train()
    
    for x, y in train_loader:
        
        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        pred = model(x)
#         pred = model(x)
        loss = criterion(pred, y)
        
        loss.backward()
        
        optimizer.step()
        
        train_losses.append(loss.item())
        losses.append(loss.item())
        
        progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))

        progress_bar.update(x.shape[0])
        
    progress_bar.close()
    
    model.eval()
    
    for x, y in validation_loader:
        
        x = x.to(device)
        y = y.to(device)

        with torch.no_grad():

            pred = model(x)

            pred = pred.cpu()
            y = y.cpu()

            test_targets.append(y.numpy())
            test_pred_class.append(np.argmax(pred, axis=1))

            loss = criterion(pred, y)

            test_losses.append(loss.item())
        
    mean_test_loss = np.mean(test_losses)

    test_targets = np.concatenate(test_targets).squeeze()
    test_pred_class = np.concatenate(test_pred_class).squeeze()

    f1 = f1_score(test_targets, test_pred_class, average='micro')

    test_f1.append(f1)
    
    print()
    print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

    print('F1 test - {:.3f}'.format(f1))
        
    # наивный early stopping
    if mean_test_loss < best_test_loss:
        best_test_loss = mean_test_loss
    else:
        print('Early stopping')
        break

In [None]:
model.eval()

predictions = []

for x, _ in test_loader:

    x = x.to(device)

    with torch.no_grad():

        pred , h1 = model(x, h1)

        pred = pred.cpu()
        
        predictions.append(np.argmax(pred, axis=1))
        
predictions = np.concatenate(predictions).squeeze()

In [None]:
test['main_category'] = predictions

In [None]:
test = test[['index', 'main_category']]

In [None]:
test.head()

In [None]:
test.main_category.unique()

In [None]:
test.to_csv('submission.csv', index=False)

In [None]:
n_classes

In [None]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim=300, hidden_dim=180, n_layers=4, drop_prob=0.2):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
#         weight2 = next(self.parameters()).data
#         self.hidden2 = (weight2.new(self.n_layers, batch_size, 32).zero_().to(device),
#                        weight2.new(self.n_layers, batch_size, 32).zero_().to(device))
        
        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(vocab_size))
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
#         self.lstm2 = nn.LSTM(64, 32, n_layers, dropout=drop_prob, batch_first=True)
#         self.lstm3 = nn.LSTM(256, 256, n_layers, dropout=drop_prob, batch_first=True)
#         self.lstm4 = nn.LSTM(256, output_size, n_layers, dropout=drop_prob, batch_first=True)
        self.bn1 = nn.BatchNorm1d()
        self.cv1 = nn.Conv1d(180, 180, 3, padding=1)
        self.mp1 = nn.MaxPool1d(2)
        
        self.bn2 = nn.BatchNorm1d()
        self.cv2 = nn.Conv1d(180, 128, 3, padding=1)
        self.mp2 = nn.MaxPool1d(2)
        
        self.bn2 = nn.BatchNorm1d()
        self.cv3 = nn.Conv1d(128, 96, 3, padding=1)
        self.mp3 = nn.MaxPool1d(2)
        
        self.cv4 = nn.Conv1d(96, 64, 3, padding=1)
        
        self.cv = nn.Conv1d(64, 32, 3, padding=1)
        
        self.cv5 = nn.Conv1d(32, 28, 3, padding=1)
        self.mp5 = nn.MaxPool1d(2)
        self.cv6 = nn.Conv1d(28, 28, 3, padding=1)
        self.avp = nn.AvgPool1d(2)

        self.dropout = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(128, 64)
        self.ac1 = nn.ReLU()
        self.fc2 = nn.Linear(64, output_size)
        
    def forward(self, x, hidden1, hidden2):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
#         embeds = torch.nn.utils.rnn.pack_padded_sequence(embeds)

#         embeds = embeds.mean(dim=-2)
#         print(embeds.shape)
        lstm_out, hidden1 = self.lstm1(embeds, hidden1)
#         print(lstm_out.shape)
#         print(lstm_out.shape)
#         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

#         lstm_out, hidden = self.lstm2(lstm_out, hidden)
# #         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

#         lstm_out, hidden = self.lstm3(lstm_out, hidden)
# #         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

#         lstm_out, hidden = self.lstm4(lstm_out, hidden)
#         lstm_out = lstm_out[:, -1, :]
#         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

#         lstm_out, hidden = self.lstm5(lstm_out, hidden)
# #         lstm_out = nn.utils.rnn.PackedSequence(lstm_out, batch_size)
        lstm_out = lstm_out.transpose(1, 2)
#         print(lstm_out.shape)
        lstm_out = self.cv1(lstm_out)
        lstm_out = self.mp1(lstm_out)
        
        lstm_out = self.cv2(lstm_out)
        lstm_out = self.mp2(lstm_out)
        
        lstm_out = self.cv3(lstm_out)
        lstm_out = self.mp3(lstm_out)
        
        lstm_out = self.cv4(lstm_out)
        lstm_out = self.mp3(lstm_out)
        print(lstm_out.shape)
        
#         lstm_out = lstm_out.transpose(1, 2)
# #         print(lstm_out.shape)
#         lstm_out = self.cv(lstm_out)
        
#         lstm_out, hidden2 = self.lstm2(lstm_out, hidden2)
# #         print(lstm_out.shape)
        lstm_out = lstm_out.transpose(1, 2)
#         print(lstm_out.shape)
#         lstm_out = self.cv5(lstm_out)
#         lstm_out = self.mp5(lstm_out)
#         lstm_out = self.cv6(lstm_out)
#         lstm_out = self.avp(lstm_out)
#         print(lstm_out.shape)
# #         lstm_out = lstm_out.transpose(1, 2)
#         print(lstm_out.shape)
#         out = lstm_out.view(batch_size, -1)
            
        lstm_out = lstm_out.contiguous().view(batch_size, -1)
        
        out = self.fc1(lstm_out)
        out = self.dropout(out)
        out = self.ac1(out)

        out = self.fc2(out)

        out = out.view(batch_size, -1)
        
        return out, hidden1, hidden2
    
    def init_hidden(self, batch_size, hidden_dim):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, hidden_dim).zero_().to(device))
        return hidden

In [None]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim 
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim=300, hidden_dim=128, n_layers=4, drop_prob=0.2):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(vocab_size))
#         self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
#         self.lstm2 = nn.LSTM(hidden_dim, 256, n_layers, dropout=drop_prob, batch_first=True)
#         self.lstm3 = nn.LSTM(256, 256, n_layers, dropout=drop_prob, batch_first=True)
#         self.lstm4 = nn.LSTM(256, output_size, n_layers, dropout=drop_prob, batch_first=True)

        self.lstm1 = nn.LSTM(embed_size, 128, n_layers, bidirectional=True, dropout=drop_prob, batch_first=True)
        self.lstm2 = nn.GRU(128*2, 64, bidirectional=True, batch_first=True)


        self.cv1 = nn.Conv1d(128, 96, 3, padding=1)
        self.mp1 = nn.MaxPool1d(2)
        self.bn1 = nn.BatchNorm1d(128)
        self.ac0 = nn.ReLU()
        self.do1 = nn.Dropout()
        
        self.cv2 = nn.Conv1d(96, 128, 3, padding=1)
        self.mp2 = nn.MaxPool1d(2)
        self.bn2 = nn.BatchNorm1d(128)
        self.ac2 = nn.ReLU()
        self.do2 = nn.Dropout()
        
#         self.cv3 = nn.Conv1d(64, 128, 3, padding=1)
#         self.ac3 = nn.ReLU()
#         self.bn3 = nn.BatchNorm1d(128)
#         self.mp3 = nn.MaxPool1d(2)
#         self.do3 = nn.Dropout()
        
#         self.cv4 = nn.Conv1d(32, 128, 3, padding=1)
#         self.ac4 = nn.ReLU()
#         self.bn4 = nn.BatchNorm1d(128)
#         self.do4 = nn.Dropout()
        
        self.attention_layer = Attention(128, maxlen)

        self.dropout = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(2*64, 96)
        self.bn0 = nn.BatchNorm1d(96)
        self.ac1 = nn.ReLU()
        self.fc2 = nn.Linear(96, output_size)
        self.sigmoid = nn.Softmax()
        
    def forward(self, x):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
#         embeds = torch.nn.utils.rnn.pack_padded_sequence(embeds)

#         embeds = embeds.mean(dim=-2)
        lstm_out, hidden = self.lstm1(embeds)
#         print(lstm_out.shape)
#         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        lstm_out, hidden = self.lstm2(lstm_out)
#         print(lstm_out.shape)
# #         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

#         lstm_out, hidden = self.lstm3(lstm_out, hidden)
# #         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

#         lstm_out, hidden = self.lstm4(lstm_out, hidden)
#         lstm_out = lstm_out[:, -1, :]
#         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

#         lstm_out, hidden = self.lstm5(lstm_out, hidden)
# #         lstm_out = nn.utils.rnn.PackedSequence(lstm_out, batch_size)
        lstm_out = lstm_out.transpose(1, 2)

        lstm_out = self.bn1(lstm_out)
        lstm_out = self.cv1(lstm_out)
        lstm_out = self.do1(lstm_out)
        lstm_out = self.ac0(lstm_out)
#         lstm_out = self.mp1(lstm_out)
        
        lstm_out = self.cv2(lstm_out)
        lstm_out = self.do2(lstm_out)
        lstm_out = self.bn2(lstm_out)
        lstm_out = self.ac2(lstm_out)
#         lstm_out = self.mp2(lstm_out)
        
#         lstm_out = self.cv3(lstm_out)
#         lstm_out = self.do3(lstm_out)
#         lstm_out = self.bn3(lstm_out)
#         lstm_out = self.ac3(lstm_out)
#         lstm_out = self.mp3(lstm_out)
        
#         lstm_out = self.cv4(lstm_out)
#         lstm_out = self.dropout(lstm_out)
#         lstm_out = self.bn4(lstm_out)
#         lstm_out = self.ac4(lstm_out)
        
        lstm_out = lstm_out.transpose(1, 2)
#         print(lstm_out.shape)
#         lstm_out = lstm_out.contiguous().view(batch_size, -1)
        lstm_out = self.attention_layer(lstm_out)
        
        out = self.fc1(lstm_out)
        out = self.bn0(out)
        out = self.dropout(out)
        out = self.ac1(out)

        out = self.fc2(out)

        out = out.view(batch_size, -1)
        
        return out
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

In [None]:
output_size = 1
embedding_dim = 300
hidden_dim = 128
n_layers = 2

model = SentimentNet(embedding_matrix, n_classes)
model.to(device)

lr=0.01
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
criterion = criterion.to(device)

In [None]:
train_on_gpu = True
epochs = 10
counter = 0
print_every = 1000
clip = 5
valid_loss_min = np.Inf

epochs = 15
losses = []
best_test_loss = 10.

test_f1 = []

batch_size = 64

model.train()
for n_epoch in range(epochs):
    
    train_losses = []
    test_losses = []
    test_targets = []
    test_pred_class = []
    
    progress_bar = tqdm(total=len(train_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))
    
    model.train()

    for inputs, labels in train_loader:
        counter += 1
        
        inputs, labels = inputs.to(device), labels.to(device)
        
        model.zero_grad()
        
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        train_losses.append(loss.item())
        losses.append(loss.item())
        
        progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))

        progress_bar.update(x.shape[0])
        
    progress_bar.close()
    
    model.eval()
    
    for inp, lab in validation_loader:
        inp, lab = inp.to(device), lab.to(device)
        
        with torch.no_grad():
        
            out = model(inp)
            out = out.cpu()
            lab = lab.cpu()
            
            test_targets.append(lab.numpy())
            test_pred_class.append(np.argmax(out, axis=1))

            loss = criterion(out, lab)

            test_losses.append(loss.item())
        
    mean_test_loss = np.mean(test_losses)

    test_targets = np.concatenate(test_targets).squeeze()
    test_pred_class = np.concatenate(test_pred_class).squeeze()

    f1 = f1_score(test_targets, test_pred_class, average='micro')

    test_f1.append(f1)
    
    print()
    print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

    print('F1 test - {:.3f}'.format(f1))
        
#         if counter%print_every == 0:
#             val_h = model.init_hidden(batch_size)
#             val_losses = []
#             model.eval()
#             for inp, lab in validation_loader:
#                 val_h = tuple([each.data for each in val_h])
#                 inp, lab = inp.to(device), lab.to(device)
#                 out, val_h = model(inp, val_h)
#                 val_loss = criterion(out, lab)
#                 val_losses.append(val_loss.item())
                
#             model.train()
#             print("Epoch: {}/{}...".format(i+1, epochs),
#                   "Step: {}...".format(counter),
#                   "Loss: {:.6f}...".format(loss.item()),
#                   "Val Loss: {:.6f}".format(np.mean(val_losses)))
#             if np.mean(val_losses) <= valid_loss_min:
#                 torch.save(model.state_dict(), './state_dict.pt')
#                 print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
#                 valid_loss_min = np.mean(val_losses)

In [None]:
test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in validation_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output, labels)
    test_losses.append(test_loss.item())
    pred = torch.round(output)  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels)
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

In [None]:
embed_size = 300 # how big is each word vector
max_features = 244922 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 32 # max number of words in a question to use
batch_size = 64 # how many samples to process at once
n_epochs = 5 # how many times to iterate over all samples
# n_splits = 5 # Number of K-fold Splits
# SEED = 10
# debug = 0

In [None]:
embedding_matrix = vectors
del vectors

In [None]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim 
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

class Attention_Net(nn.Module):
    def __init__(self):
        super(Attention_Net, self).__init__()
        drp = 0.1
#         self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix))
#         self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
#         self.embedding.weight.requires_grad = False

        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size, 128, bidirectional=True, batch_first=True)
        self.lstm2 = nn.GRU(128*2, 64, bidirectional=True, batch_first=True)

        self.attention_layer = Attention(128, maxlen)
        
        self.linear = nn.Linear(64*2 , 64)
        self.relu = nn.ReLU()
        self.out = nn.Linear(64, 28)

    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
        h_lstm, _ = self.lstm(h_embedding)
        h_lstm, _ = self.lstm2(h_lstm)
#         print(h_lstm.shape)
        h_lstm_atten = self.attention_layer(h_lstm)
#         print(h_lstm_atten.shape)
        conc = self.relu(self.linear(h_lstm_atten))
        out = self.out(conc)
        return out


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model = Attention_Net()

model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
criterion.to(device)

In [None]:
train_on_gpu = True
print_every = 1000
clip = 5
valid_loss_min = np.Inf

epochs = 3
losses = []
best_test_loss = 10.

test_f1 = []

batch_size = 64

model.train()
for n_epoch in range(epochs):
    
    train_losses = []
    test_losses = []
    test_targets = []
    test_pred_class = []
    
    progress_bar = tqdm(total=len(train_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))
    
    model.train()

    for inputs, labels in train_loader:
        
        inputs, labels = inputs.to(device), labels.to(device)
        
        model.zero_grad()
        
        output = model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        train_losses.append(loss.item())
        losses.append(loss.item())
        
        progress_bar.set_postfix(train_loss = np.mean(losses[-500:]))

        progress_bar.update(x.shape[0])
        
    progress_bar.close()
    
    model.eval()
    
    for inp, lab in validation_loader:
        inp, lab = inp.to(device), lab.to(device)
        
        with torch.no_grad():
        
            out = model(inp)
            out = out.cpu()
            lab = lab.cpu()
            
            test_targets.append(lab.numpy())
            test_pred_class.append(np.argmax(out, axis=1))

            loss = criterion(out, lab)

            test_losses.append(loss.item())
        
    mean_test_loss = np.mean(test_losses)

    test_targets = np.concatenate(test_targets).squeeze()
    test_pred_class = np.concatenate(test_pred_class).squeeze()

    f1 = f1_score(test_targets, test_pred_class, average='micro')

    test_f1.append(f1)
    
    print()
    print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

    print('F1 test - {:.3f}'.format(f1))

In [None]:
model.eval()

predictions = []

for x, _ in test_loader:

    x = x.to(device)

    with torch.no_grad():

        pred = model(x)

        pred = pred.cpu()
        
        predictions.append(np.argmax(pred, axis=1))
        
predictions = np.concatenate(predictions).squeeze()

In [None]:
test['main_category'] = predictions

In [None]:
test = test[['index', 'main_category']]

In [None]:
test.to_csv('submission.csv', index=False)