In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['imdb_master.csv']


In [2]:
import nltk
from collections import Counter
import itertools

In [3]:
import torch

Рассмотрим преобразование текстов в удобоваримый для нейронной сети вид.<br>
А именно:<br>
    - Текст разбивается на слова (токенизация, знаки препинания считаются словами)<br>
    - Слова подсчитываются для формирования ограниченного словаря. Каждому слову сопоставляется определеннный номер в словаре. 
    Редким словам назначается специальный номер (эквивалентно замене редких слов на спец. слово <UNK> (неизвестное слово)). 
    - Последовательности слов преобразуются в последовательности номеров слов. Добавляются спец. слова для обозначения начала и конца текста. 
    - Полученные последовательности выравниваются по заданной максимальной длине через обрезание или дополнение номером спец.символа <PAD>


Класс для хранения текста в виде последовательности номеров и его закодированной метки

In [4]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, label_id):
        self.input_ids = input_ids
        self.label_id = label_id

Класс словаря. Метод word2id возвращает номер слова, id2word - наоборот, восстанавливает слово.

In [5]:
class Vocab:
    def __init__(self, itos, unk_index):
        self._itos = itos
        self._stoi = {word:i for i, word in enumerate(itos)}
        self._unk_index = unk_index
        
    def __len__(self):
        return len(self._itos)
    
    def word2id(self, word):
        idx = self._stoi.get(word)
        if idx is not None:
            return idx
        return self._unk_index
    
    def id2word(self, idx):
        return self._itos[idx]

In [6]:
from tqdm import tqdm_notebook

Интерфейс объекта, преобразующего тексты в последовательности номеров.
transform выполняет преобразование при помощи словаря.
fit_transform выучивает словарь из текста и возвращает такое же преобразование при помощи свежеполученного словаря.

In [7]:
class TextToIdsTransformer:
    def transform():
        raise NotImplementedError()
        
    def fit_transform():
        raise NotImplementedError()



Простая реализация данного интерфейса. Разбиение на слова производится с помощью библиотеки NLTK.
В словаре содержатся несколько спец. слов.
После токенизации, к полученной последовательности слов добавляются слева и справа спец. слова для начала и конца текста.

In [8]:
class SimpleTextTransformer(TextToIdsTransformer):
    def __init__(self, max_vocab_size):
        self.special_words = ['<PAD>', '</UNK>', '<S>', '</S>']
        self.unk_index = 1
        self.pad_index = 0
        self.vocab = None
        self.max_vocab_size = max_vocab_size
        
    def tokenize(self, text):
        return nltk.tokenize.word_tokenize(text.lower())
        
    def build_vocab(self, tokens):
        itos = []
        itos.extend(self.special_words)
        
        token_counts = Counter(tokens)
        for word, _ in token_counts.most_common(self.max_vocab_size - len(self.special_words)):
            itos.append(word)
            
        self.vocab = Vocab(itos, self.unk_index)
    
    def transform(self, texts):
        result = []
        for text in texts:
            tokens = ['<S>'] + self.tokenize(text) + ['</S>']
            ids = [self.vocab.word2id(token) for token in tokens]
            result.append(ids)
        return result
    
    def fit_transform(self, texts):
        result = []
        tokenized_texts = [self.tokenize(text) for text in texts]
        self.build_vocab(itertools.chain(*tokenized_texts))
        for tokens in tokenized_texts:
            tokens = ['<S>'] + tokens + ['</S>']
            ids = [self.vocab.word2id(token) for token in tokens]
            result.append(ids)
        return result

Строим экземпляр входных данных. Обеспечиваем длину последовательности номеров равной max_seq_len. 

In [9]:
def build_features(token_ids, label, max_seq_len, pad_index, label_encoding):
    if len(token_ids) >= max_seq_len:
        ids = token_ids[:max_seq_len]
    else:
        ids = token_ids + [pad_index for _ in range(max_seq_len - len(token_ids))]
    return InputFeatures(ids, label_encoding[label])
        

Собираем экземпляры в тензоры

In [10]:
def features_to_tensor(list_of_features):
    text_tensor = torch.tensor([example.input_ids for example in list_of_features], dtype=torch.long)
    labels_tensor = torch.tensor([example.label_id for example in list_of_features], dtype=torch.long)
    return text_tensor, labels_tensor

In [11]:
from sklearn import model_selection

In [12]:
imdb_df = pd.read_csv('../input/imdb_master.csv', encoding='latin-1')
dev_df = imdb_df[(imdb_df.type == 'train') & (imdb_df.label != 'unsup')]
test_df = imdb_df[(imdb_df.type == 'test')]
train_df, val_df = model_selection.train_test_split(dev_df, test_size=0.05, stratify=dev_df.label)

In [13]:
max_seq_len=200
classes = {'neg': 0, 'pos' : 1}

In [14]:
text2id = SimpleTextTransformer(10000)

train_ids = text2id.fit_transform(train_df['review'])
val_ids = text2id.transform(val_df['review'])
test_ids = text2id.transform(test_df['review'])

In [15]:
print(train_df.review.iloc[0][:160])
print(train_ids[0][:30])

I was seriously looking forward to seeing this film because it seemed truly promising from the coming attractions: Jim Carrey with Godlike powers was an idea th
[2, 18, 22, 627, 298, 966, 10, 325, 19, 28, 101, 16, 463, 374, 2466, 50, 4, 590, 1, 90, 1146, 3304, 24, 1, 1693, 22, 48, 333, 20, 109]


In [16]:
train_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(train_ids, train_df['label'])]

val_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(val_ids, val_df['label'])]

test_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(test_ids, test_df['label'])]

In [17]:
print(train_features[3].input_ids)

[2, 827, 4, 260, 5544, 50, 2332, 330, 441, 7, 1164, 272, 17, 42, 1175, 901, 26, 6, 169, 5, 827, 20, 45, 4, 497, 17, 165, 106, 5544, 11, 4055, 7, 2877, 24, 2910, 6, 169, 827, 162, 1448, 229, 7109, 14, 15, 12, 13, 14, 15, 12, 13, 195, 20, 53, 14, 15, 12, 13, 14, 15, 12, 13, 687, 5, 169, 160, 84, 4, 2757, 239, 26, 6, 20, 1717, 31, 56, 81, 100, 315, 52, 4, 1457, 170, 141, 6, 1, 11, 4, 693, 260, 26, 18, 158, 141, 130, 6, 54, 85, 8, 184, 701, 950, 76, 69, 184, 6, 27, 16, 22, 840, 6, 71, 61, 4, 3674, 9, 4, 26, 22, 10, 41, 840, 5, 16, 22, 118, 840, 10, 41, 4203, 14, 15, 12, 13, 14, 15, 12, 13, 4, 78, 290, 18, 164, 36, 1611, 25, 3083, 72, 79, 166, 19, 22, 286, 339, 18, 140, 1196, 16, 6, 38, 1040, 39, 300, 6, 6889, 1449, 60, 33, 439, 57, 531, 244, 339, 7, 16, 21, 36, 856, 154, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [18]:
train_tensor, train_labels = features_to_tensor(train_features)
val_tensor, val_labels = features_to_tensor(val_features)
test_tensor, test_labels = features_to_tensor(test_features)

In [19]:
print(train_tensor.size())

torch.Size([23750, 200])


In [20]:
print(len(text2id.vocab))

10000


In [25]:
from torch.utils.data import TensorDataset,DataLoader
import torch.nn as nn
import torch.nn.functional as F
import math
test_loader = DataLoader(TensorDataset(test_tensor,test_labels),batch_size=64)
train_loader = DataLoader(TensorDataset(train_tensor,train_labels),batch_size=64,shuffle = True)
val_loader = DataLoader(TensorDataset(val_tensor,val_labels),batch_size=64)

In [26]:
for xx,yy in train_loader:
    print(xx)
    print(yy)
    break

tensor([[   2,   18,   22,  ..., 1930, 8035,    5],
        [   2, 1217,    5,  ...,    0,    0,    0],
        [   2,   18,  158,  ...,    0,    0,    0],
        ...,
        [   2,  273,   54,  ...,   35, 4130,   34],
        [   2,   42,   96,  ...,   42,    6,   16],
        [   2,   18,  118,  ...,    0,    0,    0]])
tensor([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1])


In [116]:
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.layer0 = nn.Embedding(10000,50)
        self.layer1 = nn.Sequential(
            nn.Conv1d(in_channels=50, out_channels=100, kernel_size=3,padding=2),
            nn.ReLU(),
            nn.MaxPool1d(3),
            nn.Conv1d(in_channels=100, out_channels=100, kernel_size=3,padding=2),
            nn.ReLU(),
            nn.MaxPool1d(3),
            nn.Conv1d(in_channels=100, out_channels=120, kernel_size=3,padding=2),
            nn.ReLU(),
            nn.MaxPool1d(4),)
        
        self.layer2 = nn.Sequential(
            nn.Linear(720,1), nn.Sigmoid()
        )
    def forward(self, x):
        x = self.layer0(x)
        x = x.transpose(1,2)
        y = self.layer2(self.layer1(x).view(x.size(0), -1))
        return y

In [117]:
def batch_res(y_pred_all,output):
    for i in output:
        if i>0.5:
            y_pred_all.append(1)
        else:
            y_pred_all.append(0)
def fit(model, train_dl,val_dl, lr, epoches,tolerance):
    model.cuda()
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    best_loss = 100
    cur_tol = tolerance
    for epoche in range(epoches):
        ep_loss = 0
        for xx,yy in train_dl:
            xx,yy = xx.cuda(), yy.cuda()
            optimizer.zero_grad()
            y_pred = model(xx)
            loss = criterion(y_pred, yy.float())
            loss.backward()
            ep_loss+=loss.item()
            optimizer.step()
        with torch.no_grad():
            val_loss=0
            for xx,yy in val_dl:
                xx,yy = xx.cuda(), yy.cuda()
                y_pred = model(xx)
                loss = criterion(y_pred, yy.float())
                val_loss+=loss.item()
            val_loss/=len(val_dl)
            if best_loss>= val_loss:
                best_loss = val_loss
                cur_tol = tolerance
                torch.save(model.state_dict(), "..\bestmodel.mod")
            else:
                cur_tol -= 1
            if cur_tol==0:
                model.load_state_dict(torch.load("..\bestmodel.mod"))
                break
        print(epoche,"    Loss: {}".format(ep_loss/len(train_dl)), "---->Val loss: {}".format(val_loss))
    print("Best loss is ", best_loss)
    print("Stop train.")
    model.cpu()

In [118]:
net = NN()
fit(net,train_loader,val_loader,0.005,15,10)

  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))


0     Loss: 0.5119494117235625 ---->Val loss: 0.3479331016540527
1     Loss: 0.28308007721939393 ---->Val loss: 0.3333878993988037
2     Loss: 0.20783451081363744 ---->Val loss: 0.37650136426091196
3     Loss: 0.17892269694036053 ---->Val loss: 0.41734743416309356
4     Loss: 0.1534567443824183 ---->Val loss: 0.8312504947185516
5     Loss: 0.09580146791713853 ---->Val loss: 1.1243581086397172
6     Loss: 0.10376980108639566 ---->Val loss: 0.6242071568965912
7     Loss: 0.10075629055399889 ---->Val loss: 0.7970703899860382
8     Loss: 0.06316325119616757 ---->Val loss: 0.9806081369519234
9     Loss: 0.041872288351402845 ---->Val loss: 0.9513038605451584
10     Loss: 0.03286363384300315 ---->Val loss: 1.1349822551012039
Best loss is  0.3333878993988037
Stop train.


In [119]:
net.load_state_dict(torch.load("..\bestmodel.mod"))
y_true = []
y_pred = []
for xx,yy in test_loader:
    net.cuda()
    xx,yy = xx.cuda(), yy.cuda()
    out = net(xx)
    batch_res(y_pred,out)
    y_true.extend(yy.tolist())
net.cpu()
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))
#print(confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85     12500
           1       0.86      0.83      0.85     12500

   micro avg       0.85      0.85      0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

