In [1]:
import numpy as np
import pandas as pd
import io, sys, os
from tqdm import tqdm

import torch
import torch.nn as nn
import torchvision

from livelossplot import PlotLosses
from sklearn.metrics import classification_report, f1_score
import torchtext

from jupyterthemes import jtplot
jtplot.style()

sys.path.insert(0, '..\helpers')
from helpers import *

device = "cuda:0" if torch.cuda.is_available() else "cpu"

Using TensorFlow backend.


In [2]:
dataset = pd.read_csv('../data/train.txt', sep='\t')
dataset['turn1'] = dataset['turn1'].apply(normalize)
dataset['turn2'] = dataset['turn2'].apply(normalize)
dataset['turn3'] = dataset['turn3'].apply(normalize)
dataset.to_csv('train_norm.csv', index=False)
data = Dataset('../data/train.txt')

In [3]:
# from keras.preprocessing.text import Tokenizer
# tokenizer = Tokenizer(num_words=None, filters='')
# tokenizer.fit_on_texts(dataset['turn1'].tolist() + dataset['turn2'].tolist() + dataset['turn3'].tolist())
# word_index = tokenizer.word_index


_text = torchtext.data.Field(
#     include_lengths = True,
#     pad_token = ' ',
    tokenize = lambda x: x.split()
)

_label = torchtext.data.Field(is_target = True)

_data = torchtext.data.TabularDataset(
    path = 'train_norm.csv',
    format = 'csv',
    fields = [
        ('id', None),
        ('turn1', _text),
        ('turn2', _text),
        ('turn3', _text),
        ('label', _label)
    ],
    skip_header = True
)

_label.build_vocab(_data, specials_first=False)
_text.build_vocab(_data)

In [4]:
custom_embeddings = get_embedding_matrix(dict(_text.vocab.stoi), 'custom_vectors_V2.5_2500.txt', 100).float()
fasttext_embeddings = get_embedding_matrix(dict(_text.vocab.stoi), 'vectors.txt', 300).float()

batch_size = 128
train, test = _data.split(split_ratio=0.8)
train_batches = torchtext.data.batch(
    train,
    batch_size = batch_size,
)

no custom vector for  <unk>
no custom vector for  <pad>
no custom vector for  <unk>
no custom vector for  <pad>


In [5]:
# data, turn1_len, turn2_len, turn3_len = make_tensors(dataset)

In [6]:
class Model(nn.Module):
    def __init__(self, fasttext_matrix, custom_matrix):
        super(Model, self).__init__()
        
        self.fasttext = nn.Embedding(
            fasttext_matrix.shape[0],
            fasttext_matrix.shape[1],
            padding_idx = 0
        ).from_pretrained(fasttext_matrix, freeze=False)
        
        self.custom = nn.Embedding(
            custom_matrix.shape[0],
            custom_matrix.shape[1],
            padding_idx = 0
        ).from_pretrained(custom_matrix, freeze=False)
#         self.fasttext = nn.Embedding.from_pretrained(fasttext_matrix, freeze=False)
#         self.custom = nn.Embedding.from_pretrained(custom_matrix, freeze=False)
        
        def new_lstm():
            return nn.LSTM(
                    fasttext_matrix.shape[1] + custom_matrix.shape[1],
                    (fasttext_matrix.shape[1] + custom_matrix.shape[1]) * 2,
                    num_layers = 2,
                    dropout = 0.5,
                    bidirectional = True,
                    batch_first = True
                )
        
        self.bi_lstm1 = new_lstm()
        self.bi_lstm2 = new_lstm()
        self.bi_lstm3 = new_lstm() 
        
        self.final = nn.Sequential(
            nn.Linear(
                (fasttext_matrix.shape[1] + custom_matrix.shape[1]) * 12,
                int((fasttext_matrix.shape[1] + custom_matrix.shape[1]) * 3)
            ),
            nn.ReLU(inplace=True),
            nn.Linear(int((fasttext_matrix.shape[1] + custom_matrix.shape[1]) * 3), 4)
        )
        
    def forward(self, turn1, turn2, turn3):
        turn1_embeddings = torch.cat((self.fasttext(turn1), self.custom(turn1)), 2)
        turn2_embeddings = torch.cat((self.fasttext(turn2), self.custom(turn2)), 2)
        turn3_embeddings = torch.cat((self.fasttext(turn3), self.custom(turn3)), 2)
        
        _out = torch.cat(
            (self.bi_lstm1(turn1_embeddings)[0][:, -1, :],
            self.bi_lstm2(turn2_embeddings)[0][:, -1, :],
            self.bi_lstm3(turn3_embeddings)[0][:, -1, :]), 
            1
        )
        
        out = self.final(_out)
        return out

In [7]:
model = Model(fasttext_embeddings, custom_embeddings)

loss = nn.CrossEntropyLoss(weight=get_class_weights(data.data['label']).to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, amsgrad=True)

In [8]:
# tensor_dataset = torch.utils.data.TensorDataset(torch.tensor(data.clone().detach(), dtype=torch.long))
train, test = torch.utils.data.random_split(data, (int(0.8*len(data)), int(0.2*len(data))))

train_batches = torch.utils.data.DataLoader(
    train,
    batch_size = 48,
    shuffle = True,
    pin_memory = True,
    num_workers = 1
)

test_batches = torch.utils.data.DataLoader(
    test,
    batch_size = 48,
    shuffle = True,
    pin_memory = True,
    num_workers = 1)

model.to(device)
live_loss = PlotLosses()

In [9]:
for epoch in range(30):
    model.train()
    train_errors = []
    for batch in tqdm(train_batches):
        t1 = data.tokenizer.texts_to_sequences(batch[1])
        t2 = data.tokenizer.texts_to_sequences(batch[2])
        t3 = data.tokenizer.texts_to_sequences(batch[3])
        
        t1 = torch.stack([torch.tensor(x).long() for x in pad_sequences(t1)]).to(device)
        t2 = torch.stack([torch.tensor(x).long() for x in pad_sequences(t2)]).to(device)
        t3 = torch.stack([torch.tensor(x).long() for x in pad_sequences(t3)]).to(device)
#         t1 = batch[0].narrow(1, 0, turn1_len).to(device)
#         t2 = batch[0].narrow(1, turn1_len, turn2_len).to(device)
#         t3 = batch[0].narrow(1, turn1_len + turn2_len, turn3_len).to(device)

#         y = batch[0].narrow(1, turn1_len + turn2_len + turn3_len, 1).to(device)
        y = [data.label_tokenizer.word_index[x] - 1 for x in batch[4]]
        y = torch.tensor(labels).long().unsqueeze(1).to(device)
        
        pred = model(t1, t2, t3)
        error = loss(pred, y.view(-1))
        
        optimizer.zero_grad()
        error.backward()
        optimizer.step()
        train_errors.append(error.data.item())
        
    test_errors = []
    for batch in test_batches:
        t1 = batch[0].narrow(1, 0, turn1_len).to(device)
        t2 = batch[0].narrow(1, turn1_len, turn2_len).to(device)
        t3 = batch[0].narrow(1, turn1_len + turn2_len, turn3_len).to(device)

        y = batch[0].narrow(1, turn1_len + turn2_len + turn3_len, 1).to(device)
        
        pred = model(t1, t2, t3)
        error = loss(pred, y.view(-1))
        
        test_errors.append(error.data.item())
    
    predict = lambda x: torch.argmax(x, dim=1).cpu().numpy()
    test_predictions = []
    test_actual = []
    for batch in test_batches:
        t1 = batch[0].narrow(1, 0, turn1_len).to(device)
        t2 = batch[0].narrow(1, turn1_len, turn2_len).to(device)
        t3 = batch[0].narrow(1, turn1_len + turn2_len, turn3_len).to(device)

        y = batch[0].narrow(1, turn1_len + turn2_len + turn3_len, 1).to(device).view(-1)
        pred = model(t1, t2, t3)
        test_predictions.append(predict(pred))
        test_errors.append(loss(pred, y.to(device)).data.item())    # get loss
        test_actual.append(y.cpu().numpy())
    
#     train_predictions = []
#     train_actual = []
#     for batch in train_batches:
#         t1 = batch[0].narrow(1, 0, turn1_len).to(device)
#         t2 = batch[0].narrow(1, turn1_len, turn2_len).to(device)
#         t3 = batch[0].narrow(1, turn1_len + turn2_len, turn3_len).to(device)

#         y = batch[0].narrow(1, turn1_len + turn2_len + turn3_len, 1).to(device).view(-1)
#         pred = model(t1, t2, t3)
#         train_predictions.append(predict(pred))
#         train_actual.append(y.cpu().numpy())
        
    live_loss.update({
        'train_loss': torch.tensor(train_errors).mean(),
        'test_loss': torch.tensor(test_errors).mean(),
#         'train_f1': f1_score(np.concatenate(train_actual), np.concatenate(train_predictions), average='weighted'),
        'test_f1': f1_score(np.concatenate(test_actual), np.concatenate(test_predictions), average='weighted'),
    })
    live_loss.draw()

  0%|                                                                                                                                            | 0/503 [00:00<?, ?it/s]

TypeError: Traceback (most recent call last):
  File "c:\users\dhruv\appdata\local\programs\python\python36\lib\site-packages\torch\utils\data\dataloader.py", line 138, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "c:\users\dhruv\appdata\local\programs\python\python36\lib\site-packages\torch\utils\data\dataloader.py", line 138, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "c:\users\dhruv\appdata\local\programs\python\python36\lib\site-packages\torch\utils\data\dataset.py", line 103, in __getitem__
    return self.dataset[self.indices[idx]]
  File "..\helpers\helpers.py", line 181, in __getitem__
    return self.data[self.data['id']==index].values[0].tolist()
  File "c:\users\dhruv\appdata\local\programs\python\python36\lib\site-packages\pandas\core\ops.py", line 1283, in wrapper
    res = na_op(values, other)
  File "c:\users\dhruv\appdata\local\programs\python\python36\lib\site-packages\pandas\core\ops.py", line 1169, in na_op
    raise TypeError("invalid type comparison")
TypeError: invalid type comparison


In [None]:
batch = next(iter(train_batches))
t1 = batch[0].narrow(1, 0, turn1_len).to(device)
t2 = batch[0].narrow(1, turn1_len, turn2_len).to(device)
t3 = batch[0].narrow(1, turn1_len + turn2_len, turn3_len).to(device)

y = batch[0].narrow(1, turn1_len + turn2_len + turn3_len, 1).to(device)

In [None]:
_t1 = model.fasttext(t1)
_t2 = model.custom(t1)
_ = torch.cat((_t1, _t2), dim=2)
model.bi_lstm1(_.to(device))

In [None]:
_t1

In [None]:
_t2

In [None]:
_

In [None]:
batch[0] = (torch.rand((32, 240)) * 100).long()

t1 = batch[0].narrow(1, 0, turn1_len).to(device)
t2 = batch[0].narrow(1, turn1_len, turn2_len).to(device)
t3 = batch[0].narrow(1, turn1_len + turn2_len, turn3_len).to(device)

y = batch[0].narrow(1, turn1_len + turn2_len + turn3_len, 1).to(device)
em1 = nn.Embedding(16366, 300)
em1.weight = nn.Parameter(fasttext_embeddings.float())
em1.to(device)
em2 = nn.Embedding(16366, 100)
em2.weight = nn.Parameter(custom_embeddings)
em2.to(device)
_t1 = em1((torch.rand((32, 40)) * 100).long().to(device))
_t2 = em2((torch.rand((32, 40)) * 100).long().to(device))
_ = torch.cat((_t1, _t2), dim=2)
l1 = model.bi_lstm1(_.to(device))[0]
l2 = model.bi_lstm2(_.to(device))[0]
l3 = model.bi_lstm3(_.to(device))[0]

In [None]:
y.view(-1)

In [None]:
get_class_weights(dataset['label'])

In [None]:
import numpy as np
np.savetxt("fasttext.csv", fasttext_embeddings.numpy(), delimiter=",")


In [None]:
dataset[ dataset['id'] <= 5].values[0]

In [None]:
train_batches = torch.utils.data.DataLoader(
    data,
    batch_size = 48,
    shuffle = True,
    pin_memory = True,
    num_workers = 8
)

In [12]:
_text.numericalize([next(iter(train_batches))[0].turn1]).squeeze()

tensor([ 88, 700,  76, 152])

In [None]:
data.tokenizer.texts_to_sequences(['all the best'])[0]

In [None]:
custom_embeddings[0]

In [9]:
iter((next(iter(train_batches))))

<list_iterator at 0x1d3d4e2ae80>