In [18]:
import os
import re
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.sampler import SubsetRandomSampler
from model.classifier import RNNClassifier
from torchsummary import summary
from dataset.chat_dataset import preprocess_data, ChatDataset, create_vocab
from collections import Counter

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Loading and Processing Chat Data

Chat data should be in the dataset folder and named "_chat.txt". First open the text file in VSCode and check if [U+200E] characters are present, if so remove all occurences.

In [19]:
path = os.path.abspath(os.getcwd())
chat_dir = os.path.join(path, "dataset")
sender_indices = preprocess_data(chat_dir)

# 2. Tokenizing Data and Creating Vocabulary

Now that we have preprocessed the data we can create our vocabulary.

In [20]:
vocab, tokenized_data, lines = create_vocab(chat_dir, sender_indices)

# 3. Creating the Dataset


In [24]:
indexed_data = []
for tokens, label in tokenized_data:
    indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
    # the token that is not in vocab get assigned <unk>
    indexed_data.append((indices, label))


In [27]:
combined_data = []

for i in range(len(lines)):
    data = (lines[i], tokenized_data[i][0], indexed_data[i][0], indexed_data[i][1])
    combined_data.append(data)


dataset = ChatDataset(combined_data)

# 4. Creating the DataLoaders

In [28]:
def collate(batch):
    assert isinstance(batch, list)
    data = pad_sequence([b['data'] for b in batch])
    lengths = torch.tensor([len(b['data']) for b in batch])
    label = torch.stack([b['label'] for b in batch])
    return {
        'data': data,
        'label': label,
        'lengths': lengths
    }

In [29]:
batch_size = 16
validation_split = .2
shuffle_dataset = True
random_seed = 42

dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler, collate_fn=collate)
val_loader = DataLoader(dataset, batch_size=batch_size,
                        sampler=valid_sampler, collate_fn=collate)

# 5. Create and Train Classifier

## Evaluation Metrics

In [30]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [42]:
@torch.no_grad()
def compute_accuracy(model, data_loader):
    corrects = 0
    total = 0
    device = next(model.parameters()).device

    for i, x in enumerate(data_loader):
        input = x['data'].to(device)
        lengths = x['lengths']
        label = x['label'].to(device)
        pred = model(input, lengths)
        pred = torch.argmax(pred, dim = 1)
        corrects += torch.count_nonzero(torch.eq(pred, label))
        total += label.numel()

        if i > 0 and i % 100 == 0:
            print('Step {} / {}'.format(i, len(data_loader)))

    return corrects / total

## Train Model

In [33]:
model_save_path = os.path.join(path, "model", "chat_model.p")

# model = torch.load(model_save_path)
model = RNNClassifier(len(vocab), 75, 32, 10, num_layers=1)

# Move model to the device we are using
model = model.to(device)
gclip = 10

def train(model, optimizer, train_loader, val_loader, loss_func, epochs=10):
    model.train()
    for epoch_id in range(epochs):
        for i, batch in enumerate(train_loader, 1):
            data, labels, lengths = batch['data'].to(
                device), batch['label'].to(device), batch['lengths'].to(device)
            optimizer.zero_grad()
            outputs = model(data, lengths)
            outputs = outputs.to(device)
            loss = loss_func(outputs, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), gclip)
            optimizer.step()
        print(f'Epoch {epoch_id + 1} \t loss: {loss.item()}')

optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
loss_func = nn.CrossEntropyLoss()

In [35]:
train(model, optimizer, train_loader, val_loader, loss_func, epochs=20)

Epoch 1 	 loss: 1.7110021114349365
Epoch 2 	 loss: 1.5741403102874756
Epoch 3 	 loss: 1.617937445640564
Epoch 4 	 loss: 1.7969694137573242
Epoch 5 	 loss: 2.114994764328003
Epoch 6 	 loss: 1.7234978675842285
Epoch 7 	 loss: 2.0141217708587646
Epoch 8 	 loss: 1.7327349185943604
Epoch 9 	 loss: 2.35719895362854
Epoch 10 	 loss: 1.8483942747116089
Epoch 11 	 loss: 1.6028711795806885
Epoch 12 	 loss: 1.6984831094741821
Epoch 13 	 loss: 1.8550447225570679
Epoch 14 	 loss: 1.9911614656448364
Epoch 15 	 loss: 1.4938912391662598
Epoch 16 	 loss: 2.1137077808380127
Epoch 17 	 loss: 1.4977675676345825
Epoch 18 	 loss: 2.008235216140747
Epoch 19 	 loss: 1.6886136531829834
Epoch 20 	 loss: 1.1570345163345337


## Test Model

In [43]:

print("accuracy on test set: {}".format(compute_accuracy(model, val_loader)))

Step 100 / 4995
Step 200 / 4995
Step 300 / 4995
Step 400 / 4995
Step 500 / 4995
Step 600 / 4995
Step 700 / 4995
Step 800 / 4995
Step 900 / 4995
Step 1000 / 4995
Step 1100 / 4995
Step 1200 / 4995
Step 1300 / 4995
Step 1400 / 4995
Step 1500 / 4995
Step 1600 / 4995
Step 1700 / 4995
Step 1800 / 4995
Step 1900 / 4995
Step 2000 / 4995
Step 2100 / 4995
Step 2200 / 4995
Step 2300 / 4995
Step 2400 / 4995
Step 2500 / 4995
Step 2600 / 4995
Step 2700 / 4995
Step 2800 / 4995
Step 2900 / 4995
Step 3000 / 4995
Step 3100 / 4995
Step 3200 / 4995
Step 3300 / 4995
Step 3400 / 4995
Step 3500 / 4995
Step 3600 / 4995
Step 3700 / 4995
Step 3800 / 4995
Step 3900 / 4995
Step 4000 / 4995
Step 4100 / 4995
Step 4200 / 4995
Step 4300 / 4995
Step 4400 / 4995
Step 4500 / 4995
Step 4600 / 4995
Step 4700 / 4995
Step 4800 / 4995
Step 4900 / 4995
accuracy on test set: 0.3561703860759735


In [116]:

torch.save(model, model_save_path)

# Predict Input

In [None]:
from dataset.chat_dataset import tokenize
from time import sleep
model.eval()

pred_indices = {value:key for (key, value) in sender_indices.items()}

text = input("Enter text: ")
tokens = tokenize(text.lower())
indices = [vocab.get(token, vocab['<unk>']) for token in tokens]
sequence = torch.tensor([indices]).permute(1,0).to(device)
pred = model.predict(sequence)
print(f'{pred_indices[pred.item()]}: {text}')