# Modelling

This notebook contains code for training an LSTM deep neural network for classifying danish, norwegian and swedish language.

In [12]:
import os
import nltk
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import softmax, relu
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

# Set this to true, if testing the pipeline
debug = False

nltk.download("punkt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


device(type='cuda')

## Data preprocessing and preparation

In [13]:
df_train = pd.read_hdf("app/lang_model/data/train.hdf5")
df_test = pd.read_hdf("app/lang_model/data/test.hdf5")

len(df_train), len(df_test)

(64966, 16190)

In [14]:
# For debugging purposes use only part of the data
if debug:
    df_train = shuffle_df(df_train)
    df_test = shuffle_df(df_test)
    df_train = df_train.iloc[:int(len(df_train)/ 4)]
    df_test = df_test.iloc[:int(len(df_train)/ 4)]
    
df_train["label"].value_counts(), df_test["label"].value_counts()

(da    26153
 no    22866
 sv    15947
 Name: label, dtype: int64,
 da    6425
 no    5839
 sv    3926
 Name: label, dtype: int64)

In [15]:
def tokenize(line):
    words = nltk.word_tokenize(line)
    tokens = [word for word in words if word.isalnum()]
    return tokens

In [16]:
df_train["data"] = df_train["data"].apply(tokenize)
df_test["data"] = df_test["data"].apply(tokenize)

In [17]:
le = preprocessing.LabelEncoder()
le.fit(df_train["label"].values)
print(le.classes_)
df_train["y"] = le.transform(df_train["label"])
df_test["y"] = le.transform(df_test["label"])

['da' 'no' 'sv']


In [18]:
df_test

Unnamed: 0,data,label,y
1,"[på, næste, niveau, er, landet, inddelt, i, di...",da,0
2,"[landsdel, er, i, forbindelse, med, folketings...",da,0
3,"[den, omfatter, østjyllands, storkreds, nordjy...",da,0
4,"[ved, næste, folketingsvalg, vil, den, fordeli...",da,0
7,"[fysiske, egenskaber, beskrives, ofte, som, ob...",da,0
...,...,...,...
81123,"[regional, patterns, of, diversity, and, estim...",no,1
81125,"[and, gaston]",no,1
81137,"[duken, besto, tidligere, av, sammensydde, rei...",no,1
81143,"[lávvu, forveksles, ofte, med, bealljegoahti]",no,1


In [19]:
# Build vocabulary
vocab = set()
for line in df_train["data"].values:
    vocab.update(set(line))
for line in df_test["data"].values:
    vocab.update(set(line))

# Build a word to index lookup
w2i = {word: i for i, word in enumerate(vocab)}

In [20]:
import pickle

with open('app/lang_model/data/vocab/vocab_1.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(vocab, filehandle)

In [21]:
len(w2i)

164657

## Modelling

In [29]:
class LangModel(nn.Module):
    def __init__(self, vocab_size):
        super(LangModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, 64)

        self.rnn_1 = nn.LSTM(
            input_size=64,
            hidden_size=100,
            num_layers=2,
            bidirectional=True,
            batch_first=False,
        )

        self.l_out = nn.Sequential(
            nn.Linear(400, 200),
            nn.Dropout(0.2),
            nn.ReLU(inplace=True),
            nn.Linear(200, 64),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(64),
            nn.Linear(64, 3),
        )

    def forward(self, x):
        out = {}
        # get embeddings
        x = self.embeddings(x)

        # output, hidden state
        x, _ = self.rnn_1(x)

        x = torch.cat((torch.mean(x, dim=0), torch.max(x, dim=0)[0]), dim=1)

        # classify
        out["out"] = softmax(self.l_out(x), dim=1)
        return out

net = LangModel(len(w2i)).to(device)
print(net)

LangModel(
  (embeddings): Embedding(164657, 64)
  (rnn_1): LSTM(64, 100, num_layers=2, bidirectional=True)
  (l_out): Sequential(
    (0): Linear(in_features=400, out_features=200, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): ReLU(inplace=True)
    (3): Linear(in_features=200, out_features=64, bias=True)
    (4): ReLU(inplace=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=64, out_features=3, bias=True)
  )
)


In [30]:
def create_input_batch(df_batch):
    # Get indices
    inputs = [[w2i[token] for token in row] for y, row in df_batch["data"].iteritems()]
    
    # Get the longest row
    longest = max([len(row) for row in inputs])

    # Make the rows equal size
    new_inputs = np.empty([len(df_batch), longest])
    for i in range(len(df_batch)):
        if len(inputs[i]) == 0:        
            new_inputs[i] = np.pad(inputs[i], (0, longest - len(inputs[i])), 'constant', constant_values=0)
        else:
            new_inputs[i] = np.pad(inputs[i], (0, longest - len(inputs[i])), 'wrap')

    inp = torch.Tensor(new_inputs.T).to(device).long()
    
    return inp

# Shuffle the rows of a pandas data frame
def shuffle_df(df):
    return df.sample(frac=1).reset_index(drop=True)

# Return an iterable over mini-batches
def batchify(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [31]:
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.0005)

In [32]:
accuracy_score(df_test["y"], np.random.randint(3, size=len(df_test)))

0.3339098208770846

In [33]:
def validate(net, df_test, batch_size, epoch):
    batches = batchify(df_test, batch_size)
    net.eval()
    loss = []
    out = []
    for df_batch in batches:
        inp = create_input_batch(df_batch)
        labels = torch.Tensor(df_batch['y'].values).to(device).long()
        output = net(inp)
        batch_loss = criterion(output['out'], labels)
        loss.append(batch_loss.item())
        _, pred = torch.max(output['out'].detach().cpu(), 1)
        out.extend(pred)
        
    mean_loss = np.mean(loss)
    accuracy = accuracy_score(df_test['y'], out)
        
    print(f"Validation loss after {epoch} epoch: {mean_loss}")
    print(f"Accuracy: {accuracy}")
    
    return mean_loss, accuracy

In [34]:
def save_best_model(net, experiment, accuracy):
    model_path = os.path.join("app", "lang_model", "data", "models", experiment)
    if not os.path.exists(model_path):
        os.makedirs(model_path)
        
    path = os.path.join(model_path, f"{accuracy}_.pt")
    
    current_best = list(filter(lambda x: x.endswith(".pt"), os.listdir(model_path)))
    if len(current_best) == 0:
        torch.save(net.state_dict(), path)
        return True
        
    current_best_acc = float(current_best[0].split("_")[0])
    if accuracy > current_best_acc:
        torch.save(net.state_dict(), path)
        os.remove(os.path.join(model_path, current_best[0]))
        return True
    return False

In [None]:
epochs = 50
batch_size = 72
# Get the training data len
batches = batchify(df_train, batch_size)
length = sum(1 for x in batches)
# Model folder name
experiment = "lstm"

for epoch in range(epochs):
    print(f"Epoch: {epoch}")
    if epoch == 0:
        validate(net, df_test, batch_size, epoch)
    net.train()
    
    # Shuffle and batchify the data
    shuffled_df = shuffle_df(df_train)
    print(len(shuffled_df))
    counter = 0
    batches = batchify(shuffled_df, batch_size)
    
    for df_batch in batches:
        
        inp = create_input_batch(df_batch)
        labels = torch.Tensor(df_batch['y'].values).to(device).long()
        optimizer.zero_grad()

        output = net(inp)

        batch_loss = criterion(output['out'], labels)
        batch_loss.backward()
        optimizer.step()
        # Some informative output
        if(counter % 60 == 0):
            print(f"Iteration:{counter}/{length} loss: {batch_loss.item()}")
        
        counter += 1
        
    _, accuracy = validate(net, df_test, batch_size, epoch)
    # No early stopping needed, as only the best model is saved.
    saved = save_best_model(net, experiment, accuracy)
    if saved:
        print(f"New best model saved.")

Epoch: 0
Validation loss after 0 epoch: 1.0996856360965306
Accuracy: 0.3554663372452131
64966
Iteration:0/903 loss: 1.121610403060913
Iteration:60/903 loss: 0.8345963954925537
Iteration:120/903 loss: 0.80738365650177
Iteration:180/903 loss: 0.7406399846076965
Iteration:240/903 loss: 0.7735495567321777
Iteration:300/903 loss: 0.8141270279884338
Iteration:360/903 loss: 0.7172715067863464
Iteration:420/903 loss: 0.7312549948692322
Iteration:480/903 loss: 0.6257780194282532
Iteration:540/903 loss: 0.7009821534156799
Iteration:600/903 loss: 0.6390631794929504
Iteration:660/903 loss: 0.7216250896453857
Iteration:720/903 loss: 0.704325258731842
Iteration:780/903 loss: 0.7114463448524475
Iteration:840/903 loss: 0.7227805852890015
Iteration:900/903 loss: 0.619522750377655
Validation loss after 0 epoch: 0.6805010329352484
Accuracy: 0.8660901791229154
New best model saved.
Epoch: 1
64966
Iteration:0/903 loss: 0.7116522789001465
Iteration:60/903 loss: 0.6932767033576965
Iteration:120/903 loss: 0.6

Iteration:240/903 loss: 0.5652458667755127
Iteration:300/903 loss: 0.5525014400482178
Iteration:360/903 loss: 0.5691633820533752
Iteration:420/903 loss: 0.5654361844062805
Iteration:480/903 loss: 0.5896544456481934
Iteration:540/903 loss: 0.6049467921257019
Iteration:600/903 loss: 0.5721753239631653
Iteration:660/903 loss: 0.578392744064331
Iteration:720/903 loss: 0.5653485655784607
Iteration:780/903 loss: 0.5955420136451721
Iteration:840/903 loss: 0.6199498772621155
Iteration:900/903 loss: 0.577678918838501
Validation loss after 10 epoch: 0.6391105813450283
Accuracy: 0.9111179740580605
New best model saved.
Epoch: 11
64966
Iteration:0/903 loss: 0.5676463842391968
Iteration:60/903 loss: 0.5722043514251709
Iteration:120/903 loss: 0.5764386653900146
Iteration:180/903 loss: 0.5967780351638794
Iteration:240/903 loss: 0.579742431640625
Iteration:300/903 loss: 0.5953176021575928
Iteration:360/903 loss: 0.6182164549827576
Iteration:420/903 loss: 0.577894389629364
Iteration:480/903 loss: 0.620

Iteration:720/903 loss: 0.579704999923706
Iteration:780/903 loss: 0.5802895426750183
Iteration:840/903 loss: 0.5519291162490845
Iteration:900/903 loss: 0.6093069314956665
