In [101]:
import os
import nltk
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import softmax, relu
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
debug = False

nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/davide/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [102]:
df_train = pd.read_hdf("data/train.hdf5")
df_test = pd.read_hdf("data/test.hdf5")

len(df_train), len(df_test)

(39798, 10168)

In [103]:
if debug:
    df_train = shuffle_df(df_train)
    df_test = shuffle_df(df_test)
    df_train = df_train.iloc[:int(len(df_train)/ 4)]
    df_test = df_test.iloc[:int(len(df_train)/ 4)]
    
df_train["label"].value_counts(), df_test["label"].value_counts()

(no    14979
 da    14305
 sv    10514
 Name: label, dtype: int64,
 no    3887
 da    3637
 sv    2644
 Name: label, dtype: int64)

In [104]:
def tokenize(line):
    words = nltk.word_tokenize(line)
    tokens = [word for word in words if word.isalnum()]
    return tokens

In [105]:
df_train["data"] = df_train["data"].apply(tokenize)
df_test["data"] = df_test["data"].apply(tokenize)

In [133]:
le = preprocessing.LabelEncoder()
le.fit(df_train["label"].values)
print(le.classes_)
df_train["y"] = le.transform(df_train["label"])
df_test["y"] = le.transform(df_test["label"])

['da' 'no' 'sv']


In [134]:
df_test

Unnamed: 0,data,label,y
0,"[grønlands, politik]",da,0
7,"[ooa, var, en, åben, bevægelse, med, et, lands...",da,0
11,"[i, august, samlede, to, store, atomkraftmarch...",da,0
15,[den],da,0
19,"[november, begyndte, oaa, som, skulle, vise, a...",da,0
...,...,...,...
49944,"[i, en, del, tilfeller, ble, det, oppnådd, ell...",no,1
49945,"[prosessen, for, sosiale, og, politiske, endri...",no,1
49948,"[begrepet, henspiller, ikke, på, en, bestemt, ...",no,1
49949,[den],no,1


In [107]:
# Build vocabulary
vocab = set()
for line in df_train["data"].values:
    vocab.update(set(line))
for line in df_test["data"].values:
    vocab.update(set(line))

# Build a word to index lookup
w2i = {word: i for i, word in enumerate(vocab)}

In [108]:
len(w2i)

109018

In [126]:
class LangModel(nn.Module):
    def __init__(self, vocab_size):
        super(LangModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, 32)

        self.rnn_1 = nn.LSTM(
            input_size=32,
            hidden_size=100,
            num_layers=2,
            bidirectional=True,
            batch_first=False,
        )

        self.l_out = nn.Sequential(
            nn.Linear(400, 200),
            nn.Dropout(0.2),
            nn.ReLU(inplace=True),
            nn.Linear(200, 64),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(64),
            nn.Linear(64, 3),
        )

    def forward(self, x):
        out = {}
        # get embeddings
        x = self.embeddings(x)

        # output, hidden state
        x, _ = self.rnn_1(x)

        x = torch.cat((torch.mean(x, dim=0), torch.max(x, dim=0)[0]), dim=1)

        # classify
        out["out"] = softmax(self.l_out(x), dim=1)
        return out

net = LangModel(len(w2i))
print(net)

LangModel(
  (embeddings): Embedding(109018, 32)
  (rnn_1): LSTM(32, 100, num_layers=2, bidirectional=True)
  (l_out): Sequential(
    (0): Linear(in_features=400, out_features=200, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): ReLU(inplace=True)
    (3): Linear(in_features=200, out_features=64, bias=True)
    (4): ReLU(inplace=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=64, out_features=3, bias=True)
  )
)


In [127]:
def create_input_batch(df_batch):
    # Get indices
    inputs = [[w2i[token] for token in row] for y, row in df_batch["data"].iteritems()]
    
    # Get the longest row
    longest = max([len(row) for row in inputs])

    # Make the rows equal size
    new_inputs = np.empty([len(df_batch), longest])
    for i in range(len(df_batch)):
        if len(inputs[i]) == 0:        
            new_inputs[i] = np.pad(inputs[i], (0, longest - len(inputs[i])), 'constant', constant_values=0)
        else:
            new_inputs[i] = np.pad(inputs[i], (0, longest - len(inputs[i])), 'wrap')

    inp = torch.Tensor(new_inputs.T).long()
    
    return inp

# Shuffle the rows of a pandas data frame
def shuffle_df(df):
    return df.sample(frac=1).reset_index(drop=True)

# Return an iterable over mini-batches
def batchify(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [128]:
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.0005)

In [129]:
accuracy_score(df_test["y"], np.random.randint(3, size=len(df_test)))

0.3268095987411487

In [130]:
def validate(net, df_test, batch_size, epoch):
    batches = batchify(df_test, batch_size)
    net.eval()
    loss = []
    out = []
    for df_batch in batches:
        inp = create_input_batch(df_batch)
        labels = torch.Tensor(df_batch['y'].values).long()
        output = net(inp)
        batch_loss = criterion(output['out'], labels)
        loss.append(batch_loss.item())
        _, pred = torch.max(output['out'].detach(), 1)
        out.extend(pred)
        
    mean_loss = np.mean(loss)
    accuracy = accuracy_score(df_test['y'], out)
        
    print(f"Validation loss after {epoch} epoch: {mean_loss}")
    print(f"Accuracy: {accuracy}")
    
    return mean_loss, accuracy

In [131]:
def save_best_model(net, experiment, accuracy):
    model_path = os.path.join("app", "lang_model", "data", "models", experiment)
    if not os.path.exists(model_path):
        os.makedirs(model_path)
        
    path = os.path.join(model_path, f"{accuracy}_.pt")
    
    current_best = list(filter(lambda x: x.endswith(".pt"), os.listdir(model_path)))
    if len(current_best) == 0:
        torch.save(net.state_dict(), path)
        return True
        
    current_best_acc = float(current_best[0].split("_")[0])
    if accuracy > current_best_acc:
        torch.save(net.state_dict(), path)
        os.remove(os.path.join(model_path, current_best[0]))
        return True
    return False

In [132]:
epochs = 50
batch_size = 56
batches = batchify(df_train, batch_size)
length = sum(1 for x in batches)
experiment = "LSTM"

for epoch in range(epochs):
    print(f"Epoch: {epoch}")
    if epoch == 0:
        validate(net, df_test, batch_size, epoch)
    net.train()
    shuffled_df = shuffle_df(df_train)
    print(len(shuffled_df))
    counter = 0
    batches = batchify(shuffled_df, batch_size)
    
    for df_batch in batches:
        
        inp = create_input_batch(df_batch)
        labels = torch.Tensor(df_batch['y'].values).long()
        optimizer.zero_grad()

        output = net(inp)

        batch_loss = criterion(output['out'], labels)
        batch_loss.backward()
        optimizer.step()
        if(counter % 60 == 0):
            print(f"Iteration:{counter}/{length} loss: {batch_loss.item()}")
            
        
        counter += 1
    _, accuracy = validate(net, df_test, batch_size, epoch)
    saved = save_best_model(net, experiment, accuracy)
    if saved:
        print(f"New best model saved.")

Epoch: 0
Validation loss after 0 epoch: 1.0981844984568083
Accuracy: 0.38227773406766324
39798
Iteration:0/711 loss: 1.0944849252700806
Iteration:60/711 loss: 0.9395195841789246
Iteration:120/711 loss: 0.8047592043876648
Iteration:180/711 loss: 0.7978276014328003
Iteration:240/711 loss: 0.7565897703170776
Iteration:300/711 loss: 0.6723564267158508
Iteration:360/711 loss: 0.7508235573768616
Iteration:420/711 loss: 0.7848092913627625
Iteration:480/711 loss: 0.7111197710037231
Iteration:540/711 loss: 0.7603636980056763
Iteration:600/711 loss: 0.7182220816612244
Iteration:660/711 loss: 0.7094685435295105
Validation loss after 0 epoch: 0.7192801839702732
Accuracy: 0.8258261211644374
New best model saved.
Epoch: 1
39798
Iteration:0/711 loss: 0.6990412473678589
Iteration:60/711 loss: 0.6683153510093689
Iteration:120/711 loss: 0.6749995350837708
Iteration:180/711 loss: 0.7053447961807251
Iteration:240/711 loss: 0.642067551612854
Iteration:300/711 loss: 0.7189406156539917
Iteration:360/711 loss

KeyboardInterrupt: 

In [116]:
import pickle

with open('app/lang_model/data/vocab/vocab.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(vocab, filehandle)