In [101]:
import os
import nltk
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import softmax, relu
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
debug = False

nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/davide/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [102]:
df_train = pd.read_hdf("data/train.hdf5")
df_test = pd.read_hdf("data/test.hdf5")

len(df_train), len(df_test)

(39798, 10168)

In [103]:
if debug:
    df_train = shuffle_df(df_train)
    df_test = shuffle_df(df_test)
    df_train = df_train.iloc[:int(len(df_train)/ 4)]
    df_test = df_test.iloc[:int(len(df_train)/ 4)]
    
df_train["label"].value_counts(), df_test["label"].value_counts()

(no    14979
 da    14305
 sv    10514
 Name: label, dtype: int64,
 no    3887
 da    3637
 sv    2644
 Name: label, dtype: int64)

In [104]:
def tokenize(line):
    words = nltk.word_tokenize(line)
    tokens = [word for word in words if word.isalnum()]
    return tokens

In [105]:
df_train["data"] = df_train["data"].apply(tokenize)
df_test["data"] = df_test["data"].apply(tokenize)

In [106]:
le = preprocessing.LabelEncoder()
le.fit(df_train["label"].values)
le.classes_
df_train["y"] = le.transform(df_train["label"])
df_test["y"] = le.transform(df_test["label"])

In [107]:
# Build vocabulary
vocab = set()
for line in df_train["data"].values:
    vocab.update(set(line))
for line in df_test["data"].values:
    vocab.update(set(line))

# Build a word to index lookup
w2i = {word: i for i, word in enumerate(vocab)}

In [108]:
len(w2i)

109018

In [109]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embeddings = nn.Embedding(len(w2i), 32)
        
        self.rnn_1 = nn.LSTM(input_size=32,
                         hidden_size=100,
                         num_layers=2,
                         bidirectional=True,
                         batch_first=False)
        
        self.l_out = nn.Sequential(
            nn.Linear(400, 200),
            nn.Dropout(.2),
            nn.ReLU(inplace=True),
            nn.Linear(200, 64),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(64),
            nn.Linear(64, 3)
        )
        
    def forward(self, x):
        out = {}
        # get embeddings
        x = self.embeddings(x)
        
        # output, hidden state
        x, hn = self.rnn_1(x)
        
        out['hidden'] = x = torch.cat((torch.mean(x, dim=0), torch.max(x, dim=0)[0]), dim=1)
        
        # classify
        out['out'] = softmax(self.l_out(x), dim=1)
        return out

net = Net()
print(net)

Net(
  (embeddings): Embedding(109018, 32)
  (rnn_1): LSTM(32, 100, num_layers=2, bidirectional=True)
  (l_out): Sequential(
    (0): Linear(in_features=400, out_features=200, bias=True)
    (1): Dropout(p=0.2, inplace=False)
    (2): ReLU(inplace=True)
    (3): Linear(in_features=200, out_features=64, bias=True)
    (4): ReLU(inplace=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=64, out_features=3, bias=True)
  )
)


In [110]:
def create_input_batch(df_batch):
    # Get indices
    inputs = [[w2i[token] for token in row] for y, row in df_batch["data"].iteritems()]
    
    # Get the longest row
    longest = max([len(row) for row in inputs])

    # Make the rows equal size
    new_inputs = np.empty([len(df_batch), longest])
    for i in range(len(df_batch)):
        if len(inputs[i]) == 0:        
            new_inputs[i] = np.pad(inputs[i], (0, longest - len(inputs[i])), 'constant', constant_values=0)
        else:
            new_inputs[i] = np.pad(inputs[i], (0, longest - len(inputs[i])), 'wrap')

    inp = torch.Tensor(new_inputs.T).long()
    
    return inp

# Shuffle the rows of a pandas data frame
def shuffle_df(df):
    return df.sample(frac=1).reset_index(drop=True)

# Return an iterable over mini-batches
def batchify(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [111]:
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.0005)

In [112]:
accuracy_score(df_test["y"], np.random.randint(3, size=len(df_test)))

0.33723446105428795

In [113]:
def validate(net, df_test, batch_size, epoch):
    batches = batchify(df_test, batch_size)
    net.eval()
    loss = []
    out = []
    for df_batch in batches:
        inp = create_input_batch(df_batch)
        labels = torch.Tensor(df_batch['y'].values).long()
        output = net(inp)
        batch_loss = criterion(output['out'], labels)
        loss.append(batch_loss.item())
        _, pred = torch.max(output['out'].detach(), 1)
        out.extend(pred)
        
    mean_loss = np.mean(loss)
    accuracy = accuracy_score(df_test['y'], out)
        
    print(f"Validation loss after {epoch} epoch: {mean_loss}")
    print(f"Accuracy: {accuracy}")
    
    return mean_loss, accuracy

In [114]:
def save_best_model(net, experiment, accuracy):
    model_path = os.path.join("data", "models", experiment)
    if not os.path.exists(model_path):
        os.makedirs(model_path)
        
    path = os.path.join(model_path, f"{accuracy}_.pt")
    
    current_best = list(filter(lambda x: x.endswith(".pt"), os.listdir(model_path)))
    if len(current_best) == 0:
        torch.save(net.state_dict(), path)
        return True
        
    current_best_acc = float(current_best[0].split("_")[0])
    if accuracy > current_best_acc:
        torch.save(net.state_dict(), path)
        os.remove(os.path.join(model_path, current_best[0]))
        return True
    return False

In [115]:
epochs = 50
batch_size = 56
batches = batchify(df_train, batch_size)
length = sum(1 for x in batches)
experiment = "LSTM"

for epoch in range(epochs):
    print(f"Epoch: {epoch}")
    if epoch == 0:
        validate(net, df_test, batch_size, epoch)
    net.train()
    shuffled_df = shuffle_df(df_train)
    print(len(shuffled_df))
    counter = 0
    batches = batchify(shuffled_df, batch_size)
    
    for df_batch in batches:
        
        inp = create_input_batch(df_batch)
        labels = torch.Tensor(df_batch['y'].values).long()
        optimizer.zero_grad()

        output = net(inp)

        batch_loss = criterion(output['out'], labels)
        batch_loss.backward()
        optimizer.step()
        if(counter % 60 == 0):
            print(f"Iteration:{counter}/{length} loss: {batch_loss.item()}")
            
        
        counter += 1
    _, accuracy = validate(net, df_test, batch_size, epoch)
    saved = save_best_model(net, experiment, accuracy)
    if saved:
        print(f"New best model saved.")

Epoch: 0
Validation loss after 0 epoch: 1.0959775428195575
Accuracy: 0.35769079464988196
39798
Iteration:0/711 loss: 1.1030023097991943
Iteration:60/711 loss: 0.8589862585067749
Iteration:120/711 loss: 0.8638518452644348
Iteration:180/711 loss: 0.8414367437362671
Iteration:240/711 loss: 0.7121663093566895
Iteration:300/711 loss: 0.7795535326004028
Iteration:360/711 loss: 0.790208637714386
Iteration:420/711 loss: 0.7073476910591125
Iteration:480/711 loss: 0.7201284170150757
Iteration:540/711 loss: 0.7387074828147888
Iteration:600/711 loss: 0.6602550148963928
Iteration:660/711 loss: 0.6929439306259155
Validation loss after 0 epoch: 0.7344094468997076
Accuracy: 0.8078284815106216
New best model saved.
Epoch: 1
39798
Iteration:0/711 loss: 0.6937600374221802
Iteration:60/711 loss: 0.6893138289451599
Iteration:120/711 loss: 0.6696398854255676
Iteration:180/711 loss: 0.6806906461715698
Iteration:240/711 loss: 0.692054808139801
Iteration:300/711 loss: 0.6445044279098511
Iteration:360/711 loss:

KeyboardInterrupt: 

In [80]:
def generate_batch(batch):
    label = torch.tensor([entry["y"] for _, entry in batch.iterrows()])
    text = [entry["data"] for _, entry in batch.iterrows()]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [78]:
x = df_train.iloc[:3]["data"].values
y = df_train.iloc[:3]["y"].values

batch = [x, y]

In [81]:
generate_batch(df_train.iloc[:3])

TypeError: expected Tensor as element 0 in argument 0, but got list

In [71]:
df_train.iloc[:3]

Unnamed: 0,data,label,y
1,"[politik, i, se, også, kategori, begivenheder, i]",da,0
2,"[politik, i, se, også, kategori, begivenheder, i]",da,0
3,"[organisationen, til, oplysning, om, atomkraft...",da,0
