In [4]:
import torch.nn as nn
import torch
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm
import numpy as np
import pickle

In [5]:
os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/Datasets/")
filename = "EuroparlNutidsr_trainset_verbs.csv"
print("Loading df...")
df = pd.read_csv(filename, encoding="UTF-8", sep=";")
pos = list(df["comment_text"].values)
labels = list(df["label"].values)

X_train, X_test, y_train, y_test = train_test_split(pos, labels, test_size=0.1, random_state=42)

unique_pos = ['NOUN','PUNCT','VERB','PRON','NUM','ADP','X','<PAD>','CCONJ','PROPN','AUX','SCONJ','INTJ','ADV','ADJ','PART','SYM','DET']

Loading df...


In [18]:
class NutidsrTokenizer():
    def __init__(self) -> None:
        print("Initializing tokenizer...")
        self.tokenize_table = {x: i for i, x in enumerate(unique_pos)}
        print("Tokenizer initialized.")

    def __call__(self, pos_list):
        for pos_string in tqdm(pos_list):
            splitted_pos = pos_string.split()
            numbers = [self.tokenize_table[x] for x in splitted_pos]
            yield [y for x in numbers for y in self._one_hot_encode(x)]
    
    def _one_hot_encode(self, number):
        return [1 if i == number else 0 for i in range(18)]

class NutidsrModel(nn.Module):
    def __init__(self) -> None:
        super(NutidsrModel, self).__init__()
        number_of_unique_pos = 18
        number_of_pos_including_padding = 21
        self.l1 = nn.Linear(number_of_pos_including_padding*number_of_unique_pos, 512)
        self.l2 = nn.Linear(512, 512)
        self.l3 = nn.Linear(512, 256)
        self.l4 = nn.Linear(256, 128)
        self.l5 = nn.Linear(128, 32)
        self.l6 = nn.Linear(32, 1)
        self.activation = nn.LeakyReLU(0.2)
        self.sigmoid = nn.Sigmoid()
        print(self.l1.weight.dtype)

    def forward(self, x):
        x = self.l1(x)
        x = self.activation(x)
        x = self.l2(x)
        x = self.activation(x)
        x = self.l3(x)
        x = self.activation(x)
        x = self.l4(x)
        x = self.sigmoid(x)
        x = self.l5(x)
        x = self.sigmoid(x)
        x = self.l6(x)
        x = self.sigmoid(x)
        return x

In [19]:
tokenizer = NutidsrTokenizer()
print("Tokenizing train...")
x_train_tokenized = list(tokenizer(X_train))
print("Tokenizing test...")
x_test_tokenized = list(tokenizer(X_test))
print("Done Tokenizing.")

Initializing tokenizer...
Tokenizer initialized.
Tokenizing train...


100%|██████████| 1716972/1716972 [01:27<00:00, 19601.62it/s]


Tokenizing test...


100%|██████████| 190775/190775 [00:06<00:00, 29603.96it/s]


Done Tokenizing.


In [32]:
EPOCHS, BATCH_SIZE = 3, 32

model = NutidsrModel()
optimizer = torch.optim.Adam(model.parameters(),lr=0.0002, betas=(0.5, 0.999))
device = "cpu"
loss_fn = nn.MSELoss()
torch.device(device)
model.to(device)
n_steps = len(x_train_tokenized) // BATCH_SIZE
eval_steps = len(x_test_tokenized) // BATCH_SIZE

print("Training model...")
print("INFO: Epochs: ", EPOCHS, ". Batch size: ", BATCH_SIZE, ". Steps pr. epoch: ", n_steps, ". Eval steps: ", eval_steps)
print("Total training steps: ", n_steps*EPOCHS)
print("Number of parameters: ", sum([len(param) for param in model.parameters()]))

torch.float32
Training model...
INFO: Epochs:  3 . Batch size:  32 . Steps pr. epoch:  53655 . Eval steps:  5961
Total training steps:  160965
Number of parameters:  2882


In [33]:
def get_batch(x, y):
    indexes = random.sample(range(len(x)), BATCH_SIZE)
    x_batch = [x[i] for i in indexes]
    y_batch = [y[i] for i in indexes]
    return x_batch, y_batch

def train_model(xb, yb):
    yb = torch.tensor(yb, dtype=torch.float32)
    optimizer.zero_grad()
    xb = torch.tensor(xb, dtype=torch.float32)
    output = model.forward(xb)
    loss = loss_fn(output, yb)
    loss.backward()
    optimizer.step()
    return loss

def test_model(eval_accuracies):
    accuracies = []
    for _ in tqdm(range(eval_steps)):
        xb, yb = get_batch(x_test_tokenized, y_test)
        xb = torch.tensor(xb, dtype=torch.float32)
        output = list(model.forward(xb))
        accuracy = sum([1 if to_binary(o) == y else 0 for (o,y) in zip(output, yb)])/len(output)
        accuracies.append(accuracy)
        del xb, yb, output, accuracy
    eval_accuracies.append(round(sum(accuracies)/len(accuracies)*100, 2))
    print("Eval accuracy: ", round(sum(accuracies)/len(accuracies)*100, 2), "%")
    return eval_accuracies

def to_binary(o):
    return 1 if float(o) > 0.5 else 0

def save_model(epoch):
    os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/FineTuneModels/SimpleNutidsrNN/")
    os.makedirs("simpleNNmodelsPT", exist_ok=True)
    torch.save(model.state_dict(), f"simpleNNmodelsPT/model_{epoch}.pt")

In [39]:
eval_accuracy = []
losses = []

print("Accuracy before training: ")
eval_accuracy = test_model(eval_accuracy)
save_model(0)

Accuracy before training: 


100%|██████████| 5961/5961 [00:08<00:00, 730.48it/s]

Eval accuracy:  57.34 %





In [41]:
for epoch in range(EPOCHS):
    temp_losses = []
    model.train()
    for i in tqdm(range(n_steps)):
        xb, yb = get_batch(x_train_tokenized, y_train)
        xb = torch.tensor(xb)
        loss = train_model(xb, yb)
        temp_losses.append(float(loss))
        if i % 3000 == 0:
            if len(temp_losses) > 0:
                losses.append(sum(temp_losses)/len(temp_losses))
                print(f"Loss at step {i}: {sum(temp_losses)/len(temp_losses)}")
            temp_losses = []
    
    print(f"Done with Epoch {epoch}")
    print("Evaluating...")
    model.eval()

    eval_accuracy = test_model(eval_accuracy)
    save_model(epoch+1)

  xb = torch.tensor(xb, dtype=torch.float32)
  return F.mse_loss(input, target, reduction=self.reduction)
  0%|          | 37/53655 [00:00<04:50, 184.36it/s]

Loss at step 0: 0.2540414035320282


  6%|▌         | 3025/53655 [00:16<04:11, 201.43it/s]

Loss at step 3000: 0.245047217592597


 11%|█         | 6032/53655 [00:30<03:52, 204.73it/s]

Loss at step 6000: 0.24456610860427222


 17%|█▋        | 9024/53655 [00:45<03:35, 207.08it/s]

Loss at step 9000: 0.24471970610320568


 22%|██▏       | 12031/53655 [01:00<03:20, 207.47it/s]

Loss at step 12000: 0.2444859666377306


 28%|██▊       | 15031/53655 [01:14<03:06, 207.39it/s]

Loss at step 15000: 0.24474409765998523


 34%|███▎      | 18042/53655 [01:29<02:49, 209.63it/s]

Loss at step 18000: 0.24494847591221333


 39%|███▉      | 21026/53655 [01:43<02:39, 204.59it/s]

Loss at step 21000: 0.24471258981029193


 45%|████▍     | 24036/53655 [01:58<02:24, 205.37it/s]

Loss at step 24000: 0.24476888142029443


 50%|█████     | 27040/53655 [02:12<02:13, 198.67it/s]

Loss at step 27000: 0.24439825842281182


 56%|█████▌    | 30037/53655 [02:26<01:57, 200.52it/s]

Loss at step 30000: 0.24475849476456643


 62%|██████▏   | 33037/53655 [02:41<01:37, 212.11it/s]

Loss at step 33000: 0.24494650533795356


 67%|██████▋   | 36030/53655 [02:55<01:24, 208.94it/s]

Loss at step 36000: 0.24447959209481876


 73%|███████▎  | 39041/53655 [03:10<01:09, 210.32it/s]

Loss at step 39000: 0.24437103914717834


 78%|███████▊  | 42040/53655 [03:24<00:55, 208.32it/s]

Loss at step 42000: 0.2445868814835946


 84%|████████▍ | 45028/53655 [03:38<00:41, 209.41it/s]

Loss at step 45000: 0.24434785095850628


 90%|████████▉ | 48031/53655 [03:53<00:26, 210.29it/s]

Loss at step 48000: 0.24472022205094496


 95%|█████████▌| 51024/53655 [04:07<00:12, 205.94it/s]

Loss at step 51000: 0.2446924748023351


100%|██████████| 53655/53655 [04:20<00:00, 206.21it/s]


Done with Epoch 0
Evaluating...


100%|██████████| 5961/5961 [00:07<00:00, 803.70it/s]


Eval accuracy:  57.14 %


  0%|          | 20/53655 [00:00<04:35, 194.52it/s]

Loss at step 0: 0.23887954652309418


  6%|▌         | 3031/53655 [00:14<04:02, 209.16it/s]

Loss at step 3000: 0.24492711263398328


 11%|█▏        | 6038/53655 [00:29<03:48, 208.58it/s]

Loss at step 6000: 0.24486575757463774


 17%|█▋        | 9038/53655 [00:43<03:34, 208.21it/s]

Loss at step 9000: 0.24467285052438578


 22%|██▏       | 12022/53655 [00:58<05:02, 137.52it/s]

Loss at step 12000: 0.2449960648616155


 23%|██▎       | 12247/53655 [01:00<03:23, 203.20it/s]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.plot(losses)
plt.title("Losses")
plt.show()

In [None]:
plt.plot(eval_accuracy)
plt.title("Eval accuracy over time")
plt.show()