In [45]:
import os
import pandas as pd

In [46]:
df = pd.read_csv("../Datasets/EuroparlNutidsr_trainset_verbs.csv", sep=";")
all_pos = list(df["comment_text"].values)
all_labels = list(df["label"].values)
df

Unnamed: 0,comment_text,label
0,<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD...,0
1,PUNCT PRON AUX VERB NOUN DET ADJ NOUN ADP PUNC...,1
2,<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD...,0
3,NOUN VERB PRON PUNCT ADP ADV DET NOUN NOUN AUX...,0
4,<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD...,0
...,...,...
2461873,<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> PRON VERB ...,1
2461874,<PAD> PRON VERB SCONJ PRON AUX VERB ADV ADP PA...,1
2461875,AUX VERB ADV ADP PART VERB NOUN CCONJ ADP NOUN...,0
2461876,<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD...,0


In [47]:
os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/Datasets/")
filename = "europarl-v7.da-en.da"
with open(filename, "r", encoding="UTF-8") as file:
    lines = file.readlines()
correct_sentences = lines[-5000:]
correct_sentences = [line.strip("\n") for line in correct_sentences]

In [48]:
correct_sentences[0]

'Liikanen, om De hurtigt vil afklare dette med Deres kollega, hr.'

In [49]:
os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/")

import pickle
import stanza
from tqdm import tqdm
import os



def get_pos(x):
    if os.path.exists(f"FineTuneModels/cache/pos_caching_{len(x)}.pkl"):
        print("pos_caching.pkl already exists")
        with open(f"FineTuneModels/cache/pos_caching_{len(x)}.pkl", "rb") as f:
            pos_list = pickle.load(f)
    else: 
        pos_list = []
        pos_tagger = stanza.Pipeline("da", processors='tokenize,pos', use_gpu=True, cache_directory='./cache', tokenize_pretokenized=True, n_process=4)
        for sentence in tqdm(x):
            pos = get_pos_tags(sentence, pos_tagger)
            pos_list.append(pos)
        print(len(pos_list))
        print("Updating")
        with open(f"FineTuneModels/cache/pos_caching_{len(x)}.pkl", "wb") as f:
            pickle.dump(pos_list, f)
        print("Updated")
    return pos_list

def get_pos_tags(sentence, pos_tagger):
    doc = pos_tagger(sentence)
    features = [word.feats if word.feats else None for sentence in doc.sentences for word in sentence.words]
    feature_dicts = turn_features_to_dicts(features)
    results = [(word.upos, [word.start_char, word.end_char], feature_dicts[i]) for sentence in doc.sentences for i, word in enumerate(sentence.words)]
    return results

def turn_features_to_dicts(features):
    feature_dicts = []
    current_tense = None
    for feature in features:
        if feature is None:
            feature_dicts.append({})
            continue
        feature_dict = {}
        current_features = feature.split("|")
        for current_feature in current_features:
            key, value = current_feature.split("=")
            if key == "Tense" and current_tense is None:
                current_tense = value
            feature_dict[key] = value
        if "Tense" not in feature_dict and "VerbForm" in feature_dict and key is not None:
            feature_dict["Tense"] = "Pres" if current_tense is None else current_tense
        feature_dicts.append(feature_dict)
    return feature_dicts

all_pos = get_pos(correct_sentences)

pos_caching.pkl already exists


In [50]:
os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/Datasets/")
filename = "europarl-v7.da-en.da"
with open(filename, "r", encoding="UTF-8") as file:
    lines = file.readlines()
with open("nutids_r_bøjninger.pickle", "rb") as f:
    nutids_r_bøjninger = pickle.load(f)
with open("nutids_r_stem.pickle", "rb") as f:
    nutids_r_stem = pickle.load(f)

from tqdm import tqdm

testset = []
labels = []

padded_words = []

padding_left = 15
padding_right = 5

def get_pos_tags(index):
    current_pos = all_pos[index]
    return [current_pos[i][0] for i in range(len(current_pos))]

og_index = 0
comma_right_before_index = 0
at_indexes = []
at_index = -1


for y in tqdm(range(len(correct_sentences))):
    line = correct_sentences[y]
    if len(str(line)) < 1 or str(line) == "nan":
        continue
    line = line.strip("\n")
    true_words = line.split()
    pos = get_pos_tags(y)
    words = ["<PAD>"]*padding_left + pos + ["<PAD>"]*padding_right
    true_padded_words = ["<PAD>"]*padding_left + true_words + ["<PAD>"]*padding_right
    for i, word in enumerate(true_words):
        try: stemmed = nutids_r_stem[word]
        except: continue
        if word[-1] == "s" or words[i+padding_left] != "VERB":
            continue
        if true_words[i-1].lower().strip() == "og": 
            og_index += 1
            continue
        if true_words[i-1][-1] == ",":
            comma_right_before_index += 1
            continue
        at_index += 1
        if true_words[i-1].lower().strip() == "at": 
            at_indexes.append(at_index)
        if nutids_r_bøjninger[stemmed][0] == word:
            labels.append(1)
        else:
            labels.append(0)
        testset.append(" ".join(words[i:i+padding_left+padding_right+1]))
        padded_words.append(" ".join(true_padded_words[i:i+padding_left+padding_right+1]))

100%|██████████| 5000/5000 [00:00<00:00, 44866.35it/s]


In [67]:
simple_all_pos = []

for sent_pos in all_pos:
    temp = []
    for tup in sent_pos:
        temp.append(tup[0])

    simple_all_pos.append(temp)

new_testset = [" ".join(sent_pos.split()[11:17]) for sent_pos in testset]
new_pos = [" ".join(sent_pos[11:17]) for sent_pos in simple_all_pos]

from tqdm import tqdm
can_guess = 0

predictions = []

for sent_pos in tqdm(new_testset):
    if sent_pos in new_pos:
        can_guess += 1
        predictions.append(all_labels[new_pos.index(sent_pos)])
    else:
        predictions.append(None)

can_guess 

100%|██████████| 5921/5921 [00:00<00:00, 20490.42it/s]


680

In [68]:
len(predictions), len(labels)

(5921, 5921)

In [69]:
import numpy as np

a = np.array(predictions)
b = np.array(labels)

no_guess = 0
correct = 0
wrong = 0

for p, l in zip(predictions, labels):
    if p is None:
        no_guess += 1
        continue
    if p == l:
        correct += 1
    else:
        wrong += 1
    
no_guess = no_guess / len(predictions) * 100
correct = correct / len(predictions) * 100
wrong = wrong / len(predictions) * 100

no_guess, correct, wrong

(88.51545347069751, 4.847154196926195, 6.637392332376288)