In [68]:
import pandas as pd
import os
import torch
from transformers import Trainer, BertTokenizer
import numpy as np

In [69]:
os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/GrammatiktakDatasets/otherDatasets/")
test_sentences = pd.read_csv("nutids_r.csv", header=None, names=["fake", "true"])

In [70]:
import pickle

os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/Datasets/")
with open("nutids_r.pickle", "rb") as f:
    nutids_r = pickle.load(f)
with open("nutids_r_stem.pickle", "rb") as f:
    nutids_r_stem = pickle.load(f)

In [71]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [91]:
sentences = [x.strip().strip(".,?!").lower() for x in list(test_sentences["fake"])]
sentences[:5]

['rigtig mange glæde sig til at ser og inviterer familie og venner',
 'det er ikke altid nemt at forsvarer din opførsel',
 'mange drenge interessere sig for fodbold',
 'vil du inviterer alle dine veninder til fødselsdagen',
 'det er svært at vurderer hvor meget bilen er værd']

In [130]:
from tqdm import tqdm
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
import stanza

char = "*@;:!\"?«».,"

def init_model():
    model = torch.load("nutidsrModel1", map_location=torch.device('cpu'))
    device = "mps"
    torch.device(device)
    model.eval()
    model.to(device)
    return Trainer(model)

def load_constants():
    SCOPE = 10
    PADDING = int(SCOPE/2)
    HALF_SCOPE = int(SCOPE/2)
    MAX_LENGTH = 21
    return SCOPE, PADDING, HALF_SCOPE, MAX_LENGTH

class correct_nutidsr():
    def __init__(self):
        os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/FineTuneModels/Models/")
        self.tokenizer = BertTokenizer.from_pretrained('Maltehb/danish-bert-botxo')
        self.model = init_model()
        self.pos_tagger = stanza.Pipeline("da", processors='tokenize,pos', use_gpu=True, cache_directory='./cache', tokenize_pretokenized=True, n_process=4)
        self.scope, self.padding, self.half_scope, self.max_length = load_constants()

    def get_pos_tags(self, sentence):
        doc = self.pos_tagger(sentence)
        results = [word.upos for sentence in doc.sentences for word in sentence.words]
        return results
    
    def add_padding(self, lsts):
        return [["<PAD>"]*self.padding + lst + ["<PAD>"]*self.padding for lst in lsts]
    
    def clean_input(self, pre_cleaned_result):
        post_cleaned_result = []
        for i in tqdm(range(len(pre_cleaned_result))):
            pre_cleaned_lst = pre_cleaned_result[i]
            lst = [word.translate(str.maketrans('', '', ''.join(char))) for word in pre_cleaned_lst]
            post_cleaned_result.append(lst)
        return post_cleaned_result
    
    def convert_sentences_to_dataset(self, post_cleaned_result):
        post_cleaned_result
        dataset = []
        original_words = []
        output_lst = []
        for i in tqdm(range(len(post_cleaned_result))):
            cur_original_words = []
            current_lst = post_cleaned_result[i]
            pos_tags = self.get_pos_tags(" ".join(current_lst))
            for x in range(len(current_lst)):
                current_word = current_lst[x]
                try: stemmed_word = nutids_r_stem[current_word]
                except: continue
                if pos_tags[x] != "VERB":
                    continue
                current_dataset = current_lst[x-self.half_scope:x+self.half_scope+1]
                current_dataset[self.half_scope] = stemmed_word
                dataset.append(" ".join(current_dataset))
                if nutids_r[stemmed_word][0] == current_word:
                    output = 1
                elif nutids_r[stemmed_word][1] == current_word:
                    output = 0
                else:
                    print("ERROR")
                output_lst.append(output)
                cur_original_words.append(current_word)
            original_words.append(cur_original_words)
        return dataset, output_lst, original_words

    def convert_dataset_to_dataloader(self, dataset):
        test_dataset = Dataset(dataset)
        return test_dataset
    
    def tokenize_sentences(self, sentences):
        X_tokenized = self.tokenizer(sentences, padding=True, truncation=True, max_length=self.max_length)
        return X_tokenized
    
    def get_predictions(self, dataset):
        raw_predictions, _, _ = self.model.predict(dataset)
        final_prediction = np.argmax(raw_predictions, axis=1)
        return final_prediction

    def from_sentences_to_dataset(self, sentences):
        padded_sentences = self.add_padding(sentences)
        dataset, output_lst, original_words = self.convert_sentences_to_dataset(padded_sentences)
        X_tokenized = self.tokenize_sentences(dataset)
        test_dataset = self.convert_dataset_to_dataloader(X_tokenized)
        return test_dataset, output_lst, original_words, dataset
    
    def get_measures(self, predictions, output_lst, dataset):
        middle_words = [data.split()[self.half_scope] for data in dataset]
        df = pd.DataFrame({
                            "Dataset": dataset,
                            "Preds": predictions,
                            "Output": output_lst,
                            "Word": middle_words
                        })
        os.chdir("/Users/lucasvilsen/Desktop/GrammatikTAK/GrammatiktakDatasets/otherDatasets/")
        df.to_csv("nutidsr_test_results.csv", index=False)
        f1 = f1_score(output_lst, predictions, average="macro")
        recall = recall_score(output_lst, predictions, average="macro")
        precision = precision_score(output_lst, predictions, average="macro")
        accuracy = accuracy_score(output_lst, predictions)
        return [f1, recall, precision, accuracy]
    
    def print_measures(self, measures, measure_names):
        for measure, name in zip(measures, measure_names):
            print(f"{name}: {round(measure, 4)}")

    def split_sentences(self, sentences):
        return [sent.split() for sent in sentences]

    def correct_sentence(self, test_sentences):
        sentences = self.split_sentences(test_sentences)
        dataset, output, original_words, pre_tokenized_dataset = self.from_sentences_to_dataset(sentences)
        predictions = self.get_predictions(dataset)
        self.print_measures(self.get_measures(predictions, output, pre_tokenized_dataset), ["F1-score", "Recall", "Precision", "Accuracy"])
        self.convert_back_to_sentences(test_sentences, predictions, original_words)
        
    def convert_back_to_sentences(self, sentences, predictions, original_words_lst):
        true_sentences = [x.strip().strip(".,?!").lower() for x in list(test_sentences["true"])]
        i = 0
        correct = 0
        for sentence, true_sentence, original_words in zip(sentences, true_sentences, original_words_lst):
            replacements = []
            for original_word in original_words:
                stemmed_word = nutids_r_stem[original_word]
                if predictions[i] == 0:
                    correct_word = nutids_r[stemmed_word][0]
                else:
                    correct_word = nutids_r[stemmed_word][1]
                sentence = sentence.replace(original_word, correct_word)
                replacements.append(f"{original_word} -> {correct_word}")
                i += 1
            # print(sentence)
            # print(*replacements, sep="\n")
            # print("\n")
            if sentence == true_sentence:
                correct += 1
            else:
                print(sentence)
                print(true_sentence)

        print(f"Correct: {correct}/{len(true_sentences)}")

In [131]:
corrector = correct_nutidsr() 
corrector.correct_sentence(sentences)

2023-04-10 14:57:43 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-10 14:57:44 INFO: Loading these models for language: da (Danish):
| Processor | Package |
-----------------------
| tokenize  | ddt     |
| pos       | ddt     |

2023-04-10 14:57:44 INFO: Using device: cpu
2023-04-10 14:57:44 INFO: Loading: tokenize
2023-04-10 14:57:44 INFO: Loading: pos
2023-04-10 14:57:44 INFO: Done loading processors!
100%|██████████| 100/100 [00:14<00:00,  6.95it/s]


  0%|          | 0/13 [00:00<?, ?it/s]

F1-score: 0.8581
Recall: 0.8869
Precision: 0.8348
Accuracy: 0.9406
rigtig mange glæde sig til at ser og invitere familie og venner
rigtig mange glæder sig til at se og invitere familie og venner
de funden en løsning
de finder en løsning
jeg skreven en e-mail til min chef
jeg skriver en e-mail til min chef
han spille fodbold hver weekend
han spiller fodbold hver weekend
hun malt et billede til sin mor
hun maler et billede til sin mor
vi tagen bussen til skolen
vi tager bussen til skolen
han fløjen til paris i morgen
han flyver til paris i morgen
du given mig en gave på min fødselsdag
du giver mig en gave på min fødselsdag
han tagen elevatoren op til kontoret
han tager elevatoren op til kontoret
de sås en film i biografen
de ser en film i biografen
vi spille brætspil om aftenen
vi spiller brætspil om aftenen
du skreven en liste over tingene
du skriver en liste over tingene
du boende i en stor by
du bor i en stor by
jeg skreven en dagbog hver aften
jeg skriver en dagbog hver aften
han sås