In [3]:
from transformers import RobertaTokenizer, RobertaForCausalLM
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import os
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import lemmy
import stanza

In [4]:
pos_model = stanza.Pipeline("da", processors='tokenize,pos', use_gpu=True, cache_directory='./cache', tokenize_pretokenized=True, n_process=4)
lemmatizer = lemmy.load("da")

2023-03-27 14:30:36 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-03-27 14:30:37 INFO: Loading these models for language: da (Danish):
| Processor | Package |
-----------------------
| tokenize  | ddt     |
| pos       | ddt     |

2023-03-27 14:30:37 INFO: Using device: cpu
2023-03-27 14:30:37 INFO: Loading: tokenize
2023-03-27 14:30:37 INFO: Loading: pos
2023-03-27 14:30:37 INFO: Done loading processors!


In [31]:
def get_pos(sentence):
    doc = pos_model(sentence)
    return [word.upos for sentence in doc.sentences for word in sentence.words]

In [64]:
def convert_sentence(sentences):
    number_of_verbs = []
    dataset = []
    for x in tqdm(range(0, len(sentences))):
        cur_sentences = sentences[x]
        cur_sentence = "".join(cur_sentences)
        words = cur_sentence.split()
        len_cur_sent = len(words)
        pos = np.array(get_pos(cur_sentence))
        verb_indices = np.where(pos == "VERB")[0]
        if len(pos) != len(words):
            continue
        for i in verb_indices:
            already_padded = False
            if i < 4:
                words_before = ["<pad>"]*(4-i) + words[0:i] + [lemmatizer.lemmatize(pos[i], words[i])[0]] + words[i+1:i+5]
                if i >= len_cur_sent-4:
                    words_before = words_before + ["<pad>"]*(4-(len_cur_sent-i)+1)
            elif i >= len_cur_sent-4:
                words_before = words[i-4:i] + [lemmatizer.lemmatize(pos[i], words[i])[0]] + words[i+1:] + ["<pad>"]*(4-(len_cur_sent-i)+1)
            else:
                words_before = words[i-4:i] + [lemmatizer.lemmatize(pos[i], words[i])[0]] + words[i+1:i+5]
            if len(words_before) != 9:
                print("Error:")
                print(words_before, i)
                continue
            dataset.append((" ".join(words_before)).lower())
        number_of_verbs.append(len(verb_indices))
    return dataset, number_of_verbs

In [80]:
test_set = pd.read_csv('../GrammatiktakDatasets/otherDatasets/nutids_r.csv', names=["wrong", "right"])
test_lines = list(test_set["wrong"])
dataset, number_of_verbs = convert_sentence(test_lines[:5])
dataset

100%|██████████| 5/5 [00:00<00:00, 11.33it/s]


['glæde sig til at se og inviterer familie og',
 'til at ser og invitere familie og venner <pad>',
 'ikke altid nemt at forsvare din opførsel <pad> <pad>',
 '<pad> <pad> mange drenge interessere sig for fodbold <pad>',
 '<pad> <pad> vil du invitere alle dine veninder til',
 'det er svært at vurdere hvor meget bilen er']

In [81]:
device = "mps"
tenseModel = torch.load("tenseModel1", map_location=torch.device('mps'))
tenseModel.eval()

batch_size = 32

class CustomDataset(Dataset):
    def __init__(self, X_tokenized, Y_tokenized=None):
        self.input_ids = X_tokenized["input_ids"]
        self.attention_mask = X_tokenized["attention_mask"]
        if Y_tokenized is not None:
            self.labels = Y_tokenized["input_ids"]
        else:
            self.labels = None

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]
        if self.labels is None:
            return torch.tensor(input_ids), torch.tensor(attention_mask)
        labels = self.labels[idx]
        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

In [90]:
tokenizer = RobertaTokenizer.from_pretrained("DDSC/roberta-base-danish")

X_train_tokenized = tokenizer(dataset, padding=True, truncation=True)
#train_dataset = CustomDataset(X_train_tokenized)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [93]:
input_ids = X_train_tokenized['input_ids']
attention_mask = X_train_tokenized['attention_mask']

input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    outputs = tenseModel(input_ids.to(device), attention_mask=attention_mask.to(device))

# Get the predicted labels
predicted_labels = outputs.logits.argmax(dim=-1)

# Convert the predicted labels to text using the tokenizer
predicted_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_labels]

print(predicted_texts)


['glæde sig til at se og familie familie og', 'til at invitere familie og venner', 'ikke, fint at min', 'mange drengesere sig', 'vil jeg alle til', 'det er svært at vurdere hvor meget bilen er']


In [78]:
outputs

['logits', 'logits', 'logits', 'logits', 'logits', 'logits', 'logits']