In [16]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from transformers import TrainingArguments, Trainer, BertForSequenceClassification

Load model

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Geotrend/distilbert-base-da-cased")

Load Data (about 2 min)

In [4]:
df = pd.read_csv("/Users/lucasvilsen/Desktop/GrammatikTAK/Datasets/SentToLabel_15-5_Revisited.csv", sep=";")
print(len(df))

46423624


In [5]:
df = df[:100000]
print(len(df))

100000


In [6]:
data = df["data"].to_list()
labels = df["label"].to_list()

Prepare Data

In [7]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None, batch_size=16):
        self.encodings = encodings
        self.labels = labels
        self.batch_size = batch_size

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [8]:
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.1, random_state=1212)

In [9]:
start_time = time.time()
print("Tokenizing val:")
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True)
print("Time taken for validation: ", time.time() - start_time)
print("Expected total time ", ((time.time() - start_time) * 10) // 60, "min", ((time.time() - start_time) * 10) % 60, "sek")
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True)

Tokenizing val:
Time taken for validation:  0.5844769477844238
Expected total time  0.0 min 5.846250057220459 sek


In [10]:
batch_size = 32
epochs = 2

train_dataset = CustomDataset(X_train_tokenized, y_train, batch_size=batch_size)
val_dataset = CustomDataset(X_val_tokenized, y_val, batch_size=batch_size)

Prepare model

In [17]:
device = "mps"
torch.device(device)
model = BertForSequenceClassification.from_pretrained('Geotrend/distilbert-base-da-cased', num_labels=3)
model.to(device)
device

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Maltehb/danish-bert-botxo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'mps'

In [18]:
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

In [19]:
def compute_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

Train model

In [21]:
trainer.train()

  0%|          | 0/5626 [00:00<?, ?it/s]

KeyboardInterrupt: 

Save model

In [None]:
torch.save(model, './commaDistilBERT1')