In [1]:
import datasets
import torch
import evaluate
from datasets import load_dataset
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
emotion_dataset = datasets.load_from_disk('C:/Users/JGras/instagram-topics/instagram-topics/Data/dair-ai-amotion-de-test-train-val')

In [3]:
checkpoint = "distilbert/distilbert-base-german-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
def tokenize_function(sample):
  return tokenizer(sample["text"], truncation=True)

In [5]:
tokenized_datasets = emotion_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/27200 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3400 [00:00<?, ? examples/s]

Map:   0%|          | 0/3400 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

model.safetensors:   0%|          | 0.00/270M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def compute_metrics(pred):
  # pred ist vom Typ EvalPrediction. Daher können wir auf die
  # label_ids als auch auf die predictions zugreifen
  labels = pred.label_ids
  # argmax gibt den Arrayindex mit der größten Nummer zurück 
  # dadurch wissen wir, welches Sentiment vorhergesagt wurde
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="macro")
  acc = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, average="macro")
  recall = recall_score(labels, preds, average="macro")
  return {"precision": precision, "recall": recall, "acc": acc, "f1": f1}

In [10]:
batch_size = 8
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = f"{checkpoint}-dair-ai-de-emotion-finetuned"
training_args = TrainingArguments(output_dir = model_name, 
                                  num_train_epochs = 4, # Anzahl der Epochen, d.h. Trainingsdurchläufe
                                  learning_rate = 4e-5, # Lernrate: setzt man normalerweise immer auf 1e-5/2e-5/3e-5/4e-5
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  evaluation_strategy="epoch", # Evaluierung nach jeder Epoche
                                  disable_tqdm = False, # Fortschrittsbalken wird angezeigt
                                  logging_steps = logging_steps, # Anzahl der Updateschritte zwischen zwei Logs
                                  log_level="error")

In [11]:
train_dataset = tokenized_datasets["train"].shuffle().select(range(13600))
test_dataset = tokenized_datasets["test"].shuffle(42).select(range(3000))

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,No log,0.480986,0.814699,0.797002,0.851667,0.80274
2,0.526100,0.451948,0.818384,0.853106,0.872333,0.832954
3,0.526100,0.525375,0.831297,0.843538,0.880333,0.837095
4,0.177800,0.554152,0.850377,0.845027,0.888,0.847466


TrainOutput(global_step=6800, training_loss=0.35195834889131433, metrics={'train_runtime': 553.5894, 'train_samples_per_second': 98.268, 'train_steps_per_second': 12.283, 'total_flos': 732319596747072.0, 'train_loss': 0.35195834889131433, 'epoch': 4.0})

In [14]:
eval = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])

In [17]:
result = eval
params = {"model": "distilbert-base-german-cased"}
evaluate.save("./results/", **result, **params)

WindowsPath('results/result-2024_03_31-14_44_05.json')

In [16]:
trainer.save_model('distilbert-base-german-cased-dair-ai-emotion-de-finetuned')