In [41]:
import os
import sys
import pandas as pd

In [42]:
test_df = pd.read_csv('../../data/german/test.csv')
train_df = pd.read_csv('../../data/german/train.csv')
val_df = pd.read_csv('../../data/german/valid.csv')


test_df.head()

Unnamed: 0,text,label,source
0,@user korrekt! Verstehe sowas nicht...,negative,sb_10k
1,Einparken k√∂nnen die Aliens auch nicht! #schlefaz,neutral,sb_10k
2,Der Dubbletimepart von Julien war ja mal sowas...,positive,sb_10k
3,#Instachallenge #Day16 #what #i #am #reading #...,negative,sb_10k
4,Also gleich. Mach noch das Video fertig.,neutral,sb_10k


In [43]:
# from sklearn.model_selection import train_test_split

# train_df, _ = train_test_split(
#     train_df,
#     test_size = ,
#     stratify = train_df['label'],
#     random_state = 42
# )

In [44]:
train_df['label'].unique()


mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

# Create a new column with mapped values
train_df['sentiment_num'] = train_df['label'].map(mapping)
test_df['sentiment_num'] = test_df['label'].map(mapping)
val_df['sentiment_num'] = val_df['label'].map(mapping)

In [45]:
test_df.head()

Unnamed: 0,text,label,source,sentiment_num
0,@user korrekt! Verstehe sowas nicht...,negative,sb_10k,0
1,Einparken k√∂nnen die Aliens auch nicht! #schlefaz,neutral,sb_10k,1
2,Der Dubbletimepart von Julien war ja mal sowas...,positive,sb_10k,2
3,#Instachallenge #Day16 #what #i #am #reading #...,negative,sb_10k,0
4,Also gleich. Mach noch das Video fertig.,neutral,sb_10k,1


In [46]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AdamW
from datasets import Dataset

model_name = "bert-base-multilingual-uncased"
tok = BertTokenizer.from_pretrained(model_name)

MAX_LEN = 100  # tweets are short
#Converting to tensors
def tokenize(batch):
    return tok(
        batch["text"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

train_encodings = tokenize(train_df)
val_encodings = tokenize(val_df)



In [47]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = SentimentDataset(train_encodings, train_df["sentiment_num"].values)
val_dataset = SentimentDataset(val_encodings, val_df["sentiment_num"].values)

In [48]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = 8)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

training_args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|‚ñà‚ñà‚ñà‚ñé      | 115/345 [01:16<02:40,  1.43it/s]

{'loss': 0.9298, 'grad_norm': 9.266109466552734, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}



 33%|‚ñà‚ñà‚ñà‚ñé      | 115/345 [01:20<02:40,  1.43it/s]

{'eval_loss': 0.8602445721626282, 'eval_runtime': 3.6004, 'eval_samples_per_second': 89.99, 'eval_steps_per_second': 5.833, 'epoch': 1.0}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 230/345 [02:40<01:26,  1.33it/s]

{'loss': 0.7246, 'grad_norm': 7.968637466430664, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}



 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 230/345 [02:45<01:26,  1.33it/s]

{'eval_loss': 0.8266333937644958, 'eval_runtime': 4.1609, 'eval_samples_per_second': 77.869, 'eval_steps_per_second': 5.047, 'epoch': 2.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 345/345 [04:28<00:00,  1.36it/s]

{'loss': 0.5598, 'grad_norm': 16.82453727722168, 'learning_rate': 0.0, 'epoch': 3.0}



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 345/345 [04:32<00:00,  1.36it/s]

{'eval_loss': 0.8053461909294128, 'eval_runtime': 3.8702, 'eval_samples_per_second': 83.716, 'eval_steps_per_second': 5.426, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 345/345 [04:34<00:00,  1.26it/s]

{'train_runtime': 274.5602, 'train_samples_per_second': 20.094, 'train_steps_per_second': 1.257, 'train_loss': 0.7380588255066802, 'epoch': 3.0}





TrainOutput(global_step=345, training_loss=0.7380588255066802, metrics={'train_runtime': 274.5602, 'train_samples_per_second': 20.094, 'train_steps_per_second': 1.257, 'total_flos': 283514985469800.0, 'train_loss': 0.7380588255066802, 'epoch': 3.0})

In [49]:
metrics = trainer.evaluate()
print(metrics)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:03<00:00,  5.94it/s]

{'eval_loss': 0.8053461909294128, 'eval_runtime': 3.5619, 'eval_samples_per_second': 90.962, 'eval_steps_per_second': 5.896, 'epoch': 3.0}





In [50]:
predictions = trainer.predict(val_dataset)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:04<00:00,  4.66it/s]


In [51]:
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, classification_report

logits = predictions.predictions      # or predictions[0]
labels = predictions.label_ids        # or predictions[1]

y_pred = np.argmax(logits, axis=-1)

print("Accuracy:", accuracy_score(labels, y_pred))
print("F1 (macro):", f1_score(labels, y_pred, average="macro"))

print(
    classification_report(
        labels,
        y_pred,
        target_names=["negative", "neutral", "positive"]
    )
)

Accuracy: 0.6882716049382716
F1 (macro): 0.6872524196478546
              precision    recall  f1-score   support

    negative       0.64      0.66      0.65       108
     neutral       0.72      0.63      0.67       108
    positive       0.71      0.78      0.74       108

    accuracy                           0.69       324
   macro avg       0.69      0.69      0.69       324
weighted avg       0.69      0.69      0.69       324

