In [1]:
import os
import sys
import pandas as pd
import re


In [2]:
test_df = pd.read_csv('../../data/german/test.csv')
train_df = pd.read_csv('../../data/german/train.csv')
val_df = pd.read_csv('../../data/german/valid.csv')


test_df.head()

Unnamed: 0,text,label,source
0,@user korrekt! Verstehe sowas nicht...,negative,sb_10k
1,Einparken kÃ¶nnen die Aliens auch nicht! #schlefaz,neutral,sb_10k
2,Der Dubbletimepart von Julien war ja mal sowas...,positive,sb_10k
3,#Instachallenge #Day16 #what #i #am #reading #...,negative,sb_10k
4,Also gleich. Mach noch das Video fertig.,neutral,sb_10k


In [3]:
# from sklearn.model_selection import train_test_split

# train_df, _ = train_test_split(
#     train_df,
#     test_size = 0.6 ,
#     stratify = train_df['label'],
#     random_state = 42
# )

In [4]:

mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

# Create a new column with mapped values
train_df['sentiment_num'] = train_df['label'].map(mapping)
test_df['sentiment_num'] = test_df['label'].map(mapping)
val_df['sentiment_num'] = val_df['label'].map(mapping)

In [5]:
url_pattern = re.compile(r"http\S+|www\.\S+")
mention_pattern = re.compile(r"@\w+")
hashtag_pattern = re.compile(r"#(\w+)")
rt_pattern = re.compile(r"^RT\s+@\w+:\s*")

def normalize_elongation(word, max_repeat=2):
    return re.sub(r"(.)\1{"+str(max_repeat)+r",}", r"\1"*max_repeat, word)

def clean_tweet(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    text = rt_pattern.sub("", text)
    text = url_pattern.sub(" URL ", text)
    text = mention_pattern.sub(" @usuario ", text)
    text = hashtag_pattern.sub(r"\1", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.lower()
    tokens = []
    for tok in text.split():
        tokens.append(normalize_elongation(tok))
    text = " ".join(tokens)

    return text



train_df["clean_text"] = train_df["text"].apply(clean_tweet)
test_df["clean_text"] = test_df["text"].apply(clean_tweet)
val_df["clean_text"] = val_df["text"].apply(clean_tweet)

In [None]:
import torch

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AdamW, XLMRobertaTokenizerFast, XLMRobertaForSequenceClassification
from datasets import Dataset

model_name = "xlm-roberta-base"
tok = XLMRobertaTokenizerFast.from_pretrained(model_name)

MAX_LEN = 128 
def tokenize(batch):
    return tok(
        batch["text"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

train_encodings = tokenize(train_df)
val_encodings = tokenize(val_df)
test_encodings = tokenize(test_df)


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = SentimentDataset(train_encodings, train_df["sentiment_num"].values)
val_dataset = SentimentDataset(val_encodings, val_df["sentiment_num"].values)
test_dataset = SentimentDataset(test_encodings, test_df["sentiment_num"].values)

In [8]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = 8)
#change number of labels based on keep/drop neutral class

label2id = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

id2label = {v: k for k, v in label2id.items()}

num_labels = len(label2id)
model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
training_args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 20%|â–ˆâ–ˆ        | 115/575 [01:29<06:39,  1.15it/s]

{'loss': 1.0165, 'grad_norm': 13.730364799499512, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


                                                 
 20%|â–ˆâ–ˆ        | 115/575 [01:33<06:39,  1.15it/s]

{'eval_loss': 0.8808032274246216, 'eval_runtime': 3.4314, 'eval_samples_per_second': 94.423, 'eval_steps_per_second': 6.12, 'epoch': 1.0}


 40%|â–ˆâ–ˆâ–ˆâ–ˆ      | 230/575 [03:07<04:40,  1.23it/s]

{'loss': 0.7953, 'grad_norm': 17.312267303466797, 'learning_rate': 1.2e-05, 'epoch': 2.0}


                                                 
 40%|â–ˆâ–ˆâ–ˆâ–ˆ      | 230/575 [03:11<04:40,  1.23it/s]

{'eval_loss': 0.6955017447471619, 'eval_runtime': 3.4792, 'eval_samples_per_second': 93.125, 'eval_steps_per_second': 6.036, 'epoch': 2.0}


 60%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ    | 345/575 [05:02<03:35,  1.07it/s]

{'loss': 0.6601, 'grad_norm': 31.852636337280273, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


                                                 
 60%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ    | 345/575 [05:06<03:35,  1.07it/s]

{'eval_loss': 0.6720032095909119, 'eval_runtime': 4.0417, 'eval_samples_per_second': 80.164, 'eval_steps_per_second': 5.196, 'epoch': 3.0}


 80%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ  | 460/575 [06:53<01:43,  1.12it/s]

{'loss': 0.5504, 'grad_norm': 23.26292610168457, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


                                                 
 80%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ  | 460/575 [06:57<01:43,  1.12it/s]

{'eval_loss': 0.695486307144165, 'eval_runtime': 4.1353, 'eval_samples_per_second': 78.351, 'eval_steps_per_second': 5.078, 'epoch': 4.0}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 575/575 [08:54<00:00,  1.08it/s]

{'loss': 0.4573, 'grad_norm': 18.49350357055664, 'learning_rate': 0.0, 'epoch': 5.0}


                                                 
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 575/575 [08:57<00:00,  1.08it/s]

{'eval_loss': 0.7033093571662903, 'eval_runtime': 3.1146, 'eval_samples_per_second': 104.028, 'eval_steps_per_second': 6.743, 'epoch': 5.0}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 575/575 [09:01<00:00,  1.06it/s]

{'train_runtime': 541.1271, 'train_samples_per_second': 16.992, 'train_steps_per_second': 1.063, 'train_loss': 0.695913430918818, 'epoch': 5.0}





TrainOutput(global_step=575, training_loss=0.695913430918818, metrics={'train_runtime': 541.1271, 'train_samples_per_second': 16.992, 'train_steps_per_second': 1.063, 'total_flos': 472524975783000.0, 'train_loss': 0.695913430918818, 'epoch': 5.0})

In [9]:
metrics = trainer.evaluate()
print(metrics)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21/21 [00:03<00:00,  6.84it/s]

{'eval_loss': 0.7033093571662903, 'eval_runtime': 3.0868, 'eval_samples_per_second': 104.965, 'eval_steps_per_second': 6.803, 'epoch': 5.0}





In [10]:
predictions = trainer.predict(val_dataset)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21/21 [00:03<00:00,  5.35it/s]


In [12]:
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, classification_report

logits = predictions.predictions      # or predictions[0]
labels = predictions.label_ids        # or predictions[1]

y_pred = np.argmax(logits, axis=-1)

print("Accuracy:", accuracy_score(labels, y_pred))
print("F1 (macro):", f1_score(labels, y_pred, average="macro"))

print(
    classification_report(
        labels,
        y_pred,
        target_names=["negative", "neutral", "positive"]
    )
)

Accuracy: 0.7438271604938271
F1 (macro): 0.7423535727288263
              precision    recall  f1-score   support

    negative       0.71      0.74      0.72       108
     neutral       0.75      0.66      0.70       108
    positive       0.78      0.83      0.80       108

    accuracy                           0.74       324
   macro avg       0.74      0.74      0.74       324
weighted avg       0.74      0.74      0.74       324



Test Dataset

In [13]:
pred = trainer.predict(test_dataset)
y_true = test_df["sentiment_num"]
y_pred = np.argmax(pred.predictions, axis=1)


logits = pred.predictions   
labels = pred.label_ids   


print("Accuracy:", accuracy_score(labels, y_pred))
print("F1 (macro):", f1_score(labels, y_pred, average="macro"))

print(
    classification_report(
        labels,
        y_pred,
        #target_names=["negative", "neutral", "positive"]
        target_names=["negative","neutral", "positive"]
    )
)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 21/21 [00:02<00:00,  7.40it/s]


Accuracy: 0.7438271604938271
F1 (macro): 0.7436737945019444
              precision    recall  f1-score   support

    negative       0.70      0.75      0.73       108
     neutral       0.75      0.69      0.72       108
    positive       0.78      0.79      0.78       108

    accuracy                           0.74       324
   macro avg       0.74      0.74      0.74       324
weighted avg       0.74      0.74      0.74       324

