In [2]:
# ✅ XLM-RoBERTa Fine-Tuning Pipeline (Combined English+Dutch) and then BERT Fine tuned for combined, English and Dutch seperately

import os
# Throttle MPS memory growth
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from transformers import (
    XLMRobertaTokenizerFast,
    XLMRobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

# 1) Load & prepare your combined CSV
csv_path = "/Users/feysal/Downloads/combined_sentiment_training_data.csv"
df = pd.read_csv(csv_path)
df = df[df["real_sentiment"].isin([-1, 0, 1])].dropna(subset=["Cleaned Comment Text"])
df["label"] = df["real_sentiment"].map({-1: 0, 0: 1, 1: 2})

train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=42
)

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)

# 2) Load tokenizer & model
model_name = "xlm-roberta-base"
tokenizer  = XLMRobertaTokenizerFast.from_pretrained(model_name)
model      = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)

# 3) Tokenization function (no cleaning, raw text)
def tokenize_batch(batch):
    return tokenizer(
        batch["Cleaned Comment Text"],
        truncation=True,
        max_length=128  # shorten for memory
    )

# Apply tokenization and drop unused columns
remove_cols = ["Cleaned Comment Text", "real_sentiment", "__index_level_0__"]
train_ds = train_ds.map(tokenize_batch, batched=True, remove_columns=remove_cols)
val_ds   = val_ds.map(tokenize_batch,   batched=True, remove_columns=remove_cols)

# 4) Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer)

# 5) Metrics computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

# 6) Training arguments
training_args = TrainingArguments(
    output_dir               = "./xlm-roberta-finetuned",
    eval_strategy            = "epoch",    # alias for evaluation_strategy
    save_strategy            = "epoch",
    per_device_train_batch_size   = 2,
    per_device_eval_batch_size    = 4,
    gradient_accumulation_steps   = 8,
    num_train_epochs         = 4,
    learning_rate            = 2e-5,
    weight_decay             = 0.01,
    load_best_model_at_end   = True,
    metric_for_best_model    = "accuracy",
    logging_steps            = 50,
    fp16                     = False      # fp16 not recommended on MPS
)

# 7) Initialize Trainer
trainer = Trainer(
    model            = model,
    args             = training_args,
    train_dataset    = train_ds,
    eval_dataset     = val_ds,
    tokenizer        = tokenizer,
    data_collator    = data_collator,
    compute_metrics  = compute_metrics,
)

# 8) Train & evaluate
trainer.train()
metrics = trainer.evaluate()
print("\nFinal validation metrics:", metrics)

# 9) Detailed classification report
raw_preds = trainer.predict(val_ds)
preds     = np.argmax(raw_preds.predictions, axis=-1)
labels    = raw_preds.label_ids

print("\nAccuracy:", accuracy_score(labels, preds))
print("\nClassification Report:\n", classification_report(labels, preds,
      target_names=["Negative","Neutral","Positive"]))
print("\nConfusion Matrix:\n", confusion_matrix(labels, preds))


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/676 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
0,1.0134,0.972862,0.535503
1,0.8602,0.956173,0.594675
2,0.8108,0.82248,0.647929
3,0.6758,0.82821,0.64645



Final validation metrics: {'eval_loss': 0.8224799633026123, 'eval_accuracy': 0.6479289940828402, 'eval_runtime': 4.113, 'eval_samples_per_second': 164.355, 'eval_steps_per_second': 41.089, 'epoch': 3.9955555555555557}

Accuracy: 0.6479289940828402

Classification Report:
               precision    recall  f1-score   support

    Negative       0.67      0.66      0.66       227
     Neutral       0.52      0.41      0.46       198
    Positive       0.70      0.83      0.76       251

    accuracy                           0.65       676
   macro avg       0.63      0.63      0.63       676
weighted avg       0.64      0.65      0.64       676


Confusion Matrix:
 [[149  47  31]
 [ 59  81  58]
 [ 14  29 208]]


In [9]:
# Install required libraries
!pip install -q transformers datasets scikit-learn
!pip install tf-keras

# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import classification_report, accuracy_score
from transformers import EarlyStoppingCallback

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
df = pd.read_csv('/Users/feysal/Downloads/Dutch_sample_manually_labelled - dutch_comments_with_mapped_sentiment.csv')  # adjust to actual filename
df = df[df["real_sentiment"].isin([-1, 0, 1])]  # Filter valid classes
df = df.dropna(subset=["Cleaned Comment Text"])

# Label mapping: -1 → 0, 0 → 1, 1 → 2
df["label"] = df["real_sentiment"].map({-1: 0, 0: 1, 1: 2})

# Split
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Tokenization
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["Cleaned Comment Text"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df[["Cleaned Comment Text", "label"]])
val_dataset = Dataset.from_pandas(val_df[["Cleaned Comment Text", "label"]])

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)

# Freeze lower layers (optional but recommended)
for name, param in model.bert.named_parameters():
    if "encoder.layer.11" not in name and "pooler" not in name:
        param.requires_grad = False

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions)
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

#  Train
trainer.train()

# 📊 Evaluate
eval_results = trainer.evaluate()
print("\n✅ Final Accuracy:", eval_results["eval_accuracy"])

# Full report
predictions = trainer.predict(val_dataset)
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=-1)

print("\n🧾 Classification Report:\n", classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Map:   0%|          | 0/825 [00:00<?, ? examples/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.818605,0.652174
2,No log,0.792228,0.657005
3,No log,0.776218,0.68599
4,No log,0.770521,0.705314
5,No log,0.768761,0.710145



✅ Final Accuracy: 0.7101449275362319

🧾 Classification Report:
               precision    recall  f1-score   support

    Negative       0.74      0.77      0.76        84
     Neutral       0.67      0.63      0.65        71
    Positive       0.71      0.71      0.71        52

    accuracy                           0.71       207
   macro avg       0.71      0.71      0.71       207
weighted avg       0.71      0.71      0.71       207



In [11]:
# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import classification_report, accuracy_score
from transformers import EarlyStoppingCallback

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
df = pd.read_csv('/Users/feysal/Downloads/sample_english_with_real_sentiment - sample_english_with_real_sentiment.csv-2.csv')  # adjust to actual filename
df = df[df["real_sentiment"].isin([-1, 0, 1])]  # Filter valid classes
df = df.dropna(subset=["Cleaned Comment Text"])

# Label mapping: -1 → 0, 0 → 1, 1 → 2
df["label"] = df["real_sentiment"].map({-1: 0, 0: 1, 1: 2})

# Split
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Tokenization
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["Cleaned Comment Text"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df[["Cleaned Comment Text", "label"]])
val_dataset = Dataset.from_pandas(val_df[["Cleaned Comment Text", "label"]])

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)

# Freeze lower layers (optional but recommended)
for name, param in model.bert.named_parameters():
    if "encoder.layer.11" not in name and "pooler" not in name:
        param.requires_grad = False

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions)
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

#  Train
trainer.train()

# Evaluate
eval_results = trainer.evaluate()
print("\n✅ Final Accuracy:", eval_results["eval_accuracy"])

# Full report
predictions = trainer.predict(val_dataset)
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=-1)

print("\n🧾 Classification Report:\n", classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Map:   0%|          | 0/1875 [00:00<?, ? examples/s]

Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.649287,0.763326
2,No log,0.638038,0.767591
3,No log,0.634282,0.763326
4,No log,0.63517,0.767591



✅ Final Accuracy: 0.767590618336887

🧾 Classification Report:
               precision    recall  f1-score   support

    Negative       0.79      0.80      0.79       143
     Neutral       0.71      0.54      0.62       127
    Positive       0.78      0.89      0.83       199

    accuracy                           0.77       469
   macro avg       0.76      0.74      0.75       469
weighted avg       0.76      0.77      0.76       469



In [13]:
# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import classification_report, accuracy_score
from transformers import EarlyStoppingCallback

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
df = pd.read_csv('/Users/feysal/Downloads/combined_sentiment_training_data.csv')  # adjust to actual filename
df = df[df["real_sentiment"].isin([-1, 0, 1])]  # Filter valid classes
df = df.dropna(subset=["Cleaned Comment Text"])

# Label mapping: -1 → 0, 0 → 1, 1 → 2
df["label"] = df["real_sentiment"].map({-1: 0, 0: 1, 1: 2})

# Split
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Tokenization
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["Cleaned Comment Text"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df[["Cleaned Comment Text", "label"]])
val_dataset = Dataset.from_pandas(val_df[["Cleaned Comment Text", "label"]])

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)

# Freeze lower layers (optional but recommended)
for name, param in model.bert.named_parameters():
    if "encoder.layer.11" not in name and "pooler" not in name:
        param.requires_grad = False

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions)
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

#  Train
trainer.train()

# Evaluate
eval_results = trainer.evaluate()
print("\n✅ Final Accuracy:", eval_results["eval_accuracy"])

# Full report
predictions = trainer.predict(val_dataset)
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(axis=-1)

print("\n🧾 Classification Report:\n", classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]))

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/676 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.696308,0.715976
2,No log,0.677827,0.730769
3,0.739800,0.672559,0.720414
4,0.739800,0.666443,0.724852



✅ Final Accuracy: 0.7307692307692307

🧾 Classification Report:
               precision    recall  f1-score   support

    Negative       0.76      0.73      0.75       227
     Neutral       0.68      0.51      0.58       198
    Positive       0.73      0.90      0.81       251

    accuracy                           0.73       676
   macro avg       0.73      0.72      0.71       676
weighted avg       0.73      0.73      0.72       676

