# **Preprocessing**

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, EarlyStoppingCallback, Trainer
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn

In [None]:
path = r"comments.csv"
dataset = load_dataset("csv", data_files={"data": path})["data"]

In [None]:
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

In [None]:
train_dataset

Dataset({
    features: ['CommentText', 'Sentiment'],
    num_rows: 819328
})

In [None]:
def map_sentiment(example):
    mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
    example["label"] = mapping[example["Sentiment"]]
    return example

In [None]:
train_dataset = train_dataset.map(map_sentiment)
val_dataset = val_dataset.map(map_sentiment)

In [None]:
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["CommentText"], truncation=True, padding="max_length", max_length=64)

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
columns_to_keep = ["input_ids", "attention_mask", "label"]
train_dataset.set_format(type="torch", columns=columns_to_keep)
val_dataset.set_format(type="torch", columns=columns_to_keep)

In [None]:
num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# **Fine-tuning**:


In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
        loss_fct = nn.CrossEntropyLoss(label_smoothing=0.1)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./result",
    evaluation_strategy="steps",
    eval_steps=125,
    save_steps=125,
    per_device_train_batch_size=1176,
    per_device_eval_batch_size=1176,
    num_train_epochs=3,
    learning_rate=1e-5,
    weight_decay=0.05,
    gradient_accumulation_steps=2,
    fp16=True,
    logging_steps=100,
    load_best_model_at_end=True,
    warmup_steps=500,
)



In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
125,0.6511,0.633147,0.799148
250,0.6465,0.629841,0.800257
375,0.6473,0.630979,0.800499
500,0.6432,0.629981,0.800389
625,0.6417,0.629567,0.800224
750,0.6352,0.628263,0.801345
875,0.633,0.628241,0.801751
1000,0.6305,0.627432,0.801465


TrainOutput(global_step=1044, training_loss=0.6392939940266226, metrics={'train_runtime': 1346.9264, 'train_samples_per_second': 1824.884, 'train_steps_per_second': 0.775, 'total_flos': 8.065899688502477e+16, 'train_loss': 0.6392939940266226, 'epoch': 2.9928263988522237})

In [None]:
trainer.save_model("./youtube_sentiment_model_final")
tokenizer.save_pretrained("./youtube_sentiment_model_final")

('./youtube_sentiment_model_final/tokenizer_config.json',
 './youtube_sentiment_model_final/special_tokens_map.json',
 './youtube_sentiment_model_final/tokenizer.json')