In [None]:
# Configure model, metrics, and Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Map labels to IDs if needed
# Ensure your labels are integers starting at 0
label2id = None
id2label = None

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
    }

args = TrainingArguments(
    output_dir="runs/roberta-sentiment",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
metrics = trainer.evaluate()
metrics

trainer.save_model("models/roberta-sentiment-finetuned")
tokenizer.save_pretrained("models/roberta-sentiment-finetuned")

In [None]:
# Tokenize dataset using the RoBERTa tokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    return tokenizer(batch[text_column], truncation=True, max_length=256)

train_tok = train_ds.map(preprocess, batched=True)
val_tok = val_ds.map(preprocess, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels = len(set(train_ds[label_column])) if hasattr(train_ds, "features") else 3

In [None]:
# Load dataset from CSV and prepare splits
from datasets import load_dataset

csv_path = r"d:\NLP_Project\twitter_sentiment_data.csv"
dataset = load_dataset("csv", data_files={"train": csv_path}, split="train")

# If your CSV has columns like 'text' and 'label' adjust here
text_column = "text"
label_column = "label"

# Create validation split (10%)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = dataset["train"]
val_ds = dataset["test"]

# Inspect a sample
train_ds[0]

In [None]:
# Install training dependencies
import sys
!{sys.executable} -m pip install -U datasets scikit-learn

In [None]:
# Optional: install required packages (uncomment if needed)
## Note: Running installs from the notebook may require internet access.
## On Windows cmd, you can also install via terminal:
## pip install --upgrade pip
## pip install transformers torch sentencepiece

# If you prefer inline install, uncomment the following:
# import sys
# !{sys.executable} -m pip install -U pip
# !{sys.executable} -m pip install transformers torch sentencepiece

In [None]:
# Fix notebook progress bars and optional HF Xet support
# import sys
# !{sys.executable} -m pip install --upgrade ipywidgets jupyter jupyterlab notebook
# Optional: speed up Hugging Face downloads
# !{sys.executable} -m pip install "huggingface_hub[hf_xet]"
# Optional: silence symlink warning without enabling Developer Mode
# import os
# os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [1]:
# Initialize RoBERTa sentiment pipeline using CardiffNLP model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Build a TextClassificationPipeline
sentiment_pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    device=0 if torch.cuda.is_available() else -1
)

labels = model.config.id2label
labels

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu


{0: 'negative', 1: 'neutral', 2: 'positive'}

In [2]:
# Quick test
texts = [
    "I love this product!",
    "This is the worst experience ever.",
    "It's okay, nothing special.",
    "I hate you so much!",
    "I adore you so much!"
]

results = sentiment_pipeline(texts, top_k=1)
for text, res in zip(texts, results):
    print(f"{text} -> {res[0]['label']} ({res[0]['score']:.3f})")

I love this product! -> positive (0.985)
This is the worst experience ever. -> negative (0.945)
It's okay, nothing special. -> neutral (0.599)
I hate you so much! -> negative (0.935)
I adore you so much! -> positive (0.984)


# RoBERTa Sentiment Analysis
Setup environment and initialize a RoBERTa-based sentiment pipeline.