# RoBERTa-base with SMOTE and Hyperparameter Tuning
This notebook loads the data, preprocesses it, vectorizes using a pretrained `roberta-base` model from HuggingFace, applies SMOTE for class imbalance, and performs hyperparameter tuning.

In [9]:
#!pip install -q transformers datasets imbalanced-learn scikit-learn

In [10]:
#!pip install matplotlib
#!pip install torch

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.utils.class_weight import(
   compute_class_weight,
)
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoConfig,
    DataCollatorWithPadding,
    RobertaForSequenceClassification
)
import torch



from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Get pooled embeddings instead of input_ids for SMOTE
from transformers import RobertaModel
import torch

from transformers import RobertaTokenizer

from imblearn.over_sampling import SMOTE


In [12]:
#from google.colab import drive
#drive.mount('/content/drive')

# Load train data
#train_path = "/content/drive/MyDrive/Text Mining/textmining/Project Data-20250507/train.csv"
train_path = "Project Data-20250507/train.csv"
train_df = pd.read_csv(train_path)

# View shape and features
print("Training data shape:", train_df.shape)
print("\nTraining data columns:", train_df.columns.tolist())

# Display first few rows
print("\nFirst 5 rows of training data:")
train_df.head()

Training data shape: (9543, 2)

Training data columns: ['text', 'label']

First 5 rows of training data:


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [13]:
# Clean text
def preprocess_text(text):
    return text.lower().strip().replace('\n', ' ')

train_df['text'] = train_df['text'].apply(preprocess_text)

# Display first few rows after cleaning
print("\nFirst 5 rows of clean training data:")
train_df.head()


First 5 rows of clean training data:


Unnamed: 0,text,label
0,$bynd - jpmorgan reels in expectations on beyo...,0
1,$ccl $rcl - nomura points to bookings weakness...,0
2,"$cx - cemex cut at credit suisse, j.p. morgan ...",0
3,$ess: btig research cuts to neutral https://t....,0
4,$fnko - funko slides after piper jaffray pt cu...,0


In [14]:
#import pandas as pd

# Load dataset (update path if needed)
#df = pd.read_csv("train.csv")

# Basic cleanup if needed
train_df.dropna(subset=['text', 'label'], inplace=True)
train_df['label'] = train_df['label'].astype(int)

# Split data
from sklearn.model_selection import train_test_split
X = train_df['text'].tolist()
y = train_df['label'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [15]:


tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize function
def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=128)

# Tokenize training and validation
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)


In [16]:


model_eval = RobertaModel.from_pretrained('roberta-base')
model_eval.eval()

with torch.no_grad():
    train_embeddings = model_eval(**tokenizer(X_train, padding=True, truncation=True, return_tensors="pt", max_length=128)).last_hidden_state[:,0,:].numpy()

# Reduce dimensionality before SMOTE
scaled_embeddings = StandardScaler().fit_transform(train_embeddings)
X_reduced = PCA(n_components=50).fit_transform(scaled_embeddings)

# Now apply SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_reduced, y_train)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from torch.utils.data import Dataset

class RobertaDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets for Trainer
train_dataset = RobertaDataset(tokenizer(X_train, truncation=True, padding=True, max_length=128), y_train)
val_dataset = RobertaDataset(tokenizer(X_val, truncation=True, padding=True, max_length=128), y_val)


In [18]:
#pip install 'accelerate>=0.26.0'


In [19]:
%pip install 'accelerate>=0.26.0'

from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from transformers import TrainingArguments


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='macro'),
        'precision': precision_score(labels, preds, average='macro'),
        'recall': recall_score(labels, preds, average='macro')
    }

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


Note: you may need to restart the kernel to use updated packages.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

# Get predictions from model
predictions = trainer.predict(val_dataset)
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(-1)

# Print classification report
print("Validation Classification Report:")
print(classification_report(y_true, y_pred, digits=4))

# Optionally print each metric individually
print(f"Accuracy:  {accuracy_score(y_true, y_pred):.4f}")
print(f"F1 Score:  {f1_score(y_true, y_pred, average='macro'):.4f}")
print(f"Precision: {precision_score(y_true, y_pred, average='macro'):.4f}")
print(f"Recall:    {recall_score(y_true, y_pred, average='macro'):.4f}")
