# RoBERTa with XGBoost

This notebook uses the pretrained transformer model cardiffnlp/twitter-roberta-base-2021-124m to embed tweets into vectors that capture contextual meaning, which are then used as input features for an XGBoost classifier.

We also used PCA and SMOTE to handle the imbalance in the data set and prepare the data better for XGBoost. 

# Imports

In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import uniform, randint


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

# Load train data
#train_path = "/content/drive/MyDrive/Text Mining/textmining/Project Data-20250507/train.csv"
train_path = "Project Data-20250507/train.csv"
train_df = pd.read_csv(train_path)

# View shape and features
print("Training data shape:", train_df.shape)
print("\nTraining data columns:", train_df.columns.tolist())

# Display first few rows
print("\nFirst 5 rows of training data:")
train_df.head()

Training data shape: (9543, 2)

Training data columns: ['text', 'label']

First 5 rows of training data:


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [None]:
# Load dataset
#df = pd.read_csv("train.csv")
train_df.dropna(subset=['text', 'label'], inplace=True)
train_df['label'] = train_df['label'].astype(int)

X = train_df['text'].tolist()
y = train_df['label'].tolist()

# Train/Val split
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")
model.eval()

def get_embeddings(texts):
    with torch.no_grad():
        inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=32, return_tensors="pt")
        outputs = model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Pr-Processing

Normalize, PCA and SMOTE 

In [None]:
# Get embeddings
X_train_embed = get_embeddings(X_train_texts)
X_val_embed = get_embeddings(X_val_texts)

# Normalize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_embed)
X_val_scaled = scaler.transform(X_val_embed)


In [None]:
# PCA
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

# SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca, y_train)


Also added random search for hyperparameter tuning inside XGBoost

# Initialize Model  

In [None]:
param_dist = {
    "n_estimators": randint(50, 300),
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.3),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4)
}

xgb = XGBClassifier(objective="multi:softprob", num_class=3, eval_metric="mlogloss", use_label_encoder=False)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    xgb, param_distributions=param_dist, n_iter=10,
    scoring="f1_macro", n_jobs=1,  # use 1 job to avoid serialization problems
    cv=cv, verbose=1, random_state=42
)

search.fit(X_train_resampled, y_train_resampled)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


# Model

Use the best parameters for XGBoost for Classification

In [None]:
best_model = search.best_estimator_
y_pred = best_model.predict(X_val_pca)

print("Best Parameters:", search.best_params_)
print("Validation Classification Report:")
print(classification_report(y_val, y_pred, digits=4))
print(f"Accuracy:  {accuracy_score(y_val, y_pred):.4f}")
print(f"F1 Score:  {f1_score(y_val, y_pred, average='macro'):.4f}")
print(f"Precision: {precision_score(y_val, y_pred, average='macro'):.4f}")
print(f"Recall:    {recall_score(y_val, y_pred, average='macro'):.4f}")


Best Parameters: {'colsample_bytree': np.float64(0.6053059844639466), 'learning_rate': np.float64(0.29266052670545584), 'max_depth': 8, 'n_estimators': 291, 'subsample': np.float64(0.7541666010159664)}
Validation Classification Report:
              precision    recall  f1-score   support

           0     0.6776    0.5764    0.6229       288
           1     0.7316    0.6727    0.7009       385
           2     0.8786    0.9312    0.9042      1236

    accuracy                         0.8256      1909
   macro avg     0.7626    0.7268    0.7427      1909
weighted avg     0.8186    0.8256    0.8207      1909

Accuracy:  0.8256
F1 Score:  0.7427
Precision: 0.7626
Recall:    0.7268
