# Classification
## MI201

##**Group 4** :
- Diego FLEURY CORRÊA DE MORAES
- Hazael SOLEDADE DE ARAUJO JUMONJI
- Lucas DE OLIVEIRA MARTIM

### Project 3 : **Sentiment Analysis Using LLMs**

In [1]:
# Data handling
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Ploting
import matplotlib.pyplot as plt
import seaborn as sns

# Text preprocessing
import re
import unicodedata

# BERT
from transformers import AutoTokenizer, AutoModel

# Deep learning
from torch.utils.data import Dataset, DataLoader
import torch

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Classic ML models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Model selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score

# Importing Data

In [2]:
train_full = pd.read_csv('processed_train.csv')
test_full = pd.read_csv('processed_test.csv')

X_train, X_val, y_train, y_val = train_test_split(train_full['Text'], train_full['Sentiment'], test_size=0.2, random_state=42)
X_test, y_test = test_full['Text'], test_full['Sentiment']

In [3]:
train_full.dropna(inplace=True)
test_full.dropna(inplace=True)

In [4]:
X_train.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [5]:
# Preprocessing the text

def preprocess_text(text):
    """
    Text preprocessing, removing accents, links, HTML, extra spaces and user names.

    - Converts to lowercase.
    - Removes accents.
    - Removes HTML tags.
    - Remove links (http, https, www).
    - Removes extra spaces.
    - Removes user names.
    """

    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Strip all accents
    text = ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')

    # Removes links (http, https, www)
    text = re.sub(r"http\S+|www\S+", "", text)

    # Removes HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Removes usernames
    text = re.sub(r"@\w+", "", text)

    # Removes line breaks and excessive whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [6]:
X_train = X_train.apply(preprocess_text)
X_val = X_val.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

In [7]:
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
y_train = y_train.map(sentiment_mapping)
y_val = y_val.map(sentiment_mapping)
y_test = y_test.map(sentiment_mapping)

In [8]:
X_train

Unnamed: 0,Text
0,"oh, he`s hilarious. i`m just commenting on the..."
1,"thanks for trying i was hoping bud trillin, bu..."
2,after show at our house rocked! saying goodbye...
3,up at 4:30am west coast time..gettin ready to ...
4,my computer is so slooowww this morning. i thi...
...,...
21979,feels like warm things
21980,my best friend is in vegas without me
21981,- fire and urban at rock challenge
21982,a+ for effort though


In [9]:
y_train

Unnamed: 0,Sentiment
0,2
1,2
2,1
3,2
4,1
...,...
21979,1
21980,1
21981,1
21982,2


In [10]:
# Custom Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        """
        Args:
            texts (list): List of text samples.
            labels (list): List of sentiment labels (e.g., 0, 1).
            tokenizer (transformers.BertTokenizer): Tokenizer for BERT.
            max_length (int): Maximum length for tokenized sequences.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize and encode the text
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

# Extract embeddings for all data
def extract_embeddings(model, dataloader, device):
    """
    Extracts embeddings for all data using a pre-trained BERT model.

    Args:
        model (transformers.BertModel): Pre-trained BERT model.
        dataloader (DataLoader): DataLoader for the dataset.
        device (torch.device): Device to run the model on (CPU or GPU).

    Returns:
        torch.Tensor: A matrix of size (number_of_samples, embedding_size).
    """
    model.eval()  # Set the model to evaluation mode
    embeddings = []

    with torch.no_grad():  # Disable gradient computation
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass through BERT
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output  # CLS token representation

            # Append embeddings to the list
            embeddings.append(pooled_output.cpu())

    # Combine all embeddings into a single matrix
    return torch.cat(embeddings, dim=0)

# Custom Dataset
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        """
        Args:
            texts (list): List of text samples.
            labels (list): List of sentiment labels (e.g., 0, 1).
            tokenizer (transformers.BertTokenizer): Tokenizer for BERT.
            max_length (int): Maximum length for tokenized sequences.
        """
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        # Tokenize and encode the text
        embeddings = self.embeddings[idx]
        label = self.labels[idx]

        return {
            "input_ids": embeddings.squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Hyperparameters
# PRETRAINED_MODEL = "bert-base-uncased"
PRETRAINED_MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
MAX_LENGTH = 128
BATCH_SIZE = 64
NUM_CLASSES = 3
LEARNING_RATE = 2e-5
EPOCHS = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

# Initialize the datasets
train_dataset = TextDataset(X_train, y_train, tokenizer, MAX_LENGTH)
val_dataset = TextDataset(X_val, y_val, tokenizer, MAX_LENGTH)

# Initialize the dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

train_embeddings = None
val_embeddings = None

load_embeddings = True

if not load_embeddings:
    # Get the bert model
    bert = AutoModel.from_pretrained(PRETRAINED_MODEL).to(device)

    # Extract embeddings (train)
    train_embeddings = extract_embeddings(bert, train_loader, device)
    train_embeddings =train_embeddings.cpu()

    # Extract embeddings (val)
    val_embeddings = extract_embeddings(bert, val_loader, device)
    val_embeddings =val_embeddings.cpu()

    # To save runtime next time
    np.save('roberta_train_embeddings.npy', train_embeddings)
    np.save('roberta_val_embeddings.npy', val_embeddings)
else:
    train_embeddings = np.load("roberta_train_embeddings.npy")
    val_embeddings = np.load("roberta_val_embeddings.npy")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [12]:
print("Valores únicos em y_train:", np.unique(y_train))
print("Tipo dos valores em y_train:", type(y_train[0]))

Valores únicos em y_train: [0 1 2]
Tipo dos valores em y_train: <class 'numpy.int64'>


In [13]:
train_embeddings.shape, val_embeddings.shape

((21984, 768), (5496, 768))

In [14]:
# Initialize the embedding datasets
embedding_train_dataset = EmbeddingDataset(train_embeddings, y_train)
embedding_val_dataset = EmbeddingDataset(val_embeddings, y_val)

# Initialize the embedding dataloaders
embedding_train_loader = DataLoader(embedding_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
embedding_val_loader = DataLoader(embedding_val_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Classical ML (SVM, Random Forest, XGBoost)

In [15]:
# from sklearn.decomposition import PCA, KernelPCA

# # explained_variance_ratio = .99
# n_components = 50

# # Aplicar PCA nos embeddings de treino
# # pca = PCA(n_components=explained_variance_ratio)
# kpca = KernelPCA(n_components = n_components, kernel = "rbf")

# train_embeddings_pca = kpca.fit_transform(train_embeddings)

# # Transformar os embeddings de validação/teste com o mesmo PCA treinado
# val_embeddings_pca = kpca.transform(val_embeddings)
# # test_embeddings_pca = pca.transform(test_embeddings)

In [19]:
! pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [17]:
!pip install numba



In [20]:
import umap
import numba

umap_model = umap.UMAP(n_components=50, n_neighbors=15, min_dist=0.1, metric = "cosine")
train_embeddings_umap = umap_model.fit_transform(train_embeddings)
val_embeddings_umap = umap_model.transform(val_embeddings)



In [23]:
umap_model = umap.UMAP(n_components=50, n_neighbors=30, min_dist=0.1, metric = "cosine")
train_embeddings_umap = umap_model.fit_transform(train_embeddings)
val_embeddings_umap = umap_model.transform(val_embeddings)



In [21]:
train_embeddings.shape, train_embeddings_umap.shape

((21984, 768), (21984, 50))

### SVM

In [24]:
from sklearn.linear_model import SGDClassifier

svm_sgd = SGDClassifier(loss='hinge', alpha=0.01, max_iter=2000, tol=1e-3)
svm_sgd.fit(train_embeddings_umap, y_train)

print(f"Accuracy: {svm_sgd.score(val_embeddings_umap, y_val):.4f}")


Accuracy: 0.7482


In [25]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C=1.0, max_iter=5000)
svm.fit(train_embeddings_umap, y_train)

print(f"Validation Accuracy: {svm.score(val_embeddings_umap, y_val):.4f}")


Validation Accuracy: 0.7516


In [26]:
from sklearn.linear_model import LogisticRegression

# Criar e treinar o modelo de Regressão Logística
logreg = LogisticRegression(
    max_iter=500,       # Número máximo de iterações para convergência
    C=1.0,              # Regularização (1.0 = padrão, pode ajustar)
    solver="lbfgs",     # Otimizador (funciona bem para embeddings grandes)
    multi_class="multinomial",  # Para classificação multiclasse
    n_jobs=-1           # Usa múltiplos núcleos da CPU
)

logreg.fit(train_embeddings_umap, y_train)

# Avaliação
accuracy = logreg.score(val_embeddings_umap, y_val)
print(f"Validation Accuracy: {accuracy:.4f}")

# Fazer previsões
logreg_predictions = logreg.predict(val_embeddings_umap)




Validation Accuracy: 0.7547


In [42]:
# svm_params = {
#     'C': [0.1, 1, 10],
#     'kernel': ['linear']
# }

# svm_grid = GridSearchCV(SVC(), svm_params, cv=3, n_jobs=-1, verbose=10)
# svm_grid.fit(train_embeddings_pca, y_train)

# print(f"Best parameters: {svm_grid.best_params_}")
# print(f"Best score: {svm_grid.best_score_}")

# svm_best_model = svm_grid.best_estimator_
# svm_best_model.fit(train_embeddings_pca, y_train)
# svm_predictions = svm_best_model.predict(val_embeddings_pca)

# generalization_error_estimate = 1 - svm_best_model.score(val_embeddings_pca, y_val)
# print(f"Generalization Error Estimate: {generalization_error_estimate}")

### Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

# Criando e treinando o modelo Random Forest
rf = RandomForestClassifier(
    n_estimators=200,   # Número de árvores
    max_depth=10,       # Profundidade máxima das árvores (ajustável)
    n_jobs=-1,          # Usa todos os núcleos do processador
    random_state=42
)

rf.fit(train_embeddings_umap, y_train)

# Avaliação
accuracy = rf.score(val_embeddings_umap, y_val)
print(f"Validation Accuracy: {accuracy:.4f}")

# Fazer previsões
rf_predictions = rf.predict(val_embeddings_umap)


Validation Accuracy: 0.7576


### XGBoost

In [28]:
class_counts = np.bincount(y_train)

class_weights = {i: sum(class_counts) / class_counts[i] for i in range(len(class_counts))}

# Criar o vetor de pesos para cada amostra do y_train
sample_weights = np.array([class_weights[y] for y in y_train])

In [29]:
class_weights

{0: 3.540666774037687, 1: 2.475396914761851, 2: 3.188859878154917}

In [30]:
from xgboost import XGBClassifier

# Criar e treinar o modelo XGBoost
xgb = XGBClassifier(
    n_estimators=100,  # Número de árvores na floresta
    max_depth=5,       # Profundidade máxima das árvores
    learning_rate=0.01, # Taxa de aprendizado
    use_label_encoder=False,  # Evita warnings desnecessários
    eval_metric="mlogloss",  # Métrica para classificação multiclasse
    sample_weight = sample_weights # Balanceamento das classes
)

xgb.fit(train_embeddings_umap, y_train)

# Avaliação no conjunto de validação
accuracy = xgb.score(val_embeddings_umap, y_val)
print(f"Validation Accuracy: {accuracy:.4f}")

Parameters: { "sample_weight", "use_label_encoder" } are not used.



Validation Accuracy: 0.7578


# Neural Network

# LLM

### Fine Tuning with LoRA