#Dependencies

In [1]:
!pip install -q transformers datasets hmmlearn nltk

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/166.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [3]:
import os
import nltk
import numpy as np
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from hmmlearn import hmm
from nltk.corpus import stopwords
from datasets import load_dataset
from dataclasses import dataclass
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# implementation with HMM


In [8]:
HP = {
    'MAX_WORDS': 10000,
    'DIM': 300,
    'HMM_STATES': 3,
    'LIMIT_TRAIN': 4000,
    'LIMIT_TEST': 500
}

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

def load_glove(path):
    with open(path, encoding='utf-8') as f:
        return {line.split()[0]: np.asarray(line.split()[1:], dtype='float32')
                for line in f}

embeddings_index = load_glove(f'glove.6B.{HP["DIM"]}d.txt')

In [9]:
(x_train_raw, y_train), (x_test_raw, y_test) = tf.keras.datasets.imdb.load_data(num_words=HP['MAX_WORDS'])

word_index = tf.keras.datasets.imdb.get_word_index()
stop_words = set(stopwords.words('english')) | {'no', 'not', 'nor', 'neither', 'never', "didn't", "isn't"}
embedding_matrix = np.zeros((HP['MAX_WORDS'] + 4, HP['DIM']))

valid_indices = set()

for word, i in word_index.items():
    if i < HP['MAX_WORDS'] and word in embeddings_index and word not in stop_words:
        embedding_matrix[i + 3] = embeddings_index[word]
        valid_indices.add(i + 3)

def vectorize_sequences(sequences, max_len=100):
    processed_seqs = []
    lengths = []

    for seq in sequences:
        valid_tokens = [idx for idx in seq if idx in valid_indices][:max_len]

        if valid_tokens:
            processed_seqs.append(embedding_matrix[valid_tokens])
            lengths.append(len(valid_tokens))

    return processed_seqs, lengths

X_train_seqs, train_lens = vectorize_sequences(x_train_raw[:HP['LIMIT_TRAIN']])
X_test_seqs, test_lens = vectorize_sequences(x_test_raw[:HP['LIMIT_TEST']])

y_train_sub = y_train[:HP['LIMIT_TRAIN']][:len(X_train_seqs)]
y_test_sub = y_test[:HP['LIMIT_TEST']][:len(X_test_seqs)]

In [10]:
class HMMSentiment:
    def __init__(self, n_components=3):
        common_params = {'n_components': n_components, 'covariance_type': "diag", 'n_iter': 20, 'verbose': False}

        self.models = {
            0: hmm.GaussianHMM(**common_params),
            1: hmm.GaussianHMM(**common_params)
        }

    def fit(self, X_seqs, lengths, y):
        for label, model in self.models.items():
            indices = [i for i, t in enumerate(y) if t == label]

            if not indices: continue

            X_concat = np.concatenate([X_seqs[i] for i in indices])
            lens_concat = [lengths[i] for i in indices]
            model.fit(X_concat, lens_concat)

    def predict_score_diff(self, X_seqs):
        scores = []

        for seq in X_seqs:
            try:
                s_pos = self.models[1].score(seq)
                s_neg = self.models[0].score(seq)
                scores.append(s_pos - s_neg)
            except ValueError:
                scores.append(0)

        return np.array(scores)

model = HMMSentiment(n_components=HP['HMM_STATES'])
model.fit(X_train_seqs, train_lens, y_train_sub)

In [None]:
scores = model.predict_score_diff(X_test_seqs)

thresholds = np.linspace(np.percentile(scores, 5), np.percentile(scores, 95), 200)
accuracies = [accuracy_score(y_test_sub, (scores > t).astype(int)) for t in thresholds]
best_idx = np.argmax(accuracies)

best_thresh = thresholds[best_idx]
final_preds = (scores > best_thresh).astype(int)

print("="*40)
print(f"[Minimalist HMM] Best Threshold: {best_thresh:.2f} | Acc: {accuracies[best_idx]:.4f}")
print("-" * 40)
print(classification_report(y_test_sub, final_preds, target_names=['Neg', 'Pos']))

# implementation with LLM


In [None]:
BATCH_SIZE = 64
MAX_LEN = 128

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = load_dataset('imdb')

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN
    )

print("Tokenizando dataset...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

tf_train = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

tf_test = tokenized_datasets["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    batch_size=BATCH_SIZE,
)

In [None]:
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    force_download=True,
    use_safetensors=False
)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.summary()

In [None]:
history = model.fit(
    tf_train,
    validation_data=tf_test,
    epochs=4
)

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs_range = range(len(acc))

    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Treino Acc')
    plt.plot(epochs_range, val_acc, label='Validação Acc')
    plt.legend(loc='lower right')
    plt.title('Acurácia de Treino e Validação')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Treino Loss')
    plt.plot(epochs_range, val_loss, label='Validação Loss')
    plt.legend(loc='upper right')
    plt.title('Perda de Treino e Validação')
    plt.show()

plot_history(history)