In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np

Data Preparation

In [2]:
# Ler os arquivos CSV
df_train = pd.read_csv("imdb_reviews_train.csv", quoting=0, on_bad_lines='skip')
df_test = pd.read_csv("imdb_reviews_test.csv", quoting=0, on_bad_lines='skip')
df_train.count()
df_test.count()

print("Número de linhas no conjunto de treinamento:", df_train.shape[0])
print("Número de linhas no conjunto de teste:", df_test.shape[0])

print(f" train set tem {sum(df_train['label'] == 'pos')} reviews positivas")
print(f" train set tem {sum(df_train['label'] == 'neg')} reviews negativas")

print(f" test set tem {sum(df_test['label'] == 'pos')} reviews positivas")
print(f" test set tem {sum(df_test['label'] == 'neg')} reviews negativas")

Número de linhas no conjunto de treinamento: 21754
Número de linhas no conjunto de teste: 21996
 train set tem 10776 reviews positivas
 train set tem 10978 reviews negativas
 test set tem 10946 reviews positivas
 test set tem 11050 reviews negativas


In [3]:
# Remover duplicatas com base na coluna 'review' e mantendo a primeira ocorrência
df_train = df_train.drop_duplicates(subset=['text'], keep='first')
df_test = df_test.drop_duplicates(subset=['text'], keep='first')
x = df_train.count()
y = df_test.count()

print("Número de linhas no conjunto de treinamento após remoção de duplicatas:", x)
print("Número de linhas no conjunto de teste após remoção de duplicatas:", y)

print(f" train set tem {sum(df_train['label'] == 'pos')} reviews positivas")
print(f" train set tem {sum(df_train['label'] == 'neg')} reviews negativas")

print(f" test set tem {sum(df_test['label'] == 'pos')} reviews positivas")
print(f" test set tem {sum(df_test['label'] == 'neg')} reviews negativas")


Número de linhas no conjunto de treinamento após remoção de duplicatas: text     21662
label    21662
dtype: int64
Número de linhas no conjunto de teste após remoção de duplicatas: text     21814
label    21814
dtype: int64
 train set tem 10748 reviews positivas
 train set tem 10914 reviews negativas
 test set tem 10888 reviews positivas
 test set tem 10926 reviews negativas


In [4]:
# Exemplo
df_train['text'] = df_train['text'].apply(lambda x: x.lower())
df_train['text'][:10]

df_test['text'] = df_test['text'].apply(lambda x: x.lower())
df_test['text'][:10]

Unnamed: 0,text
0,logan lerman & dean collins iii of jack & bobb...
1,i have seen this film on a sunday evening and ...
2,two great stars and a legendary director creat...
3,i'm originally from brazil... the sad thing ab...
4,"""witchery"" is a decent little euro trash horro..."
5,the best so bad it's good movie ever made. rud...
6,"okay, i'll say it. this movie made me laugh so..."
7,this movie almost has everything. the action i...
8,this movie is ageless and would probably appea...
9,"great book, great movie, great soundtrack. fra..."


In [5]:
# Carregar o tokenizer pré-treinado do BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Função para tokenizar o texto e gerar IDs de entrada e máscaras de atenção
def tokenize_data(texts, tokenizer, max_length=128):
    # Tokenizar e gerar os IDs e máscaras de atenção
    encoding = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encoding.input_ids, encoding.attention_mask

# Tokenizar dados de treino e teste
train_input_ids, train_attention_masks = tokenize_data(df_train['text'], tokenizer)
test_input_ids, test_attention_masks = tokenize_data(df_test['text'], tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
train_input_ids_np = train_input_ids.numpy()
train_attention_masks_np = train_attention_masks.numpy()

test_input_ids_np = test_input_ids.numpy()
test_attention_masks_np = test_attention_masks.numpy()

# Rótulos (assumindo que 'pos' é 1 e 'neg' é 0)
train_labels_np = df_train['label'].apply(lambda x: 1 if x == 'pos' else 0).to_numpy()
test_labels_np = df_test['label'].apply(lambda x: 1 if x == 'pos' else 0).to_numpy()

# Divide o conjunto de treino em treino + validação
train_input_ids_np, val_input_ids_np, train_attention_masks_np, val_attention_masks_np, train_labels_np, val_labels_np = train_test_split(
    train_input_ids_np, train_attention_masks_np, train_labels_np, test_size=0.1, random_state=42
)

Usar BERT pré-treinado para classificação binária (sem fine-tuning)

In [9]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Criar o pipeline de análise de sentimentos
sentiment_pipeline = pipeline("sentiment-analysis", model="bert-base-uncased", batch_size=32)
#sentiment_pipeline = pipeline("sentiment-analysis", model="bert-base-uncased")

# Obter as previsões
results = sentiment_pipeline(df_test["text"].tolist())

# Converter as previsões
predictions = [1 if result['label'] == 'POSITIVE' else 0 for result in results]

# Calcular a acurácia
accuracy = accuracy_score(test_labels_np, predictions)
print(f"Acurácia no conjunto de teste: {accuracy:.4f}")

# Calcular a matriz de confusão
conf_matrix = confusion_matrix(test_labels_np, predictions)

sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["neg", "pos"], yticklabels=["neg", "pos"])
plt.ylabel('Real')
plt.xlabel('Previsto')
plt.title('Matriz de Confusão')
plt.show()

# primeiros 5 exemplos com suas previsões
for i in range(5):
    print(f"Texto: {df_test['text'].iloc[i]}")
    print(f"Rótulo Real: {test_labels_np[i]}")
    print(f"Rótulo Previsto: {predictions[i]}")
    print()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


KeyboardInterrupt: 