In [None]:
# Importações
import os
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Caminho do projeto no Google Drive
project_path = "/content/drive/MyDrive/TrabalhoIA2"
model_save_path = f"{project_path}/Modelos"
data_path = f"{project_path}/Classes"
os.makedirs(model_save_path, exist_ok=True)


In [None]:
# Une os dados dos três arquivos em um único dataset
positive = pd.read_csv(f"{data_path}/positive.txt", header=None, names=["text"])
negative = pd.read_csv(f"{data_path}/negative.txt", header=None, names=["text"])
neutral = pd.read_csv(f"{data_path}/neutral.txt", header=None, names=["text"])

In [None]:
positive["label"] = 2  # Positivo
negative["label"] = 0  # Negativo
neutral["label"] = 1   # Neutro

In [None]:
# Concatena e salva o dataset combinado
df = pd.concat([positive, negative, neutral], ignore_index=True)
df.to_csv(f"{data_path}/dataset_completo.csv", index=False)

In [None]:
# Verifica distribuição
print("Distribuição de classes:")
print(df["label"].value_counts())

Distribuição de classes:
label
2    100
0    100
1    100
Name: count, dtype: int64


In [None]:
# Divide o dataset em treino (70%), validação (15%) e teste (15%)
train, temp = train_test_split(df, test_size=0.3, stratify=df["label"], random_state=42)
val, test = train_test_split(temp, test_size=0.5, stratify=temp["label"], random_state=42)


In [None]:
# Verifica tamanhos das divisões
print(f"Tamanho do treino: {len(train)}")
print(f"Tamanho da validação: {len(val)}")
print(f"Tamanho do teste: {len(test)}")


Tamanho do treino: 210
Tamanho da validação: 45
Tamanho do teste: 45


In [None]:
# Salva os datasets como CSV
train.to_csv(f"{data_path}/train.csv", index=False)
val.to_csv(f"{data_path}/val.csv", index=False)
test.to_csv(f"{data_path}/test.csv", index=False)

In [None]:
# Carrega os dados no formato Hugging Face
dataset = DatasetDict({
    "train": load_dataset("csv", data_files=f"{data_path}/train.csv")["train"],
    "validation": load_dataset("csv", data_files=f"{data_path}/val.csv")["train"],
    "test": load_dataset("csv", data_files=f"{data_path}/test.csv")["train"]
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Função para treinar o modelo
def train_model(model_name, dataset, model_save_path):
    print(f"\nTreinando e avaliando o modelo: {model_name}")

    # Carrega o tokenizador e o modelo
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3  # Configura para 3 classes
    )

    # Tokeniza os dados
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets.set_format("torch")

    # Configurações de treinamento
    training_args = TrainingArguments(
        output_dir=f"{model_save_path}/{model_name.replace('/', '_')}",
        evaluation_strategy="epoch",
        logging_strategy="steps",
        logging_steps=50,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir=f"{model_save_path}/logs/{model_name.replace('/', '_')}",
        report_to="none"
    )

    # Treinador
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer
    )

    # Treina o modelo
    trainer.train()

    # Avaliação
    predictions = trainer.predict(tokenized_datasets["test"])
    preds = predictions.predictions.argmax(-1)

    print("\nRelatório de classificação:")
    print(classification_report(
        tokenized_datasets["test"]["labels"], preds, target_names=["Negative", "Neutral", "Positive"]
    ))

    # Salva o modelo
    trainer.save_model(f"{model_save_path}/{model_name.replace('/', '_')}")
    print(f"Modelo {model_name} salvo com sucesso em {model_save_path}!")

In [None]:
# Treinamento do Modelo 1: Distilbert
train_model(
    model_name="distilbert-base-uncased-finetuned-sst-2-english",
    dataset=dataset,
    model_save_path=model_save_path
)



Treinando e avaliando o modelo: distilbert-base-uncased-finetuned-sst-2-english


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.774762
2,No log,0.600595
3,No log,0.550512



Relatório de classificação:
              precision    recall  f1-score   support

    Negative       0.87      0.87      0.87        15
     Neutral       0.91      0.67      0.77        15
    Positive       0.79      1.00      0.88        15

    accuracy                           0.84        45
   macro avg       0.86      0.84      0.84        45
weighted avg       0.86      0.84      0.84        45

Modelo distilbert-base-uncased-finetuned-sst-2-english salvo com sucesso em /content/drive/MyDrive/TrabalhoIA2/Modelos!


In [None]:
# Treinamento do modelo 2: Twitter Roberta Sentiment
train_model(
    model_name="cardiffnlp/twitter-roberta-base-sentiment",
    dataset=dataset, 
    model_save_path=model_save_path
)


Treinando e avaliando o modelo: cardiffnlp/twitter-roberta-base-sentiment


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.083446
2,No log,0.01583
3,No log,0.006991



Relatório de classificação:
              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        15
     Neutral       1.00      0.93      0.97        15
    Positive       0.94      1.00      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

Modelo cardiffnlp/twitter-roberta-base-sentiment salvo com sucesso em /content/drive/MyDrive/TrabalhoIA2/Modelos!
