<a href="https://colab.research.google.com/github/Cleander/analise-de-sentimentos/blob/main/analise_de_sentimentos_bertimbal_pi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Análise de Sentimentos - PI 5

##Treinando o Modelo

In [None]:
pip install transformers datasets torch pandas

In [2]:
import pandas as pd

df = pd.read_csv('olist.csv')

df.head()

Unnamed: 0,original_index,review_text,review_text_processed,review_text_tokenized,polarity,rating,kfold_polarity,kfold_rating
0,97262,Perfeito....chegou antes do prazo.....,perfeito....chegou antes do prazo.....,"['perfeito', 'chegou', 'antes', 'do', 'prazo']",1.0,5,1,1
1,72931,Foi uma ótima compra! Chegou antes mesmo do pr...,foi uma otima compra! chegou antes mesmo do pr...,"['foi', 'uma', 'otima', 'compra', 'chegou', 'a...",1.0,5,1,1
2,19659,Recebi muito rapido e um otimo custo beneficio,recebi muito rapido e um otimo custo beneficio,"['recebi', 'muito', 'rapido', 'um', 'otimo', '...",1.0,5,1,1
3,43054,Recomendo,recomendo,['recomendo'],1.0,5,1,1
4,59202,Só veio uma capa comprei 3 aí paguei. Mais de ...,so veio uma capa comprei 3 ai paguei. mais de ...,"['so', 'veio', 'uma', 'capa', 'comprei', 'ai',...",0.0,1,1,1


In [3]:
df.isnull().sum()

Unnamed: 0,0
original_index,0
review_text,0
review_text_processed,1
review_text_tokenized,0
polarity,3665
rating,0
kfold_polarity,0
kfold_rating,0


In [4]:
print(f"Linhas antes da limpeza: {df.shape[0]}")
df = df.dropna(subset=['review_text_tokenized', 'polarity'])
print(f"Linhas após a limpeza: {df.shape[0]}")

Linhas antes da limpeza: 41744
Linhas após a limpeza: 38079


In [5]:
train_data = df[(df['kfold_polarity'] >= 2) & (df['kfold_polarity'] <= 8)]
val_data = df[df['kfold_polarity'] == 9]
test_data = df[df['kfold_polarity'] == 1]

print(f"Treinamento: {len(train_data)}")
print(f"Validação: {len(val_data)}")
print(f"Teste: {len(test_data)}")

Treinamento: 26656
Validação: 3808
Teste: 3808


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [7]:
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128)

train_encodings = tokenize_function(train_data['review_text_tokenized'].tolist())
val_encodings = tokenize_function(val_data['review_text_tokenized'].tolist())
test_encodings = tokenize_function(test_data['review_text_tokenized'].tolist())

In [8]:
import torch
from datasets import Dataset

train_labels = torch.tensor(train_data['polarity'].values, dtype=torch.long)
val_labels = torch.tensor(val_data['polarity'].values, dtype=torch.long)
test_labels = torch.tensor(test_data['polarity'].values, dtype=torch.long)

train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels
})


In [None]:
from transformers import BertForSequenceClassification

# Classificação binária (polaridade)
model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=2)

In [10]:
from transformers import TrainingArguments
import numpy as np
from sklearn.metrics import f1_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"f1": f1_score(p.label_ids, preds, average="macro")}

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=4.840981590642851e-05, #Anterior: 2e-5
    per_device_train_batch_size=8, #Anterior: 16
    per_device_eval_batch_size=64, #Anterior: 64
    num_train_epochs=2, #Anterior: 2
    weight_decay=0.2755551526558927, #Anterior: 0.01
    gradient_accumulation_steps=1, #Anterior: 2
    warmup_ratio=0.04642400936685703, #Parâmetro não utilizado antes
    metric_for_best_model="f1", #Parâmetro não utilizado antes
    fp16=True,
)

In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcleandersilva[0m ([33mcleandersilva-portal-puc-campinas[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1
1,0.2313,0.199261,0.915399
2,0.1626,0.213406,0.93181


TrainOutput(global_step=6664, training_loss=0.2114469962102883, metrics={'train_runtime': 852.6658, 'train_samples_per_second': 62.524, 'train_steps_per_second': 7.815, 'total_flos': 3506744145838080.0, 'train_loss': 0.2114469962102883, 'epoch': 2.0})

##Avaliando e Salvando o Modelo

In [14]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.20289386808872223,
 'eval_f1': 0.9335099590097733,
 'eval_runtime': 6.0944,
 'eval_samples_per_second': 624.837,
 'eval_steps_per_second': 9.845,
 'epoch': 2.0}

In [15]:
from sklearn.metrics import accuracy_score, classification_report
import torch

def compute_metrics(dataset):
    predictions = trainer.predict(dataset)
    preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)
    labels = dataset["labels"]
    accuracy = accuracy_score(labels, preds)
    report = classification_report(labels, preds, target_names=["Negativo", "Positivo"])

    print(f"Acurácia: {accuracy:.4f}")
    print("Relatório de Classificação:\n", report)

compute_metrics(test_dataset)

Acurácia: 0.9435
Relatório de Classificação:
               precision    recall  f1-score   support

    Negativo       0.89      0.93      0.91      1140
    Positivo       0.97      0.95      0.96      2668

    accuracy                           0.94      3808
   macro avg       0.93      0.94      0.93      3808
weighted avg       0.94      0.94      0.94      3808



In [16]:
from google.colab import drive
drive.mount('/content/drive')

model.save_pretrained("/content/drive/MyDrive/sentiment_model_optuna_tuned_v1.1")
tokenizer.save_pretrained("/content/drive/MyDrive/sentiment_model_optuna_tuned_v1.1")

Mounted at /content/drive


('/content/drive/MyDrive/sentiment_model_optuna_tuned_v1.1/tokenizer_config.json',
 '/content/drive/MyDrive/sentiment_model_optuna_tuned_v1.1/special_tokens_map.json',
 '/content/drive/MyDrive/sentiment_model_optuna_tuned_v1.1/vocab.txt',
 '/content/drive/MyDrive/sentiment_model_optuna_tuned_v1.1/added_tokens.json')

In [17]:
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/sentiment_model_optuna_tuned_v1.1")
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/sentiment_model_optuna_tuned_v1.1")

def predict_sentiment(texts):
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    outputs = model(**encodings)
    predictions = outputs.logits.argmax(dim=-1)
    return predictions

textos = ["Este úlitmo lançamento não foi legal", "Não podia ter comprado um produto melhor."]
predictions = predict_sentiment(textos)
print(predictions)

tensor([0, 1])


###Avaliando modelo antigo

In [None]:
from transformers import BertForSequenceClassification

model_path = "/content/drive/MyDrive/sentiment_model"

old_model = BertForSequenceClassification.from_pretrained(model_path)

In [None]:
from transformers import Trainer

old_trainer = Trainer(
    model=old_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
import torch

def compute_old_model_metrics(dataset):
    predictions = old_trainer.predict(dataset)
    preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)
    labels = dataset["labels"]
    accuracy = accuracy_score(labels, preds)
    report = classification_report(labels, preds, target_names=["Negativo", "Positivo"])

    print(f"Acurácia: {accuracy:.4f}")
    print("Relatório de Classificação:\n", report)

compute_old_model_metrics(test_dataset)

Acurácia: 0.9472
Relatório de Classificação:
               precision    recall  f1-score   support

    Negativo       0.90      0.93      0.91      1140
    Positivo       0.97      0.95      0.96      2668

    accuracy                           0.95      3808
   macro avg       0.93      0.94      0.94      3808
weighted avg       0.95      0.95      0.95      3808



##Fazendo Fine-tuning dos Hiperparâmetros

In [None]:
pip install optuna

In [None]:
from transformers import Trainer, TrainingArguments
import optuna
import numpy as np
from sklearn.metrics import f1_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"f1": f1_score(p.label_ids, preds, average="macro")}

def model_init():
    return BertForSequenceClassification.from_pretrained(
        'neuralmind/bert-base-portuguese-cased',
        num_labels=2
    )

def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 5e-6, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 3, 4]),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.3),
    }

optuna_args = TrainingArguments(
    output_dir="./optuna_test",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=64,
    fp16=True,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

optuna_trainer = Trainer(
    model_init=model_init,
    args=optuna_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("Iniciando busca de hiperparâmetros com Optuna...")

best_run = optuna_trainer.hyperparameter_search(
    direction="maximize",
    n_trials=15,
    hp_space=hp_space,
    backend="optuna"
)

print("\nMelhores hiperparâmetros encontrados:")
for param, value in best_run.hyperparameters.items():
    print(f"{param}: {value}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-06-07 20:41:18,459] A new study created in memory with name: no-name-217af405-039f-40e1-80a2-dc33a658bf96


Iniciando busca de hiperparâmetros com Optuna...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcleandersilva[0m ([33mcleandersilva-portal-puc-campinas[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1
1,0.2704,0.17755,0.920306
2,0.1603,0.166818,0.929976
3,0.1476,0.176048,0.932205
4,0.1302,0.180885,0.932501


[I 2025-06-07 21:00:19,249] Trial 0 finished with value: 0.9325009789725797 and parameters: {'learning_rate': 6.229243288212518e-06, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.1701338063494077, 'gradient_accumulation_steps': 4, 'warmup_ratio': 0.010867739418388034}. Best is trial 0 with value: 0.9325009789725797.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁▇██
eval/loss,▆▁▆█
eval/runtime,▁▄▁█
eval/samples_per_second,█▅█▁
eval/steps_per_second,█▅█▁
train/epoch,▁▂▂▃▄▅▆▆▇██
train/global_step,▁▂▂▃▄▅▆▆▇██
train/grad_norm,▆█▁▅▁▇
train/learning_rate,█▇▅▄▂▁
train/loss,█▄▃▂▂▁

0,1
eval/f1,0.9325
eval/loss,0.18089
eval/runtime,6.9617
eval/samples_per_second,546.99
eval/steps_per_second,8.619
total_flos,7013488291676160.0
train/epoch,4.0
train/global_step,3332.0
train/grad_norm,4.93066
train/learning_rate,0.0


Epoch,Training Loss,Validation Loss,F1
1,0.2322,0.229666,0.902494
2,0.1799,0.183633,0.934565
3,0.1154,0.23546,0.932991


[I 2025-06-07 21:19:11,225] Trial 1 finished with value: 0.9329905171571024 and parameters: {'learning_rate': 4.793943064208717e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.024996744871575903, 'gradient_accumulation_steps': 1, 'warmup_ratio': 0.030215476310005106}. Best is trial 1 with value: 0.9329905171571024.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁██
eval/loss,▇▁█
eval/runtime,▁█▅
eval/samples_per_second,█▁▄
eval/steps_per_second,█▁▄
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/grad_norm,▁▂▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁█▁
train/learning_rate,██▇▇▆▆▆▅▅▄▄▄▃▃▃▂▂▁▁
train/loss,█▅▄▅▅▅▄▄▃▃▃▃▃▂▂▂▁▂▁

0,1
eval/f1,0.93299
eval/loss,0.23546
eval/runtime,6.88
eval/samples_per_second,553.489
eval/steps_per_second,8.721
total_flos,5260116218757120.0
train/epoch,3.0
train/global_step,9996.0
train/grad_norm,0.05339
train/learning_rate,0.0


Epoch,Training Loss,Validation Loss,F1
1,0.2168,0.191266,0.924728
2,0.1511,0.204708,0.934924


[I 2025-06-07 21:31:40,659] Trial 2 finished with value: 0.9349236272175826 and parameters: {'learning_rate': 4.840981590642851e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.2755551526558927, 'gradient_accumulation_steps': 1, 'warmup_ratio': 0.04642400936685703}. Best is trial 2 with value: 0.9349236272175826.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁█
eval/loss,▁█
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▂▃▃▄▄▄▅▆▆▇▇███
train/global_step,▁▂▂▃▃▄▄▄▅▆▆▇▇███
train/grad_norm,▁▁▁▁▃▂▂█▁▁▂▁▁
train/learning_rate,█▇▇▆▆▅▅▄▃▃▂▂▁
train/loss,█▅▄▄▄▃▃▂▂▁▂▁▁

0,1
eval/f1,0.93492
eval/loss,0.20471
eval/runtime,6.9062
eval/samples_per_second,551.39
eval/steps_per_second,8.688
total_flos,3506744145838080.0
train/epoch,2.0
train/global_step,6664.0
train/grad_norm,0.33203
train/learning_rate,0.0


Epoch,Training Loss,Validation Loss,F1
1,0.3923,0.183951,0.920418
2,0.1669,0.162985,0.931356
3,0.1537,0.17253,0.929249
4,0.1266,0.177117,0.930995
5,0.1163,0.189477,0.929354


[I 2025-06-07 21:52:34,686] Trial 3 finished with value: 0.9293537615197454 and parameters: {'learning_rate': 6.900536899259329e-06, 'per_device_train_batch_size': 16, 'num_train_epochs': 5, 'weight_decay': 0.19715091674742594, 'gradient_accumulation_steps': 2, 'warmup_ratio': 0.16703785909835495}. Best is trial 2 with value: 0.9349236272175826.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁█▇█▇
eval/loss,▇▁▄▅█
eval/runtime,▁█▄▅▄
eval/samples_per_second,█▁▅▄▅
eval/steps_per_second,█▁▅▄▅
train/epoch,▁▂▂▃▃▄▅▅▆▆▇███
train/global_step,▁▂▂▃▃▄▅▅▆▆▇███
train/grad_norm,▅▄▄▄▁▄▇█
train/learning_rate,▆█▇▆▄▃▂▁
train/loss,█▃▂▂▂▁▁▁

0,1
eval/f1,0.92935
eval/loss,0.18948
eval/runtime,6.9018
eval/samples_per_second,551.743
eval/steps_per_second,8.693
total_flos,8766860364595200.0
train/epoch,5.0
train/global_step,4165.0
train/grad_norm,7.7672
train/learning_rate,0.0


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1
1,No log,0.184819,0.925718
2,0.266100,0.178954,0.92736


[I 2025-06-07 22:00:03,060] Trial 4 finished with value: 0.9273597154391857 and parameters: {'learning_rate': 5.5608315124188e-06, 'per_device_train_batch_size': 16, 'num_train_epochs': 2, 'weight_decay': 0.25858127676884646, 'gradient_accumulation_steps': 4, 'warmup_ratio': 0.06527188568547647}. Best is trial 2 with value: 0.9349236272175826.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁█
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂██
train/global_step,▁▂██
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/f1,0.92736
eval/loss,0.17895
eval/runtime,6.8492
eval/samples_per_second,555.979
eval/steps_per_second,8.76
total_flos,3506744145838080.0
train/epoch,2.0
train/global_step,834.0
train/grad_norm,2.7468
train/learning_rate,0.0


Epoch,Training Loss,Validation Loss,F1
1,0.207,0.197827,0.904213


[I 2025-06-07 22:03:59,779] Trial 5 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇█
train/global_step,▁▄▇█
train/grad_norm,█▁▃
train/learning_rate,▁▇█
train/loss,█▁▁

0,1
eval/f1,0.90421
eval/loss,0.19783
eval/runtime,6.9627
eval/samples_per_second,546.917
eval/steps_per_second,8.617
train/epoch,1.0
train/global_step,1666.0
train/grad_norm,5.48393
train/learning_rate,3e-05
train/loss,0.207


Epoch,Training Loss,Validation Loss,F1
1,0.2047,0.198043,0.923112


[I 2025-06-07 22:07:56,620] Trial 6 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇█
train/global_step,▁▄▇█
train/grad_norm,▆▁█
train/learning_rate,▁█▇
train/loss,█▁▁

0,1
eval/f1,0.92311
eval/loss,0.19804
eval/runtime,6.8903
eval/samples_per_second,552.661
eval/steps_per_second,8.708
train/epoch,1.0
train/global_step,1666.0
train/grad_norm,5.98166
train/learning_rate,1e-05
train/loss,0.2047


Epoch,Training Loss,Validation Loss,F1
1,0.2636,0.166989,0.928225
2,0.1555,0.164098,0.930339


[I 2025-06-07 22:15:06,765] Trial 7 finished with value: 0.9303387707168559 and parameters: {'learning_rate': 1.1478196809318994e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2, 'weight_decay': 0.20422806059467968, 'gradient_accumulation_steps': 3, 'warmup_ratio': 0.12061593785961211}. Best is trial 2 with value: 0.9349236272175826.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁█
eval/loss,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▂▇██
train/global_step,▁▂▇██
train/grad_norm,▁█
train/learning_rate,█▁
train/loss,█▁

0,1
eval/f1,0.93034
eval/loss,0.1641
eval/runtime,6.9265
eval/samples_per_second,549.772
eval/steps_per_second,8.662
total_flos,3681449886597120.0
train/epoch,2.0
train/global_step,1112.0
train/grad_norm,2.7017
train/learning_rate,0.0


Epoch,Training Loss,Validation Loss,F1
1,0.2823,0.194362,0.901953


[I 2025-06-07 22:18:19,173] Trial 8 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁█
train/global_step,▁█
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
eval/f1,0.90195
eval/loss,0.19436
eval/runtime,6.8372
eval/samples_per_second,556.952
eval/steps_per_second,8.776
train/epoch,1.0
train/global_step,556.0
train/grad_norm,3.01455
train/learning_rate,2e-05
train/loss,0.2823


Epoch,Training Loss,Validation Loss,F1
1,0.1875,0.175195,0.927787
2,0.1392,0.174142,0.931062


[I 2025-06-07 22:26:55,095] Trial 9 finished with value: 0.931062017567208 and parameters: {'learning_rate': 2.395791502451541e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.012804252680517858, 'gradient_accumulation_steps': 3, 'warmup_ratio': 0.14925682238816249}. Best is trial 2 with value: 0.9349236272175826.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁█
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▃▃▅▇██
train/global_step,▁▃▃▅▇██
train/grad_norm,█▂▂▁
train/learning_rate,█▆▃▁
train/loss,█▃▂▁

0,1
eval/f1,0.93106
eval/loss,0.17414
eval/runtime,6.8777
eval/samples_per_second,553.672
eval/steps_per_second,8.724
total_flos,3681449886597120.0
train/epoch,2.0
train/global_step,2222.0
train/grad_norm,0.5978
train/learning_rate,0.0


Epoch,Training Loss,Validation Loss,F1
1,0.2116,0.228071,0.890534


[I 2025-06-07 22:31:25,808] Trial 10 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▄▇█
train/global_step,▁▄▇█
train/grad_norm,█▁▅
train/learning_rate,▁▅█
train/loss,█▁▁

0,1
eval/f1,0.89053
eval/loss,0.22807
eval/runtime,6.9458
eval/samples_per_second,548.242
eval/steps_per_second,8.638
train/epoch,1.0
train/global_step,1666.0
train/grad_norm,2.06268
train/learning_rate,5e-05
train/loss,0.2116


Epoch,Training Loss,Validation Loss,F1
1,0.2255,0.195136,0.917887


[I 2025-06-07 22:37:06,123] Trial 11 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▅▆▇█
train/global_step,▁▂▃▅▆▇█
train/grad_norm,▂█▁▁▂▇
train/learning_rate,█▇▅▄▂▁
train/loss,█▃▂▁▂▁

0,1
eval/f1,0.91789
eval/loss,0.19514
eval/runtime,6.8171
eval/samples_per_second,558.599
eval/steps_per_second,8.801
train/epoch,1.0
train/global_step,3332.0
train/grad_norm,7.75621
train/learning_rate,3e-05
train/loss,0.2255


Epoch,Training Loss,Validation Loss,F1
1,0.2186,0.213955,0.920821


[I 2025-06-07 22:42:44,199] Trial 12 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▅▆▇█
train/global_step,▁▂▃▅▆▇█
train/grad_norm,▂▁▁▁▁█
train/learning_rate,▃█▆▄▃▁
train/loss,█▄▂▂▁▁

0,1
eval/f1,0.92082
eval/loss,0.21395
eval/runtime,6.9521
eval/samples_per_second,547.75
eval/steps_per_second,8.631
train/epoch,1.0
train/global_step,3332.0
train/grad_norm,7.48538
train/learning_rate,2e-05
train/loss,0.2186


Epoch,Training Loss,Validation Loss,F1
1,0.2101,0.222612,0.915558


[I 2025-06-07 22:48:23,135] Trial 13 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▅▆▇█
train/global_step,▁▂▃▅▆▇█
train/grad_norm,▃▁▁▁▆█
train/learning_rate,█▇▅▄▂▁
train/loss,█▃▂▁▂▁

0,1
eval/f1,0.91556
eval/loss,0.22261
eval/runtime,6.8651
eval/samples_per_second,554.691
eval/steps_per_second,8.74
train/epoch,1.0
train/global_step,3332.0
train/grad_norm,1.20404
train/learning_rate,2e-05
train/loss,0.2101


Epoch,Training Loss,Validation Loss,F1
1,0.2293,0.231783,0.916564


[I 2025-06-07 22:54:01,807] Trial 14 pruned. 



Melhores hiperparâmetros encontrados:
learning_rate: 4.840981590642851e-05
per_device_train_batch_size: 8
num_train_epochs: 2
weight_decay: 0.2755551526558927
gradient_accumulation_steps: 1
warmup_ratio: 0.04642400936685703


##Utilizando o Modelo e a API

In [None]:
pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=25af2d0882f3a2815509c7ced48b376fba8956a16bd98c7deff7cc5ffe725443
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from fpdf import FPDF
from datetime import datetime
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ig_user_id = "17841449666813574"
app_id = "1750857045465243"
app_secret = "3879b6aeb6718a852a5bed85f1ab5fde"
user_access_token = "EAAY4ZASwZCWJsBO9DOETVck86K7likqssFJ88bX96jfn00zEr2QWY40D1kqpJ9BtfDtGMCwZBr2C1iYj7mVPTPSQjIZA5p3shN7WTPyyYB9xpBlCkvmz23uFrL6iTz15hUl0d7ZAGwcZA0jicA0j1yghUofENIClhf70xzMwlrbVvJb8b0SLyPg9fcUvK4Jwgw09kC2rplqc0Ds0H2mAZDZD"

url = f"https://graph.facebook.com/v17.0/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={user_access_token}"
response = requests.get(url)
long_access_token = response.json()["access_token"]

base_url = f"https://graph.facebook.com/v17.0/{ig_user_id}/media?fields=id,caption,timestamp&access_token={long_access_token}"

if not os.path.exists('graficos'):
    os.makedirs('graficos')

In [None]:
def coletar_comentarios_por_publicacao():
    publicacoes = []
    response = requests.get(base_url)
    if response.status_code == 200:
        data = response.json()['data']
        for item in data:
            media_id = item['id']
            caption = item.get('caption', 'Sem legenda')
            timestamp = item.get('timestamp', None)

            comments_url = f'https://graph.facebook.com/v17.0/{media_id}/comments?fields=id,text,timestamp,username&access_token={long_access_token}'
            comments_response = requests.get(comments_url)

            comentarios = []
            if comments_response.status_code == 200:
                comments_data = comments_response.json().get('data', [])
                comentarios = [comment['text'] for comment in comments_data]
            else:
                print(f'Erro ao buscar comentários da mídia {media_id}')

            publicacoes.append({
                'media_id': media_id,
                'caption': caption,
                'comentarios': comentarios,
                'timestamp': timestamp
            })
    else:
        print('Erro ao buscar mídias:', response.text)

    return publicacoes

In [None]:
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/sentiment_model")
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/sentiment_model")

In [None]:
def analisar_sentimentos(comentarios):
    resultados = []
    if comentarios:
        encodings = tokenizer(comentarios, padding=True, truncation=True, max_length=128, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**encodings)
            predictions = outputs.logits.argmax(dim=-1)

        for comentario, pred in zip(comentarios, predictions):
            sentimento = 'Positivo' if pred.item() == 1 else 'Negativo'
            resultados.append((comentario, sentimento))
    return resultados

In [None]:
def calcular_metricas(resultados):
    total = len(resultados)
    positivos = sum(1 for _, s in resultados if s == 'Positivo')
    negativos = total - positivos
    porcentagem_positivos = positivos / total * 100 if total else 0
    porcentagem_negativos = negativos / total * 100 if total else 0
    return positivos, negativos, porcentagem_positivos, porcentagem_negativos

In [None]:
def gerar_grafico_publicacao(caption, positivos, negativos, media_id):
    labels = ['Positivos', 'Negativos']
    sizes = [positivos, negativos]
    colors = ['#4CAF50', '#F44336']

    fig, ax = plt.subplots()
    ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
    ax.axis('equal')
    plt.title(caption[:50] + '...')
    caminho = f'graficos/{media_id}.png'
    plt.savefig(caminho)
    plt.close()
    return caminho

"""
def gerar_grafico_geral(total_positivos, total_negativos):
    labels = ['Positivos', 'Negativos']
    sizes = [total_positivos, total_negativos]
    colors = ['#4CAF50', '#F44336']

    fig, ax = plt.subplots()
    ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
    ax.axis('equal')
    plt.title('Distribuição Geral dos Sentimentos')
    caminho = 'graficos/geral.png'
    plt.savefig(caminho)
    plt.close()
    return caminho
"""

def gerar_grafico_temporal(resultados_temporais):
    if not resultados_temporais:
        return None

    resultados_ordenados = sorted(resultados_temporais, key=lambda x: x['data'])

    datas = [
      datetime.strptime(item['data'], '%Y-%m-%dT%H:%M:%S%z').strftime('%d/%m/%Y %H:%M')
      for item in resultados_ordenados
    ]

    porcentagens = [item['pct_positivos'] for item in resultados_ordenados]
    legends = [item['caption'][:30] + '...' if len(item['caption']) > 30 else item['caption'] for item in resultados_ordenados]

    fig, ax = plt.subplots(figsize=(12, 6))
    ax.plot(datas, porcentagens, marker='o', color='#2196F3', linestyle='-')

    for i, txt in enumerate(legends):
        ax.annotate(txt, (i, porcentagens[i]), textcoords="offset points", xytext=(0,10),
                    ha='center', fontsize=8, rotation=45)

    ax.set_xticks(datas)
    ax.set_xticklabels(datas, rotation=45, ha='right', fontsize=8)

    ax.set_title('Evolução da Avaliação das Publicações ao Longo do Tempo')
    ax.set_xlabel('Data da Publicação')
    ax.set_ylabel('% de Comentários Positivos')
    ax.set_ylim(0, 100)
    ax.grid(True)

    caminho = 'graficos/grafico_temporal.png'
    plt.tight_layout()
    plt.savefig(caminho)
    plt.close()
    return caminho


In [None]:
#Função utilizada na solução provisória para os emojis dando erro ao gerar o pdf
def remove_emojis(text):
    return text.encode('latin-1', 'ignore').decode('latin-1')

In [None]:
class PDFRelatorio(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 16)
        self.cell(0, 10, 'Relatório de Análise de Sentimentos - Instagram', 0, 1, 'C')
        self.ln(10)

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Página {self.page_no()}', 0, 0, 'C')

    def add_publicacao(self, caption, positivos, negativos, porcentagem_positivos, porcentagem_negativos, grafico_path):
        self.set_font('Arial', 'B', 12)
        #Solução provisória para os emojis
        self.multi_cell(0, 10, remove_emojis(caption))
        self.set_font('Arial', '', 12)
        self.cell(0, 10, f'Positivos: {positivos} ({porcentagem_positivos:.2f}%)', 0, 1)
        self.cell(0, 10, f'Negativos: {negativos} ({porcentagem_negativos:.2f}%)', 0, 1)
        self.ln(3)
        self.image(grafico_path, w=150)
        self.ln(10)

    def add_conclusao_geral(self, total_positivos, total_negativos, pct_positivos, pct_negativos, grafico_path):
        self.add_page()
        self.set_font('Arial', 'B', 14)
        self.cell(0, 10, 'Resumo Geral', 0, 1, 'C')
        self.ln(5)
        self.set_font('Arial', '', 12)
        self.cell(0, 10, f'Total de Comentários Positivos: {total_positivos} ({pct_positivos:.2f}%)', 0, 1)
        self.cell(0, 10, f'Total de Comentários Negativos: {total_negativos} ({pct_negativos:.2f}%)', 0, 1)
        self.ln(5)
        self.image(grafico_path, w=150)
        self.ln(10)

        conclusao = 'Conclusão geral: '
        if pct_positivos > 70:
            conclusao += 'O perfil está muito bem avaliado!'
        elif pct_positivos > 40:
            conclusao += 'O perfil está com avaliação mista.'
        else:
            conclusao += 'O perfil está sendo mal avaliado.'

        self.multi_cell(0, 10, conclusao)

###Gerando o Relatório

In [None]:
publicacoes = coletar_comentarios_por_publicacao()
pdf = PDFRelatorio()
pdf.add_page()

print(publicacoes)

total_resultados = []
resultados_temporais = []

total_positivos = 0
total_negativos = 0

for publicacao in publicacoes:
    comentarios = publicacao['comentarios']
    caption = publicacao['caption']
    media_id = publicacao['media_id']

    if comentarios:
        resultados = analisar_sentimentos(comentarios)
        positivos, negativos, pct_positivos, pct_negativos = calcular_metricas(resultados)
        grafico_path = gerar_grafico_publicacao(caption, positivos, negativos, media_id)

        pdf.add_publicacao(caption, positivos, negativos, pct_positivos, pct_negativos, grafico_path)

        total_positivos += positivos
        total_negativos += negativos
        total_resultados.extend(resultados)

        resultados_temporais.append({
            'data': publicacao['timestamp'],
            'pct_positivos': pct_positivos,
            'caption': caption
        })

grafico_temporal_path = gerar_grafico_temporal(resultados_temporais)
pct_total_positivos = total_positivos / (total_positivos + total_negativos) * 100 if (total_positivos + total_negativos) else 0
pct_total_negativos = 100 - pct_total_positivos

pdf.add_conclusao_geral(total_positivos, total_negativos, pct_total_positivos, pct_total_negativos, grafico_temporal_path)

pdf.output('relatorio_sentimentos_instagram.pdf')

[{'media_id': '17982061616675940', 'caption': 'O que esse negócio de I.A tá ficando bom em foto é brincadeira 😳', 'comentarios': ['Que lindosss❤️❤️', '❤️❤️❤️', 'A IA nao colocou aliança na sua foto 😠', 'Amei!! ❤️❤️', 'ta roubando o emprego do vasco', 'Legal que na terceira foto a Le não tá de olho fechado mas a IA entendeu que tava hahahaha', 'A Porsche virou fusca kkkkkkkk', 'show de bola🙌❤️', 'parece o dj oreia', 'ficou parecido irmão 👏👏'], 'timestamp': '2025-03-31T15:00:00+0000'}, {'media_id': '18487181953049729', 'caption': 'Obrigado por essa vista maravilhosa!! 🥹', 'comentarios': ['Kkkkkkkkkkkkk', '😂😂😂😂😂', '👏👏👏muito  bom', 'O que importa é a companhia!!', '😂😂😂', 'Nuussssss......deu ate medo 😂😂'], 'timestamp': '2025-01-06T16:19:11+0000'}, {'media_id': '18044393930191067', 'caption': 'Eu e você, você e eu ♥️', 'comentarios': ['👏👏👏👏👏👏👏👏👏', '🔥🔥🔥🔥🔥🔥kkkkk', '👏👏👏👏👏🔥🔥🔥🔥', '💘💘💘💘', 'linducos', 'Lindos amooooo ❤️❤️❤️', 'Seus lindos ❤️❤️', 'Lindicos', 'Meu tudinho', 'Te amo muito lindeza❤️❤️❤

  plt.savefig(caminho)
  plt.savefig(caminho)
  plt.savefig(caminho)
  plt.savefig(caminho)
  plt.savefig(caminho)
  plt.savefig(caminho)
  plt.savefig(caminho)


''