In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score, precision_score, recall_score, f1_score

## Pré-processamento

In [None]:
true = pd.read_csv('noticias/true.csv')
true_2 = pd.read_csv('noticias/true_news.csv')
fake = pd.read_csv('noticias/fake.csv')
news = pd.concat([true, fake, true_2],axis=0, ignore_index=True)
news.head()

In [None]:
news.tail()

In [None]:
news.drop(['title','url','publisher_site', 'origin', 'publisher_name', 'date'], axis=1, inplace=True)
news

In [None]:
news.isna().sum()

In [6]:
#true=0,fake=1 to true=1, fake=0
news['label'] = news['label'].apply(lambda x: 1 if x == 0 else 0)

In [None]:
news['label'].unique() 

In [None]:
sns.countplot(x='label', data=news)

In [None]:
news = news.sample(frac=1, random_state=1).reset_index(drop=True)
news

In [10]:
def text_cleaning(text):
  text = text.lower()

  text = re.sub(r'[^\w\s]', '', text) #Remover pontuação

  text = re.sub(r'[\d]', '', text) #Remover digitos

  text = re.sub(r'https?://S+|www\.\S+', '', text) #Remover URLs

  return text

In [None]:
news['text'] = news['text'].apply(text_cleaning)
news

## Modelagem e treinamento

In [None]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False, clean_up_tokenization_spaces=True)
model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

In [None]:
noticias = news['text'].tolist() 
rotulo = news['label'].tolist()

In [None]:
encodings = tokenizer(noticias, truncation=True, padding=True, max_length=128, return_tensors='pt')
labels = torch.tensor(rotulo).clone().detach()
print(len(encodings['input_ids']), len(labels))

In [None]:
x_train, x_test, y_train, y_test, mask_train, mask_test = train_test_split(encodings['input_ids'], labels, encodings['attention_mask'], test_size=0.2, random_state=1)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [52]:
class createDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, index):
        item = {key: self.encodings[key][index] for key in self.encodings}
        item['labels'] = self.labels[index].clone().detach()
        return item

In [53]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'precision': precision_score(p.label_ids, preds),
        'recall': recall_score(p.label_ids, preds),
        'f1-score': f1_score(p.label_ids, preds)
    }

In [54]:
train_encodings = {'input_ids': x_train, 'attention_mask': mask_train}
test_encodings = {'input_ids': x_test, 'attention_mask': mask_test}

train_dataset = createDataset(train_encodings, y_train)
test_dataset = createDataset(test_encodings, y_test)

In [None]:
'''
weight_decay=0.01,                
logging_dir='./logs',                              
save_steps=500,                   
save_total_limit=2,               
metric_for_best_model="precision",
seed=1, '''

In [None]:
training_args = TrainingArguments(output_dir='./results', 
                                  evaluation_strategy='epoch',
                                  num_train_epochs= 3,
                                  report_to="none")

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=train_dataset,
                  eval_dataset=test_dataset,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics
                 )

In [None]:
trainer.train()

## Avaliação

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

In [None]:
true_labels = []
pred_labels = []

model.eval()

validation_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

with torch.no_grad():
    for i in validation_dataloader:
        
        inputs = {key: val.to(device) for key, val in i.items()}

        labels = inputs.pop('labels').to('cpu').numpy()

        outputs = model(**inputs)

        preds = torch.argmax(outputs.logits, dim=1).to('cpu').numpy()

        true_labels.extend(labels)
        pred_labels.extend(preds)

cm = confusion_matrix(true_labels, pred_labels)
print(classification_report(true_labels, pred_labels))
bert_accuracy = accuracy_score(true_labels, pred_labels)
print(bert_accuracy)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='crest', cbar=True) 
plt.xlabel('Rótulos previstos')
plt.ylabel('Rótulos verdadeiros')
plt.title('Matriz de confusão')
plt.xticks([0.5, 1.5], ['Falso', 'Verdadeiro']) 
plt.yticks([0.5, 1.5], ['Falso', 'Verdadeiro'])
plt.savefig('matriz-conf.png')
plt.show()

In [None]:
def predict(text):
    
    inputs = tokenizer(text, return_tensors='pt')
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return ("Verdadeiro" if prediction == 1 else "Falso")

In [None]:
print(predict("O novo estudo afirma que a lua é feita de queijo."))

In [None]:
print(predict('A fumaça das queimadas que tem encoberto cidades do Norte, Centro-Oeste, Sudeste e Sul do país causa preocupação não só entre ambientalistas, mas também entre médicos e especialistas em saúde.'))

In [None]:
from transformers import BertForPreTraining
modelo_base = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased')
modelo_treinado = BertForSequenceClassification.from_pretrained('./trained_model')

In [None]:
trainer_base = Trainer(
    model=modelo_base,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=test_dataset
)
result_base = trainer_base.evaluate()

In [None]:
trainer_treinado = Trainer(
    model=modelo_treinado,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=test_dataset
)
result_treinado = trainer_treinado.evaluate()

In [None]:
print("Modelo Base:", result_base)
print("Modelo Treinado em Notícias Falsas:", result_treinado)

In [None]:
result_base = {
    'accuracy': result_base['eval_accuracy'],
    'precision': result_base['eval_precision'],
    'recall': result_base['eval_recall'],
    'f1': result_base['eval_f1-score']
}

result_treinado = {
    'Accuracy': result_treinado['eval_accuracy'],
    'Precision': result_treinado['eval_precision'],
    'Recall': result_treinado['eval_recall'],
    'F1-score': result_treinado['eval_f1-score']
}

In [None]:
metrics = metrics = ["Acurácia", "Precisão", "Revocação", "F1-Score"]
values_base = list(result_base.values())
values_treinado = list(result_treinado.values())

x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(10,6))
bar1 = ax.bar(x - width/2, values_base, width, label='Modelo Base', color='royalblue')
bar2 = ax.bar(x + width/2, values_treinado, width, label='Modelo Treinado', color='seagreen')

ax.set_xlabel('Métricas')
ax.set_ylabel('Valores')
ax.set_title('Comparação entre os Modelos')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))

for bar in bar1:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, yval, f'{yval:.2f}%', ha='center', va='bottom')

for bar in bar2:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, yval, f'{yval:.2f}%', ha='center', va='bottom')


plt.tight_layout()
plt.savefig('comp-modelos.png')
plt.show()