<a href="https://colab.research.google.com/github/EricRibeiroAlves/DeteccaoFakeNews/blob/main/Modelos/%5BAP_T%5D_TI_Modelo_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importação de Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
from google.colab import drive

#Tratamento

In [None]:
# Importação do Dataset ISOT Fake News Dataset
drive.mount('/content/drive')
# Dataset
dat_fake = "/content/drive/MyDrive/Eng. Controle e Automação/8º Semestre/AP/dataset_FakeNews/Fake.csv"
dat_real = "/content/drive/MyDrive/Eng. Controle e Automação/8º Semestre/AP/dataset_FakeNews/True.csv"

dt_fake = pd.read_csv(dat_fake)
dt_real = pd.read_csv(dat_real)

dt_fake['label'] = 'fake'
dt_real['label'] = 'real'
df = pd.concat([dt_fake, dt_real], ignore_index=True)
df = df[(df['subject'] == 'News') | (df['subject'] == 'worldnews')]

# Verificar valores ausentes e balanceamento
df = df.dropna()  # Remover entradas com valores nulos
print(df['label'].value_counts())  # Checar balanceamento

# Pré-Processamento

In [None]:
# Codificar os rótulos
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])  # fake=0, real=1

# Divisão dos dados
x_train, x_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.3, random_state=42
)

# Tokenização
modelo_bert = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(modelo_bert)

def tokenizar_dados(textos, labels=None, max_length=256):
    encodings = tokenizer(
        list(textos),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='tf'
    )
    if labels is not None:
        return tf.data.Dataset.from_tensor_slices((dict(encodings), labels)).batch(16)
    return tf.data.Dataset.from_tensor_slices(dict(encodings)).batch(16)

# Preparar datasets
train_data = tokenizar_dados(x_train, y_train)
test_data = tokenizar_dados(x_test, y_test)

# Modelo BERT

In [1]:
# Modelo BERT
modelo = TFBertForSequenceClassification.from_pretrained(
    modelo_bert,
    num_labels=2
)

# Compilação
modelo.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),  # Taxa ajustada
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Treinamento
history = modelo.fit(
    train_data,
    epochs=1,  # Ajustar se necessário
    validation_data=test_data,
    verbose=1
)



NameError: name 'TFBertForSequenceClassification' is not defined

#Validação

In [None]:
# Avaliação
logits = modelo.predict(test_data).logits
y_pred = tf.argmax(logits, axis=1).numpy()

# Métricas
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, tf.nn.softmax(logits, axis=1)[:, 1].numpy())

print(f"Acurácia: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

# Relatório detalhado
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
