<a href="https://colab.research.google.com/github/AdaliaFlores/DetectorSesgo/blob/main/Detector_de_sesgo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

from transformers import pipeline

# Cargamos un modelo preentrenado de análisis de sentimiento (como prueba)
classifier = pipeline("sentiment-analysis")

# Prueba con un texto
texto = "Todos los políticos son corruptos y solo piensan en robar"
resultado = classifier(texto)

print("Resultado:", resultado)




No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


Resultado: [{'label': 'NEGATIVE', 'score': 0.9867910146713257}]


In [1]:
import pandas as pd
import csv

# Cargar los archivos ignorando líneas corruptas y problemas de comillas
fake_df = pd.read_csv("DataSet_Misinfo_FAKE.csv", on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf-8', engine='python')
true_df = pd.read_csv("DataSet_Misinfo_TRUE.csv", on_bad_lines='skip', quoting=csv.QUOTE_NONE, encoding='utf-8', engine='python')

# Etiquetas
fake_df["label"] = 1
true_df["label"] = 0

# Unir datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Mostrar resumen
print(df["label"].value_counts())
df.head()


label
1    201765
0     88952
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,text,label
0,to all the people who voted for this a hole t...,you were wrong! 70-year-old men don t change ...,1
1,,,1
2,,,1
3,,,1
4,,,1


In [2]:
import pandas as pd

# Leer el archivo sin separar columnas (todo como una sola línea por fila)
true_df = pd.read_csv("DataSet_Misinfo_TRUE.csv", header=None, names=["text"], engine="python", on_bad_lines='skip')

# Eliminar filas vacías
true_df = true_df[true_df["text"].notnull()]
true_df = true_df[true_df["text"].str.len() > 10]

# Añadir la etiqueta de clase
true_df["label"] = 0

# Mostrar cuántos textos válidos hay
print(f"Número de textos verdaderos válidos: {len(true_df)}")
true_df.head()


Número de textos verdaderos válidos: 34941


Unnamed: 0,text,label
0.0,The head of a conservative Republican faction ...,0
1.0,Transgender people will be allowed for the fir...,0
2.0,The special counsel investigation of links bet...,0
3.0,Trump campaign adviser George Papadopoulos tol...,0
4.0,President Donald Trump called on the U.S. Post...,0


In [3]:
# Combinar columnas de texto en una sola (usando fillna para evitar errores)
df["texto"] = df["text"].fillna('') + " " + df["label"].fillna('').astype(str)

# Ahora nos quedamos solo con las filas que tengan texto real (más de 10 caracteres)
df = df[df["texto"].str.len() > 10]

# Eliminar duplicados (opcional)
df = df.drop_duplicates(subset="texto")

# Mostrar resumen
print(f"Número de textos válidos: {len(df)}")
df[["texto", "label"]].head()


Número de textos válidos: 247


Unnamed: 0,texto,label
0,you were wrong! 70-year-old men don t change ...,1
165,look at me! I m violating the U.S. flag code ...,1
274,she finishes.The whole thing sounds like an ...,1
509,honor the fact that he never wanted to see yo...,1
523,but at least he now admits it.Featured image ...,1


In [4]:
# Verificamos que fake_df esté limpio
fake_df = fake_df[fake_df["text"].notnull()]
fake_df = fake_df[fake_df["text"].str.len() > 10]
fake_df = fake_df.drop_duplicates(subset="text")
fake_df["label"] = 1

# Combinamos ambos datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Barajamos el dataset para mezclar bien
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Mostramos el resultado
print("Distribución de clases final:")
print(df["label"].value_counts())
df.head()


Distribución de clases final:
label
0    34941
1      131
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,text,label
0,,Democratic Republic of Congo s main opposition...,0
1,,"QUICK, IS KLM a brand of milk, the call letter...",0
2,,Belgium will set up an English-language commer...,0
3,,The U.S. consumer financial watchdog on Thursd...,0
4,,U.S. President Donald Trump tweeted “SEE YOU I...,0


In [5]:
# Tomar todas las muestras de clase 1
fake_sample = fake_df

# Submuestrear clase 0 para que tenga igual cantidad que la clase 1
true_sample = true_df.sample(n=len(fake_sample), random_state=42)

# Combinar ambos conjuntos balanceados
df_balanced = pd.concat([fake_sample, true_sample], ignore_index=True)

# Mezclar
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Ver distribución
print(df_balanced["label"].value_counts())
df_balanced.head()


label
0    131
1    131
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,text,label
0,,Longer prison sentences for non-violent crimin...,0
1,,Sat in his hillside grocery shop in a Banglade...,0
2,,An Israeli court on Saturday freed without cha...,0
3,,"Pope Francis, in the first-ever papal address ...",0
4,so that if anything happens to me it s not g...,he added. WikiLeaks got access before he wa...,1


In [6]:
from sklearn.model_selection import train_test_split

# Dividir
train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42, stratify=df_balanced["label"])

print(f"Entrenamiento: {len(train_df)} muestras")
print(f"Prueba: {len(test_df)} muestras")


Entrenamiento: 209 muestras
Prueba: 53 muestras


In [7]:
!pip install transformers datasets torch


Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m112.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached fsspec-2025.3.0-py3-none-any.whl (193 kB)
Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, tokenizers, transformers
  Attempting uninstall: fsspec
    Found existing installation: fsspe

In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples, padding='max_length', truncation=True, max_length=128)

# Tokenizar textos de entrenamiento y prueba
train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

test_texts = test_df['text'].tolist()
test_labels = test_df['label'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
import torch

class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = FakeNewsDataset(train_encodings, train_labels)
test_dataset = FakeNewsDataset(test_encodings, test_labels)


In [14]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

# Cargar el modelo BERT con una capa final de clasificación
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Configurar los argumentos del entrenamiento (solo los compatibles con 4.5.3)
training_args = TrainingArguments(
    output_dir='./results',               # Carpeta para guardar resultados
    num_train_epochs=3,                   # Número de épocas
    per_device_train_batch_size=8,        # Batch size para entrenamiento
    per_device_eval_batch_size=8,         # Batch size para validación
    logging_dir='./logs',                 # Carpeta para logs
    logging_steps=10,                     # Cada cuántos pasos registrar logs
    seed=42,                               # Semilla para reproducibilidad
    report_to="none"
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset  # Si quieres evaluar durante el entrenamiento
)

trainer.train()


Step,Training Loss
10,0.5185
20,0.1228
30,0.0161
40,0.0983
50,0.0018
60,0.0015
70,0.0011
80,0.0009


TrainOutput(global_step=81, training_loss=0.09395186323600671, metrics={'train_runtime': 829.9419, 'train_samples_per_second': 0.755, 'train_steps_per_second': 0.098, 'total_flos': 41242657927680.0, 'train_loss': 0.09395186323600671, 'epoch': 3.0})

In [16]:
eval_result = trainer.evaluate()
print(eval_result)


{'eval_loss': 0.001126388437114656, 'eval_runtime': 19.264, 'eval_samples_per_second': 2.751, 'eval_steps_per_second': 0.363, 'epoch': 3.0}
