In [1]:
# Instalar PyTorch si es necesario
!pip install torch torchvision --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import re
from collections import Counter

In [19]:
from google.colab import drive
drive.mount('/content/drive')

ruta = '/content/drive/MyDrive/MeIA/PLN/fake_news.xlsx'

df = pd.read_excel(ruta)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Id,Category,Topic,Source,Headline,Text,Link
0,1,Fake,Education,El Ruinaversal,"RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...","RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...",http://www.elruinaversal.com/2017/06/10/rae-in...
1,2,Fake,Education,Hay noticia,"La palabra ""haiga"", aceptada por la RAE","La palabra ""haiga"", aceptada por la RAE La Rea...",https://haynoticia.es/la-palabra-haiga-aceptad...
2,3,Fake,Education,El Ruinaversal,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,http://www.elruinaversal.com/2018/05/06/yordi-...
3,4,True,Education,EL UNIVERSAL,UNAM capacitará a maestros para aprobar prueba...,UNAM capacitará a maestros para aprobar prueba...,http://www.eluniversal.com.mx/articulo/nacion/...
4,5,Fake,Education,Lamula,pretenden aprobar libros escolares con conteni...,Alerta: pretenden aprobar libros escolares con...,https://redaccion.lamula.pe/2018/06/19/memoria...


In [21]:
# Tokenización y creación de vocabulario
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return text.split()

df.columns = df.columns.str.strip().str.lower()
df['tokens'] = df['text'].apply(tokenize)
counter = Counter()
for tokens in df['tokens']:
    counter.update(tokens)

vocab = {word: i+2 for i, (word, _) in enumerate(counter.most_common())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode(tokens):
    return [vocab.get(t, 1) for t in tokens]
df['input_ids'] = df['tokens'].apply(encode)

In [23]:
print(df.columns)


Index(['id', 'category', 'topic', 'source', 'headline', 'text', 'link',
       'tokens', 'input_ids'],
      dtype='object')


In [24]:
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['category'])
X_train, X_test, y_train, y_test = train_test_split(
    df['input_ids'], df['label_id'], test_size=0.3, random_state=42)

In [25]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [torch.tensor(t, dtype=torch.long) for t in texts]
        self.labels = torch.tensor(labels.tolist(), dtype=torch.long)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=0)
    return texts_padded, torch.tensor(labels)

train_ds = TextDataset(X_train, y_train)
test_ds = TextDataset(X_test, y_test)
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(test_ds, batch_size=2, collate_fn=collate_fn)

In [26]:
# Red CNN con embeddings
class CNNWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.conv = nn.Conv1d(in_channels=embed_dim, out_channels=100, kernel_size=5)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(100, num_classes)
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x = self.relu(x)
        x = self.pool(x).squeeze(2)
        return self.fc(x)

In [27]:
model = CNNWithEmbedding(vocab_size=len(vocab), embed_dim=50, num_classes=len(le.classes_))
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [29]:
for epoch in range(10):
    print(f"Época {epoch}:\n")
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for xb, yb in train_dl:
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Cálculo de accuracy
        predicted = torch.argmax(pred, dim=1)
        correct += (predicted == yb).sum().item()
        total += yb.size(0)

    avg_loss = total_loss / len(train_dl)
    accuracy = correct / total * 100

    print(f"Pérdida promedio: {avg_loss:.4f}")
    print(f"Precisión sobre entrenamiento: {accuracy:.2f}%\n")


Época 0:

Pérdida promedio: 0.4428
Precisión sobre entrenamiento: 98.52%

Época 1:

Pérdida promedio: 0.3080
Precisión sobre entrenamiento: 98.52%

Época 2:

Pérdida promedio: 0.8295
Precisión sobre entrenamiento: 97.46%

Época 3:

Pérdida promedio: 0.3650
Precisión sobre entrenamiento: 97.89%

Época 4:

Pérdida promedio: 1.0726
Precisión sobre entrenamiento: 98.31%

Época 5:

Pérdida promedio: 0.3782
Precisión sobre entrenamiento: 98.94%

Época 6:

Pérdida promedio: 0.4170
Precisión sobre entrenamiento: 98.94%

Época 7:

Pérdida promedio: 0.0000
Precisión sobre entrenamiento: 100.00%

Época 8:

Pérdida promedio: 0.0000
Precisión sobre entrenamiento: 100.00%

Época 9:

Pérdida promedio: 0.0000
Precisión sobre entrenamiento: 100.00%



In [30]:
model.eval()
all_preds, all_true = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        output = model(xb)
        pred = torch.argmax(output, dim=1)
        all_preds.extend(pred.tolist())
        all_true.extend(yb.tolist())

print(classification_report(all_true, all_preds, target_names=le.classes_))

              precision    recall  f1-score   support

        Fake       0.75      0.86      0.80       103
        True       0.83      0.70      0.76       100

    accuracy                           0.78       203
   macro avg       0.79      0.78      0.78       203
weighted avg       0.79      0.78      0.78       203



In [31]:
def predecir_texto(texto, model, vocab, le):
    # Preprocesar y tokenizar
    texto = texto.lower()
    tokens = re.sub(r"[^\w\s]", "", texto).split()
    input_ids = [vocab.get(t, 1) for t in tokens]  # 1 = <UNK>

    # Convertir a tensor y hacer padding manual si quieres longitud fija
    tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)  # batch size = 1

    # Inferencia
    model.eval()
    with torch.no_grad():
        salida = model(tensor)
        pred = torch.argmax(salida, dim=1).item()

    # Decodificar etiqueta
    return le.classes_[pred]


In [34]:
texto_nuevo = "Científicos mexicanos descubren una cura definitiva para la diabetes con extracto de nopal silvestre, y aseguran resultados en menos de 7 días."
print(predecir_texto(texto_nuevo, model, vocab, le))


Fake
