In [1]:
# Fake News Classification Pipeline
# =================================
# Este script implementa un pipeline de clasificación de titulares de noticias
# para distinguir entre noticias reales (1) y falsas (0).
# Incluye preprocesamiento, vectorización con TF-IDF y embeddings (si está disponible), comparativa de modelos,
# y opción de análisis de sentimiento. Además, se proporciona un boceto básico para una app en Streamlit.

import pandas as pd
import numpy as np
import sys
import subprocess
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
# ---------- Preprocesamiento ----------
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    Limpia y normaliza texto: minusculas, elimina HTML, URLs y puntuación,
    tokeniza, elimina stopwords y lematiza.
    """
    def __init__(self, language='english'):
        self.stopwords = set(stopwords.words(language))
        self.lemma = WordNetLemmatizer()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'<.*?>', ' ', text)                     # quita HTML
        text = re.sub(r'http\S+|www\S+', ' ', text)          # quita URLs
        text = re.sub(r'[^a-z\s]', ' ', text)                 # quita puntuación y números
        tokens = text.split()
        tokens = [self.lemma.lemmatize(tok) for tok in tokens if tok not in self.stopwords]
        return ' '.join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(self.clean_text)

In [4]:
# ---------- Carga de datos ----------
train = pd.read_csv("training_data.csv", sep="\t")
test = pd.read_csv("testing_data.csv", sep="\t")

In [5]:
train.columns = ['label', 'title']
test.columns = ['label', 'title']

train['label'] = train['label'].map({'FAKE': 0, 'REAL': 1})
test['label'] = test['label'].map({'FAKE': 0, 'REAL': 1})

print("Columnas del dataset de entrenamiento:", train.columns)

Columnas del dataset de entrenamiento: Index(['label', 'title'], dtype='object')


In [6]:
# Asumimos columnas en train: 'title', 'label'; test: 'title' y marcador en 'label' = 2
X = train['title']
y = train['label']

In [7]:
# Verificar y corregir la columna 'label' en el DataFrame de entrenamiento
if train['label'].isna().all():
    # Intentar mapear los valores de la columna 'label' en base a los datos originales
    train['label'] = train['title'].apply(lambda x: 0 if 'FAKE' in x.upper() else (1 if 'REAL' in x.upper() else np.nan))

# Actualizar X y y después de corregir la columna 'label'
X = train['title']
y = train['label']

# Eliminar filas con valores NaN en y
X_clean = X[~y.isna()]
y_clean = y[~y.isna()]

# Verificar que X_clean y y_clean no estén vacíos
if X_clean.empty or y_clean.empty:
    raise ValueError("Los datos de entrada están vacíos después de eliminar valores NaN. Verifique los datos de entrada.")



In [8]:
# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean)

In [9]:
# ---------- Definición de pipelines ----------

# Pipeline con TF-IDF + Logistic Regression
tfidf_pipeline = Pipeline([
    ('preprocess', TextPreprocessor()),
    ('tfidf', TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
    ('clf', LogisticRegression(solver='liblinear', random_state=42))
])

In [13]:
from sentence_transformers import SentenceTransformer


In [14]:
modelo = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
# Pipeline con embeddings preentrenados + Random Forest

from sentence_transformers import SentenceTransformer
class EmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
    def fit(self, X, y=None): return self
    def transform(self, X):
        return np.array(self.model.encode(X.tolist(), show_progress_bar=True))

embedding_pipeline = Pipeline([
    ('preprocess', TextPreprocessor()),
    ('embed', EmbeddingTransformer()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [16]:
# ---------- Entrenamiento y evaluación ----------

def evaluate_model(pipeline, X_tr, X_te, y_tr, y_te):
    pipeline.fit(X_tr, y_tr)
    preds = pipeline.predict(X_te)
    proba = pipeline.predict_proba(X_te)[:,1] if hasattr(pipeline, 'predict_proba') else None
    print("Classification Report:\n", classification_report(y_te, preds))
    print("Confusion Matrix:\n", confusion_matrix(y_te, preds))
    if proba is not None:
        print(f"ROC AUC: {roc_auc_score(y_te, proba):.3f}")

print("\n--- Evaluando TF-IDF + LogisticRegression ---")
evaluate_model(tfidf_pipeline, X_train, X_test, y_train, y_test)

print("\n--- Evaluando Embeddings + RandomForest ---")
evaluate_model(embedding_pipeline, X_train, X_test, y_train, y_test)


--- Evaluando TF-IDF + LogisticRegression ---
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.94      0.97        52
         1.0       0.96      1.00      0.98        77

    accuracy                           0.98       129
   macro avg       0.98      0.97      0.98       129
weighted avg       0.98      0.98      0.98       129

Confusion Matrix:
 [[49  3]
 [ 0 77]]
ROC AUC: 0.997

--- Evaluando Embeddings + RandomForest ---


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.94      0.96        52
         1.0       0.96      0.99      0.97        77

    accuracy                           0.97       129
   macro avg       0.97      0.96      0.97       129
weighted avg       0.97      0.97      0.97       129

Confusion Matrix:
 [[49  3]
 [ 1 76]]
ROC AUC: 0.997


In [17]:
# ---------- Análisis de Sentimientos (opcional) ----------
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

def sentiment_scores(texts):
    sia = SentimentIntensityAnalyzer()
    return [sia.polarity_scores(t) for t in texts]

# Ejemplo:
print(sentiment_scores(X_test[:5]))

[{'neg': 0.25, 'neu': 0.75, 'pos': 0.0, 'compound': -0.4601}, {'neg': 0.0, 'neu': 0.792, 'pos': 0.208, 'compound': 0.2732}, {'neg': 0.258, 'neu': 0.517, 'pos': 0.225, 'compound': -0.2244}, {'neg': 0.287, 'neu': 0.612, 'pos': 0.101, 'compound': -0.6444}, {'neg': 0.154, 'neu': 0.846, 'pos': 0.0, 'compound': -0.2942}]


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\alber\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
