In [1]:
# ==============================================================
# LAB 5 - Minería de Textos y Análisis de Sentimiento
# Script integral con:
#   Parte 1: Snapshot del dataset
#   Parte 2: Preprocesamiento (conservar palabra de hashtag, tratar "911", lematización)
#   Parte 3: Unigramas por clase + WordCloud + Top-10 barras
#   Parte 4: Bigramas y Trigramas por clase
#   Parte 5: Modelo preliminar (TF-IDF 1-2-gramas + Regresión Logística)
# Produce: CSVs, PNGs y prints de métricas en consola
# ==============================================================

# -----------------------
# 0) IMPORTS Y CONFIG
# -----------------------
import os
import re
import html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
from wordcloud import WordCloud

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import bigrams, trigrams
from nltk.stem import WordNetLemmatizer

# SKLEARN (Modelo preliminar)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

# --- Descargas NLTK robustas según la versión instalada ---
import nltk
from nltk.data import find

def ensure_nltk(resource, quiet=True):
    try:
        find(resource)
    except LookupError:
        # map a resource path to its download key
        key = resource.split("/", 1)[-1].split("/")[0]  # e.g., "punkt" or "punkt_tab"
        nltk.download(key, quiet=quiet)

# Tokenizers
ensure_nltk('tokenizers/punkt')
# Algunas versiones nuevas de NLTK requieren también 'punkt_tab'
try:
    ensure_nltk('tokenizers/punkt_tab')
except Exception:
    pass  # en versiones viejas no existe; ignorar

# Stopwords, WordNet y OMW
ensure_nltk('corpora/stopwords')
ensure_nltk('corpora/wordnet')
ensure_nltk('corpora/omw-1.4')


# Asegurar descargas necesarias de NLTK
nltk.download('punkt', quiet=True)
# NOTA: No usar 'punkt_tab'
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Carpeta de salida
OUT_DIR = "outputs_lab5"
os.makedirs(OUT_DIR, exist_ok=True)


# ------------------------------------------------------------
# 1) CARGA DE DATOS + SNAPSHOT (PARTE 1)
#    - Dimensiones, nulos, distribución de target, muestra
# ------------------------------------------------------------
def snapshot_dataset(df: pd.DataFrame, out_dir: str = OUT_DIR) -> None:
    """Imprime y guarda un resumen rápido del dataset."""
    print("\n=== SNAPSHOT DEL DATASET ===")
    print("Dimensiones:", df.shape, "\n")

    nulls = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
    print("Nulos por columna (%):\n", nulls, "\n")

    if 'target' in df.columns:
        print("Distribución de target (conteo):\n", df['target'].value_counts(dropna=False), "\n")
        print("Distribución de target (%):\n", (df['target'].value_counts(normalize=True) * 100).round(2), "\n")
    else:
        print("⚠️ Columna 'target' no encontrada. Se omitirá su distribución.\n")

    sample_cols = [c for c in ['id', 'keyword', 'location', 'text', 'target'] if c in df.columns]
    sample_df = df[sample_cols].sample(min(5, len(df)), random_state=42)
    print("Muestra de filas (5 o menos):\n", sample_df, "\n")

    # Guardar snapshot a disco
    nulls.to_csv(os.path.join(out_dir, "snapshot_nulls.csv"))
    if 'target' in df.columns:
        df['target'].value_counts(dropna=False).to_csv(os.path.join(out_dir, "snapshot_target_counts.csv"))
        (df['target'].value_counts(normalize=True) * 100).round(2).to_csv(
            os.path.join(out_dir, "snapshot_target_perc.csv")
        )
    sample_df.to_csv(os.path.join(out_dir, "snapshot_sample.csv"), index=False)


# ----------------------------------------------------------------
# 2) PREPROCESAMIENTO (PARTE 2)
#    Objetivo:
#     - minúsculas, quitar URL y menciones
#     - conservar la PALABRA del hashtag (remover solo '#')
#     - tratar emojis/símbolos
#     - preservar "911" como token (ej. 'nineoneone') para no perder semántica
#     - remover números sueltos (con excepción del token preservado)
#     - eliminar stopwords y lematizar
#     - ejemplos Antes/Después
# ----------------------------------------------------------------
lemmatizer = WordNetLemmatizer()
STOPWORDS = set(w for w in stopwords.words('english') if w not in {"no","not","never"})


def preprocess_text(text: str) -> str:
    """Limpia y normaliza texto siguiendo las decisiones del avance."""
    if pd.isna(text):
        return ""

    # 1) Unescape y minúsculas
    text = html.unescape(str(text)).lower()

    # 2) Conservar palabra del hashtag: quitar el símbolo '#' pero no la palabra
    text = re.sub(r'#', ' ', text)

    # 3) Quitar URLs y menciones
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)

    # 4) Quitar emojis/símbolos del plano multilingüe
    text = re.sub(r'[\U00010000-\U0010FFFF]', ' ', text)  # emojis extendidos

    # 5) Dejar solo letras/números/espacios (remover punct)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)

    # 6) Preservar "911" como palabra informativa (marcado temporal)
    text = re.sub(r'\b911\b', ' nineoneone ', text)

    # 7) Tokenizar
    tokens = word_tokenize(text)

    # 8) Remover números sueltos (excepto nuestro marcador)
    tokens = [t for t in tokens if not (t.isdigit() and t != 'nineoneone')]

    # 9) Stopwords + Lematización
    clean = []
    for w in tokens:
        if w in STOPWORDS:
            continue
        base = lemmatizer.lemmatize(w)
        clean.append(base)

    return ' '.join(clean)


def demo_preprocess_examples(df: pd.DataFrame, k: int = 5) -> None:
    """Imprime ejemplos Antes/Después para documentar el efecto del preprocesamiento."""
    print("\n=== EJEMPLOS ANTES/DESPUÉS (Preprocesamiento) ===")
    if 'text' not in df.columns:
        print("⚠️ Columna 'text' no encontrada. No se pueden generar ejemplos.")
        return
    examples = df['text'].dropna().sample(min(k, df['text'].dropna().shape[0]), random_state=13).tolist()
    for raw in examples:
        print("\nRAW  :", raw)
        print("CLEAN:", preprocess_text(raw))


# ----------------------------------------------------------------
# 3) UNIGRAMAS POR CLASE + WORDCLOUD + BARRAS TOP-10 (PARTE 3)
# ----------------------------------------------------------------
def get_word_frequencies(text_series: pd.Series) -> Counter:
    all_words = ' '.join(text_series.astype(str)).split()
    return Counter(all_words)

def save_wordcloud_and_barchart(freq: Counter, title: str, out_prefix: str, topn: int = 10) -> None:
    """Genera y guarda WordCloud y gráfico de barras Top-N a disco."""
    # WordCloud
    wc = WordCloud(width=1000, height=500, background_color='white').generate_from_frequencies(freq)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'WordCloud - {title}')
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"{out_prefix}_wordcloud.png"))
    plt.close()

    # Barras Top-N
    top_items = dict(freq.most_common(topn))
    plt.figure(figsize=(8, 4))
    plt.bar(top_items.keys(), top_items.values())
    plt.xticks(rotation=45, ha='right')
    plt.title(f'Top {topn} Words - {title}')
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"{out_prefix}_top{topn}_bar.png"))
    plt.close()


# ---------------------------------------------------
# 4) BIGRAMAS / TRIGRAMAS POR CLASE (PARTE 4)
# ---------------------------------------------------
def get_bigram_frequencies(text_series: pd.Series) -> Counter:
    all_bigrams = []
    for text in text_series.astype(str):
        toks = text.split()
        all_bigrams.extend(list(bigrams(toks)))
    return Counter(all_bigrams)

def get_trigram_frequencies(text_series: pd.Series) -> Counter:
    all_trigrams = []
    for text in text_series.astype(str):
        toks = text.split()
        all_trigrams.extend(list(trigrams(toks)))
    return Counter(all_trigrams)


# ------------------------------------------------------------
# 5) MODELO PRELIMINAR (PARTE 5)
#    - Split estratificado
#    - TF-IDF (1-2-gramas)
#    - Regresión Logística
#    - Métricas + Matriz de Confusión + Error Analysis
# ------------------------------------------------------------
def baseline_model(df_clean: pd.DataFrame) -> None:
    if 'cleaned_text' not in df_clean.columns or 'target' not in df_clean.columns:
        print("⚠️ Faltan columnas 'cleaned_text' y/o 'target'. No se entrena baseline.")
        return

    X = df_clean['cleaned_text'].fillna("")
    y = df_clean['target'].astype(int)

    # Split estratificado
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Vectorizador: unigrams + bigrams
    vec = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=20000)
    Xtr = vec.fit_transform(X_train)
    Xva = vec.transform(X_valid)

    # Regresión Logística (solver robusto; aumentar max_iter para convergencia)
    clf = LogisticRegression(max_iter=1000, solver='liblinear')
    clf.fit(Xtr, y_train)
    pred = clf.predict(Xva)

    print("\n=== MÉTRICAS (Validación) ===")
    print(classification_report(y_valid, pred, digits=4))

    # Matriz de confusión
    cm = confusion_matrix(y_valid, pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    fig, ax = plt.subplots(figsize=(4, 4))
    disp.plot(ax=ax, values_format='d')
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "confusion_matrix.png"))
    plt.close()

    # Mini error analysis (hasta 5)
    miss_idx = np.where(pred != y_valid.values)[0][:5]
    print("\n=== ERRORES DE EJEMPLO ===")
    for i in miss_idx:
        print(f"\nTrue={y_valid.values[i]} Pred={pred[i]}")
        print(X_valid.iloc[i])

    vocab = pd.Series(vec.vocabulary_).sort_values()
    vocab.to_csv(os.path.join(OUT_DIR, "tfidf_vocabulary.csv"))


# ==============================================================
# MAIN: ORQUESTA TODO EL FLUJO
# ==============================================================
if __name__ == "__main__":
    # ---- Cargar dataset ----

    df = pd.read_csv("tweets.csv")

    # ---- Parte 1: Snapshot ----
    snapshot_dataset(df, OUT_DIR)

    # ---- Parte 2: Preprocesamiento ----
    print("Aplicando preprocesamiento a la columna 'text' -> 'cleaned_text' ...")
    if 'text' not in df.columns:
        raise ValueError("El CSV debe contener la columna 'text'.")
    demo_preprocess_examples(df, k=5)
    df['cleaned_text'] = df['text'].apply(preprocess_text)

    # Guardar preprocesado
    clean_path = os.path.join(OUT_DIR, "tweets_cleaned.csv")
    df.to_csv(clean_path, index=False)
    print(f"Archivo preprocesado guardado en: {clean_path}")

    # ---- Parte 3: Unigramas por clase ----
    if 'target' in df.columns:
        disaster = df[df['target'] == 1]['cleaned_text']
        non_disaster = df[df['target'] == 0]['cleaned_text']

        # Frecuencias
        disaster_freq = get_word_frequencies(disaster)
        non_disaster_freq = get_word_frequencies(non_disaster)

        # Exportar CSVs
        pd.DataFrame(disaster_freq.items(), columns=['word', 'count']).to_csv(
            os.path.join(OUT_DIR, "unigrams_disaster.csv"), index=False
        )
        pd.DataFrame(non_disaster_freq.items(), columns=['word', 'count']).to_csv(
            os.path.join(OUT_DIR, "unigrams_non_disaster.csv"), index=False
        )

        # Visuales (WordCloud + Barras)
        save_wordcloud_and_barchart(disaster_freq, "Disaster (target=1)", "disaster_unigrams", topn=10)
        save_wordcloud_and_barchart(non_disaster_freq, "Non-Disaster (target=0)", "non_disaster_unigrams", topn=10)

        print("Top 10 palabras disaster:", disaster_freq.most_common(10))
        print("Top 10 palabras non-disaster:", non_disaster_freq.most_common(10))
        print("Palabras en común (unigrams):", set(disaster_freq.keys()) & set(non_disaster_freq.keys()))
    else:
        print("⚠️ Columna 'target' no encontrada. Se omite análisis por clase (Partes 3-5).")

    # ---- Parte 4: Bigramas / Trigramas por clase ----
    if 'target' in df.columns:
        disaster_bigram = get_bigram_frequencies(disaster)
        non_disaster_bigram = get_bigram_frequencies(non_disaster)
        disaster_trigram = get_trigram_frequencies(disaster)
        non_disaster_trigram = get_trigram_frequencies(non_disaster)

        # Exportar CSVs
        pd.DataFrame([(' '.join(k), v) for k, v in disaster_bigram.items()],
                     columns=['bigram', 'count']).to_csv(
            os.path.join(OUT_DIR, "bigrams_disaster.csv"), index=False
        )
        pd.DataFrame([(' '.join(k), v) for k, v in non_disaster_bigram.items()],
                     columns=['bigram', 'count']).to_csv(
            os.path.join(OUT_DIR, "bigrams_non_disaster.csv"), index=False
        )
        pd.DataFrame([(' '.join(k), v) for k, v in disaster_trigram.items()],
                     columns=['trigram', 'count']).to_csv(
            os.path.join(OUT_DIR, "trigrams_disaster.csv"), index=False
        )
        pd.DataFrame([(' '.join(k), v) for k, v in non_disaster_trigram.items()],
                     columns=['trigram', 'count']).to_csv(
            os.path.join(OUT_DIR, "trigrams_non_disaster.csv"), index=False
        )

        print("Top 10 bigramas disaster:", disaster_bigram.most_common(10))
        print("Top 10 bigramas non-disaster:", non_disaster_bigram.most_common(10))
        print("Top 10 trigramas disaster:", disaster_trigram.most_common(10))
        print("Top 10 trigramas non-disaster:", non_disaster_trigram.most_common(10))

    # ---- Parte 5: Modelo preliminar ----
    if 'target' in df.columns:
        baseline_model(df)



=== SNAPSHOT DEL DATASET ===
Dimensiones: (7613, 5) 

Nulos por columna (%):
 location    33.27
keyword      0.80
id           0.00
text         0.00
target       0.00
dtype: float64 

Distribución de target (conteo):
 target
0    4342
1    3271
Name: count, dtype: int64 

Distribución de target (%):
 target
0    57.03
1    42.97
Name: proportion, dtype: float64 

Muestra de filas (5 o menos):
         id      keyword               location  \
2644  3796  destruction                    NaN   
2227  3185       deluge                    NaN   
5448  7769       police                     UK   
132    191   aftershock                    NaN   
6845  9810       trauma  Montgomery County, MD   

                                                   text  target  
2644  So you have a new weapon that can cause un-ima...       1  
2227  The f$&amp;@ing things I do for #GISHWHES Just...       0  
5448  DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...       1  
132   Aftershock back to school kick

In [2]:
# === Punto 6: Modelos de clasificación para "disaster" (1) vs "no disaster" (0) ===
# - Pipelines con TF-IDF (unigramas + bigramas) para capturar contexto corto.
# - Modelos: Multinomial Naive Bayes, Regresión Logística, Linear SVM, Random Forest (baseline).
# - Búsqueda rápida de hiperparámetros (GridSearchCV) y evaluación con métricas.
# - Guarda el mejor modelo para usarlo en el Punto 7 (función clasificar_tweet).

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import joblib

# 1) Tomar el DataFrame existente

df = globals().get('df', None)
if df is None:
    # fallback opcional: si no tienes df en memoria, intenta cargar train.csv desde el cwd
    df = pd.read_csv('train.csv')  # ajusta la ruta si la tienes en otra carpeta

text_col = 'clean_text' if 'clean_text' in df.columns else 'text'
target_col = 'target'

# Asegurar tipos
df[text_col] = df[text_col].astype(str)
y = df[target_col].astype(int)
X = df[text_col]

# 2) Split estratificado (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# 3) Vectorizador base (palabras): unigrams + bigrams para dar algo de "contexto"
base_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.98,
    strip_accents='unicode',
    lowercase=True
)

# 4) Definir modelos (pipelines)
pipelines = {
    "NaiveBayes": Pipeline([
        ("tfidf", base_tfidf),
        ("clf", MultinomialNB())
    ]),
    "LogisticRegression": Pipeline([
        ("tfidf", base_tfidf),
        ("clf", LogisticRegression(max_iter=200, n_jobs=None))
    ]),
    "LinearSVM": Pipeline([
        ("tfidf", base_tfidf),
        ("clf", LinearSVC())
    ]),
    "RandomForest": Pipeline([
        ("tfidf", base_tfidf),
        ("clf", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))
    ])
}

# 5) Grids pequeños (rápidos) para no tardar demasiado
param_grids = {
    "NaiveBayes": {
        "clf__alpha": [0.5, 1.0]
    },
    "LogisticRegression": {
        "clf__C": [0.5, 1.0, 2.0],
        "clf__penalty": ["l2"],
        "clf__solver": ["liblinear", "lbfgs"]  # lbfgs ignora 'penalty' si no es l2, por eso fijamos l2
    },
    "LinearSVM": {
        "clf__C": [0.5, 1.0, 2.0]
    },
    "RandomForest": {
        "clf__max_depth": [None, 20, 40],
        "clf__min_samples_split": [2, 5]
    }
}

# 6) Entrenar, evaluar y comparar
results = []
best_models = {}

for name, pipe in pipelines.items():
    grid = GridSearchCV(
        pipe,
        param_grids[name],
        scoring="f1",  # F1 balancea precision/recall (dataset algo desbalanceado)
        cv=5,
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)

    results.append({
        "modelo": name,
        "best_params": grid.best_params_,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    })
    best_models[name] = grid

# 7) Tabla de resultados ordenada por F1
results_df = pd.DataFrame(results).sort_values(by="f1", ascending=False).reset_index(drop=True)
results_df

# 8) Mostrar reporte y matriz de confusión del mejor
best_name = results_df.iloc[0]["modelo"]
print(f"\nMejor modelo por F1: {best_name} | Params: {best_models[best_name].best_params_}\n")

best_clf = best_models[best_name]
y_pred_best = best_clf.predict(X_test)

print("== Classification report ==\n")
print(classification_report(y_test, y_pred_best, digits=4))

print("== Confusion matrix ==")
print(confusion_matrix(y_test, y_pred_best))

# 9) (Opcional) Inspección de características top del mejor modelo si es lineal (LR o SVM)
def top_features_linear(grid, top_k=20):
    try:
        vec = grid.best_estimator_["tfidf"]
        clf = grid.best_estimator_["clf"]
        if hasattr(clf, "coef_"):
            feature_names = np.array(vec.get_feature_names_out())
            coefs = clf.coef_.ravel()
            top_pos_idx = np.argsort(coefs)[-top_k:][::-1]
            top_neg_idx = np.argsort(coefs)[:top_k]
            print("\nTop + (asoc. a clase 1 = desastre):")
            for i in top_pos_idx:
                print(f"{feature_names[i]:<25s} {coefs[i]:.4f}")
            print("\nTop - (asoc. a clase 0 = no desastre):")
            for i in top_neg_idx:
                print(f"{feature_names[i]:<25s} {coefs[i]:.4f}")
        else:
            print("El clasificador no es lineal o no expone coeficientes.")
    except Exception as e:
        print(f"No se pudieron extraer features: {e}")

if best_name in ["LogisticRegression", "LinearSVM"]:
    top_features_linear(best_models[best_name], top_k=20)

# 10) Guardar el mejor modelo para el Punto 7
joblib.dump(best_clf.best_estimator_, "modelo_disaster_best.joblib")
print("\nModelo guardado en: modelo_disaster_best.joblib")



Mejor modelo por F1: LinearSVM | Params: {'clf__C': 0.5}

== Classification report ==

              precision    recall  f1-score   support

           0     0.8206    0.8631    0.8413       869
           1     0.8046    0.7492    0.7759       654

    accuracy                         0.8142      1523
   macro avg     0.8126    0.8061    0.8086      1523
weighted avg     0.8137    0.8142    0.8132      1523

== Confusion matrix ==
[[750 119]
 [164 490]]

Top + (asoc. a clase 1 = desastre):
hiroshima                 2.1612
fires                     1.7468
storm                     1.6168
floods                    1.5633
in                        1.5514
wildfire                  1.5429
train                     1.4685
near                      1.4378
massacre                  1.4345
earthquake                1.4066
tornado                   1.3945
buildings                 1.3363
california                1.3292
police                    1.3017
japan                     1.2729
fire   

In [3]:
# === Punto 7: Función para clasificar un tweet nuevo ===
# - Carga el mejor modelo (Pipeline TF-IDF + clf).
# - Usa tu preprocess_text si ya existe en el notebook; si no, aplica una limpieza equivalente.
# - Devuelve etiqueta (0/1), texto limpio y una "confianza" (score sigmoide del margin SVM).

import re, html, joblib, numpy as np, pandas as pd

# 1) Cargar el modelo entrenado
MODEL_PATH = "modelo_disaster_best.joblib"
modelo = joblib.load(MODEL_PATH)

# 2) Fallback de preprocesamiento
def _default_preprocess(text: str) -> str:
    if pd.isna(text):
        return ""
    t = html.unescape(str(text)).lower()
    # conservar palabra del hashtag
    t = re.sub(r'#', ' ', t)
    # quitar URLs y menciones
    t = re.sub(r'http\S+|www\S+|https\S+', ' ', t)
    t = re.sub(r'@\w+', ' ', t)
    # preservar 911 como token legible
    t = re.sub(r'\b911\b', ' nineoneone ', t)
    # quitar puntuación y números sueltos
    t = re.sub(r'[^\w\s]', ' ', t)  # puntuación
    t = re.sub(r'\d+', ' ', t)      # números
    # espacios
    t = re.sub(r'\s+', ' ', t).strip()
    return t

_preprocess_fn = globals().get('preprocess_text', _default_preprocess)

# 3) Función principal
def clasificar_tweet(tweet: str) -> dict:
    """
    Recibe un tweet crudo y devuelve:
      - pred: 1 = desastre, 0 = no desastre
      - label: 'disaster'/'no disaster'
      - clean: texto preprocesado
      - confidence: score ∈ (0,1) derivado de decision_function (no calibrado)
    """
    clean = _preprocess_fn(tweet)
    pred = int(modelo.predict([clean])[0])

    # "confianza" basada en margin (sigmoide del decision_function).
    confidence = None
    if hasattr(modelo, "decision_function"):
        margin = float(modelo.decision_function([clean])[0])
        confidence = float(1.0 / (1.0 + np.exp(-margin)))  # monotónica, no calibrada
    elif hasattr(modelo, "predict_proba"):
        confidence = float(modelo.predict_proba([clean])[0][1])

    return {
        "tweet": tweet,
        "clean": clean,
        "pred": pred,
        "label": "disaster" if pred == 1 else "no disaster",
        "confidence": None if confidence is None else round(confidence, 4),
    }

# 4) Clasificador por lotes (útil para probar varios)
def clasificar_varios(tweets: list[str]) -> pd.DataFrame:
    cleans = [ _preprocess_fn(t) for t in tweets ]
    preds = modelo.predict(cleans)
    # intentar confianza si se puede
    confs = None
    if hasattr(modelo, "decision_function"):
        margins = modelo.decision_function(cleans)
        confs = 1.0 / (1.0 + np.exp(-np.asarray(margins, dtype=float)))
    elif hasattr(modelo, "predict_proba"):
        confs = modelo.predict_proba(cleans)[:, 1]
    df_out = pd.DataFrame({
        "tweet": tweets,
        "clean": cleans,
        "pred": preds.astype(int),
        "label": np.where(preds==1, "disaster", "no disaster"),
    })
    if confs is not None:
        df_out["confidence"] = np.round(confs, 4)
    return df_out

# 5) Prueba rápida
ejemplos = [
    "Wildfire spreading near the hills, people evacuating now!",
    "This cake is a disaster 😂 but tastes amazing lol",
    "Earthquake reported 20 miles north of the city. Stay safe.",
]
clasificar_varios(ejemplos)


Unnamed: 0,tweet,clean,pred,label,confidence
0,"Wildfire spreading near the hills, people evac...",wildfire spreading near hill people evacuating,1,disaster,0.7693
1,This cake is a disaster 😂 but tastes amazing lol,cake disaster taste amazing lol,0,no disaster,0.2767
2,Earthquake reported 20 miles north of the city...,earthquake reported mile north city stay safe,1,disaster,0.7094


In [4]:
# === Punto 8: Análisis de sentimiento (positivo/negativo/neutral) ===
# - Usa VADER (NLTK). Si no está disponible, cae a TextBlob.
# - PRESERVA emojis y emoticonos (no usamos 'clean_text' aquí), quitamos solo URLs/mentions.
# - Devuelve: sent_neg, sent_neu, sent_pos, sent_compound, sent_label (pos/neu/neg),
#             pos_count, neg_count (conteos de palabras según léxico).

import re, html, numpy as np, pandas as pd

# 0) Asegurar df y columnas
assert 'text' in df.columns, "No encuentro la columna 'text' en df."
if 'target' not in df.columns:
    raise ValueError("No encuentro la columna 'target' en df.")

# 1) Preprocesamiento ligero SOLO para sentimiento (conservar emojis y 'tonos')
def sentiment_prepare(s: str) -> str:
    """Mantiene emojis/emoticonos; quita URLs y @mentions; conserva hashtags como palabra."""
    if pd.isna(s):
        return ""
    t = str(s)
    t = html.unescape(t)
    # quitar URLs y menciones
    t = re.sub(r'http\S+|www\S+|https?\S+', ' ', t)
    t = re.sub(r'@\w+', ' ', t)
    # convertir #palabra -> palabra (conserva la palabra)
    t = t.replace('#', ' ')
    # colapsar espacios
    t = re.sub(r'\s+', ' ', t).strip()
    return t

df['text_for_sent'] = df['text'].astype(str).apply(sentiment_prepare)

# 2) Backend de sentimiento: VADER -> TextBlob (fallback)
backend = None
sia = None
try:
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer
    try:
        _ = SentimentIntensityAnalyzer()
    except LookupError:
        nltk.download('vader_lexicon')
    sia = SentimentIntensityAnalyzer()
    backend = "vader"
except Exception as e:
    try:
        from textblob import TextBlob
        backend = "textblob"
    except Exception as e2:
        raise RuntimeError("No se encontró VADER ni TextBlob. Instala al menos uno para continuar.")

print(f"Usando backend de sentimiento: {backend}")

# 3) Funciones de puntaje y conteo de palabras positivas/negativas
def scores_vader(txt: str):
    sc = sia.polarity_scores(txt)  # dict: neg, neu, pos, compound
    return sc['neg'], sc['neu'], sc['pos'], sc['compound']

def counts_vader(txt: str):
    # cuenta tokens cuya valencia en el léxico VADER sea >0 o <0
    tokens = re.findall(r"\w+", txt.lower())
    pos = sum(1 for t in tokens if t in sia.lexicon and sia.lexicon[t] > 0)
    neg = sum(1 for t in tokens if t in sia.lexicon and sia.lexicon[t] < 0)
    return pos, neg

def scores_textblob(txt: str):
    from textblob import TextBlob
    pol = TextBlob(txt).sentiment.polarity  # [-1,1]
    # aproximación a componentes
    pos = max(0.0, pol)
    neg = max(0.0, -pol)
    neu = max(0.0, 1.0 - (pos + neg))
    return neg, neu, pos, float(pol)

def counts_textblob(txt: str):
    from textblob import TextBlob
    tokens = re.findall(r"\w+", txt.lower())
    pos = neg = 0
    for t in tokens:
        pol = TextBlob(t).sentiment.polarity
        if pol > 0:
            pos += 1
        elif pol < 0:
            neg += 1
    return pos, neg

if backend == "vader":
    score_fn = scores_vader
    count_fn = counts_vader
else:
    score_fn = scores_textblob
    count_fn = counts_textblob

# 4) Aplicar a todo el dataset
neg_neu_pos_comp = df['text_for_sent'].apply(score_fn)
df[['sent_neg','sent_neu','sent_pos','sent_compound']] = pd.DataFrame(neg_neu_pos_comp.tolist(), index=df.index)

pos_neg_counts = df['text_for_sent'].apply(count_fn)
df[['pos_count','neg_count']] = pd.DataFrame(pos_neg_counts.tolist(), index=df.index)

# 5) Etiqueta de sentimiento por umbrales estándar de VADER (funcionan también con fallback)
def label_from_compound(c):
    if c >= 0.05:
        return "positive"
    elif c <= -0.05:
        return "negative"
    else:
        return "neutral"

df['sent_label'] = df['sent_compound'].apply(label_from_compound)

# 6) Resumen rápido (para tu informe)
print(df[['sent_neg','sent_neu','sent_pos','sent_compound','pos_count','neg_count','sent_label']].head())

print("\nDistribución de sentimiento (tweets):")
print(df['sent_label'].value_counts(dropna=False).to_frame('count'))

print("\nCrosstab sentimiento vs. target (0=no desastre, 1=desastre):")
print(pd.crosstab(df['sent_label'], df['target']))


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Usando backend de sentimiento: vader
   sent_neg  sent_neu  sent_pos  sent_compound  pos_count  neg_count  \
0     0.000     0.851     0.149         0.2732          1          0   
1     0.286     0.714     0.000        -0.3400          0          1   
2     0.095     0.905     0.000        -0.2960          0          1   
3     0.000     1.000     0.000         0.0000          0          0   
4     0.000     1.000     0.000         0.0000          0          0   

  sent_label  
0   positive  
1   negative  
2   negative  
3    neutral  
4    neutral  

Distribución de sentimiento (tweets):
            count
sent_label       
negative     3735
neutral      1945
positive     1933

Crosstab sentimiento vs. target (0=no desastre, 1=desastre):
target         0     1
sent_label            
negative    1854  1881
neutral     1092   853
positive    1396   537


In [7]:
# === Punto 9: Top 10 más negativos/positivos + comparación por categoría ===
import numpy as np
import pandas as pd

# 0) Asegurar columnas necesarias
req_cols = {'id','text','target','sent_compound','sent_label'}
faltan = req_cols - set(df.columns)
if faltan:
    raise ValueError(f"Faltan columnas en df para el punto 9: {faltan}")

# 1) Etiqueta humana de categoría
df['target_label'] = np.where(df['target']==1, 'desastre', 'no desastre')

# 2) Top 10 más negativos / más positivos (según sent_compound)
cols_show = ['id','target','target_label','sent_label','sent_compound','text']

top10_neg = df.sort_values('sent_compound', ascending=True).head(10)[cols_show]
top10_pos = df.sort_values('sent_compound', ascending=False).head(10)[cols_show]

print("=== TOP 10 TWEETS MÁS NEGATIVOS ===")
display(top10_neg.reset_index(drop=True))

print("\n=== TOP 10 TWEETS MÁS POSITIVOS ===")
display(top10_pos.reset_index(drop=True))

# 3) ¿Son los de desastre más negativos?
#    Tasas por categoría
totals = df.groupby('target').size().rename('total')
negatives = df[df['sent_label']=='negative'].groupby('target').size().rename('negativos')
positives = df[df['sent_label']=='positive'].groupby('target').size().rename('positivos')
neutrals  = df[df['sent_label']=='neutral'].groupby('target').size().rename('neutrales')

summary = pd.concat([totals, negatives, positives, neutrals], axis=1).fillna(0).astype(int)
summary['p_neg'] = summary['negativos'] / summary['total']
summary['p_pos'] = summary['positivos'] / summary['total']
summary['p_neu'] = summary['neutrales'] / summary['total']

print("\n=== Resumen por categoría (0=no desastre, 1=desastre) ===")
display(summary)

p_neg_0 = summary.loc[0,'p_neg']
p_neg_1 = summary.loc[1,'p_neg']
print(f"\nTasa de negativos en NO DESASTRE (0): {p_neg_0:.3f}")
print(f"Tasa de negativos en DESASTRE (1):   {p_neg_1:.3f}")
print(f"Diferencia (1 - 0): {p_neg_1 - p_neg_0:+.3f}")

# 4) (Opcional) Significancia estadística: Chi-cuadrado y Cramér's V
try:
    from scipy.stats import chi2_contingency
    tab = pd.crosstab(df['target'], df['sent_label'])  # filas: target, columnas: sentiment
    chi2, p, dof, exp = chi2_contingency(tab.values)
    n = tab.values.sum()
    k = min(tab.shape) - 1  # min(r-1,c-1)
    cramer_v = np.sqrt(chi2 / (n * k)) if k > 0 else np.nan
    print("\n=== Chi-cuadrado sentimiento x categoría ===")
    print(tab)
    print(f"chi2={chi2:.3f}, dof={dof}, p-value={p:.3e}, Cramér's V={cramer_v:.3f}")
except Exception as e:
    print("\n(No se pudo calcular chi-cuadrado/Cramér V):", e)


=== TOP 10 TWEETS MÁS NEGATIVOS ===


Unnamed: 0,id,target,target_label,sent_label,sent_compound,text
0,10689,0,no desastre,negative,-0.9883,wreck? wreck wreck wreck wreck wreck wreck wre...
1,9172,1,desastre,negative,-0.9686,@Abu_Baraa1 Suicide bomber targets Saudi mosqu...
2,9166,1,desastre,negative,-0.9623,Suicide bomber kills 15 in Saudi security site...
3,9137,1,desastre,negative,-0.9595,? 19th Day Since 17-Jul-2015 -- Nigeria: Suici...
4,9159,1,desastre,negative,-0.9552,17 killed in SÛªArabia mosque suicide bombing...
5,4213,0,no desastre,negative,-0.9549,at the lake \n*sees a dead fish*\nme: poor lit...
6,682,1,desastre,negative,-0.9538,illegal alien released by Obama/DHS 4 times Ch...
7,2225,1,desastre,negative,-0.9524,Bomb Crash Loot Riot Emergency Pipe Bomb Nucle...
8,9765,1,desastre,negative,-0.95,Bomb head? Explosive decisions dat produced mo...
9,9940,1,desastre,negative,-0.9493,@cspan #Prez. Mr. President you are the bigges...



=== TOP 10 TWEETS MÁS POSITIVOS ===


Unnamed: 0,id,target,target_label,sent_label,sent_compound,text
0,10028,0,no desastre,positive,0.973,Check out 'Want Twister Tickets AND A VIP EXPE...
1,9345,0,no desastre,positive,0.9564,@thoutaylorbrown I feel like accidents are jus...
2,8989,1,desastre,positive,0.9471,TodayÛªs storm will pass; let tomorrowÛªs li...
3,4541,0,no desastre,positive,0.9423,@batfanuk we enjoyed the show today. Great fun...
4,4844,0,no desastre,positive,0.9423,@batfanuk we enjoyed the show today. Great fun...
5,8994,0,no desastre,positive,0.9376,Free Ebay Sniping RT? http://t.co/B231Ul1O1K L...
6,3525,1,desastre,positive,0.9356,@Raishimi33 :) well I think that sounds like a...
7,1453,0,no desastre,positive,0.9345,I'm not a Drake fan but I enjoy seeing him bod...
8,9386,0,no desastre,positive,0.9344,@duchovbutt @Starbuck_Scully @MadMakNY @davidd...
9,8759,0,no desastre,positive,0.93,Super sweet and beautiful :) https://t.co/TUi9...



=== Resumen por categoría (0=no desastre, 1=desastre) ===


Unnamed: 0_level_0,total,negativos,positivos,neutrales,p_neg,p_pos,p_neu
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,4342,1854,1396,1092,0.426992,0.321511,0.251497
1,3271,1881,537,853,0.575054,0.16417,0.260777



Tasa de negativos en NO DESASTRE (0): 0.427
Tasa de negativos en DESASTRE (1):   0.575
Diferencia (1 - 0): +0.148

=== Chi-cuadrado sentimiento x categoría ===
sent_label  negative  neutral  positive
target                                 
0               1854     1092      1396
1               1881      853       537
chi2=265.885, dof=2, p-value=1.836e-58, Cramér's V=0.187


In [6]:
# === Punto 10: Variable "negatividad" + reentrenar y comparar ===
# Definición de negatividad: usamos sent_neg (proporción negativa de VADER), rango [0,1].
# Comparamos SVM (texto) vs SVM (texto + negatividad) con el MISMO split (X_train/X_test).

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

# --- 0) Chequeos y preparación
assert 'sent_neg' in df.columns, "Ejecuta primero la Parte 8 (VADER) para tener 'sent_neg'."

# Variable de negatividad (0..1)
df['negatividad'] = df['sent_neg'].fillna(0.0)

text_col   = 'clean_text' if 'clean_text' in df.columns else 'text'
target_col = 'target'

# Reusar el MISMO split que ya hiciste en el punto 6
if not all(k in globals() for k in ['X_train','X_test','y_train','y_test']):
    raise RuntimeError("No encuentro X_train/X_test del punto 6. Ejecuta primero el bloque del punto 6.")

idx_tr = X_train.index
idx_te = X_test.index

Xtr_text = df.loc[idx_tr, text_col].astype(str)
Xte_text = df.loc[idx_te, text_col].astype(str)
ytr = y_train.astype(int)
yte = y_test.astype(int)

# --- 1) Vectorizador texto (mismos hiperparámetros del punto 6)
vec = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_df=0.98,
    strip_accents='unicode',
    lowercase=True
)
Xtr_tfidf = vec.fit_transform(Xtr_text)
Xte_tfidf = vec.transform(Xte_text)

# --- 2) BASELINE: SVM con texto solamente
svm_base = LinearSVC()
grid_base = GridSearchCV(
    svm_base,
    {'C': [0.5, 1.0, 2.0]},
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=0
)
grid_base.fit(Xtr_tfidf, ytr)
yhat_base = grid_base.predict(Xte_tfidf)

def metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary', zero_division=0
    )
    return acc, prec, rec, f1

m_base = metrics(yte, yhat_base)

# --- 3) EXTENDIDO: concatenar negatividad como feature numérica (sparse) al TF-IDF
Xtr_neg = csr_matrix(df.loc[idx_tr, 'negatividad'].values.reshape(-1,1))
Xte_neg = csr_matrix(df.loc[idx_te, 'negatividad'].values.reshape(-1,1))

Xtr_aug = hstack([Xtr_tfidf, Xtr_neg], format='csr')
Xte_aug = hstack([Xte_tfidf, Xte_neg], format='csr')

svm_aug = LinearSVC()
grid_aug = GridSearchCV(
    svm_aug,
    {'C': [0.5, 1.0, 2.0]},
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=0
)
grid_aug.fit(Xtr_aug, ytr)
yhat_aug = grid_aug.predict(Xte_aug)

m_aug = metrics(yte, yhat_aug)

# --- 4) Resumen comparativo
comp = pd.DataFrame(
    {
        "modelo": ["SVM texto", "SVM texto+negatividad"],
        "C_best": [grid_base.best_params_['C'], grid_aug.best_params_['C']],
        "accuracy": [m_base[0], m_aug[0]],
        "precision": [m_base[1], m_aug[1]],
        "recall": [m_base[2], m_aug[2]],
        "f1": [m_base[3], m_aug[3]],
    }
).round(4)
comp["Δ_accuracy"] = (comp.loc[1,"accuracy"] - comp.loc[0,"accuracy"]).round(4)
comp["Δ_recall"]   = (comp.loc[1,"recall"]   - comp.loc[0,"recall"]).round(4)
comp["Δ_f1"]       = (comp.loc[1,"f1"]       - comp.loc[0,"f1"]).round(4)
comp

print("\n== Reporte del modelo EXTENDIDO (texto+negatividad) ==")
print(f"Mejor C: {grid_aug.best_params_['C']}")
print(classification_report(yte, yhat_aug, digits=4))
print("Matriz de confusión (extendido):")
print(confusion_matrix(yte, yhat_aug))

# --- 5) Guardar el modelo extendido para usarlo si lo quieres en una versión v2 de clasificador
import joblib
artifacts = {
    "vectorizer": vec,
    "svm": grid_aug.best_estimator_,
    "C": grid_aug.best_params_['C'],
    "note": "Este modelo espera TF-IDF de texto + una columna extra con negatividad (sent_neg)."
}
joblib.dump(artifacts, "modelo_disaster_withneg.joblib")
print("\nModelo extendido guardado en: modelo_disaster_withneg.joblib")



== Reporte del modelo EXTENDIDO (texto+negatividad) ==
Mejor C: 0.5
              precision    recall  f1-score   support

           0     0.8187    0.8677    0.8425       869
           1     0.8090    0.7446    0.7755       654

    accuracy                         0.8148      1523
   macro avg     0.8138    0.8062    0.8090      1523
weighted avg     0.8145    0.8148    0.8137      1523

Matriz de confusión (extendido):
[[754 115]
 [167 487]]

Modelo extendido guardado en: modelo_disaster_withneg.joblib
