In [None]:
# ==============================================================
# LAB 5 - Minería de Textos y Análisis de Sentimiento
# Script integral con:
#   Parte 1: Snapshot del dataset
#   Parte 2: Preprocesamiento (conservar palabra de hashtag, tratar "911", lematización)
#   Parte 3: Unigramas por clase + WordCloud + Top-10 barras
#   Parte 4: Bigramas y Trigramas por clase
#   Parte 5: Modelo preliminar (TF-IDF 1-2-gramas + Regresión Logística)
# Produce: CSVs, PNGs y prints de métricas en consola
# ==============================================================

# -----------------------
# 0) IMPORTS Y CONFIG
# -----------------------
import os
import re
import html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter
from wordcloud import WordCloud

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import bigrams, trigrams
from nltk.stem import WordNetLemmatizer

# SKLEARN (Modelo preliminar)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

# --- Descargas NLTK robustas según la versión instalada ---
import nltk
from nltk.data import find

def ensure_nltk(resource, quiet=True):
    try:
        find(resource)
    except LookupError:
        # map a resource path to its download key
        key = resource.split("/", 1)[-1].split("/")[0]  # e.g., "punkt" or "punkt_tab"
        nltk.download(key, quiet=quiet)

# Tokenizers
ensure_nltk('tokenizers/punkt')
# Algunas versiones nuevas de NLTK requieren también 'punkt_tab'
try:
    ensure_nltk('tokenizers/punkt_tab')
except Exception:
    pass  # en versiones viejas no existe; ignorar

# Stopwords, WordNet y OMW
ensure_nltk('corpora/stopwords')
ensure_nltk('corpora/wordnet')
ensure_nltk('corpora/omw-1.4')


# Asegurar descargas necesarias de NLTK
nltk.download('punkt', quiet=True)
# NOTA: No usar 'punkt_tab'
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Carpeta de salida
OUT_DIR = "outputs_lab5"
os.makedirs(OUT_DIR, exist_ok=True)


# ------------------------------------------------------------
# 1) CARGA DE DATOS + SNAPSHOT (PARTE 1)
#    - Dimensiones, nulos, distribución de target, muestra
# ------------------------------------------------------------
def snapshot_dataset(df: pd.DataFrame, out_dir: str = OUT_DIR) -> None:
    """Imprime y guarda un resumen rápido del dataset."""
    print("\n=== SNAPSHOT DEL DATASET ===")
    print("Dimensiones:", df.shape, "\n")

    nulls = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
    print("Nulos por columna (%):\n", nulls, "\n")

    if 'target' in df.columns:
        print("Distribución de target (conteo):\n", df['target'].value_counts(dropna=False), "\n")
        print("Distribución de target (%):\n", (df['target'].value_counts(normalize=True) * 100).round(2), "\n")
    else:
        print("⚠️ Columna 'target' no encontrada. Se omitirá su distribución.\n")

    sample_cols = [c for c in ['id', 'keyword', 'location', 'text', 'target'] if c in df.columns]
    sample_df = df[sample_cols].sample(min(5, len(df)), random_state=42)
    print("Muestra de filas (5 o menos):\n", sample_df, "\n")

    # Guardar snapshot a disco
    nulls.to_csv(os.path.join(out_dir, "snapshot_nulls.csv"))
    if 'target' in df.columns:
        df['target'].value_counts(dropna=False).to_csv(os.path.join(out_dir, "snapshot_target_counts.csv"))
        (df['target'].value_counts(normalize=True) * 100).round(2).to_csv(
            os.path.join(out_dir, "snapshot_target_perc.csv")
        )
    sample_df.to_csv(os.path.join(out_dir, "snapshot_sample.csv"), index=False)


# ----------------------------------------------------------------
# 2) PREPROCESAMIENTO (PARTE 2)
#    Objetivo:
#     - minúsculas, quitar URL y menciones
#     - conservar la PALABRA del hashtag (remover solo '#')
#     - tratar emojis/símbolos
#     - preservar "911" como token (ej. 'nineoneone') para no perder semántica
#     - remover números sueltos (con excepción del token preservado)
#     - eliminar stopwords y lematizar
#     - ejemplos Antes/Después
# ----------------------------------------------------------------
lemmatizer = WordNetLemmatizer()
STOPWORDS = set(w for w in stopwords.words('english') if w not in {"no","not","never"})


def preprocess_text(text: str) -> str:
    """Limpia y normaliza texto siguiendo las decisiones del avance."""
    if pd.isna(text):
        return ""

    # 1) Unescape y minúsculas
    text = html.unescape(str(text)).lower()

    # 2) Conservar palabra del hashtag: quitar el símbolo '#' pero no la palabra
    text = re.sub(r'#', ' ', text)

    # 3) Quitar URLs y menciones
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)

    # 4) Quitar emojis/símbolos del plano multilingüe
    text = re.sub(r'[\U00010000-\U0010FFFF]', ' ', text)  # emojis extendidos

    # 5) Dejar solo letras/números/espacios (remover punct)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)

    # 6) Preservar "911" como palabra informativa (marcado temporal)
    text = re.sub(r'\b911\b', ' nineoneone ', text)

    # 7) Tokenizar
    tokens = word_tokenize(text)

    # 8) Remover números sueltos (excepto nuestro marcador)
    tokens = [t for t in tokens if not (t.isdigit() and t != 'nineoneone')]

    # 9) Stopwords + Lematización
    clean = []
    for w in tokens:
        if w in STOPWORDS:
            continue
        base = lemmatizer.lemmatize(w)
        clean.append(base)

    return ' '.join(clean)


def demo_preprocess_examples(df: pd.DataFrame, k: int = 5) -> None:
    """Imprime ejemplos Antes/Después para documentar el efecto del preprocesamiento."""
    print("\n=== EJEMPLOS ANTES/DESPUÉS (Preprocesamiento) ===")
    if 'text' not in df.columns:
        print("⚠️ Columna 'text' no encontrada. No se pueden generar ejemplos.")
        return
    examples = df['text'].dropna().sample(min(k, df['text'].dropna().shape[0]), random_state=13).tolist()
    for raw in examples:
        print("\nRAW  :", raw)
        print("CLEAN:", preprocess_text(raw))


# ----------------------------------------------------------------
# 3) UNIGRAMAS POR CLASE + WORDCLOUD + BARRAS TOP-10 (PARTE 3)
# ----------------------------------------------------------------
def get_word_frequencies(text_series: pd.Series) -> Counter:
    all_words = ' '.join(text_series.astype(str)).split()
    return Counter(all_words)

def save_wordcloud_and_barchart(freq: Counter, title: str, out_prefix: str, topn: int = 10) -> None:
    """Genera y guarda WordCloud y gráfico de barras Top-N a disco."""
    # WordCloud
    wc = WordCloud(width=1000, height=500, background_color='white').generate_from_frequencies(freq)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'WordCloud - {title}')
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"{out_prefix}_wordcloud.png"))
    plt.close()

    # Barras Top-N
    top_items = dict(freq.most_common(topn))
    plt.figure(figsize=(8, 4))
    plt.bar(top_items.keys(), top_items.values())
    plt.xticks(rotation=45, ha='right')
    plt.title(f'Top {topn} Words - {title}')
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"{out_prefix}_top{topn}_bar.png"))
    plt.close()


# ---------------------------------------------------
# 4) BIGRAMAS / TRIGRAMAS POR CLASE (PARTE 4)
# ---------------------------------------------------
def get_bigram_frequencies(text_series: pd.Series) -> Counter:
    all_bigrams = []
    for text in text_series.astype(str):
        toks = text.split()
        all_bigrams.extend(list(bigrams(toks)))
    return Counter(all_bigrams)

def get_trigram_frequencies(text_series: pd.Series) -> Counter:
    all_trigrams = []
    for text in text_series.astype(str):
        toks = text.split()
        all_trigrams.extend(list(trigrams(toks)))
    return Counter(all_trigrams)


# ------------------------------------------------------------
# 5) MODELO PRELIMINAR (PARTE 5)
#    - Split estratificado
#    - TF-IDF (1-2-gramas)
#    - Regresión Logística
#    - Métricas + Matriz de Confusión + Error Analysis
# ------------------------------------------------------------
def baseline_model(df_clean: pd.DataFrame) -> None:
    if 'cleaned_text' not in df_clean.columns or 'target' not in df_clean.columns:
        print("⚠️ Faltan columnas 'cleaned_text' y/o 'target'. No se entrena baseline.")
        return

    X = df_clean['cleaned_text'].fillna("")
    y = df_clean['target'].astype(int)

    # Split estratificado
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Vectorizador: unigrams + bigrams
    vec = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=20000)
    Xtr = vec.fit_transform(X_train)
    Xva = vec.transform(X_valid)

    # Regresión Logística (solver robusto; aumentar max_iter para convergencia)
    clf = LogisticRegression(max_iter=1000, solver='liblinear')
    clf.fit(Xtr, y_train)
    pred = clf.predict(Xva)

    print("\n=== MÉTRICAS (Validación) ===")
    print(classification_report(y_valid, pred, digits=4))

    # Matriz de confusión
    cm = confusion_matrix(y_valid, pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    fig, ax = plt.subplots(figsize=(4, 4))
    disp.plot(ax=ax, values_format='d')
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "confusion_matrix.png"))
    plt.close()

    # Mini error analysis (hasta 5)
    miss_idx = np.where(pred != y_valid.values)[0][:5]
    print("\n=== ERRORES DE EJEMPLO ===")
    for i in miss_idx:
        print(f"\nTrue={y_valid.values[i]} Pred={pred[i]}")
        print(X_valid.iloc[i])

    vocab = pd.Series(vec.vocabulary_).sort_values()
    vocab.to_csv(os.path.join(OUT_DIR, "tfidf_vocabulary.csv"))


# ==============================================================
# MAIN: ORQUESTA TODO EL FLUJO
# ==============================================================
if __name__ == "__main__":
    # ---- Cargar dataset ----
    
    df = pd.read_csv("tweets.csv")

    # ---- Parte 1: Snapshot ----
    snapshot_dataset(df, OUT_DIR)

    # ---- Parte 2: Preprocesamiento ----
    print("Aplicando preprocesamiento a la columna 'text' -> 'cleaned_text' ...")
    if 'text' not in df.columns:
        raise ValueError("El CSV debe contener la columna 'text'.")
    demo_preprocess_examples(df, k=5)
    df['cleaned_text'] = df['text'].apply(preprocess_text)

    # Guardar preprocesado
    clean_path = os.path.join(OUT_DIR, "tweets_cleaned.csv")
    df.to_csv(clean_path, index=False)
    print(f"Archivo preprocesado guardado en: {clean_path}")

    # ---- Parte 3: Unigramas por clase ----
    if 'target' in df.columns:
        disaster = df[df['target'] == 1]['cleaned_text']
        non_disaster = df[df['target'] == 0]['cleaned_text']

        # Frecuencias
        disaster_freq = get_word_frequencies(disaster)
        non_disaster_freq = get_word_frequencies(non_disaster)

        # Exportar CSVs
        pd.DataFrame(disaster_freq.items(), columns=['word', 'count']).to_csv(
            os.path.join(OUT_DIR, "unigrams_disaster.csv"), index=False
        )
        pd.DataFrame(non_disaster_freq.items(), columns=['word', 'count']).to_csv(
            os.path.join(OUT_DIR, "unigrams_non_disaster.csv"), index=False
        )

        # Visuales (WordCloud + Barras)
        save_wordcloud_and_barchart(disaster_freq, "Disaster (target=1)", "disaster_unigrams", topn=10)
        save_wordcloud_and_barchart(non_disaster_freq, "Non-Disaster (target=0)", "non_disaster_unigrams", topn=10)

        print("Top 10 palabras disaster:", disaster_freq.most_common(10))
        print("Top 10 palabras non-disaster:", non_disaster_freq.most_common(10))
        print("Palabras en común (unigrams):", set(disaster_freq.keys()) & set(non_disaster_freq.keys()))
    else:
        print("⚠️ Columna 'target' no encontrada. Se omite análisis por clase (Partes 3-5).")

    # ---- Parte 4: Bigramas / Trigramas por clase ----
    if 'target' in df.columns:
        disaster_bigram = get_bigram_frequencies(disaster)
        non_disaster_bigram = get_bigram_frequencies(non_disaster)
        disaster_trigram = get_trigram_frequencies(disaster)
        non_disaster_trigram = get_trigram_frequencies(non_disaster)

        # Exportar CSVs
        pd.DataFrame([(' '.join(k), v) for k, v in disaster_bigram.items()],
                     columns=['bigram', 'count']).to_csv(
            os.path.join(OUT_DIR, "bigrams_disaster.csv"), index=False
        )
        pd.DataFrame([(' '.join(k), v) for k, v in non_disaster_bigram.items()],
                     columns=['bigram', 'count']).to_csv(
            os.path.join(OUT_DIR, "bigrams_non_disaster.csv"), index=False
        )
        pd.DataFrame([(' '.join(k), v) for k, v in disaster_trigram.items()],
                     columns=['trigram', 'count']).to_csv(
            os.path.join(OUT_DIR, "trigrams_disaster.csv"), index=False
        )
        pd.DataFrame([(' '.join(k), v) for k, v in non_disaster_trigram.items()],
                     columns=['trigram', 'count']).to_csv(
            os.path.join(OUT_DIR, "trigrams_non_disaster.csv"), index=False
        )

        print("Top 10 bigramas disaster:", disaster_bigram.most_common(10))
        print("Top 10 bigramas non-disaster:", non_disaster_bigram.most_common(10))
        print("Top 10 trigramas disaster:", disaster_trigram.most_common(10))
        print("Top 10 trigramas non-disaster:", non_disaster_trigram.most_common(10))

    # ---- Parte 5: Modelo preliminar ----
    if 'target' in df.columns:
        baseline_model(df)



=== SNAPSHOT DEL DATASET ===
Dimensiones: (7613, 5) 

Nulos por columna (%):
 location    33.27
keyword      0.80
id           0.00
text         0.00
target       0.00
dtype: float64 

Distribución de target (conteo):
 target
0    4342
1    3271
Name: count, dtype: int64 

Distribución de target (%):
 target
0    57.03
1    42.97
Name: proportion, dtype: float64 

Muestra de filas (5 o menos):
         id      keyword               location  \
2644  3796  destruction                    NaN   
2227  3185       deluge                    NaN   
5448  7769       police                     UK   
132    191   aftershock                    NaN   
6845  9810       trauma  Montgomery County, MD   

                                                   text  target  
2644  So you have a new weapon that can cause un-ima...       1  
2227  The f$&amp;@ing things I do for #GISHWHES Just...       0  
5448  DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...       1  
132   Aftershock back to school kick