In [1]:
# ======================================================
# NLP en Español - Pipeline Colab Ready (TF-IDF, Coseno, LDA)
# ======================================================

# ---------- 0) Instalación de dependencias ----------
!pip -q install spacy nltk scikit-learn
!python -m spacy download es_core_news_sm -q

# ---------- 1) Imports y recursos ----------
import re, html, unicodedata, warnings
from typing import List, Iterable, Tuple
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.snowball import SpanishStemmer

# ---------- 2) Stopwords y configuración ----------
SPANISH_STOPWORDS = set(stopwords.words("spanish"))
NEGATIONS = {"no","nunca","jamás","ni"}
CUSTOM_STOPWORDS = {"muy","lo","el","mi","este","esto","esta","rt","via"}
ALL_STOPWORDS = SPANISH_STOPWORDS.union(CUSTOM_STOPWORDS) - NEGATIONS

# spaCy (lemmatizer) con fallback a stemming
USE_SPACY = True
nlp = None
try:
    import spacy
    nlp = spacy.load("es_core_news_sm", disable=["ner","parser"])
except Exception as e:
    print("spaCy no disponible, usando stemming NLTK. Motivo:", e)
    USE_SPACY = False
    stemmer = SpanishStemmer()

# ---------- 3) Preprocesamiento ----------
URL = re.compile(r"(https?://\S+|www\.\S+)")
USER = re.compile(r"@\w+")
HASHTAG = re.compile(r"#(\w+)")
NUM = re.compile(r"\b\d+([.,]\d+)?\b")
MULTISPACE = re.compile(r"\s+")

def strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))

def normalize_repeats(token: str, max_rep=2) -> str:
    return re.sub(r"(.)\1{"+str(max_rep)+",}", r"\1"*max_rep, token)

def normalize_basic(t: str) -> str:
    t = html.unescape(t.lower())
    t = strip_accents(t)
    t = URL.sub(" url ", t)
    t = USER.sub(" usuario ", t)
    t = HASHTAG.sub(lambda m: f" hashtag {m.group(1)} ", t)
    t = NUM.sub(" num ", t)
    return t

def tokenize(t: str) -> List[str]:
    return [w for w in re.split(r"[^a-zñáéíóúü]+", t) if w]

def clean_tokens(tokens: Iterable[str]) -> List[str]:
    out = []
    for tok in tokens:
        tok = normalize_repeats(tok)
        if tok in NEGATIONS:            # preserva negaciones
            out.append(tok); continue
        if tok in ALL_STOPWORDS:        # filtra stopwords
            continue
        if len(tok) < 2:
            continue
        out.append(tok)
    return out

def lemma_or_stem(tokens: Iterable[str]) -> List[str]:
    if USE_SPACY and nlp is not None:
        doc = nlp(" ".join(tokens))
        lems = []
        for t in doc:
            if t.text in NEGATIONS:
                lems.append(t.text)
            else:
                l = (t.lemma_ or t.text).strip()
                if l: lems.append(l)
        return lems
    else:
        return [SpanishStemmer().stem(t) if t not in NEGATIONS else t for t in tokens]

def preprocess_text(text: str) -> str:
    text = normalize_basic(text)
    toks = tokenize(text)
    toks = clean_tokens(toks)
    toks = lemma_or_stem(toks)
    return MULTISPACE.sub(" ", " ".join(toks)).strip()

def preprocess_corpus(docs: Iterable[str]) -> List[str]:
    return [preprocess_text(d) for d in docs]

# ---------- 4) Vectorizadores ----------
def build_tfidf(ngram_range=(1,2), min_df=1, max_df=0.95, sublinear_tf=True):
    return TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=max_df,
                           sublinear_tf=sublinear_tf, token_pattern=r"(?u)\b\w+\b")

def build_bow(ngram_range=(1,2), min_df=1, max_df=0.95, binary=False):
    return CountVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=max_df,
                           binary=binary, token_pattern=r"(?u)\b\w+\b")

# ---------- 5) Utilidades ----------
def top_topic_words(lda, feature_names, n_top=10) -> List[List[Tuple[str,float]]]:
    out = []
    for k, comp in enumerate(lda.components_):
        idx = np.argsort(comp)[::-1][:n_top]
        out.append([(feature_names[i], float(comp[i])) for i in idx])
    return out

def most_similar_pairs(X_tfidf, topk=3):
    S = cosine_similarity(X_tfidf)
    np.fill_diagonal(S, -1)  # evita escoger el mismo doc
    triples = []
    n = S.shape[0]
    # pares únicos i<j
    for i in range(n):
        for j in range(i+1, n):
            triples.append((i, j, float(S[i,j])))
    triples = sorted(triples, key=lambda x: x[2], reverse=True)[:topk]
    return triples, S

# ---------- 6) Demo con tu corpus (reemplaza por el tuyo si deseas) ----------
docs = [
    "La Inteligencia Artificial avanza rápidamente en salud.",
    "Los hospitales usan NLP para analizar historias clínicas.",
    "El fútbol es un deporte popular en Latinoamérica.",
    "Los bancos utilizan modelos de lenguaje para contratos.",
    "La selección ganó un partido importante en Quito.",
    "La IA mejora los procesos financieros en bancos.",
    "Los pacientes reciben diagnósticos con apoyo de NLP.",
    "El equipo de Guayaquil obtuvo la victoria en la final."
]

# a) Preprocesamiento
docs_clean = preprocess_corpus(docs)

print("=== Ejemplos preprocesados ===")
for o, c in zip(docs, docs_clean):
    print(f"- ORIG: {o}\n  CLEAN: {c}\n")

# b) TF-IDF + similitud
tfidf = build_tfidf(ngram_range=(1,2), min_df=1, max_df=0.95)
X_tfidf = tfidf.fit_transform(docs_clean)

pairs, S = most_similar_pairs(X_tfidf, topk=3)
print("=== Pares más similares (coseno) ===")
for i, j, v in pairs:
    print(f"({i}, {j}) = {v:.3f}")

# c) LDA (BoW)
bow = build_bow(ngram_range=(1,2), min_df=1, max_df=0.95)
X_bow = bow.fit_transform(docs_clean)
lda = LatentDirichletAllocation(n_components=2, random_state=42, learning_method="batch")
doc_topic = lda.fit_transform(X_bow)

feat = bow.get_feature_names_out()
topic_words = top_topic_words(lda, feat, n_top=10)

print("\n=== Tópicos (palabras top) ===")
for k, topic in enumerate(topic_words):
    print(f"Topic {k}: " + ", ".join(w for w,_ in topic))

print("\n=== Pertenencia doc → tópico ===")
for i, dist in enumerate(doc_topic):
    k = int(np.argmax(dist))
    print(f"Doc {i}: Topic {k} (p={dist[k]:.3f})")

# d) (Opcional) Matrices en DataFrame para inspección rápida
try:
    import pandas as pd
    print("\n=== Matriz de similitud (primeros 4x4) ===")
    sim_df = pd.DataFrame(S)
    display(sim_df.iloc[:4, :4].round(3))

    print("\n=== Matriz doc-topic ===")
    dt_df = pd.DataFrame(doc_topic, columns=[f"Topic_{i}" for i in range(doc_topic.shape[1])])
    display(dt_df.round(3))
except:
    pass

print("\nListo. Puedes reemplazar 'docs' por tu propio corpus y volver a ejecutar.")


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/12.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/12.9 MB[0m [31m32.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/12.9 MB[0m [31m52.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m7.5/12.9 MB[0m [31m72.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m12.9/12.9 MB[0m [31m138.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may nee

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Ejemplos preprocesados ===
- ORIG: La Inteligencia Artificial avanza rápidamente en salud.
  CLEAN: inteligencia artificial avanzar rapidamente salud

- ORIG: Los hospitales usan NLP para analizar historias clínicas.
  CLEAN: hospital usar nlp analizar historia clinica

- ORIG: El fútbol es un deporte popular en Latinoamérica.
  CLEAN: futbol deporte popular latinoamerico

- ORIG: Los bancos utilizan modelos de lenguaje para contratos.
  CLEAN: banco utilizar modelo lenguaje contrato

- ORIG: La selección ganó un partido importante en Quito.
  CLEAN: seleccion gano partido importante quito

- ORIG: La IA mejora los procesos financieros en bancos.
  CLEAN: ia mejorar proceso financiero banco

- ORIG: Los pacientes reciben diagnósticos con apoyo de NLP.
  CLEAN: paciente recibir diagnosticos apoyo nlp

- ORIG: El equipo de Guayaquil obtuvo la victoria en la final.
  CLEAN: equipo guayaquil obtener victoria final

=== Pares más similares (coseno) ===
(3, 5) = 0.081
(1, 6) = 0.073
(0, 

Unnamed: 0,0,1,2,3
0,-1.0,0.0,0.0,0.0
1,0.0,-1.0,0.0,0.0
2,0.0,0.0,-1.0,0.0
3,0.0,0.0,0.0,-1.0



=== Matriz doc-topic ===


Unnamed: 0,Topic_0,Topic_1
0,0.946,0.054
1,0.953,0.047
2,0.933,0.067
3,0.052,0.948
4,0.946,0.054
5,0.942,0.058
6,0.052,0.948
7,0.946,0.054



Listo. Puedes reemplazar 'docs' por tu propio corpus y volver a ejecutar.
