In [1]:
import os
import logging

import gensim.corpora as corpora
from gensim.models import LdaMulticore, CoherenceModel

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# -------------------------------------------------------
# 0) Logging para ver el progreso de gensim en la consola
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO
)

# -------------------------------------------------------
# 1) Parámetros de usuario
INPUT_DIR    = r"D:\Transcripciones_Mañaneras_StopWords"
OUTPUT_DIR   = r"D:\Transcripciones_Mañaneras_StopWords\Results"
NUM_TOPICS   = 10
PASSES       = 10
CHUNK_SIZE   = 2000
WORKERS      = 4    # número de hilos para LdaMulticore
NO_BELOW     = 5    # descartar términos en < 5 documentos
NO_ABOVE     = 0.5  # descartar términos en >50% de los documentos

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------------------------------------------
# 2) Carga de documentos: cada línea/token ya está preprocesado
documents = []
file_count = 0

for root, _, files in os.walk(INPUT_DIR):
    for fname in files:
        if fname.lower().endswith(".txt"):
            file_count += 1
            path = os.path.join(root, fname)
            with open(path, encoding="utf-8") as f:
                tokens = f.read().split()
            if tokens:
                documents.append(tokens)

if file_count == 0:
    raise RuntimeError(f"No se encontró ningún .txt bajo {INPUT_DIR}")
if not documents:
    raise RuntimeError(f"Ningún documento con tokens válidos en {INPUT_DIR}")

print(f"✓ Archivos leídos: {file_count}")
print(f"✓ Documentos con tokens: {len(documents)}")

# -------------------------------------------------------
# 3) Construye el diccionario y filtra extremos
dictionary = corpora.Dictionary(documents)
print(f"→ Términos únicos antes de filtrar: {len(dictionary)}")

dictionary.filter_extremes(
    no_below=NO_BELOW,
    no_above=NO_ABOVE
)
print(f"→ Términos únicos después de filtrar: {len(dictionary)}")
if len(dictionary) == 0:
    raise RuntimeError("Diccionario vacío tras filter_extremes(). Ajusta NO_BELOW/NO_ABOVE.")

# -------------------------------------------------------
# 4) Crea el corpus bag-of-words, descartando docs vacíos
corpus = [
    dictionary.doc2bow(doc)
    for doc in documents
    if dictionary.doc2bow(doc)
]
print(f"✓ Corpus BOW contiene {len(corpus)} documentos")
if not corpus:
    raise RuntimeError("Todos los documentos quedaron vacíos tras doc2bow().")

# -------------------------------------------------------
# 5) Entrena el modelo LDA en paralelo
lda_model = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    passes=PASSES,
    chunksize=CHUNK_SIZE,
    workers=WORKERS,
    random_state=42,
    eta='auto',
    decay=0.5
)

# -------------------------------------------------------
# 6) Imprime temas y calcula coherencia
print("\n--- Tópicos extraídos ---")
for topic_id, topic in lda_model.print_topics(num_topics=NUM_TOPICS, num_words=10):
    print(f"Tópico {topic_id:>2}: {topic}")

coherence = CoherenceModel(
    model=lda_model,
    texts=documents,
    dictionary=dictionary,
    coherence='c_v'
).get_coherence()
print(f"\nCoherencia del modelo (c_v): {coherence:.4f}")

# -------------------------------------------------------
# 7) Guarda la visualización interactiva como HTML
vis = gensimvis.prepare(lda_model, corpus, dictionary)
html_path = os.path.join(OUTPUT_DIR, "lda_visualization.html")
pyLDAvis.save_html(vis, html_path)
print(f"\n[✓] Visualización guardada en:\n    {html_path}")

# -------------------------------------------------------
# 8) Genera y guarda nubes de palabras por tópico
for t in range(NUM_TOPICS):
    freqs = dict(lda_model.show_topic(t, topn=20))
    wc = WordCloud(
        width=800, height=400,
        background_color='white'
    ).generate_from_frequencies(freqs)

    plt.figure()
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Tópico {t+1}")
    
    img_path = os.path.join(
        OUTPUT_DIR,
        f"wordcloud_topic_{t+1}.png"
    )
    plt.savefig(img_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"[✓] Nube Tópico {t+1} → {img_path}")

print("\n¡Proceso completado!")


2025-04-30 14:08:46,664 : INFO : adding document #0 to Dictionary<0 unique tokens: []>


✓ Archivos leídos: 1337
✓ Documentos con tokens: 1337


2025-04-30 14:09:05,242 : INFO : built Dictionary<77171 unique tokens: ['abandonado', 'abogado', 'aborda', 'abordar', 'abrir']...> from 1337 documents (total 7455166 corpus positions)
2025-04-30 14:09:05,245 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<77171 unique tokens: ['abandonado', 'abogado', 'aborda', 'abordar', 'abrir']...> from 1337 documents (total 7455166 corpus positions)", 'datetime': '2025-04-30T14:09:05.245375', 'gensim': '4.3.3', 'python': '3.12.8 (tags/v3.12.8:2dc476b, Dec  3 2024, 19:30:04) [MSC v.1942 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'created'}
2025-04-30 14:09:05,437 : INFO : discarding 53386 tokens: [('abrir', 1039), ('acabar', 1217), ('acaparada', 1), ('acción', 1172), ('aceptar', 976), ('aclarar', 873), ('acompañar', 776), ('acordar', 936), ('acto', 971), ('actualmente', 696)]...
2025-04-30 14:09:05,440 : INFO : keeping 23785 tokens which were in no less than 5 and no more than 668 (=50.0%) documents


→ Términos únicos antes de filtrar: 77171


2025-04-30 14:09:05,635 : INFO : resulting dictionary: Dictionary<23785 unique tokens: ['abandonado', 'abogado', 'aborda', 'abordar', 'abusar']...>


→ Términos únicos después de filtrar: 23785


2025-04-30 14:09:24,568 : INFO : using symmetric alpha at 0.1
2025-04-30 14:09:24,583 : INFO : using serial LDA version on this node
2025-04-30 14:09:24,651 : INFO : running online LDA training, 10 topics, 10 passes over the supplied corpus of 1337 documents, updating every 8000 documents, evaluating every ~1337 documents, iterating 50x with a convergence threshold of 0.001000
2025-04-30 14:09:24,664 : INFO : training LDA model using 4 processes


✓ Corpus BOW contiene 1337 documentos


2025-04-30 14:10:01,967 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1337/1337, outstanding queue size 1
2025-04-30 14:10:33,846 : INFO : topic #4 (0.100): 0.002*"centavo" + 0.002*"medicamento" + 0.002*"vacuna" + 0.002*"homicidio" + 0.001*"litro" + 0.001*"arma" + 0.001*"tramo" + 0.001*"aéreo" + 0.001*"ministro" + 0.001*"enfermedad"
2025-04-30 14:10:33,849 : INFO : topic #5 (0.100): 0.002*"vacuna" + 0.002*"centavo" + 0.002*"medicamento" + 0.001*"litro" + 0.001*"refinería" + 0.001*"consulta" + 0.001*"tramo" + 0.001*"crédito" + 0.001*"margen" + 0.001*"avión"
2025-04-30 14:10:33,853 : INFO : topic #2 (0.100): 0.003*"centavo" + 0.002*"vacuna" + 0.002*"homicidio" + 0.002*"electricidad" + 0.002*"medicamento" + 0.002*"litro" + 0.002*"tramo" + 0.001*"avión" + 0.001*"ministro" + 0.001*"robo"
2025-04-30 14:10:33,857 : INFO : topic #1 (0.100): 0.003*"centavo" + 0.002*"homicidio" + 0.002*"vacuna" + 0.002*"litro" + 0.002*"arma" + 0.001*"robo" + 0.001*"tramo" + 0.001*"medicamento


--- Tópicos extraídos ---
Tópico  0: 0.012*"vacuna" + 0.005*"homicidio" + 0.005*"vacunación" + 0.005*"dosis" + 0.004*"tendencia" + 0.004*"robo" + 0.004*"vacunar" + 0.003*"medicamento" + 0.003*"doloso" + 0.003*"aéreo"
Tópico  1: 0.004*"robo" + 0.004*"ducto" + 0.003*"barril" + 0.003*"arma" + 0.002*"refinería" + 0.002*"homicidio" + 0.002*"deuda" + 0.002*"víctima" + 0.002*"tratado" + 0.002*"toma"
Tópico  2: 0.003*"homicidio" + 0.003*"ministro" + 0.003*"mentira" + 0.003*"arma" + 0.002*"detenido" + 0.002*"falso" + 0.002*"droga" + 0.002*"magistrado" + 0.002*"detención" + 0.002*"tribunal"
Tópico  3: 0.031*"vacuna" + 0.012*"vacunación" + 0.010*"dosis" + 0.008*"vacunar" + 0.003*"ruta" + 0.003*"educativo" + 0.003*"argentina" + 0.003*"fase" + 0.002*"maestra" + 0.002*"canciller"
Tópico  4: 0.007*"medicamento" + 0.004*"enfermedad" + 0.003*"medicina" + 0.003*"especialidad" + 0.002*"niña" + 0.002*"clínica" + 0.002*"epidemia" + 0.002*"coronavirus" + 0.002*"consulta" + 0.002*"abasto"
Tópico  5: 0.002*"

2025-04-30 14:22:54,774 : INFO : 1 batches submitted to accumulate stats from 64 documents (418552 virtual)
2025-04-30 14:22:55,058 : INFO : 2 batches submitted to accumulate stats from 128 documents (841223 virtual)
2025-04-30 14:22:55,312 : INFO : 3 batches submitted to accumulate stats from 192 documents (1248476 virtual)
2025-04-30 14:22:55,642 : INFO : 4 batches submitted to accumulate stats from 256 documents (1635679 virtual)
2025-04-30 14:22:56,046 : INFO : 5 batches submitted to accumulate stats from 320 documents (2035334 virtual)
2025-04-30 14:22:56,475 : INFO : 6 batches submitted to accumulate stats from 384 documents (2451145 virtual)
2025-04-30 14:22:56,904 : INFO : 7 batches submitted to accumulate stats from 448 documents (2829924 virtual)
2025-04-30 14:22:57,380 : INFO : 8 batches submitted to accumulate stats from 512 documents (3198689 virtual)
2025-04-30 14:22:57,847 : INFO : 9 batches submitted to accumulate stats from 576 documents (3545478 virtual)
2025-04-30 14


Coherencia del modelo (c_v): 0.4524

[✓] Visualización guardada en:
    D:\Transcripciones_Mañaneras_StopWords\Results\lda_visualization.html
[✓] Nube Tópico 1 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_1.png
[✓] Nube Tópico 2 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_2.png
[✓] Nube Tópico 3 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_3.png
[✓] Nube Tópico 4 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_4.png
[✓] Nube Tópico 5 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_5.png
[✓] Nube Tópico 6 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_6.png
[✓] Nube Tópico 7 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_7.png
[✓] Nube Tópico 8 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_8.png
[✓] Nube Tópico 9 → D:\Transcripciones_Mañaneras_StopWords\Results\wordcloud_topic_9.png
[✓] Nube Tópico 10 → D:\Transcripciones_Mañaneras_StopWo