In [None]:
%pip install ipykernel

In [1]:
from datasets import load_dataset
from gensim.corpora import Dictionary  # Mapeo entre palabras e ids
from gensim.models import LdaModel  # Cargar modeo LDA
from gensim.parsing.preprocessing import STOPWORDS  # Lista de palabras de parada
from gensim.test.utils import datapath  # Utilidad para guardar y cargar modelos
from gensim.utils import (
    simple_preprocess,
)  # Convertir un documento en una lista de tokens
import nltk  # Natural Language Toolkit
from nltk.stem import WordNetLemmatizer  # Lematizado
import os

# Descargamos información de WordNet para el lematizado
nltk.download("wordnet")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\berna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
dataset = load_dataset("glnmario/news-qa-summarization", split="train")

In [3]:
stories = list()
for article in dataset:
    stories.append(article["story"])

In [7]:
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(WordNetLemmatizer().lemmatize(token))
    return result
processed_docs = [preprocess(story) for story in stories]

In [27]:
dictionary = Dictionary(processed_docs)
corpus = [dictionary.doc2bow(story) for story in processed_docs]

able action actor afghan afghan afghan afghan afghan afghan afghan afghan afghan afghan afghan afghan afghan afghanistan afghanistan afghanistan afghanistan afghanistan afghanistan afghanistan afghanistan afghanistan afghanistan alleyway approach army army army army army aroma asylum asylum authorized baking banter battalion battered bazaar beam beria beria best better british british british british british british british british british british british build business center changing clearly closest clothes colthup colthup come comfortable complex complex compound conduct cost count countryside course creating creating cross cultural cultural culture culture culture david difficult directly distinctive duty easily easy elder elder elder england england english entering environment exercise explains explains farmhouse farmhouse fazel feature fight flatbread fled force force force freshly ghurkha give going grape green grilled ground ground group hand handle hawk heart helmand helmand 

In [13]:
lda_model = LdaModel(
        corpus=corpus,  # El conjunto de datos
        id2word=dictionary,  # Diccionario que mapea cada palabra a un identificador único
        num_topics=10,  # Número de temas que queremos identificar
        random_state=666,  # Establecemos una semilla para reproducibilidad
        update_every=1,  # Cada cuantos documentos se actualizan los parámetros del modelo
        passes=15,  # Núero de pases sobre el corpus completo. Más pases implica más precisión, pero más tiempo de entrenamiento
        alpha="auto",  # Cantidad de temas que habrá en los documentos (si normalmente habrán pocos o muchos)
        per_word_topics=True,
    )

In [17]:
topics = lda_model.print_topics(num_words=6)
for topic in topics:
    print(topic)

(0, '0.032*"said" + 0.015*"military" + 0.014*"official" + 0.010*"attack" + 0.010*"force" + 0.009*"afghanistan"')
(1, '0.020*"child" + 0.018*"said" + 0.017*"school" + 0.012*"health" + 0.011*"student" + 0.010*"woman"')
(2, '0.032*"said" + 0.014*"court" + 0.009*"year" + 0.009*"case" + 0.007*"told" + 0.007*"family"')
(3, '0.019*"said" + 0.008*"people" + 0.008*"church" + 0.007*"group" + 0.007*"government" + 0.007*"country"')
(4, '0.009*"year" + 0.009*"like" + 0.007*"time" + 0.007*"people" + 0.006*"say" + 0.005*"world"')
(5, '0.012*"team" + 0.010*"game" + 0.010*"world" + 0.010*"year" + 0.009*"player" + 0.008*"second"')
(6, '0.048*"said" + 0.013*"police" + 0.008*"people" + 0.006*"home" + 0.006*"told" + 0.006*"city"')
(7, '0.018*"said" + 0.013*"iran" + 0.012*"government" + 0.011*"state" + 0.010*"election" + 0.010*"country"')
(8, '0.031*"said" + 0.020*"flight" + 0.016*"plane" + 0.012*"airport" + 0.011*"pilot" + 0.010*"airline"')
(9, '0.021*"said" + 0.016*"obama" + 0.014*"president" + 0.009*"sta

In [46]:
# Asignamos los temas a cada documento en el corpus
topics_dict = dict()
for document_num, doc in enumerate(corpus):
  # Obtenemos las distribución de temas para el documento
  doc_topics, word_topics, phi_values = lda_model.get_document_topics(doc, per_word_topics=True)
  
  # Ordenamos por el segundo elemento de la tupla (probabilidad del tema) y obtenemos el mayor
  dominant_topic = sorted(doc_topics, key=lambda x: x[1], reverse=True)[0]
  topic_num, prop_topic = dominant_topic

  # Crear una lista vacía para almacenar las palabras del documento
  document_words = []

  # Recorrer cada par (id de palabra, frecuencia) en la representación BoW
  for word_id, freq in doc:
    # Obtener la palabra correspondiente al id del diccionario
    word = dictionary[word_id]
    
    # Agregar la palabra a la lista de palabras del documento, repetida según su frecuencia
    document_words.extend([word] * freq)
  # Unir las palabras en un solo string, separadas por espacios
  document = ' '.join(document_words)
  # Guardado de texto por tema
  if topic_num in topics_dict:
    topics_dict[topic_num] += document
  else:
    topics_dict[topic_num] = document

In [52]:
from wordcloud import WordCloud
os.makedirs("wordclouds", exist_ok=True)

for topic_num, words in topics_dict.items():
    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = STOPWORDS,
                collocations=False, 
                min_font_size = 10).generate(words)
    wordcloud.to_file(f"wordclouds/topic_{topic_num}.png")