In [8]:
import importlib
import numpy as np
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher, Matcher

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models

nlp_topics = spacy.load("es_core_news_md")

In [9]:
data_file = "./data/PuertoMontt_chile_2021-07-01_2021-12-31_FILTRADO.csv"

df = pd.read_csv(data_file, index_col=0)

topicmatcher = Matcher(nlp_topics.vocab)
pattern_1 = [{"POS": "NOUN"},{"LOWER": "de"}, {"POS": "NOUN"}]
topicmatcher.add("NOUN-de-NOUN", [pattern_1])

pattern_2 = [{"POS": "NOUN"}, {"POS": "ADJ"}]
topicmatcher.add("NOUN-ADJ", [pattern_2])

In [10]:
df

Unnamed: 0_level_0,title,date
id_news,Unnamed: 1_level_1,Unnamed: 2_level_1
21443510.0,Para series menores de Puerto Varas y Puerto M...,2021-08-13
21469264.0,Puerto Montt: Hoy comienza el ciclo de concier...,2021-12-14
21940337.0,Más de 2500 rezagados fueron vacunados este fi...,2021-07-25
21469757.0,Así fue la marcha por el 18 de Octubre en Puer...,2021-10-19
47390948.0,De los artistas José Reyes y Gastón Ampuero: H...,2021-11-05
...,...,...
21384707.0,¡Castro te quiero ver de pie…!,2021-12-10
21391930.0,Víctor Hugo Catalán – poeta y narrador,2021-08-29
21378371.0,Jordi Valenzuela Muñoz: una apuesta joven para...,2021-10-16
47522784.0,"""El cambio climático puede tener grandes efect...",2021-08-24


In [5]:
def text_to_list(noticia, nlp, matcher):
    list_of_words = []

    try:
        doc = nlp(noticia)

        for token in doc:
            if (token.pos_ == "NOUN"):
                list_of_words.append(token.text)

        for ent in doc.ents:
            if (ent.label_ == "PER" and " " in ent.text):
                list_of_words.append(ent.text)

        matches = matcher(doc)

        for match_id, start, end in matches:
            span = doc[start:end]  # The matched span
            list_of_words.append(span.text)

    except Exception as e:
        print(e)
        print("Noticia:\n", noticia)

    return list_of_words

In [12]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ---------u
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        # print(num_topics)

        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

        model_list.append(model)
        coherencemodel = CoherenceModel(
            model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return [model_list, coherence_values]


In [None]:
# Run LDA analysis for each commune
RUN_LDA = True
if RUN_LDA:
    # 2708 noticias: 3:14 minutos
    lda_results = []

    noticias = df["title"]
    n = len(noticias)

    start = 2
    limit = min(n, 10)
    step = 2

    print(
        #f"Procesando comuna: {comuna}\n",
        f"\tDesde {start} hasta {limit} cada {step}.")

    noticias_procesadas = []
    for noticia in noticias:
        noticias_procesadas.append(text_to_list(
            noticia, nlp_topics, topicmatcher))
    id2word = corpora.Dictionary(noticias_procesadas)
    dataset = [id2word.doc2bow(noticia_procesada)
               for noticia_procesada in noticias_procesadas]
    lda_results.append([id2word, dataset] + compute_coherence_values(
        dictionary=id2word, corpus=dataset, texts=noticias_procesadas, start=start, limit=limit, step=step))


In [None]:
# This can execute if lda_results exists.
PLOT_COHERENCE = True
if PLOT_COHERENCE:
    for _, _, _, coherence_values in lda_results:
        r = 2 + len(coherence_values) * 2
        x = range(2,  r, 2)
        plt.grid()
        plt.scatter(x, coherence_values, s=300)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        #plt.legend(("coherence_values"), loc='best')
        plt.title(comuna)
        plt.show()