# Most frequent words

In [10]:
import pandas as pd
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
import numpy as np
from pattern.text.es import singularize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [8]:
all_df = pd.read_csv('df_2019.csv')
all_df.shape

(70352, 8)

In [21]:
def select_secondary_keywords_ngrams(txt_list: list[str], ngram_min: int, ngram_max: int) -> list[str]:
    # Stopword removal
    stop_words = set(stopwords.words('spanish'))
    txt1 = []
    for line in txt_list:
        txt1.append(' '.join([x for
                            x in nltk.word_tokenize(line) if
                            (x not in stop_words)]))

    # Getting trigrams
    vectorizer = CountVectorizer(ngram_range=(ngram_min, ngram_max))
    X1 = vectorizer.fit_transform(txt1)
    features = (vectorizer.get_feature_names_out())

    # Applying TFIDF
    vectorizer = TfidfVectorizer(ngram_range=(ngram_min, ngram_max))
    X2 = vectorizer.fit_transform(txt1)
    scores = (X2.toarray())

    # Getting top ranking features
    sums = X2.sum(axis=0)
    data1 = []
    for col, term in enumerate(features):
        data1.append((term, sums[0, col]))
    ranking = pd.DataFrame(data1, columns=['term', 'rank'])
    words = (ranking.sort_values('rank', ascending=False))
    return words

frequent_words = select_secondary_keywords_ngrams(all_df.Texto, 5, 10)  #[0:max_n]
print("\n\nWords head : \n", frequent_words)



Words head : 
                                                       term       rank
2485237                      muy buenos días todas ustedes  21.768304
2991055          presidente alberto rodríguez sdp noticias  13.231604
499756            buenos días presidente alberto rodríguez  12.165195
499757        buenos días presidente alberto rodríguez sdp  11.887507
1222269              días presidente alberto rodríguez sdp  11.887507
...                                                    ...        ...
2228265  lugar terminaran plazo ampliaran meses francis...   0.039746
2228266  lugar terminaran plazo ampliaran meses francis...   0.039746
2228267  lugar terminaran plazo ampliaran meses francis...   0.039746
1316250  empresa productiva subsidiaria seis veracruz f...   0.039746
2443345  modificatorios ampliaron plazos gasoductos lug...   0.039746

[4174012 rows x 2 columns]


# Unsupervised classification of all conversations

In [23]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words("spanish"))
X = vectorizer.fit_transform(frequent_words["term"])
true_k = 2 # Number of groups
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 va
 si
 vamos
 mil
 gobierno
 presidente
 entonces
 van
 millones
 ahora
Cluster 1:
 méxico
 ciudad
 unidos
 gobierno
 pueblo
 país
 va
 aquí
 presidente
 corrupción
