# Most frequent words

In [10]:
import pandas as pd
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
import numpy as np
from pattern.text.es import singularize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [8]:
all_df = pd.read_csv('df_2019.csv')
all_df.shape

(70352, 8)

In [24]:
def select_secondary_keywords_ngrams(txt_list: list[str], ngram_min: int, ngram_max: int) -> list[str]:
    # Stopword removal
    stop_words = set(stopwords.words('spanish'))
    txt1 = []
    for line in txt_list:
        txt1.append(' '.join([x for
                            x in nltk.word_tokenize(line) if
                            (x not in stop_words)]))

    # Getting trigrams
    vectorizer = CountVectorizer(ngram_range=(ngram_min, ngram_max))
    X1 = vectorizer.fit_transform(txt1)
    features = (vectorizer.get_feature_names_out())

    # Applying TFIDF
    vectorizer = TfidfVectorizer(ngram_range=(ngram_min, ngram_max))
    X2 = vectorizer.fit_transform(txt1)
    scores = (X2.toarray())

    # Getting top ranking features
    sums = X2.sum(axis=0)
    data1 = []
    for col, term in enumerate(features):
        data1.append((term, sums[0, col]))
    ranking = pd.DataFrame(data1, columns=['term', 'rank'])
    words = (ranking.sort_values('rank', ascending=False))
    return words

frequent_words = select_secondary_keywords_ngrams(all_df.Texto, 5, 5)  #[0:max_n]
print("\n\nWords head : \n", frequent_words)



Words head : 
                                                      term       rank
490262                      muy buenos días todas ustedes  22.715261
590480          presidente alberto rodríguez sdp noticias  19.111745
98522                 buenos días señor presidente buenos  19.100927
241959                  días señor presidente buenos días  18.284120
98351            buenos días presidente alberto rodríguez  17.291307
...                                                   ...        ...
317838     fiscalía general república unidad inteligencia   0.096225
172275     cruces subgerente regional energéticos sureste   0.096225
49522              ampliaran meses francisco moreno núñez   0.096225
803801  veracruz firmó convenios modificatorios ampliaron   0.096225
521570        olvera rico subgerente regional energéticos   0.096225

[824576 rows x 2 columns]


# Unsupervised classification of all conversations

In [25]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words("spanish"))
X = vectorizer.fit_transform(frequent_words["term"])
true_k = 4 # Number of groups
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 ser
 gente
 va
 puede
 van
 debe
 mucha
 vamos
 entonces
 humilde
Cluster 1:
 si
 usted
 sé
 va
 ver
 saber
 puede
 preguntarle
 presidente
 gobierno
Cluster 2:
 dinero
 quiero
 presupuesto
 lavado
 decir
 va
 pueblo
 hacer
 gobierno
 si
Cluster 3:
 va
 vamos
 mil
 méxico
 gobierno
 presidente
 entonces
 país
 ahora
 van
