# Embeddings Neuronales


**TODO**

### Importanción de librería requeridas

In [1]:
import os

import warnings
import pickle
from collections import Counter

import numpy as np
import pandas as pd

import gensim.corpora as corpora
from gensim.models import FastText

from clustering_utils import vectorize, mbkmeans_clusters

warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline


### Definición de variables globales

In [2]:
TEXT_FILE_READ = 'docs/preprocessing_reddit_data.csv'
TEXT_SAVE_FILE = 'docs/reddit_data_fasttext.csv'
FILENAME_PICKLE = "docs/tmpreddit.pickle"
n_clusters = 120

### Lectura de los comentarios de Reddit

Los comentarios fueron previamente preprocesados (Ver en TODO).

In [3]:
with open(FILENAME_PICKLE, 'rb') as f:
    df = pickle.load(f)


### Vocabulario

In [4]:
# Create Dictionary
id2word = corpora.Dictionary(df['lemma_tokens'])

# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)

# Creating a corpus object
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

processed_corpus = df['lemma_tokens']


### Entrenamiento del modelo FastText

In [5]:
model = FastText(sentences=processed_corpus, vector_size=100, window=5, min_count=1, workers=4)
model.train(processed_corpus, total_examples=len(processed_corpus), epochs=100)

(18382405, 18695800)

In [6]:
# algunas predicciones
#model.get_nearest_neighbors('peron')

In [7]:
#model.get_nearest_neighbors('cristina')

In [8]:
#model.get_nearest_neighbors('nestor')

In [9]:
#model.get_nearest_neighbors('neoliberal')

In [10]:
#model.get_nearest_neighbors('malvinas')

In [11]:
#model.get_analogies('cristina', 'peronista', 'alberto')

In [12]:
#model.wv.most_similar('mapuche')

### Generación de vectores desde documentos

In [13]:

vectorized_docs = vectorize(processed_corpus, model=model)
len(vectorized_docs), len(vectorized_docs[0])



(27791, 100)

### Generación de clusters

In [14]:
clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=n_clusters,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": df["body"].values,
    "tokens": [" ".join(text) for text in processed_corpus],
    "cluster": cluster_labels
})



For n_clusters = 120
Silhouette coefficient: 0.02
Inertia:1531358.956483711
Silhouette values:
    Cluster 64: Size:247 | Avg:0.21 | Min:0.05 | Max: 0.37
    Cluster 32: Size:32 | Avg:0.19 | Min:0.07 | Max: 0.31
    Cluster 77: Size:3 | Avg:0.18 | Min:0.17 | Max: 0.20
    Cluster 78: Size:11 | Avg:0.17 | Min:-0.11 | Max: 0.39
    Cluster 109: Size:155 | Avg:0.12 | Min:-0.06 | Max: 0.31
    Cluster 99: Size:21 | Avg:0.12 | Min:-0.11 | Max: 0.32
    Cluster 72: Size:61 | Avg:0.12 | Min:-0.03 | Max: 0.29
    Cluster 112: Size:198 | Avg:0.11 | Min:-0.00 | Max: 0.21
    Cluster 59: Size:64 | Avg:0.11 | Min:0.02 | Max: 0.30
    Cluster 63: Size:166 | Avg:0.10 | Min:-0.06 | Max: 0.29
    Cluster 24: Size:312 | Avg:0.09 | Min:0.00 | Max: 0.19
    Cluster 38: Size:183 | Avg:0.08 | Min:-0.10 | Max: 0.29
    Cluster 110: Size:101 | Avg:0.08 | Min:-0.06 | Max: 0.25
    Cluster 47: Size:88 | Avg:0.08 | Min:-0.05 | Max: 0.23
    Cluster 50: Size:93 | Avg:0.08 | Min:-0.03 | Max: 0.26
    Cluster 65: 

### *Top terms* por cluster (basado en los centroides de los clusters)

In [15]:
print("Most representative terms per cluster (based on centroids):")
for i in range(n_clusters):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=10)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")



Most representative terms per cluster (based on centroids):
Cluster 0: thought though without its thank wouldnt wouldn´t theory big—though theyll 
Cluster 1: dolar dólar bipolar olar amándolar \-volar embolar inmolar enrolar suelar 
Cluster 2: esperar 2)esperar esperaré esperabar desesperar señal_esperar esperaba esperáramo espera esperabas 
Cluster 3: ecuación creación cremación usurpación nación oxigenación inclinación alienación asignación apelación 
Cluster 4: poner iponer ponerl disponer oponer reponer exponer ponerlo imponer oner 
Cluster 5: sátirar tirar juirar girar lirar \*mirar elvirar admirar quirar dirar 
Cluster 6: vómito rarito jaimito mito 6to agito humito dígito ojito rito 
Cluster 7: salir salúdalir saliar saliva sali salio salimos salia pasalir salsar 
Cluster 8: recomendado recordado recomendariar recomendar recordar recomendo recomendario recomendaría recorrido recomienda 
Cluster 9: obtener detener \-tener abstener retener ener sostener wiener deshacer desperdiciac

### *Top terms* por cluster (basado en las palabras más frecuentes)

In [16]:
for i in range(n_clusters):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")


Cluster 0: ⠀(685) the(70) of(34) and(33) to(32) 
Cluster 1: dolar(90) dólar(54) subir(19) peso(13) dólares(11) 
Cluster 2: esperar(111) espert(27) pasar(6) respuesta(5) superar(4) 
Cluster 3: nación(18) elección(12) inflación(12) acción(10) argentino(10) 
Cluster 4: poner(223) él(42) pasar(8) suponer(7) gente(6) 
Cluster 5: tirar(118) mirar(59) tiro(31) tira(11) pasar(8) 
Cluster 6: gato(30) orto(24) dame(12) listo(11) querer(10) 
Cluster 7: salir(259) sal(13) calle(11) correr(10) sacar(10) 
Cluster 8: pasar(76) paso(59) semana(54) mañana(37) lindo(36) 
Cluster 9: tener(147) hacer(112) querer(13) él(12) año(8) 
Cluster 10: gente(71) hablar(52) pensar(50) cosa(49) persona(48) 
Cluster 11: interés(17) internet(13) interno(13) interesante(10) control(10) 
Cluster 12: gobierno(194) nacional(60) terrorista(32) mapuch(18) él(15) 
Cluster 13: buscar(141) encontrar(16) gente(14) busca(12) laburo(12) 
Cluster 14: llamar(44) amar(6) mar(3) atención(3) amigo(2) 
Cluster 15: milei(182) peronismo(8

### Recupere los documentos más representativos (basados en los centroides de los clústeres) para un cluster en particular

In [17]:
test_cluster = 0
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print( df["body"].values[d])
    print("-------------")

A Stanley thermos is always appreciated, though it's common down here.. Mac&Chesse, Peanut butter, Maple Syrup (a bit canadian), odd bbq sauces (there's little variety here)... among the things that can survive an international trip.
-------------
Cdo le doy boton derecho en steam me dice. \-Apex Legends. \-Monster Hunter Rise DEMO. \-Wallpaper Engine. \-Muse Dash. \-Slay the Spire. EDIT: I AM THE STORM THAT IS APPROACHING \*bangs\*
-------------
Pawg stands for phat ass white girl. In this context, phat is an alternative slang spelling for fat. A pawg is a woman who has a big—though it’s not necessarily fat—butt.. Edit: Corrijo copy&paste fallido
-------------
Argentina loves DauT.. I feel we have many things in common with balkan countries, like terrible goverments.
-------------
A society grows great when old men plant trees in whose shade they shall never sit.
-------------
HAHAHAAHAHAHAAHAHAHAHAHAHAAH. Oh, wait, you were serious? Let me laugh harder. ###HAHAAHAHAHAHAHAHAHHAHAHAHAH

In [18]:

#print(len(vectorized_docs))
#print(vectorized_docs[0])

test_v = vectorize([['defender', 'peso', 'siente', 'corazón', 'compro', 'pesos', 'tasa', 'fijo', 'año']], model=model)
prediction = clustering.predict(test_v)
print(prediction)

[56]


In [19]:
reddit = pd.read_csv(TEXT_FILE_READ)

def get_cluster(row):
    test_v = vectorize([row], model=model)
    return clustering.predict(test_v)

reddit['cluster'] = reddit.apply(lambda row: get_cluster(row['lemma_tokens']) , axis = 1) 


In [20]:
# Show
reddit.head(10)

Unnamed: 0,score,id,flair,comms_num,body,comment_parent_id,is_replay,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,lemma_tokens,body_preprocessing,cluster
0,1,hfw14mt,Discusion🧐,1,todo para decir que tapaste el baño. tira un b...,q44kw3,False,,,,,,,,,"['tapastir', 'baño', 'tirar', 'balde', 'aguo']",tapastir baño tirar balde aguo,[96]
1,1,hfw41eh,Discusion🧐,0,"sopapa primero master, si hay tapón te vas a t...",hfw14mt,True,,,,,,,,,"['sopapa', 'master', 'tapón', 'va', 'teñir', '...",sopapa master tapón va teñir medio,[96]
2,1,hfw1ao2,Discusion🧐,0,"Usas la sopapa, o tiras agua caliente con un b...",q44kw3,False,,,,,,,,,"['sopapo', 'tira', 'agua', 'caliente', 'balde']",sopapo tira agua caliente balde,[96]
3,1,hfw3jof,Discusion🧐,2,Lo que he probado que siempre me dio resultado...,q44kw3,False,,,,,,,,,"['probado', 'resultado', 'sellar', 'boca', 'in...",probado resultado sellar boca inodoro tirar ca...,[96]
4,1,hfw6v4i,Discusion🧐,0,Estas cobrando por dar mantenimiento y no sabe...,q44kw3,False,,,,,,,,,"['cobrar', 'mantenimiento', 'carajo', 'kjjjjjj...",cobrar mantenimiento carajo kjjjjjjjjj vivirio...,[96]
5,1,hfw26iv,Discusion🧐,0,"Si tenes algo con punta, metelo y hace un poco...",q44kw3,False,,,,,,,,,"['tén', 'punto', 'metelo', 'fuerza', 'romper',...",tén punto metelo fuerza romper tapo baño tirar...,[96]
6,1,hfw2gof,Discusion🧐,1,"Con una manguera para regar el jardín, si tene...",q44kw3,False,,,,,,,,,"['regar', 'jardín', 'tén', 'pod', 'probar']",regar jardín tén pod probar,[96]
7,1,hfw5s13,Discusion🧐,0,"despues regas el jardin y se lava sola, solo q...",hfw2gof,True,,,,,,,,,"['rega', 'jardin', 'lava', 'tenés', 'lavarte',...",rega jardin lava tenés lavarte mano pulgar chorro,[96]
8,1,hfw3air,Discusion🧐,0,La respuesta real es que se venden unos caños ...,q44kw3,False,,,,,,,,,"['respuesta', 'real', 'vender', 'caño', 'alamb...",respuesta real vender caño alambrado decir cañ...,[96]
9,7,hfvxa6w,Discusion🧐,3,Mi alfajor favorito es el Havana,q443eo,False,,,,,,,,,"['alfajor', 'favorito', 'haván']",alfajor favorito haván,[96]


In [21]:
reddit.to_csv(TEXT_SAVE_FILE, index=False)

In [24]:
cluster_path = 'docs/test/fasttext_comments_per_cluster/'

os.makedirs(cluster_path,exist_ok=True)

for i in range(n_clusters):
    reddit[(reddit["cluster"] == i)][['flair', 'body']].to_csv(cluster_path + str(i) + '.csv')
