# Embeddings Neuronales


**TODO**

### Importanción de librería requeridas

In [1]:
import gensim.corpora as corpora
from gensim.models import Word2Vec

from sklearn.cluster import KMeans
import numpy as np

import pickle
import pandas as pd

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline


### Definición de variables globales

In [2]:
TEXT_FILE_READ = 'docs/preprocessing_reddit_data.csv'
TEXT_SAVE_FILE = 'docs/reddit_data_lda.csv'
FILENAME_PICKLE = "docs/tmpreddit.pickle"


### Lectura de los comentarios de Reddit

Los comentarios fueron previamente preprocesados (Ver en TODO).

In [3]:
with open(FILENAME_PICKLE, 'rb') as f:
    df = pickle.load(f)


### Vocabulario

In [4]:
# Create Dictionary
id2word = corpora.Dictionary(df['lemma_tokens'])

# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)

# Creating a corpus object
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]


In [5]:
processed_corpus = df['lemma_tokens']


### Entrenamiento del modelo Word2Vec

In [6]:
model = Word2Vec(sentences=processed_corpus, vector_size=100, window=5, min_count=1, workers=4)
model.train(processed_corpus, total_examples=len(processed_corpus), epochs=100)
model.save("word2vec.model")

In [7]:
word_vecs = []
vocabulary = list(model.wv.key_to_index)

for key in model.wv.key_to_index:
    word_vecs.append(model.wv[key])


### Entrenamiento del modelo Word2Vec


In [8]:
# generamos los clústers

n_clusters = 70

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
X_wvkm = kmeans.fit_transform(word_vecs)
y_wvkm = kmeans.predict(word_vecs)

In [9]:
for cluster in range(n_clusters):
    mask = X_wvkm[y_wvkm == cluster]
    idx_sort = np.argsort(X_wvkm[:,cluster])
    words = [vocabulary[x] for x in idx_sort[:20]]

    print("Clúster %d:" % cluster, end='')
    print()
    for w in words:
        print(' %s' % w, end='')
    print()

Clúster 0:
 manteco ajo estofado bifir vinagre té pechuga caloría napolitán remplazo cafir higo salso dandy aderezo oliva bife ganache zapallito zanahorio
Clúster 1:
 shotcito tirante baratisimo gesticular wrigth iamc munnnn diz asamblea bonitas.com millsbeelaneiii dharma queremir mote dolaaaar potensia caius messirve cerveceer latorrir
Clúster 2:
 his train other every less goes common |:-|:-| about thaber without friend leave whose trading| even expensive dick little away
Clúster 3:
 mep devaluar prendario prestación equilibrar berso imprimar quiebra irian arbitrar puedas apalancandote estable gán ganancio hajajar ciudadanir debio venderlo retiran
Clúster 4:
 kink co2 cópulo carvajal shaming dolaaaar ajajajajaj creíque lamentar lipomodal diversidat ononononononononoo"#"#"¡="¡"¡=¡1 hmmmm asteroidir http siiiiiii jsjasj ofertar freelancers clothes
Clúster 5:
 halagador fiche migratorio adhesivo afrodescendiente afirmacion traiganmir minecraft solcito shifteado serpiente orejudo subayud

In [10]:
# algunas predicciones

model.wv.most_similar("rucula")

[('jajajajjajaajaj', 0.945906937122345),
 ('laconcho', 0.9091038703918457),
 ('bottle', 0.9089493155479431),
 ('branding', 0.8754062652587891),
 ('meanies', 0.8094251155853271),
 ('submarinar', 0.8011689186096191),
 ('golden', 0.7291918396949768),
 ('oct', 0.7138594388961792),
 ('frondizi', 0.7132689356803894),
 ('eyes', 0.7089555263519287)]

In [11]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(processed_corpus, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(27791, 100)

In [12]:


def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [13]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": df["body"].values,
    "tokens": [" ".join(text) for text in processed_corpus],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: -0.00
Inertia:1004188.0895273283
Silhouette values:
    Cluster 29: Size:485 | Avg:0.25 | Min:0.01 | Max: 0.43
    Cluster 20: Size:239 | Avg:0.13 | Min:-0.07 | Max: 0.36
    Cluster 46: Size:253 | Avg:0.12 | Min:-0.12 | Max: 0.36
    Cluster 25: Size:168 | Avg:0.11 | Min:-0.11 | Max: 0.36
    Cluster 24: Size:221 | Avg:0.10 | Min:-0.08 | Max: 0.32
    Cluster 35: Size:178 | Avg:0.10 | Min:-0.08 | Max: 0.32
    Cluster 14: Size:188 | Avg:0.10 | Min:-0.09 | Max: 0.29
    Cluster 23: Size:203 | Avg:0.10 | Min:-0.09 | Max: 0.32
    Cluster 11: Size:291 | Avg:0.09 | Min:-0.10 | Max: 0.31
    Cluster 18: Size:148 | Avg:0.09 | Min:-0.09 | Max: 0.33
    Cluster 16: Size:312 | Avg:0.09 | Min:-0.10 | Max: 0.30
    Cluster 43: Size:299 | Avg:0.09 | Min:-0.07 | Max: 0.29
    Cluster 26: Size:327 | Avg:0.08 | Min:-0.11 | Max: 0.31
    Cluster 31: Size:355 | Avg:0.07 | Min:-0.10 | Max: 0.28
    Cluster 40: Size:265 | Avg:0.07 | Min:-0.11 | Max: 0.28
    C

In [14]:
print("Most representative terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: ver video bolu\*\ perooooo soné 
Cluster 1: seguir ⢧ ⠣ imbécil ⡾ 
Cluster 2: llegar punto ⢧ ⠣ ⡾ 
Cluster 3: equilibrar dólares mep arbitrar pesos 
Cluster 4: viejo ⢧ ⠣ ⡾ ⡰ 
Cluster 5: peronismo ⡾ atribuis ⢧ peronista 
Cluster 6: problema acá mundo ⢧ suecia 
Cluster 7: pasar ⢧ ⡾ ⠣ ⡰ 
Cluster 8: tén ⠣ ⢧ locador pod 
Cluster 9: vo ⢧ ⠣ seguí ⡰ 
Cluster 10: \-mr argument supiste clothes agarrenme 
Cluster 11: venir mínimo infierno\ ⠣ entero 
Cluster 12: alberto ginastera jajjaj \*big brain 
Cluster 13: gracias ⢧ ⠣ ⡾ alegrar 
Cluster 14: va catre decís coquetar aflojo 
Cluster 15: votar voto ganar oficialismo pegarl 
Cluster 16: pensar ⢧ ⠣ ⡾ opinar 
Cluster 17: ⢧ ⡾ ⠣ ⡰ vivir 
Cluster 18: seguro automotor pegajoso letrar derechogenial/ 
Cluster 19: comer naaaaaah detox servira ocu 
Cluster 20: poner empaquetado persiga ropo lucifer 
Cluster 21: año tener ⢧ ⠣ ⡾ 
Cluster 22: decir ⢧ ⡾ ⠣ ⡰ 
Cluster 23: quedar marmota ⢧ ⡰ ⡾ 
C

In [16]:
test_cluster = 3
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print( df["body"].values[d])
    print("-------------")

Sanguijuelas mas grandes no puede haber
-------------
alibaba no es el "mayorista"? compra en aliexpress. No te olvides que al dolar tenes que sumarle el 65% de impuestos y cuando llegue lo que compres pagas el 50% de lo que supere 50usd en impuestos de importacion
-------------
Les están queriendo sacar a algunos *(por ahora algunos)* exchanges la capacidad de recibir transferencias bancarias. O sea, les cortan a los clientes del exchange el ingreso o extracción de pesos o dólares con el CBU
-------------
Imagínate unos botines del Relámpago Marquinhos
-------------
El banco central tiene que arbitrar bonos para equilibrar mep y cclno es solo dólar bcra.. EDIT: También tienen que ver que no se descalabre senebi por que hay importadores que están yendo justo ahí a buscar
-------------
hacele un whois al dominio y reportalo ante el ISP que corresponda, denunciandolo como estafa/fraude
-------------
Cada Macri es un shotcito de tequila
-------------
16 Años imposible bro, pero un préstam

In [9]:
reddit = pd.read_csv(TEXT_FILE_READ)

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # row = sorted(row, key=lambda x: (x[1]), reverse=True) # old line
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
                #ent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4)]), ignore_index=True)
                #print(sent_topics_df)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    #sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution']

    # Add original text to the end of the output
    #contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, texts], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=base_model, corpus=corpus, texts=reddit)

In [10]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
#df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,index,Dominant_Topic,Perc_Contribution,Topic_Keywords,score,id,flair,comms_num,body,comment_parent_id,is_replay,lemma_tokens,body_preprocessing
0,0,12.0,0.2531,"él, recordar, pegar, único, robar, barrio, pen...",1,hgw77qe,Política🏛️,0,"Iba a decir, bue si lo saco de su bolsillo... ...",q9imco,False,"['bue', 'saco', 'bolsillo', 'recorder', 'hdp',...",bue saco bolsillo recorder hdp mantener alcanz...
1,1,2.0,0.2598,"perro, nik, meme, gobierno, explicar, it, teni...",1,hgw7dci,Política🏛️,0,Se volvio un meme el bot del dolar?,hgw666m,True,"['volvio', 'meme', 'dolar']",volvio meme dolar
2,2,26.0,0.3279,"falacia, decir, gratis, k, joda, país, mandar,...",1,hgw69er,Humor:snoo_joy:,0,Este Esteban Lamothe estaba en la ficción de u...,q9i4uj,False,"['ester', 'lamothe', 'ficción', 'villo', 'acá'...",ester lamothe ficción villo acá comedia políti...
3,3,6.0,0.4349,"pobre, servir, él, comida, tenés, culpa, onda,...",1,hgw6zvd,Meme💩,0,Eso porque son todos útos chupa bija.. Venga e...,hgw2528,True,"['úto', 'chupa', 'bijo', 'venir', 'ban', 'nedf...",úto chupa bijo venir ban nedflanducacion
4,4,21.0,0.8089,"re, cabeza, él, morir, pibes, papa, hambre, ri...",1,hgw24ns,Meme💩,0,mas verso burgués que Maximo no hay. Es la rep...,q9hut7,False,"['verso', 'burgués', 'maximo', 'representación']",verso burgués maximo representación
5,5,12.0,0.3722,"él, recordar, pegar, único, robar, barrio, pen...",1,hgw38x8,Meme💩,0,Ayudar con comida? Na mejor unos afiches a tod...,q9hut7,False,"['ayudar', 'comida', 'na', 'afich', 'color']",ayudar comida na afich color
6,6,17.0,0.3461,"the, of, necesitar, you, fácil, and, to, creer...",1,hgw2rml,Meme💩,1,¿Por qué si es un cerdo tiene 6 patas?,q9hut7,False,"['cerdo', 'pata']",cerdo pata
7,7,22.0,0.3515,"ah, /s, peronista, paso, x200b, mes, cagar, él...",1,hgw3wei,Meme💩,0,"Mira, soy tan capitalista que por 15 mil pesos...",q9hut7,False,"['mira', 'capitalisto', 'pesos', 'corrijo', 'c...",mira capitalisto pesos corrijo color
8,8,24.0,0.4082,"milei, pasar, debate, votar, voto, mujer, izqu...",1,hgw78bv,Meme💩,0,Swinetaur libertario de Darkest Perónia. Ruin ...,q9hut7,False,"['swinetaur', 'libertario', 'darkest', 'peróni...",swinetaur libertario darkest perónia ruin come...
9,9,23.0,0.4081,"él, foto, ver, libertad, sacar, feriado, tomar...",1,hgw6rim,Meme💩,0,como no pueden contra elllll. lo ensucian vamo...,q9hut7,False,"['elllll', 'ensuciar', 'milie', 'bastar', 'k']",elllll ensuciar milie bastar k


In [11]:
df_dominant_topic.to_csv(TEXT_SAVE_FILE, index=False)