# Grouping similar words

In [1]:
%%time
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.wrappers import FastText
from difflib import get_close_matches
from gensim import corpora, models
from nltk import SnowballStemmer

from core.utils import get_closest_vector
from sklearn.cluster import DBSCAN

import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import numpy as np
import unidecode
import umap
import nltk

%load_ext autoreload
%autoreload 2

CPU times: user 19 s, sys: 829 ms, total: 19.9 s
Wall time: 19.4 s


### Reading fast-text bin file (Memory Expensive)

In [2]:
wordvectors_file = './files/fasttext-sbwc.bin'

In [3]:
%%time
wordvector_bin = FastText.load_fasttext_format(wordvectors_file)

CPU times: user 4min 18s, sys: 6.72 s, total: 4min 24s
Wall time: 4min 25s


### Reading data

In [4]:
emotions = pd.read_excel('./data/emotions.xlsx', engine='openpyxl')

In [5]:
emo_ids = emotions['emo_id'].values
emo_vector = emotions['name'].values

In [6]:
len(emo_vector)

4076

### Initial Clustering

In [7]:
def cluster_vector(emo_vector, eps=0.5):
    mapper = umap.UMAP()
    transform = wordvector_bin[emo_vector]
    reduced = mapper.fit_transform(transform)
    clustering = DBSCAN(eps=eps, min_samples=2).fit(reduced)
    fig = px.scatter(x=reduced[:, 0], y=reduced[:, 1], hover_name=emo_vector, color=clustering.labels_)
    fig.show()
    return clustering.labels_

In [24]:
clustering_labels = cluster_vector(emo_vector, eps=0.2)

### Removing not util clusters, normalizing and looking for roots (steaming)

Taking in account the clustering over the umap. All cluster around with exception of clusters 0,1,2,4,5,13, should be removed.

**PD: Cluster numbers could vary since UMAP is a stochastic algorithm**

In [26]:
def delete_clusters(partial_emo_vector, clustering_labels, ids, keep_indices=[]):
    for index in keep_indices:
        partial_emo_vector = partial_emo_vector[clustering_labels!=index]
        ids = ids[clustering_labels!=index]
        clustering_labels = clustering_labels[clustering_labels!=index]
        
        
    return partial_emo_vector, clustering_labels, ids

In [28]:
print(np.unique(clustering_labels))
remove_cluster_num = [-1,3,6,7,8,9,10,11,12,14,15,16]

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]


In [29]:
clean_emo_vector, clustering_labels, emo_ids_pos = delete_clusters(emo_vector, 
                                                                   clustering_labels,
                                                                   emo_ids,
                                                                   remove_cluster_num)

After this, we apply the `unidecode(word)` function to normalize the string -i.e., removing tildes and special characters 

In [30]:
emo_vector_unicode = [unidecode.unidecode(word) for word in clean_emo_vector]

In [31]:
print('Original:{}\nAfter Normalization: {}'.format(len(np.unique(emo_vector)), 
                                                    len(np.unique(emo_vector_unicode))))

Original:4076
After Normalization: 3758


### Identifying emotions

The final `emo_vector_unicode` has a lot of words which are not related with emotions or sentiments. This is because some entries in the survey was filled with free-text. In attemp to capture all posible emotions we splitted that free text in singular words (assumming that all of them could be a potencial emotion).

Now It is time to discriminate them. We use [Spanish Emotion Lexicon](http://www.cic.ipn.mx/~sidorov/#SEL)(SEL) to filter words.

In [47]:
SEL_df = pd.read_excel('./files/SEL.xlsx', engine='openpyxl')
standard_emotions = SEL_df['Palabra']
SEL_df['Categoría'].unique()

array(['Alegría', 'Enojo', 'Miedo', 'Repulsión', 'Sorpresa', 'Tristeza'],
      dtype=object)

In order to find matches between survey words and the standard ones, we use `get_close_matches` which compare words using similarity criteria.

In [48]:
emo_vector_matched = []
categories = []
for word in emo_vector_unicode:
    closest = get_close_matches(word, standard_emotions,n=1, cutoff=0.85)
    if closest == []:
        emo_vector_matched.append('NR')
        categories.append('NR')
    else:
        emo_vector_matched.append(closest[0])
        cat = SEL_df[SEL_df['Palabra']==closest[0]]['Categoría']
        categories.append(cat.values[0])

In [75]:
partial = pd.DataFrame()
partial['emo_id'] = emo_ids_pos
partial['name'] = emo_vector_unicode
partial['match'] = emo_vector_matched
partial['macro'] = categories

In [76]:
emo_selected = partial[partial['match'] != 'NR']
emo_selected.sample(n=10)

Unnamed: 0,emo_id,name,match,macro
1706,1846,indignados,indignado,Enojo
817,849,comprension,comprensivo,Alegría
92,93,angustiada,angustiado,Miedo
1428,1557,consideracion,consideración,Alegría
1769,1911,aficion,aficionar,Alegría
2554,2739,libres,libre,Alegría
3490,3785,convulsion,convulsión,Enojo
1570,1707,impaciente,impacientar,Enojo
1813,1958,enojar,enojar,Enojo
2564,2749,emocionante,emocionante,Alegría


In [77]:
print('Original:{}\nAfter Normalization: {}\nAfter SEL: {}'.format(len(np.unique(emo_vector)), 
                                                    len(np.unique(emo_vector_unicode)),
                                                    len(np.unique(emo_selected['match']))))

Original:4076
After Normalization: 3758
After SEL: 497


###

In [66]:
person_id = pd.read_excel('./data/persons_emotion.xlsx', engine='openpyxl')

In [69]:
person_id_v2 = person_id[person_id['emo_id'].isin(emo_selected['id'])]

In [78]:
emo_selected.to_csv('emotions_v2.csv', index=False)

In [79]:
person_id_v2.to_csv('persons_emotion_v2.csv', index=False)