# Grouping similar words

In [1]:
%%time
from difflib import get_close_matches
from core.utils import get_closest_vector

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

CPU times: user 530 ms, sys: 1.13 s, total: 1.66 s
Wall time: 482 ms


### Reading data

In [2]:
def clean_alt_list(list_):
    list_ = str(list_)
    list_ = list_.replace('[', '')
    list_ = list_.replace(']', '')
    list_ = list_.replace("'", '')
    list_ = list_.split(',')
    return list_

In [3]:
emotions = pd.read_csv('./data/emotions_table.csv', low_memory=False)

In [4]:
emotions['name_tokens'] = emotions['name_tokens'].apply(lambda x: clean_alt_list(x))

### Identifying emotions

The final `emo_vector_unicode` has a lot of words which are not related with emotions or sentiments. This is because some entries in the survey was filled with free-text. In attemp to capture all posible emotions we splitted that free text in singular words (assumming that all of them could be a potencial emotion).

Now It is time to discriminate them. We use [Spanish Emotion Lexicon](http://www.cic.ipn.mx/~sidorov/#SEL)(SEL) to filter words.

In [5]:
SEL_df = pd.read_excel('./files/SEL.xlsx', engine='openpyxl')
standard_emotions = SEL_df['Palabra']
SEL_df['Categoría'].unique()

array(['Alegría', 'Enojo', 'Miedo', 'Repulsión', 'Sorpresa', 'Tristeza'],
      dtype=object)

In order to find matches between survey words and the standard ones, we use `get_close_matches` which compare words using similarity criteria.

In [7]:
%%time
emo_vector_matched = []
final_category = []
# iterate over the list of tokens
for word in emotions['name_tokens']:
    closest = []
    categories = []
    # for each word within the list (some people wrote sentences instead of a single word)
    for w in word:
        # get the closest emotion from SEL dictonary
        closest_word = get_close_matches(w.strip(), standard_emotions,n=1, cutoff=0.7)
        if closest_word != []:
            # if we match some emotion then save its category
            cat = SEL_df[SEL_df['Palabra']==closest_word[0]]['Categoría']
            categories.append(cat.values[0])
            closest.append(closest_word[0])
        else:
            continue
    
    # at the end of the process... check if the response has a category
    if closest == []:
        final_category.append('')        
        emo_vector_matched.append('')
    else:
        final_category.append(categories[0])        
        emo_vector_matched.append(closest[0])

CPU times: user 14min 28s, sys: 43.1 ms, total: 14min 28s
Wall time: 14min 28s


In [40]:
print('Non categorized values: {:.1f} %'.format(emotions[emotions['macro'] == ''].shape[0]/emotions.shape[0]*100))

Non categorized values: 4.3 %


In [31]:
emotions['macro'] = final_category

In [41]:
emotions.sample(3)

Unnamed: 0,id,diag_id,ind_id,name,name_tokens,macro,exp,exp_tokens,is_online,source_id
31277,31277,ENC_U_4602875782409625234,,tristeza,[tristeza],Tristeza,por las personas afectadas y muertas en las pr...,"['personas', 'afectadas', 'muertas', 'protestas']",0,
38750,38750,ENC_U_4562196059036899654,,rabia,[rabia],Enojo,se mostro lo malo que pasaba y no el movimient...,"['mostro', 'malo', 'pasaba', 'movimiento', 'si']",0,
35406,35406,ENC_U_4518001924212637009,,preocupacion/incertidumbre,"[preocupacion, incertidumbre]",Miedo,"ante la falta de soluciones, ver pasar el tiem...","['falta', 'soluciones', 'ver', 'pasar', 'tiemp...",0,


## TO CSV

In [42]:
%%time
emotions = emotions.to_csv('./out/emotions.csv', index=False)

CPU times: user 323 ms, sys: 28 ms, total: 351 ms
Wall time: 352 ms
