# Grouping similar words

In [1]:
%%time
from difflib import get_close_matches
from core.utils import get_closest_vector

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

CPU times: user 798 ms, sys: 1.02 s, total: 1.82 s
Wall time: 844 ms


### Reading data

In [2]:
def clean_alt_list(list_):
    list_ = str(list_)
    list_ = list_.replace('[', '')
    list_ = list_.replace(']', '')
    list_ = list_.replace("'", '')
    list_ = list_.split(',')
    return list_

In [3]:
emotions = pd.read_csv('./data/emotions.csv', low_memory=False)

In [4]:
emotions['name_tokens'] = emotions['name_tokens'].apply(lambda x: clean_alt_list(x))

### Identifying emotions

The final `emo_vector_unicode` has a lot of words which are not related with emotions or sentiments. This is because some entries in the survey was filled with free-text. In attemp to capture all posible emotions we splitted that free text in singular words (assumming that all of them could be a potencial emotion).

Now It is time to discriminate them. We use [Spanish Emotion Lexicon](http://www.cic.ipn.mx/~sidorov/#SEL)(SEL) to filter words.

In [5]:
SEL_df = pd.read_excel('./files/SEL.xlsx', engine='openpyxl')
standard_emotions = SEL_df['Palabra']
SEL_df['Categoría'].unique()

array(['Alegría', 'Enojo', 'Miedo', 'Repulsión', 'Sorpresa', 'Tristeza'],
      dtype=object)

In order to find matches between survey words and the standard ones, we use `get_close_matches` which compare words using similarity criteria.

In [6]:
%%time
emo_vector_matched = []
final_category = []
# iterate over the list of tokens
for word in emotions['name_tokens']:
    closest = []
    categories = []
    # for each word within the list (some people wrote sentences instead of a single word)
    for w in word:
        # get the closest emotion from SEL dictonary
        closest_word = get_close_matches(w.strip(), standard_emotions,n=1, cutoff=0.7)
        if closest_word != []:
            # if we match some emotion then save its category
            cat = SEL_df[SEL_df['Palabra']==closest_word[0]]['Categoría']
            categories.append(cat.values[0])
            closest.append(closest_word[0])
        else:
            continue
    
    # at the end of the process... check if the response has a category
    if closest == []:
        final_category.append('')        
        emo_vector_matched.append('')
    else:
        final_category.append(categories[0])        
        emo_vector_matched.append(closest[0])

CPU times: user 18min 41s, sys: 276 ms, total: 18min 41s
Wall time: 18min 43s


In [7]:
print('Non categorized values: {:.1f} %'.format(emotions[emotions['macro'] == ''].shape[0]/emotions.shape[0]*100))

Non categorized values: 0.0 %


In [8]:
emotions['macro'] = final_category

In [10]:
emotions.sample(3)

Unnamed: 0,id,diag_id,ind_id,name,name_tokens,macro,exp,exp_tokens,is_online
64318,64318,,3193551.0,cansancio,[cansancio],,el exceso de trabajo,"['exceso', 'trabajo']",False
56998,56998,ENC_U_3183683,,inseguridad,[inseguridad],Miedo,porque no hay respaldo de la autoridad.,"['respaldo', 'autoridad']",True
51689,51689,ENC_U_4543348788894499680,,inseguridad,[inseguridad],Miedo,"es terrible sentarse inseguro en todas partes,...","['terrible', 'sentarse', 'inseguro', 'todas', ...",True


In [11]:
emotions.to_csv('./out/emotions.csv', index=False)

## TO CSV

In [42]:
%%time
emotions = emotions.to_csv('./out/emotions.csv', index=False)

CPU times: user 323 ms, sys: 28 ms, total: 351 ms
Wall time: 352 ms
