# Data cleaning 

In [3]:
import pandas as pd
import os

import numpy as np

In [76]:
from use_cases.contributions import create_table_contributions, to_sql as con_to_sql
# 
from use_cases.emotions import create_table_emotions, to_sql as emo_to_sql 
from use_cases.personal_needs import create_table_personal_needs, to_sql as per_need_to_sql
from use_cases.dialogues import create_table_dialogues
from use_cases.country_needs import create_table_country_needs, to_sql as count_need_to_sql
from use_cases.persons import create_table_persons
from use_cases.individuals import create_table_individuals
from use_cases.pairs import create_pair_token
from use_cases.persons_dialogues import create_table_persons_dialogues

import use_cases.utils.textools as tt


pd.options.mode.chained_assignment = None 

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading Data Surveys

In [77]:
# chunksize   = 100 # Just for debugging
# survey_path = './data/BBDD_Dialogos.csv' #  chunksize=chunksize,
# survey = pd.read_csv(survey_path, low_memory=False)
# for survey in survey: break

survey_path = './data/BBDD_Dialogos.xlsx'
survey = pd.read_excel(survey_path, 'Hoja1')

filter_dialogue = pd.read_excel(survey_path, 'Filtro Diálogos')
filter_person = pd.read_excel(survey_path, 'Filtro Personas')

ind_online_survey_path = './data/Consulta_Individual_online_v2.xlsx'
ind_online_survey = pd.read_excel(ind_online_survey_path, 'Sheet1')
    
ind_survey_path = './data/Base_final_digitación_Consultas.xlsx'
ind_survey = pd.read_excel(ind_survey_path, 'CONSULTAS')

In [6]:
filter_person.sample(2)

Unnamed: 0,LP_RUN1,es_rut,es_reg_civil,Fallecido,Valido
24302,JkGgXeqedAjR,1,1,0,1
67916,52y5ydej+WPl,1,1,0,1


## Processing Uses Cases
1. Emotions 
2. Contributions
3. Country needs 
4. Personal/familiar needs

### Persons Table (Dialogue)

In [7]:
%%time
persons_table = create_table_persons(survey, filter_person)

CPU times: user 3min 20s, sys: 696 ms, total: 3min 21s
Wall time: 3min 21s


In [8]:
persons_table.sample(2)

Unnamed: 0,id,diag_id,age,sex,level,comuna_id,age_range
6287,qdsleups/7ng,enc_u_4562236951337321785,54,f,Media completa. Técnica incompleta,2301,45-60
8397,xie5ox64r1yq,enc_u_4586421525012801114,52,f,Técnica completa,5703,45-60


### Dialogues

In [9]:
%%time
dialogues_table = create_table_dialogues(survey, filter_dialogue)

CPU times: user 2.79 s, sys: 23.8 ms, total: 2.81 s
Wall time: 2.84 s


In [10]:
dialogues_table.sample(2)

Unnamed: 0,id,date,init_time,end_time,location,address,comuna_id,n_members,group_name,valid
11279,enc_u_4586158230802625534,2020-02-25 00:00:00,18:00:00,20:41:00,sede social jj.vv. salitre bajo,monjitas 480,2101,8,usuarios de oficina movil de bienes nacionales,True
9838,enc_u_4553717923124475874,2020-01-13 00:00:00,18:30:00,19:30:00,sede,la espiga 2844,13201,5,usuarios de oficina movil de bienes nacionales,True


### Persons Dialogues

In [11]:
%%time
persons_table, persons_dialogues_table = create_table_persons_dialogues(persons_table)

CPU times: user 67.3 ms, sys: 21 µs, total: 67.4 ms
Wall time: 65.8 ms


In [12]:
persons_dialogues_table.sample(2)

Unnamed: 0,person_id,diag_id
6464,gltmvqkc2oxo,enc_u_4611224809295305281
9008,smfppraspymz,enc_u_3172042


### Individuals

In [78]:
%%time
individuals_table = create_table_individuals(ind_online_survey, ind_survey)

CPU times: user 12 s, sys: 59.7 ms, total: 12.1 s
Wall time: 12 s


In [81]:
individuals_table.sample(2)

Unnamed: 0,id,date,age,comuna_id,level,age_range,online


In [82]:
individuals_table.to_csv(os.path.join(out_dir, 'individuals.csv'), index=False)

### Emotions

In [15]:
%%time
emotions_table = create_table_emotions(survey, ind_survey_path, ind_online_survey)

CPU times: user 2min 58s, sys: 16.3 s, total: 3min 15s
Wall time: 3min 16s


In [16]:
emotions_table.sample(2)

Unnamed: 0,id,diag_id,ind_id,name,name_tokens,macro,exp,exp_tokens,is_online
2142,51673,ENC_U_4543957710998990369,,temor,[temor],temor,futuro incierto\nno hay solucion,"[futuro, incierto, solucion]",True
11694,24084,ENC_U_4523401707314210356,,decepcion,[decepcion],decepcion,me decepcionan algunos comentarios o reflexion...,"[decepcionan, comentarios, reflexiones, person...",True


### Emotion Pair

In [17]:
%%time
emotion_pair = create_pair_token(emotions_table, 'exp_tokens', 'emotion_id')

CPU times: user 14.8 s, sys: 88 ms, total: 14.9 s
Wall time: 14.9 s


In [18]:
emotion_pair.sample(3)

Unnamed: 0,id,emotion_id,word_1,word_2
215439,215440,39094,desastres,pais
244736,244737,44661,pueda,ser
48239,48240,8636,familia,obligacion


### Country Needs

In [21]:
%%time
country_needs = create_table_country_needs(survey, ind_survey, ind_survey_path, ind_online_survey_path)

CPU times: user 5min 42s, sys: 26.8 s, total: 6min 9s
Wall time: 6min 9s


In [30]:
country_needs.sample(2)

Unnamed: 0,id,diag_id,ind_id,name,name_tokens,macro,exp,exp_tokens,role,role_tokens,actor,priority,is_online
20187,93955,enc_u_3222804,,agua,[agua],,,,,,"privados, presidente, congreso",,False
14378,108318,,3301389.0,educacion gratuita,"[educacion, gratuita]",,,,,,,,False


### Country Need Role Pair

In [31]:
%%time
country_need_role_pair = create_pair_token(country_needs, 'role_tokens', 'country_need_id')

CPU times: user 18.7 s, sys: 71.8 ms, total: 18.7 s
Wall time: 18.8 s


In [32]:
country_need_role_pair.sample(2)

Unnamed: 0,id,country_need_id,word_1,word_2
278270,278271,93817,crear,proyectos
252302,252303,87726,transforme,pais


### Country Need Explanation Pair

In [34]:
%%time
country_need_exp_pair = create_pair_token(country_needs, 'exp_tokens', 'country_need_id')

CPU times: user 19 s, sys: 48.6 ms, total: 19 s
Wall time: 19 s


In [35]:
country_need_exp_pair.sample(2)

Unnamed: 0,id,country_need_id,word_1,word_2
384572,384573,87613,desinformacion,adquirirlas
214993,214994,46621,bajar,valores


### Family/Personal Needs

In [45]:
%%time
personal_needs = create_table_personal_needs(survey, ind_survey, ind_online_survey)

CPU times: user 3min 19s, sys: 18.1 s, total: 3min 37s
Wall time: 3min 37s


In [48]:
personal_needs.sample(3)

Unnamed: 0,id,diag_id,ind_id,name,name_tokens,exp,exp_tokens,macro,priority,is_online
11,66690,,2740587.0,estabilidad laboral y economica.,"[estabilidad, laboral, economica]",no cuento con trabajo.,"[cuento, trabajo]",estabilidad laboral y economica.,2,False
143,53681,,2822942.0,economicas,[economicas],el dinero no es suficiente considerando los ac...,"[dinero, suficiente, considerando, actuales, g...",economicas,1,False
7738,41054,ENC_U_4556171544345385351,,que adultos mayores puedan tener mejores pensi...,"[adultos, mayores, puedan, tener, mejores, pen...",familias donde hay adultos mayores muchas vece...,"[familias, adultos, mayores, muchas, veces, en...",que adultos mayores puedan tener mejores pensi...,0,False


### Family/Personal Need Pair

In [49]:
%%time
personal_need_pair = create_pair_token(personal_needs, 'exp_tokens', 'personal_need_id')

CPU times: user 17.3 s, sys: 48.3 ms, total: 17.3 s
Wall time: 17.3 s


In [50]:
personal_need_pair.sample(3)

Unnamed: 0,id,personal_need_id,word_1,word_2
425731,425732,78124,salir,region
302944,302945,50108,preocupados,gran
354576,354577,61489,horrible,sistema


### Contributions

In [51]:
%%time
contributions = create_table_contributions(survey, ind_survey, ind_online_survey)

CPU times: user 1min 19s, sys: 7.61 s, total: 1min 26s
Wall time: 1min 27s


In [54]:
contributions.sample(3)

Unnamed: 0,id,diag_id,ind_id,text,tokens,macro,is_online
2529,69149,,3113163.0,informarse de la situacion trabajando como has...,"[informarse, situacion, trabajando, hoy, aunqu...",informarse de la situacion trabajando como has...,False
11661,51633,ENC_U_4523071849223379090,,respuesta sin completar,"[respuesta, completar]",respuesta sin completar,False
4076,57372,ENC_U_4553396672352500132,,.,,.,False


## Saving `.csv`

In [55]:
out_dir = './out'
os.makedirs(out_dir, exist_ok=True)

In [60]:
contributions.to_csv(os.path.join(out_dir, 'contributions.csv'), index=False)
personal_need_pair.to_csv(os.path.join(out_dir, 'personal_need_pair.csv'), index=False)
personal_needs.to_csv(os.path.join(out_dir, 'personal_needs.csv'), index=False)
country_need_exp_pair.to_csv(os.path.join(out_dir, 'country_need_exp_pair.csv'), index=False)
country_need_role_pair.to_csv(os.path.join(out_dir, 'country_need_role_pair.csv'), index=False)
country_needs.to_csv(os.path.join(out_dir, 'country_needs.csv'), index=False)
emotion_pair.to_csv(os.path.join(out_dir, 'emotion_pair.csv'), index=False)
emotions_table.to_csv(os.path.join(out_dir, 'emotions.csv'), index=False)
individuals_table.to_csv(os.path.join(out_dir, 'individuals.csv'), index=False)
dialogues_table.to_csv(os.path.join(out_dir, 'dialogues.csv'), index=False)
persons_table.to_csv(os.path.join(out_dir, 'persons.csv'), index=False)
persons_dialogues_table.to_csv(os.path.join(out_dir, 'persons_dialogues.csv'), index=False)

## Saving `.sql`

In [61]:
con_to_sql(contributions,os.path.join(out_dir,'contributions.sql' ))
emo_to_sql(emotions_table,os.path.join(out_dir,'emotions.sql' ))
count_need_to_sql(country_needs,os.path.join(out_dir,'country_needs.sql' ))
per_need_to_sql(personal_needs,os.path.join(out_dir,'personal_need.sql' ))

# BORRAR LO DE ABAJO

In [53]:
emo_frame['name_tokens'] = emo_frame['name_tokens'].apply(lambda x: tt.clean_alt_list(x))
emo_frame['exp_tokens'] = emo_frame['exp_tokens'].apply(lambda x: tt.clean_alt_list(x))
emo_frame = emo_frame.replace({'nr':'','nan':'', 'NR':'', 'NaN':'', np.nan:''})

In [54]:
personal_need_frame['name_tokens'] = personal_need_frame['name_tokens'].apply(lambda x: tt.clean_alt_list(x))
personal_need_frame['exp_tokens'] = personal_need_frame['exp_tokens'].apply(lambda x: tt.clean_alt_list(x))
personal_need_frame['priority'] = personal_need_frame['priority'].astype(int)
personal_need_frame = personal_need_frame.replace({'nr':'','nan':'', 'NR':'', 'NaN':'', np.nan:''})

In [55]:
con_frame['tokens'] = con_frame['tokens'].apply(lambda x: tt.clean_alt_list(x))
con_frame = con_frame.replace({'nr':'','nan':'', 'NR':'', 'NaN':'', np.nan:''})

In [67]:
coun_needs_frame['priority'] = coun_needs_frame['priority'].apply(lambda x: tt.str_to_int(x))
coun_needs_frame = coun_needs_frame.replace({'nr':'','nan':'', 'NR':'', 'NaN':'', np.nan:''})
coun_needs_frame['name_tokens'] = coun_needs_frame['name_tokens'].apply(lambda x: tt.clean_alt_list(x))
coun_needs_frame['exp_tokens'] = coun_needs_frame['exp_tokens'].apply(lambda x: tt.clean_alt_list(x))
coun_needs_frame['role_tokens'] = coun_needs_frame['role_tokens'].apply(lambda x: tt.clean_alt_list(x))


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [69]:
con_to_sql(con_frame,'./data/contributions.sql' )

In [70]:
emo_to_sql(emo_frame,'./data/emotions.sql')

In [73]:
count_need_to_sql(coun_needs_frame,'./data/country_needs.sql')

In [74]:
per_need_to_sql(personal_need_frame,'./data/personal_need.sql')