In [69]:
import os
import json
import pandas as pd
import pickle
import string
import re
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 

from IPython.display import clear_output

refs:
* https://www.geeksforgeeks.org/fuzzywuzzy-python-library/

python levenshtein installation on windows is not straightforward: see this link: 
* https://stackoverflow.com/questions/13200330/how-to-install-python-levenshtein-on-windows
* or simply do: 
```python
conda install -c conda-forge python-levenshtein
```

In [2]:
path = "../outputs/"

### get tweets

In [12]:
main_inputs = "../../inputs/"
inputs = '../inputs/'

In [13]:
main_inputs

'../../inputs/'

In [14]:
data_path  = main_inputs + 'resultado.csv.bz2'
data = pd.read_csv(data_path, header = None) 

In [15]:
data.columns = ["date", "user", "message", "0", "1", "2", "3", "4", "class", "location"]

In [16]:
data['date'] = pd.to_datetime(data['date'])

In [17]:
data = data[data['date'] > '2016-01-01 00:00:01']

#### pickle

In [None]:
data = pickle.load(open(resultado, 'rb'))

# Levenshtein distance

We are going to use WRatio to measure word distance.
* WRatio handles lower and upper case, and also punctuation.
* PartialRatio handles punctuation.
* Ratio is the simplest option.

## test

In [40]:
test1 = "xiiii, dores no corpo podem indicar contração do vírus Chikungunya! Mas gás de xisto é ok"
test2 = "aedes aegypti pode portar 4 tipos distintos de vírus, tal como xikungunya"
test3 = "Chikungunya Chicungunya Chicungunia Chicungonia Chicongonia xicongonia xicogonia xicara chicon"

In [46]:
text = test3.lower().split()
distances_dict = {}
for word in text:
    if word.startswith("chi") or word.startswith("xi"):
        if word not in distances_dict.keys():
            edit_distance = fuzz.WRatio(word, "chikungunya")
            if edit_distance > 40:
                distances_dict[word] = fuzz.WRatio(word, "chikungunya")

In [47]:
distances_dict

{'chikungunya': 100,
 'chicungunya': 91,
 'chicungunia': 82,
 'chicungonia': 73,
 'chicongonia': 64,
 'xicongonia': 48,
 'chicon': 60}

## with data

In [36]:
data.head()

Unnamed: 0,date,user,message,0,1,2,3,4,class,location
6724589,2017-01-01 00:02:07,raulgloger,@victorprusch_ ba nem me fala !! Que zika esse...,0.038131,0.381141,0.0,0.239816,0.340912,2,"{u'd': 1517412736274, u'v': 1}"
6724590,2017-01-01 00:08:51,sete1lagoas8,"#ginecologista em Sete Lagoas, Gestantes e Ado...",0.080857,0.030312,0.424166,0.111154,0.35351,4,"{u'city': u'Sete Lagoas', u'country': u'Brasil..."
6724591,2017-01-01 00:11:40,MGsites,"#ginecologista em Sete Lagoas, Gestantes e Ado...",0.073776,0.027657,0.387017,0.205092,0.306459,4,"{u'city': u'Uberlandia', u'country': u'Brasil'..."
6724592,2017-01-01 00:15:03,andrefm,"RT @ikmkoji: RT g1 ""Ministério da Saúde divulg...",0.246583,0.149596,0.037452,0.041049,0.52532,3,"{u'city': u'Guarulhos', u'country': u'Brasil',..."
6724593,2017-01-01 00:17:27,taeminhoca,RT @thejungguk: meu 2016 foi mais parado que f...,0.550304,0.052972,0.043543,0.117504,0.235677,1,"{u'd': 1517412736465, u'v': 1}"


In [37]:
texts = list(data['message'])

In [38]:
len(texts)

247357

chikungunya
* reference: https://portalarquivos2.saude.gov.br/images/pdf/2018/agosto/21/Publicacao-BE-2018-SE-30.pdf
* peak year = 2017 
* peak week = week 17 = 23-30/4/2017
* occurrence timeframe = 1-33 = 1/1 a 20/8

In [34]:
data = data[data['date'] > '2017-01-01 00:00:01']
data = data[data['date'] < '2017-08-21 00:00:01']

In [147]:
percentil = int(len(texts)/100)
distances_dict = {}
count = 0
for text in texts:
    count += 1
    if count % percentil*10 == 0: print(int(count/percentil),'% done')
    if count % (percentil-1) == 0: clear_output()
    for word in text.lower().split():
        if word.startswith("chi") or word.startswith("xi"):
            word = word.strip(string.punctuation) 
            word = re.split('[\.\,\;\\\s\/\:\-\(\&\?\!\"]', word)[0]
            word = word.encode('ascii', 'ignore').decode('ascii') # remove emoticons
            #print(word)
            if word not in distances_dict.keys():
                if len(word)>7:
                    edit_distance = fuzz.WRatio(word, "chikungunya")
                    if edit_distance > 55:
                        distances_dict[word] = edit_distance
    

100 % done


In [148]:
distances = sorted(distances_dict.items(), reverse=True, key=lambda kv: kv[1])

In [150]:
len(distances)

141

In [149]:
distances

[('chikungunya', 100),
 ('chickungunya', 96),
 ('chinkungunya', 96),
 ('chikungunyua', 96),
 ('chikungunhya', 96),
 ('chikunguniya', 96),
 ('chikunguya', 95),
 ('chikunguny', 95),
 ('chikununya', 95),
 ('chikugunya', 95),
 ('chikungnya', 95),
 ('chikungunyada', 92),
 ('chicungunya', 91),
 ('chikungunha', 91),
 ('chikungunia', 91),
 ('chikongunya', 91),
 ('chikunguyna', 91),
 ('chikugungya', 91),
 ('chinkugunya', 91),
 ('chinungunya', 91),
 ('chigungunya', 91),
 ('chikingunya', 91),
 ('chikengunya', 91),
 ('chinkunguya', 91),
 ('chilungunya', 91),
 ('chikungua', 90),
 ('chikungya', 90),
 ('chikungun', 90),
 ('chickungunha', 87),
 ('chickungunia', 87),
 ('chikanguniya', 87),
 ('chinkungunha', 87),
 ('chikunguinha', 87),
 ('chikinguniya', 87),
 ('chincungunya', 87),
 ('chinkongunya', 87),
 ('chikungoonya', 87),
 ('chicugunya', 86),
 ('chikungyia', 86),
 ('chicungnya', 86),
 ('chicunguya', 86),
 ('chikugunia', 86),
 ('chikunguia', 86),
 ('chikunguha', 86),
 ('chigugunya', 86),
 ('chikugunh