# dengue-anvil_classifier
This classifier uses data from JSON delivered on October 2019 (dengue_fixed.json). To speed up retrieval, we used a filtered database from a MongoDB collection (ufmg_filtered)

In [3]:
import pandas as pd
import time
import re
import json
import os
from pprint import pprint
from IPython.display import clear_output

In [4]:
path = '../../inputs/'
outputs = '../outputs/'

# configure MongoDB

In [5]:
import pymongo
from pymongo import MongoClient
import dns
client = MongoClient()

In [6]:
db = client.twitter
collection_ufmg = db.ufmg_filtered

# retrieve data from Mongo

In [30]:
file_len = 7503436
objects = collection_ufmg.find({})
tweets_list = []
data = pd.DataFrame()
count = 0
for obj in objects:
    date = obj['date']
    if isinstance(date, int):
        date = date/1000
    else: date = time.mktime(time.strptime(date[:10], '%Y-%m-%d'))
    if date >= 1451617260: # 1451617260 = 2016-01-01
        obj['date'] = time.strftime('%Y-%m-%d', time.localtime(date))
        obj['text'] = re.sub(r'\\', '', obj['text'])
        if 'extended_tweet' in obj: 
            obj['extended_tweet'] = re.sub(r'\\', '', obj['extended_tweet']['full_text'])
        tweets_list.append(obj)

    count += 1
    if count % (int(file_len/1000)) == 0:
        
        #here I reset the list to save memory usage
        if tweets_list: data = data.append(tweets_list, ignore_index=True)
        tweets_list = []
        
        clear_output()
        
        frac = count/file_len*100
        print("%.1f" % frac, "% done", sep="")

100.0% done


# filter by date

reference: https://portalarquivos2.saude.gov.br/images/pdf/2018/agosto/21/Publicacao-BE-2018-SE-30.pdf

* zika
    * peak year = 2016 
    * peak week = week 7 = 14-21/2/2016
    * occurrence timeframe = 7-14 = 14/2 a 9/4
* chikungunya
    * peak year = 2017 
    * peak week = week 17 = 23-30/4/2017
    * occurrence timeframe = 10-17 = 5/3 a 29/4
* dengue
    * peak year = 2016, but e used 2018 to add variability from other years
    * occurrence timeframe = 14-21 = 1/4 a 26/5

In [120]:
data = data.copy()

removing 2018 for now because I want to compare with older anvil_input.

notice that on older input, I chose different dates:
```python
data = data.loc[(data['date'] >= '2018-04-04 00:00:01') & (data['date'] <= '2018-05-27 23:59:59')]
```

In [27]:
data = data[(data['date'] >= '2016-02-14 00:00:01') & (data['date'] <= '2016-05-28 23:59:59') \
                 | (data['date'] >= '2017-01-01 00:00:01') & (data['date'] <= '2017-08-20 23:59:59') \
                 | (data['date'] >= '2018-04-01 00:00:01') & (data['date'] <= '2018-05-27 23:59:59') \
    ]
data = data[data['lang'] == 'pt']

In [28]:
print(len(data))
data.tail()

1055095


Unnamed: 0,_id,class_campanha,class_exp_pessoal,class_informacao,class_opiniao,class_parodia,coordinates,date,extended_tweet,lang,location,place,screen_name,text,user_id
1502217,1000931012069724160,0.031485,0.504642,0.111274,0.200243,0.152354,,2018-05-27,,pt,na merda,,Bwliebar_,minha irma brigando pq eu postei que Deus e zi...,2551219417
1502218,1000933497240981507,0.114587,0.10478,0.085701,0.220957,0.473975,,2018-05-27,@Crente_Quadrado preciso conversar com noe sob...,pt,"Paraiso do Norte, Brasil",,bielrobati,@Crente_Quadrado preciso conversar com noe sob...,180148246
1502219,1000934479580590080,0.064492,0.575904,0.160251,0.077597,0.121755,,2018-05-27,,pt,"Sao Leopoldo, Brasil",,CarLouhs,"Mano, que dor no meu pescoco acho que e dengue",949441490790551554
1502220,1000934567551922176,0.036469,0.071213,0.560594,0.117393,0.214331,,2018-05-27,,pt,Costa Rica,,gds506,Zancudo Aedes aegypti genera resistencia a ins...,15686478
1502221,1000934642193780738,0.01315,0.417638,0.247061,0.086871,0.235281,,2018-05-27,,pt,"Sao Paulo, Brasil",,duda_senam,RT @fluminenseraiz: VAI TOMAR NO CU O FLUMINEN...,532225289


In [29]:
data_json = data.to_json(orient='records')

In [31]:
file = os.path.join(outputs,'tweets_filtered.json.bz2')
data = pd.read_json(file)

#### filter by 7 weeks  timespan on each year
Those weeks correspond to a peak for each virus

In [32]:
data = data[(data['date'] >= '2016-02-14 00:00:01') & (data['date'] <= '2016-04-09 23:59:59') \
                 | (data['date'] >= '2017-03-05 00:00:01') & (data['date'] <= '2017-04-29 23:59:59') \
                 | (data['date'] >= '2018-04-01 00:00:01') & (data['date'] <= '2018-05-27 23:59:59') \
    ]
data = data[data['lang'] == 'pt']

In [45]:
print(len(data))

1055095


In [41]:
data_section = data[(data['date'] >= '2016-02-14 00:00:01') & (data['date'] <= '2016-04-09 23:59:59')].sample(10000)
data_section = data_section.append(data[(data['date'] >= '2017-03-05 00:00:01') & (data['date'] <= '2017-04-29 23:59:59')].sample(10000))
data_section = data_section.append(data[(data['date'] >= '2018-04-01 00:00:01') & (data['date'] <= '2018-05-27 23:59:59')].sample(10000))

In [50]:
data = data_section
print(len(data))

30000


# save a sample as input for training
We need 5000 samples. First I get 6000 to remove possible duplicates.

In [51]:
sample = data.sample(6000)
sample.drop_duplicates(subset=['text'], keep='first', inplace=True)
sample = sample.sample(5000)

In [52]:
tweets_object = []
count = 0
for index, row in sample.iterrows():
    sample_dict = {}
    sample_dict['id'] = row['_id']
    if isinstance(row['extended_tweet'], str):
        text = re.sub('\n+', ' ', row['extended_tweet'])
        #print(text)
    else:
        text = re.sub('\n+', ' ', row['text'])
    sample_dict['message'] = text
    sample_dict['count'] = count
    tweets_object.append(sample_dict)
    count += 1

In [55]:
len(tweets_object)

5000

#### save

In [56]:
file = os.path.join(outputs, 'tweets_anvil_input.json')

#### evaluate sample

In [60]:
print(len(sample[(sample['date'] >= '2016-02-14 00:00:01') & (sample['date'] <= '2016-04-09 23:59:59')]))
print(len(sample[(sample['date'] >= '2017-03-05 00:00:01') & (data['date'] <= '2017-04-29 23:59:59')]))
print(len(sample[(sample['date'] >= '2018-04-01 00:00:01') & (data['date'] <= '2018-05-27 23:59:59')]))

1770
1723
1507


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [62]:
len(sample[sample.duplicated(['text'])])

0

In [66]:
for index, row in sample.iterrows(): 
    if row[9] != 'pt': print(row[9], '-----', row[1])
print('If nothing was printed, all tweets are identified as in portuguese language')

If nothing was printed, all tweets are identified as in portuguese language


# test

check id that should have extended_tweet

In [None]:
file_len = 7503436
obj = collection_ufmg.find_one({"_id": 718277523306594300})
print(obj)

In [35]:
tweets_object = json.load(open(file, 'r'))

In [37]:
tweets_object[0]['message']

'@Charbrevolution https://t.co/vwZsyRqyAz https://t.co/RSU6IJsNTZ https://t.co/kklzwhHCWU https://t.co/Xzji1Oj3GY https://t.co/7B4q8bN3UC https://t.co/Bwq4Zxf4h5 https://t.co/zLs9DOMLVY  https://t.co/R0ZjWz5WxZ'

In [66]:
cut_texts = []
long_texts = []
count_long = 0
count_short = 0
for obj in tweets_object:
    text = obj['message']
    if len(text)>140: 
        count_long += 1
        long_texts.append([obj['id'],text])
    elif len(text)>=137 and len(text)<=140: 
        count_short += 1
        cut_texts.append([obj['id'],text])

In [67]:
print(count_short)
print(count_long)

544
28
