## Pré Processamento
### Carregando dados e fazendo limpeza básica

In [1]:
import pandas as pd
import numpy as np

In [2]:
sheets = ['mar-abr', 'abr-mai', 'mai-jun', 'jun-jul', 'jul-ago', 'ago-set']
cols = ['id','conversation_id','created_at','date','time','timezone','user_id','username','name','tweet','categoria','desinfo','mentions','urls','replies_count','retweets_count','likes_count','hashtags','link','retweet','quote_url','user_rt_id','user_rt','retweet_id','reply_to','popularidade']
raw_data = '../data/raw/tweets_aos_fatos_saude.xlsx'

In [3]:
mar_abr = pd.read_excel(raw_data, sheet_name=sheets[0])
abr_mai = pd.read_excel(raw_data, sheet_name=sheets[1])
mai_jun = pd.read_excel(raw_data, sheet_name=sheets[2])
jun_jul = pd.read_excel(raw_data, sheet_name=sheets[3])
jul_ago = pd.read_excel(raw_data, sheet_name=sheets[4])
ago_set = pd.read_excel(raw_data, sheet_name=sheets[5])

In [4]:
mar_abr.drop(mar_abr.columns[0:2], axis=1, inplace=True)
mar_abr.rename(columns={'desinf': 'desinfo'}, inplace=True)
abr_mai.drop(abr_mai.columns[0:2], axis=1, inplace=True)
mai_jun.drop(mai_jun.columns[0:2], axis=1, inplace=True)
jun_jul.drop(jun_jul.columns[0:2], axis=1, inplace=True)
jul_ago.drop(jul_ago.columns[0:2], axis=1, inplace=True)
ago_set.drop(ago_set.columns[0:2], axis=1, inplace=True)

In [5]:
frames = [mar_abr, abr_mai, mai_jun, jun_jul, jul_ago, ago_set]
df = pd.concat(frames)

In [6]:
df.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'tweet', 'categoria', 'desinfo',
       'mentions', 'urls', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'link', 'retweet', 'quote_url', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'popularidade'],
      dtype='object')

In [7]:
df.shape

(11988, 26)

In [8]:
# remover colunas user_rt_id, user_rt, retweet_id
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,11988.0,1.271208e+18,1.853411e+16,1.239025e+18,1.25255e+18,1.270801e+18,1.286719e+18,1.30348e+18
conversation_id,11988.0,1.271199e+18,1.852944e+16,1.239025e+18,1.25255e+18,1.270801e+18,1.28671e+18,1.30348e+18
created_at,11988.0,1591915000000.0,4418876000.0,1584241000000.0,1587466000000.0,1591818000000.0,1595613000000.0,1599609000000.0
user_id,11988.0,3.903617e+17,5.179402e+17,790680.0,59534430.0,1171550000.0,1.003301e+18,1.297712e+18
replies_count,11988.0,138.7674,529.7275,0.0,12.0,32.0,93.0,22842.0
retweets_count,11988.0,551.8212,1271.789,0.0,65.0,174.0,500.25,28259.0
likes_count,11988.0,2649.046,6454.799,63.0,393.0,902.5,2301.25,163235.0
user_rt_id,0.0,,,,,,,
user_rt,0.0,,,,,,,
retweet_id,0.0,,,,,,,


In [9]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
date,11988,167,2020-07-07,533
time,11988,11312,20:45:00,4
timezone,11988,3,Hora oficial do Brasil,6605
username,11988,3413,ailtonbenedito,168
name,11988,3400,🇧🇷Ailton Benedito,168
tweet,11988,11868,e se a gente der CLOROQUINA para os CNPJS??,8
categoria,600,9,favor,365
desinfo,156,3,desinf,92
mentions,11987,946,[],9675
urls,11988,3320,[],8083


In [10]:
df.drop(['user_rt_id', 'user_rt', 'retweet_id'], axis=1, inplace=True)

In [11]:
df.desinfo.value_counts(dropna=False)

NaN           11832
desinf           92
desinfo          55
depoimento        9
Name: desinfo, dtype: int64

In [12]:
df.desinfo = df.desinfo.replace('desinf', 'desinfo')

In [13]:
df.desinfo = df.desinfo.replace('desinfo', 0)
df.desinfo = df.desinfo.fillna(1)

In [14]:
#apagar depoimentos
df.drop(df[df.desinfo=='depoimento'].index, axis=0, inplace=True)
df.desinfo = df.desinfo.astype(int)

In [15]:
df.desinfo.value_counts(dropna=False)

1    11791
0      143
Name: desinfo, dtype: int64

In [16]:
print(df.timezone.unique())
df.timezone = 'GMT-3'

['Hora oficial do Brasil' '-3' 'UTC']


In [17]:
df.urls = df.urls.replace('[]', np.nan) 

In [18]:
df.hashtags = df.hashtags.replace('[]', np.nan)

In [19]:
df.mentions = df.mentions.replace('[]', np.nan)

In [20]:
df.isnull().sum()

id                     0
conversation_id        0
created_at             0
date                   0
time                   0
timezone               0
user_id                0
username               0
name                   0
tweet                  0
categoria          11362
desinfo                0
mentions            9628
urls                8043
replies_count          0
retweets_count         0
likes_count            0
hashtags           10499
link                   0
retweet                0
quote_url          11202
reply_to               0
popularidade           0
dtype: int64

### Limpar o texto dos tweets para etapa posterior

In [21]:
from nltk.corpus import stopwords
import nltk
import bs4 as bs4 
import string
import re
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

def clean_text(sentence):
    
    # Taken and amended from this notebook https://www.kaggle.com/madz2000/nlp-using-glove-embeddings-99-87-accuracy
    # Full credits to the author
    
    tmp_sentence = bs4.BeautifulSoup(sentence,"html.parser")
    sentence = tmp_sentence.get_text()
    sentence = re.sub('\[[^]]*\]', '', sentence)
    sentence = re.sub(r'http\S+', '', sentence)
    
    stop_words = set(stopwords.words('portuguese'))
    puncts = list(string.punctuation)
    stop_words.update(puncts)
    
    temp = []
    for j in sentence.split():
        if j.strip().lower() not in stop_words:
            temp.append(j.strip())
    sentence = " ".join(temp)
    return sentence

In [22]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/flavia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
df['tweet'] = df['tweet'].apply(clean_text)

In [24]:
df.to_csv('tweets.csv', index=False)

### Em `exploratory.ipynb` os dados continuam sendo tratados para treinar o modelo