# Data preprocessing

In [9]:
import pandas as pd
import pickle as pkl
import tweepy as tw

## Preparing the data scrapped from the twitter API
It's necessary to select the useful information scrapped from the tweets and organize it in a dataframe with columns that can be used in visual data exploration and in the models.

In [131]:
#opening API resulted data
with open('brute_scrapping.pkl', 'rb') as f:
    tweets_data = pkl.load(f)

In [153]:
#create dataframe from tweets data
#going to use the following information from tweets:
#### created_at : date of post
#### full_text: tweet text
#### hashtags: all hashtags in the tweet
#### user_mentions: all mentions in the tweet
#### media_type: type of the media in the tweet, if there is media
#### status_reply: if the tweet is a reply to a status
#### name: username @
#### retweet_count: number of retweets
#### favorite_count: number of retweets
tweets_dict = {}
columns_list = ['created_at', 'full_text', 'hashtags', 'user_mentions', 'media_type', 
                'status_reply','name', 'retweet_count', 'favorite_count'] 
for var in columns_list:
    tweets_dict[var] = []

for user in ['jairbolsonaro', 'bolsonarosp', 'flaviobolsonaro', 'carlosbolsonaro']:
    for tweet in tweets_data[user]:
        for var in columns_list:
            if var == 'hashtags':
                aux = '/'.join([u['text'] for u in tweet._json['entities'][var]])
                if aux == "":
                    tweets_dict[var].append("None")
                else:
                    tweets_dict[var].append(aux)
            elif var == "user_mentions":
                aux = '/'.join([u['screen_name'] for u in tweet._json['entities'][var]])
                if aux== "":
                    tweets_dict[var].append("None")
                else:
                    tweets_dict[var].append(aux)
            elif var == "status_reply":
                tweets_dict[var].append(1 if tweet._json['in_reply_to_status_id'] != None else 0)
            elif var == "name":
                tweets_dict[var].append(user)
            elif var == "media_type":
                if 'media' in list(tweet._json['entities'].keys()):
                    tweets_dict[var].append(tweet._json['extended_entities']['media'][0]['type'])
                else:
                    tweets_dict[var].append(None)
            else:
                tweets_dict[var].append(tweet._json[var])

In [223]:
tweets_df = pd.DataFrame(tweets_dict)
tweets_df.head()

Unnamed: 0,created_at,full_text,hashtags,user_mentions,media_type,status_reply,name,retweet_count,favorite_count
0,Mon Jul 27 20:51:13 +0000 2020,"-Edifício Joelma/SP, 1974.\n\n-Sgt CASSANIGA s...",,,video,0,jairbolsonaro,3154,16202
1,Mon Jul 27 11:10:36 +0000 2020,- Água para quem tem sede.\n- Liberdade para u...,,,video,0,jairbolsonaro,8101,37357
2,Sun Jul 26 20:18:19 +0000 2020,"@tarcisiogdf @MInfraestrutura 🤝🇧🇷, Ministro!",,tarcisiogdf/MInfraestrutura,,1,jairbolsonaro,1074,16840
3,Sun Jul 26 15:40:39 +0000 2020,2- @MinEconomia @MinCidadania @onyxlorenzoni @...,,MinEconomia/MinCidadania/onyxlorenzoni/MEC_Com...,photo,1,jairbolsonaro,1337,6383
4,Sun Jul 26 15:39:47 +0000 2020,1- Acompanhe as redes sociais! @secomvc @fabio...,,secomvc/fabiofaria5555/tarcisiogdf/MInfraestru...,photo,0,jairbolsonaro,3287,14836


In [224]:
tweets_df['date'] = pd.to_datetime(tweets_df.created_at)
tweets_df['year'] = tweets_df.date.apply(lambda x : x.year)
tweets_df['month'] = tweets_df.date.apply(lambda x : x.month)
tweets_df['day'] = tweets_df.date.apply(lambda x : x.day)
tweets_df['hour'] = tweets_df.date.apply(lambda x : x.hour)
tweets_df['minute'] = tweets_df.date.apply(lambda x : x.minute)
tweets_df['weekday'] = tweets_df.date.apply(lambda x : x.weekday)
tweets_df['has_hashtags'] = tweets_df.hashtags.apply(lambda x : 1 if x != "None" else 0)
tweets_df['has_mentions'] = tweets_df.user_mentions.apply(lambda x : 1 if x != "None" else 0)
tweets_df['has_media'] = tweets_df.media_type.apply(lambda x : 1 if x != "None" else 0)
tweets_df.drop(columns = ['created_at'], inplace = True)

In [225]:
tweets_df.head()

Unnamed: 0,full_text,hashtags,user_mentions,media_type,status_reply,name,retweet_count,favorite_count,date,year,month,day,hour,minute,weekday,has_hashtags,has_mentions,has_media
0,"-Edifício Joelma/SP, 1974.\n\n-Sgt CASSANIGA s...",,,video,0,jairbolsonaro,3154,16202,2020-07-27 20:51:13+00:00,2020,7,27,20,51,0,0,0,1
1,- Água para quem tem sede.\n- Liberdade para u...,,,video,0,jairbolsonaro,8101,37357,2020-07-27 11:10:36+00:00,2020,7,27,11,10,0,0,0,1
2,"@tarcisiogdf @MInfraestrutura 🤝🇧🇷, Ministro!",,tarcisiogdf/MInfraestrutura,,1,jairbolsonaro,1074,16840,2020-07-26 20:18:19+00:00,2020,7,26,20,18,6,0,1,1
3,2- @MinEconomia @MinCidadania @onyxlorenzoni @...,,MinEconomia/MinCidadania/onyxlorenzoni/MEC_Com...,photo,1,jairbolsonaro,1337,6383,2020-07-26 15:40:39+00:00,2020,7,26,15,40,6,0,1,1
4,1- Acompanhe as redes sociais! @secomvc @fabio...,,secomvc/fabiofaria5555/tarcisiogdf/MInfraestru...,photo,0,jairbolsonaro,3287,14836,2020-07-26 15:39:47+00:00,2020,7,26,15,39,6,0,1,1


In [240]:
tweets_df.to_csv("preprocessed_tweets.csv", sep = "~")