# Scrapping and preparing data

In [13]:
import pandas as pd
import tweepy as tw
import pickle as pkl
import getpass

## Kraggle data
The dataset from Kraggle _Jair Bolsonaro Twitter Data_ presents a collection of 3 thousands of tweets from president Jair Bolsonaro.

In [14]:
kraggle_data = pd.read_csv("bolsonaro_tweets.csv")
kraggle_data.date = pd.to_datetime(kraggle_data.date)
kraggle_data.head()

Unnamed: 0,date,text,likes,retweets,link
0,2020-04-29,O @govbr utilizará R$ 500 milhões para a aquis...,4250,808,https://twitter.com/jairbolsonaro/status/12555...
1,2020-04-29,"13- Mais de 17,7 mil brasileiros repatriados;\...",12314,1830,https://twitter.com/jairbolsonaro/status/12554...
2,2020-04-29,"10- 30,1 mil brasileiros curados da covid-19;...",13232,1854,https://twitter.com/jairbolsonaro/status/12554...
3,2020-04-29,"7- Suporte psicológico para equipes do SUS, v...",5121,941,https://twitter.com/jairbolsonaro/status/12554...
4,2020-04-29,4- Conclusão do Hospital de Campanha do Govern...,4247,882,https://twitter.com/jairbolsonaro/status/12554...


In [15]:
print("The time period of " +  str(len(kraggle_data))+" tweets are from " +  str(kraggle_data.date.min())[0:10] +" to " +  str(kraggle_data.date.max())[0:10] + ".")

The time period of 3120 tweets are from 2010-04-01 to 2020-04-29.


## Twitter scrapping
We can also obtain the data from Jair Bolsonaro, Flávio Bolsonaro, Carlos Bolsonaro and Eduardo Bolsonaro from the Twitter API using Tweepy. Although, the acess if permited for only the last 3200 tweets, so it isn't the full history of tweets. 

In [3]:
consumer_key = getpass.getpass()

········


In [4]:
consumer_secret = getpass.getpass()

········


In [5]:
access_token = getpass.getpass()

········


In [6]:
access_token_secret = getpass.getpass()

········


In [7]:
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [20]:
#requesting data from twitter
twitter_data = {}
for user in ['jairbolsonaro', 'bolsonarosp', 'flaviobolsonaro', 'carlosbolsonaro']:
    twitter_data[user] = [tweet for tweet in tw.Cursor(api.user_timeline,id=user, tweet_mode ="extended").items()]

In [22]:
#saving original data
with open("brute_scrapping,pkl", "wb+") as f:
    pkl.dump(twitter_data, f)

In [36]:
#script to create a dataframe from the data
date = []
text = []
user_name = []
place = []
retweets = []
favorites = []
for user in ['jairbolsonaro', 'bolsonarosp', 'flaviobolsonaro', 'carlosbolsonaro']:
    for tweet in twitter_data[user]:
        date.append(tweet._json['created_at'])
        text.append(tweet._json['full_text'])
        user_name.append(user)
        place.append(tweet._json['place'])
        retweets.append(tweet._json['retweet_count'])
        favorites.append(tweet._json['favorite_count'])
API_data = pd.DataFrame({'name': user_name, 'text': text, 'date': pd.to_datetime(date), 
                         'place': place, 'retweets': retweets, 'favorites':favorites})
API_data.head()

Unnamed: 0,name,text,date,place,retweets,favorites
0,jairbolsonaro,"-Edifício Joelma/SP, 1974.\n\n-Sgt CASSANIGA s...",2020-07-27 20:51:13+00:00,,3154,16202
1,jairbolsonaro,- Água para quem tem sede.\n- Liberdade para u...,2020-07-27 11:10:36+00:00,,8101,37357
2,jairbolsonaro,"@tarcisiogdf @MInfraestrutura 🤝🇧🇷, Ministro!",2020-07-26 20:18:19+00:00,,1074,16840
3,jairbolsonaro,2- @MinEconomia @MinCidadania @onyxlorenzoni @...,2020-07-26 15:40:39+00:00,,1337,6383
4,jairbolsonaro,1- Acompanhe as redes sociais! @secomvc @fabio...,2020-07-26 15:39:47+00:00,,3287,14836


In [51]:
API_data.to_csv('API_data.csv', sep = "~")