# Coleta de dados no twitter: Exemplo utilizando a linguagem Python

In [3]:
# Libs to catch and treat data:
import tweepy           
import numpy as np      
from textblob import TextBlob as tb

# getting access to saved twitter keys at credentials.py:
from credentials import *

In [2]:
# API Setup
def twitter_setup():
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)
    return api

In [3]:
# Starting API
api = twitter_setup()

# Setting keywords
keywords = ('covid OR covid-19 OR corona OR coronavirus')

In [31]:
# Searching for tweets with defined keywords
# result_type --> mixed(default), recent, popular
#tweets = api.search(q=keywords, count=10, result_type='mixed', tweets_mode='extended', lang='pt')
tweets = api.search(q=keywords, count=10, result_type='mixed')

In [32]:
tweets

[Status(_api=<tweepy.api.API object at 0x0000014BE3CEDC88>, _json={'created_at': 'Mon Jun 22 13:45:23 +0000 2020', 'id': 1275062328971497472, 'id_str': '1275062328971497472', 'text': 'Because of MAIL-IN BALLOTS, 2020 will be the most RIGGED Election in our nations history - unless this stupidity is… https://t.co/Ps0AaQkk9s', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/Ps0AaQkk9s', 'expanded_url': 'https://twitter.com/i/web/status/1275062328971497472', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'metadata': {'result_type': 'popular', 'iso_language_code': 'en'}, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 25073877, 'id_str': '25073877', 'name': 'Donald J. Trump', 'sc

In [33]:
for tweet in tweets:
    print(f'Usuário: {tweet.user.screen_name}')
    print(f'    Tweet: {tweet.text}')

Usuário: realDonaldTrump
    Tweet: Because of MAIL-IN BALLOTS, 2020 will be the most RIGGED Election in our nations history - unless this stupidity is… https://t.co/Ps0AaQkk9s
Usuário: JoeBiden
    Tweet: It's worth repeating that this weekend, President Trump admitted he directed his administration to slow down corona… https://t.co/kLqjsXBGaA
Usuário: funder
    Tweet: BREAKING: Trump just confirmed he slowed coronavirus testing because it looked bad for him. Americans got sick and… https://t.co/gLmWK45iaA
Usuário: anne_tappe
    Tweet: RT @garci32: CNN said that Italy recorded the lowest COVID 19 infection since March and Jake Tapper asked the expert what Italy did  that t…
Usuário: RubenEslaiman
    Tweet: RT @JefaturaPba: El gobernador @Kicillofok y el ministro de Salud @DrDanielGollan visitaron el primer Centro de Telellamadas COVID-19 de la…
Usuário: Karkouch3
    Tweet: Covid-19: Trump persiste sur la baisse du dépistage, ses experts le contredisent

https://t.co/k7lk9Ciwvv
Usu

In [34]:
# Polarity variable
analysis = None

# List to store scores
tweets_score = []

In [35]:
for tweet in tweets:
    print(f'** {tweet.text}')
    analysis = tb(tweet.text)
    polarity = analysis.sentiment.polarity
    tweets_score.append(polarity)

** Because of MAIL-IN BALLOTS, 2020 will be the most RIGGED Election in our nations history - unless this stupidity is… https://t.co/Ps0AaQkk9s
** It's worth repeating that this weekend, President Trump admitted he directed his administration to slow down corona… https://t.co/kLqjsXBGaA
** BREAKING: Trump just confirmed he slowed coronavirus testing because it looked bad for him. Americans got sick and… https://t.co/gLmWK45iaA
** RT @garci32: CNN said that Italy recorded the lowest COVID 19 infection since March and Jake Tapper asked the expert what Italy did  that t…
** RT @JefaturaPba: El gobernador @Kicillofok y el ministro de Salud @DrDanielGollan visitaron el primer Centro de Telellamadas COVID-19 de la…
** Covid-19: Trump persiste sur la baisse du dépistage, ses experts le contredisent

https://t.co/k7lk9Ciwvv
** RT @rioenlinea: Tras visita de Mario Rozas cinco Carabineros de Los Ríos dieron positivo a Coronavirus https://t.co/9CJJq7lYT9 #Valdiviacl…
** RT @MSNBC: Ahead of Pres. 

In [36]:
print(f'Vetor de polaridade: {tweets_score}')

Vetor de polaridade: [-0.04999999999999999, -0.05185185185185188, -0.33809523809523806, 0.0, 0.0, 0.0, 0.0, 0.0, -0.15555555555555559, 0.375]


In [37]:
print(f'MÉDIA DE SENTIMENTO: {str(np.mean(tweets_score))}')

MÉDIA DE SENTIMENTO: -0.022050264550264553


#### Análise de polaridade para tweets que não foram postados em inglês

In [38]:
polarities = []
for tweet in tweets:
    analysis = tb(tweet.text)
    # Verificar se o tweet esta em inglês, se não estiver, traduzir
    if analysis.detect_language() != 'en':
        traducao = tb(str(analysis.translate(to='en')))
        print(f'Texto traduzido: {traducao}')
        polarity = traducao.sentiment.polarity
    else:
        polarity = analysis.sentiment.polarity
    polarities.append(polarity)

Texto traduzido: RT @JefaturaPba: Governor @Kicillofok and Health Minister @DrDanielGollan visited the first COVID-19 Call Center of the ...
Texto traduzido: Covid-19: Trump persists on lower screening, experts contradict him

https://t.co/k7lk9Ciwvv
Texto traduzido: RT @rioenlinea: After Mario Rozas' visit, five Carabineros de Los Ríos tested positive for Coronavirus https://t.co/9CJJq7lYT9 # Valdiviacl…
Texto traduzido: RT @ alejandrocipol1: They tell me about Prevention of COVID-19, a friend called 148 with symptoms, reporting that he was with a relative of a…


In [39]:
print(f'Vetor de polaridade: {tweets_score}')

Vetor de polaridade: [-0.04999999999999999, -0.05185185185185188, -0.33809523809523806, 0.0, 0.0, 0.0, 0.0, 0.0, -0.15555555555555559, 0.375]


In [40]:
print(f'MÉDIA DE SENTIMENTO: {str(np.mean(tweets_score))}')

MÉDIA DE SENTIMENTO: -0.022050264550264553


## Armazenar os tweets coletados

In [41]:
import json

In [44]:
status = tweets[0]

#convert to string
json_str = json.dumps(status._json)

#deserialise string to python object
parsed = json.loads(json_str)

In [45]:
type(tweets)

tweepy.models.SearchResults

In [46]:
type(json_str)

str

In [47]:
type(parsed)

dict

In [49]:
datasetPath = 'D:\DataScience\Bootcamp-Analista-de-Dados-IGTI\Modulo-03\Exercícios\Datasets'

# oppened on append mode
with open(datasetPath+'\\tweets_keywords.json', 'a', encoding='utf-8') as filename:
    status = tweet
    
    #convert to string
    json_str = json.dumps(status._json)
    
    #deserialise string to python object
    parsed = json.loads(json_str)
    
    #save tweet on file
    json.dump(parsed, filename, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))

## Armazenar no MongoDB

In [50]:
import pymongo

In [52]:
#connect to Mongo
con = pymongo.MongoClient('localhost', 27017)

#select database
db = con.twitterdb

# select colection to store the tweets
collection = db.tweets_keywords

In [58]:
#lists the quantity of tweets in the collection
db.tweets_keywords.count_documents({})

10

In [56]:
i = 0
for tweet in tweets:
    db.tweets_keywords.insert_one(tweet._json)
    i+=1

In [57]:
print(f'Quantidade de tweets inseridos: {i}')

Quantidade de tweets inseridos: 10


In [59]:
#lists the quantity of tweets in the collection
db.tweets_keywords.count_documents({})

10