In [15]:

####################
#Author: brandon chiazza
#version: 1.0
#purpose: to call a twitter api and return results
#documentation: https://developer.twitter.com/en/docs
#####################


import pandas as pd
import requests
import json
import base64
import s3fs 
import twitter_keys #this is a custom reference module to a package containing twitter keys

key_secret = '{}:{}'.format(twitter_keys.client_key, twitter_keys.client_secret).encode('ascii')
b64_encoded_key = base64.b64encode(key_secret)
b64_encoded_key = b64_encoded_key.decode('ascii')

#identify base url and oauth token path
base_url = 'https://api.twitter.com/' #base url for authentication
auth_url = '{}oauth2/token'.format(base_url)

#share header information -- encoding is ascii
auth_headers = {
    'Authorization': 'Basic {}'.format(b64_encoded_key),
    'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
}

#pass clientcredentials
auth_data = {
    'grant_type': 'client_credentials'
}

#send authentication using requests - POST request
auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data)

#check response status. 200 = OK
auth_resp.status_code




200

In [20]:

# Keys in data response are token_type (bearer) and access_token (your access token)
print(auth_resp.json().keys()

access_token = auth_resp.json()['access_token']


search_headers = {
    'Authorization': 'Bearer {}'.format(access_token)    
}

#enter search parameters for coronavirus example. This looks for "covid-19" in the 1000 most recent tweets
query_params = {
    'q': 'covid-19',
    'result_type': 'recent',
    'count': 1000
}


#identify search url path and save 
search_url = '{}1.1/search/tweets.json'.format(base_url)


#run search using get request
search_resp = requests.get(search_url, headers=search_headers, params=query_params)

#check status code of GET request
search_resp.status_code


dict_keys(['token_type', 'access_token'])


200

In [17]:
#print text from result to verify  
twitter_data = search_resp.json()
for x in twitter_data['statuses']:
    print(x['text'] + '\n')

RT @sonaliranade: About 100 to 120 million blue-collar workers, accounting for over 70-80% of the industry, have gone without income in the…

川崎病とCOVID-19ってなんか似てる

RT @takeshi_local1: ＜COVID-19に伴う美ヶ原・武石地域および周辺地域の営業内容変更、冬季閉鎖の延長について＞
※今後の政府の対応により、休館期間が変更になる場合がります。詳細は各公式HPをご確認ください。
#上田市 #長和町 #美ヶ原 #ビーナスライン…

RT @Minsa_Peru: ACTUALIZACIÓN | Esta es la situación del coronavirus #COVID19 en Perú hasta las 00:00 horas del 27 de abril. #PerúEstáEnNue…

RT @RobinLloyd99: what is meant by a "safe reopening" with COVID-19? what is definition of safe here? I don't see this explained/defined in…

Amid the COVID-19 crisis, here's how Midwest VCs are approaching investing. https://t.co/x5YqWCRgaW

RT @helpandsupportf: @BlueDart_ @BlueDartCares Why aren't you starting retail customer medicine courier pickups from our pharmacy of items…

RT @TheScenestar: Sad news for music lovers that didn't get to visit the original location in Hollywood before it closed down due to COVID-…

RT @WSJopinion: Social-med

In [51]:
# move data into data frame 

df = pd.DataFrame(twitter_data['statuses'])

# show one record to verify import 
df.head(1)

Unnamed: 0,created_at,id,id_str,text,truncated,entities,metadata,source,in_reply_to_status_id,in_reply_to_status_id_str,...,retweeted_status,is_quote_status,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,quoted_status_id,quoted_status_id_str
0,Tue Apr 28 03:44:01 +0000 2020,1254979655855009793,1254979655855009793,RT @sonaliranade: About 100 to 120 million blu...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,{'created_at': 'Tue Apr 28 03:37:17 +0000 2020...,False,4,0,False,False,False,en,,


In [70]:
# we can use pandas to put data directly into an s3 bucket
s3 = {path to bucket}
df.to_csv(path=s3, sep= = '||', index = False )

In [61]:
# optional flatening nested json with json_normalize
from pandas.io.json import json_normalize

#lets look at the users column
display(df.user[:5])

#and lets print its dtype

type(df.user[2])


0    {'id': 357267251, 'id_str': '357267251', 'name...
1    {'id': 1084828411170680833, 'id_str': '1084828...
2    {'id': 138413510, 'id_str': '138413510', 'name...
3    {'id': 799377726, 'id_str': '799377726', 'name...
4    {'id': 2268076370, 'id_str': '2268076370', 'na...
Name: user, dtype: object

dict

In [68]:
# this column contains nested data which will be hard for us to manage later
# we can flatten it with the json_normalize function

user_table = json_normalize(df.user)

user_table.head()

Unnamed: 0,id,id_str,name,screen_name,location,description,url,protected,followers_count,friends_count,...,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,following,follow_request_sent,notifications,translator_type,entities.description.urls,entities.url.urls
0,357267251,357267251,RIDE ON ‏‎ - AAP anti Fenku,pavan_sethi,,"RTs can be interpreted as endorsements, irony...",,False,2838,753,...,True,False,True,False,,,,none,[],
1,1084828411170680833,1084828411170680833,しもベイベ🐝⋆︎*ﾟ∗1y5m♀,myproudmusume,,2018.11生まれ・娘さま専用しもべ妖精のアカウントです🐝⋆︎*ﾟ∗惚気多めで趣味多め。無...,,False,91,1057,...,True,False,True,False,,,,none,[],
2,138413510,138413510,てつ,ezelsapientia,静岡県浜松市,ロイヤルエンフィールドのヒマラヤンと、スーパーカブ110（JA07）に乗るおじさん。キャンプ...,https://t.co/FIX7e5xFhH,False,455,976,...,True,True,False,False,,,,none,[],"[{'url': 'https://t.co/FIX7e5xFhH', 'expanded_..."
3,799377726,799377726,Lucía,Lucia_Reyes_,Perú,Comunicación Social || @UNMSM_ || Music and la...,,False,113,127,...,True,False,False,False,,,,none,[],
4,2268076370,2268076370,Richard Hunt,rfjhunt,new york,production artiste,,False,132,139,...,True,False,True,False,,,,none,[],
