In [12]:

####################
#Author: brandon chiazza
#version: 1.0
#purpose: to call a twitter api and return results
#documentation: https://developer.twitter.com/en/docs
#####################

import pandas as pd
import requests
import json
import base64
import s3fs 
import time
import twitter_keys #this is a custom reference module to a package containing twitter keys

%config IPCompleter.greedy=True


key_secret = '{}:{}'.format(twitter_keys.client_key, twitter_keys.client_secret).encode('ascii')
b64_encoded_key = base64.b64encode(key_secret)
b64_encoded_key = b64_encoded_key.decode('ascii')

#identify base url and oauth token path
base_url = 'https://api.twitter.com/' #base url for authentication
auth_url = '{}oauth2/token'.format(base_url)

#share header information -- encoding is ascii
auth_headers = {
    'Authorization': 'Basic {}'.format(b64_encoded_key),
    'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
}

#pass clientcredentials
auth_data = {
    'grant_type': 'client_credentials'
}

#send authentication using requests - POST request
auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data)

#check response status. 200 = OK
auth_resp.status_code




200

In [27]:

#Keys in data response are token_type (bearer) and access_token (your access token)
print(auth_resp.json().keys())

access_token = auth_resp.json()['access_token']


search_headers = {
    'Authorization': 'Bearer {}'.format(access_token)    
}

#enter search parameters for coronavirus example. This looks for "covid-19" in the 1000 most recent tweets
query_params = {
    'q': 'covid-19',
    'result_type': 'recent',
    'count': 100, #update here to get more/less than 1000 returns
    'lang': 'en' #filters by english language only
}


#identify search url path and save 
search_url = '{}1.1/search/tweets.json'.format(base_url)


#run search using get request
search_resp = requests.get(search_url, headers=search_headers, params=query_params)

#check status code of GET request
search_resp.status_code


dict_keys(['token_type', 'access_token'])


200

In [28]:
#print text from result to verify  
twitter_data = search_resp.json()

for x in twitter_data['statuses']:
    print(x['text'] + '\n')
    break #prints after one iteration and stops, remove break to see all 1000

I've kind of made it a poin to not talk too much about my personal life on my account. To every rule however there… https://t.co/YBuxagRxNf



In [42]:
# move data into data frame 
df = pd.DataFrame(twitter_data['statuses'])
str(df.text).encode()
# show one record to verify import 
#df.head(1)

b'0     I\'ve kind of made it a poin to not talk too mu...\n1     Two men in Georgia drank disinfectants in effo...\n2     RT @JHorstman45: .@IowaStandard: "Numbers don\xe2\x80\x99...\n3     RT @B52Malmet: A million cases of #COVID-19 an...\n4     RT @SkyNews: For 6,600 care home deaths we sim...\n                            ...                        \n95    RT @UPI: 9 die in Peru prison riot amid fears ...\n96    RT @SafetyPinDaily: Wisconsin Election Winner ...\n97    RT @ClaraJeffery: The Vietnam War traumatized ...\n98    RT @julie_kelly2: While joggers are hounded of...\n99    "Experts condemn two California doctors for sh...\nName: text, Length: 100, dtype: object'

In [46]:
# we can use pandas to put data directly into an s3 bucket
#prepare csv file name   
filename = 's3:/lab-03/'#specify location of s3:/{my-bucket}/
groupname= 'Group_1_' #name of your group
datetime = time.strftime("%Y%m%d%H%M%S") #timestamp
filenames3 = "%s%s%s.csv"%(filename,groupname,datetime) #name of the filepath and csv file
print(filenames3)
#'C:/example_file.csv'
df.text.to_csv(path_or_buf=filenames3, header= True, index=False, line_terminator ='\n')

s3:/lab-03/Group_1_20200428193107.csv


UnicodeEncodeError: 'charmap' codec can't encode characters in position 125-128: character maps to <undefined>

In [47]:
# optional flatening nested json with json_normalize
from pandas.io.json import json_normalize

#lets look at the users column
display(df.user[:5])

#and lets print its dtype
type(df.user[2])


0    {'id': 801624973, 'id_str': '801624973', 'name...
1    {'id': 355470606, 'id_str': '355470606', 'name...
2    {'id': 14825402, 'id_str': '14825402', 'name':...
3    {'id': 817670481117663233, 'id_str': '81767048...
4    {'id': 95455068, 'id_str': '95455068', 'name':...
Name: user, dtype: object

dict

In [48]:
# this column contains nested data which will be hard for us to manage later
# we can flatten it with the json_normalize function

user_table = json_normalize(df.user)

user_table.head()

  after removing the cwd from sys.path.


Unnamed: 0,id,id_str,name,screen_name,location,description,url,protected,followers_count,friends_count,...,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,following,follow_request_sent,notifications,translator_type,entities.description.urls,entities.url.urls
0,801624973,801624973,Ethan Akrie,xethan123x,,,,False,119,528,...,True,False,True,False,,,,none,[],
1,355470606,355470606,JimmyBear 🗽 🐻😷🧻👬 🏳️‍🌈 🇺🇸 ☮️ 🍕 🚀 🌊,JimmyBear2,"Nutley, New Jersey",#COVID19 😷#StayHomeSavesLives #AloneTogether #...,,False,13925,13466,...,True,True,False,False,,,,none,[],
2,14825402,14825402,Kerith Gaines,TheRightWife,"South Carolina, USA",America First. Build the Wall. Eat your veggies!,,False,1487,1387,...,False,False,False,False,,,,none,[],
3,817670481117663233,817670481117663233,Painterly Breeze,painterlybreeze,,Old hippie and veteran who will soon be off to...,,False,114,292,...,True,False,True,False,,,,none,[],
4,95455068,95455068,Ari,IhavnoHandle,IhavNoLocation,IhavnoBio,https://t.co/H5B8gfUnyu,False,537,338,...,True,True,False,False,,,,none,[],"[{'url': 'https://t.co/H5B8gfUnyu', 'expanded_..."
