# Using Twitter API to Pull Tweets

This notebook provides the workflow for pulling tweets that are then used in building the political candidate money flow knowledge graph

In [1]:
import pandas as pd
import numpy as np
import ast
import time
import compress_json
import json

In [2]:
# Load candidate twitter usernames

cand_twitters = pd.read_csv("candidate_twitters.csv")

In [3]:
cand_twitters.head()

Unnamed: 0,Name,Twitter_username,Account_start_time,Account_ID,Sex,Birthplace,Birthday,Age,Instagram_username,Political_party
0,A. Donald McEachin,RepMcEachin,2017-01-03T00:00:00Z,8.16181e+17,male,Germany,1961-10-10T00:00:00Z,59.0,repmceachin,Democratic Party
1,Aaron Michlewitz,RepMichlewitz,2010-06-27T00:00:00Z,160246973.0,male,United States of America,1978-01-01T00:00:00Z,42.0,,Democratic Party
2,Aaron Peskin,AaronPeskin,2010-11-13T00:00:00Z,215369273.0,male,United States of America,1964-06-17T00:00:00Z,56.0,apeskin52,Democratic Party
3,Aaron Peña,AaronPena,2007-10-31T00:00:00Z,9843332.0,male,United States of America,1959-06-08T00:00:00Z,61.0,,Republican Party
4,Aaron Schock,aaronschock,2009-03-12T00:00:00Z,23951197.0,male,United States of America,1981-05-28T00:00:00Z,39.0,aaronschock,Republican Party


In [4]:
# This will import the Twarc2 client and expansions class from twarc library and also the json library
from twarc import Twarc2, expansions
import json

# This is where you initialize the client with your own bearer token (replace the XXXXX with your own bearer token)
client = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAAH8MiwEAAAAAhxBwj2PzTLJqURrByB7nteUOgoY%3DQ8EREzO3zgq2WQOrQu41PZ9nE48iEDCDSmHzfzyVl7Cx8jIqyk")

### Define twitter api functions to pull tweets

In [5]:
def get_tweets(twitter_user):
    # This timeline functions gets the Tweet timeline for a specified user
    user_timeline = client.timeline(user=twitter_user)
    
    user_tweets = []
    # Twarc returns all Tweets for the criteria set above, so we page through the results
    for page in user_timeline:
        # The Twitter API v2 returns the Tweet information and the user, media etc.  separately
        # so we use expansions.flatten to get all the information in a single JSON
        result = expansions.flatten(page)
        for tweet in result:
            user_tweets.append(tweet)
        break
    
    return(user_tweets)

In [6]:
def get_mentions(twitter_user):
    # This mentions functions gets the mentions for a specified user
    user_mentions = client.mentions(user=twitter_user)
    
    mention_tweets= []
    count = 0
    # Twarc returns all Tweets for the criteria set above, so we page through the results
    for page in user_mentions:
        # The Twitter API v2 returns the Tweet information and the user, media etc.  separately
        # so we use expansions.flatten to get all the information in a single JSON
        result = expansions.flatten(page)
        for tweet in result:
            mention_tweets.append(tweet)
            count += 1
            if count == 100:
                count = 0
                break
        break
        
    return(mention_tweets)            

## Load in the curated senator races and senator member twitters processed from GetCandidates.ipynb

In [7]:
senate_members = pd.read_csv("senator_members_twitters_curated.csv")
senate_races = pd.read_csv("senator_races_twitters_curated.csv")

In [8]:
senate_members.head()

Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,occupancy,last_means,bioname,Last name,Name,Twitter_username
0,117,President,99912,99,0,USA,200,0.0,0.0,"TRUMP, Donald John",Trump,Donald Trump,realDonaldTrump
1,117,President,99912,99,0,USA,200,0.0,0.0,"TRUMP, Donald John",Trump,Donald Trump,POTUS
2,117,President,99913,99,0,USA,100,0.0,0.0,"BIDEN, Joseph Robinette, Jr.",Biden,Joe Biden,JoeBiden
3,117,Senate,42102,41,0,AL,200,,,"TUBERVILLE, Thomas Hawley (Tommy)",Tuberville,Tommy Tuberville,TTuberville
4,117,Senate,94659,41,0,AL,200,,,"SHELBY, Richard C.",Shelby,Richard Shelby,SenShelby


In [9]:
senate_races.head()

Unnamed: 0,STATE,First name,Last name,Party,Incumbent,Full name,Name,Twitter_username
0,Alaska,Lisa,Murkowski,Rep,Y,Lisa Murkowski,Lisa Murkowski,lisamurkowski
1,Arizona,Mark,Kelly,Dem,Y,Mark Kelly,Mark Kelly,CaptMarkKelly
2,Arizona,Mark,Kelly,Dem,Y,Mark Kelly,Mike Kelly,MikeKellyPA
3,Arkansas,John,Boozman,Rep,y,John Boozman,John Boozman,Boozman4AR
4,Arkansas,John,Boozman,Rep,y,John Boozman,John Boozman,JohnBoozman


In [10]:
senate_members = senate_members.drop_duplicates(subset = ['Name'])
senate_races = senate_races.drop_duplicates(subset = ['Name'])

In [11]:
len(senate_members)

95

In [12]:
len(senate_races)

35

In [13]:
senate_members = senate_members.drop(['congress', 'chamber', 'Last name', 'bioname', 'occupancy', 'district_code', 'last_means'], axis = 1)

In [14]:
senate_members.head()

Unnamed: 0,icpsr,state_icpsr,state_abbrev,party_code,Name,Twitter_username
0,99912,99,USA,200,Donald Trump,realDonaldTrump
2,99913,99,USA,100,Joe Biden,JoeBiden
3,42102,41,AL,200,Tommy Tuberville,TTuberville
4,94659,41,AL,200,Richard Shelby,SenShelby
5,40300,81,AK,200,Lisa Murkowski,lisamurkowski


In [15]:
senate_races = senate_races.drop(['First name', 'Last name', 'Full name'], axis = 1)

In [16]:
senate_races.head()

Unnamed: 0,STATE,Party,Incumbent,Name,Twitter_username
0,Alaska,Rep,Y,Lisa Murkowski,lisamurkowski
1,Arizona,Dem,Y,Mark Kelly,CaptMarkKelly
2,Arizona,Dem,Y,Mike Kelly,MikeKellyPA
3,Arkansas,Rep,y,John Boozman,Boozman4AR
5,California,Dem,y,Alex Padilla,alexpadilla4ca


In [17]:
all_cand_data = pd.concat([senate_members, senate_races])

In [18]:
all_cand_data = all_cand_data.drop_duplicates(subset = ['Twitter_username'])

In [19]:
len(all_cand_data)

107

In [20]:
all_cand_data.head()

Unnamed: 0,icpsr,state_icpsr,state_abbrev,party_code,Name,Twitter_username,STATE,Party,Incumbent
0,99912.0,99.0,USA,200.0,Donald Trump,realDonaldTrump,,,
2,99913.0,99.0,USA,100.0,Joe Biden,JoeBiden,,,
3,42102.0,41.0,AL,200.0,Tommy Tuberville,TTuberville,,,
4,94659.0,41.0,AL,200.0,Richard Shelby,SenShelby,,,
5,40300.0,81.0,AK,200.0,Lisa Murkowski,lisamurkowski,,,


In [21]:
cand_twitters = list(all_cand_data['Twitter_username'])

## Get recent tweets of senator/candidates using twitter API, loop through twitter IDs

In [57]:
politician_tweets = {}
for twitter_id in cand_twitters:
    try:
        user_tweets = get_tweets(twitter_id)
        politician_tweets[twitter_id] = user_tweets
    except:
        continue

In [24]:
politician_tweets[:5]

NameError: name 'politician_tweets' is not defined

### Save politician twitters as json object 

In [64]:
import json

with open('politician_user_tweets.json', 'w') as fp:
    json.dump(politician_tweets, fp)

## Get recent tweets mentioning the senator/candidate user their twitter IDs

In [23]:
mention_tweets = {}
for twitter_id in cand_twitters:
    try:
        user_tweets = get_mentions(twitter_id)
        mention_tweets[twitter_id] = user_tweets
    except:
        continue

In [25]:
import json

with open('politician_mention_tweets.json', 'w') as fp:
    json.dump(mention_tweets, fp)

## Load in JSON to compress it with compress JSON

In [7]:
politician_tweets = open('politician_user_tweets.json')
tweets = json.load(politician_tweets)
compress_json.dump(tweets, "politician_direct_tweets.json.gz")

In [8]:
mention_tweets = open('politician_mention_tweets.json')
tweets = json.load(mention_tweets)
compress_json.dump(tweets, "politician_mention_tweets.json.gz")