Twitter API
=========================================



In [1]:
# !pip install tweepy
# install twitter module for python

In [2]:
import os 
from pprint import pprint
import json
import tweepy
import numpy as np
import pandas as pd
import text2emotion as te

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/farhat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/farhat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/farhat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Authorization

In [3]:
#api keys
CONSUMER_KEY = ""
CONSUMER_SECRET = ""
ACCESS_TOKEN = ""
ACCESS_TOKEN_SECRET = ""


auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = tweepy.API(auth,wait_on_rate_limit=True)

### Note: Code will not work without applying the keys

# Choosing Samples

Given the limitations of the free Twitter API, we chose to sample the 5 counties that had the highest proportion of votes for Democratic and Republican parties respectively. Interestingly, all these counties are in close proximity to one another.

In [4]:
county_data = pd.read_csv("county_politics.csv")

## Top 5 Democratic Majority Counties in CA

In [5]:
dems = county_data[county_data["majority"]=="Democrat"]
dem_short = dems.sort_values("dem_pct", ascending= False).head() #find 5 highest proportion democrat voting counties
dem_short

Unnamed: 0,county,dem_pct,rep_pct,majority,lat,long
37,San Francisco,85.26%,12.72%,Democrat,37.779026,-122.419906
20,Marin,82.33%,15.79%,Democrat,38.040914,-122.619964
0,Alameda,79.83%,17.62%,Democrat,37.609029,-121.899142
43,Santa Cruz,78.44%,18.49%,Democrat,37.050096,-121.990591
40,San Mateo,77.89%,20.20%,Democrat,37.496904,-122.333057


In [36]:
dem_short[["county","dem_pct","rep_pct","majority"]]

Unnamed: 0,county,dem_pct,rep_pct,majority
37,San Francisco,85.26%,12.72%,Democrat
20,Marin,82.33%,15.79%,Democrat
0,Alameda,79.83%,17.62%,Democrat
43,Santa Cruz,78.44%,18.49%,Democrat
40,San Mateo,77.89%,20.20%,Democrat


## Top 5 Republican Majority Counties in CA

In [6]:
reps = county_data[county_data["majority"]=="Republican"]
rep_short = reps.sort_values("rep_pct", ascending= False).head() #find 5 highest proportion democrat voting counties
rep_short

Unnamed: 0,county,dem_pct,rep_pct,majority,lat,long
17,Lassen,23.24%,74.47%,Republican,40.768558,-120.730998
24,Modoc,26.33%,71.19%,Republican,41.545049,-120.7436
51,Tehama,31.02%,66.62%,Republican,40.125133,-122.201553
44,Shasta,32.28%,65.41%,Republican,40.796512,-121.997919
10,Glenn,35.36%,62.52%,Republican,39.591277,-122.377866


In [37]:
rep_short[["county","dem_pct","rep_pct","majority"]]

Unnamed: 0,county,dem_pct,rep_pct,majority
17,Lassen,23.24%,74.47%,Republican
24,Modoc,26.33%,71.19%,Republican
51,Tehama,31.02%,66.62%,Republican
44,Shasta,32.28%,65.41%,Republican
10,Glenn,35.36%,62.52%,Republican


# Retrieving Tweets

We made a function that makes it easier to retrieve Tweets about a topic given a certain location. We believe our version is a bit more readable than the API, especially when we want to carry out these queries for multiple locations. Currently, it is not possible to retrive tweets from multiple coordinates using the Twitter API natively.

In [7]:
def get_topic_tweets(topic,lat,long):
    '''Searches tweets about a topic (in the form of string) given coordinates'''
    tweet_list = []
    response = api.search_tweets(str(topic),
                                geocode = str(lat) + "," + str(long) + ",100km",
                                count = 50,
                                result_type="recent",
                                lang="en")
    for i in response: 
        tweet_list.append(i.text.split("https")[0])
    
    return tweet_list

# Retrieving Republican County Tweets

In [8]:
rep_twt = [] #create an empty list to store all republican tweets

for i in range (0,5):
    rep_twt += get_topic_tweets("vaccines",rep_short.iloc[i,4],rep_short.iloc[i,5]) 
    #search each county for vaccine tweets

BadRequest: 400 Bad Request
215 - Bad Authentication data.

In [None]:
rep_twt_df = pd.DataFrame(rep_twt, columns = ["tweet"]).drop_duplicates() #create dataframe for tweets, drops duplicates

#rep_twt_df.to_csv("republican_tweets.csv",index = False) #save tweets as csv so we don't have to query all the time

# Preparing Republican County Tweet Data for Sentiment Analysis

In [9]:
rep_twt_df = pd.read_csv("republican_tweets.csv") #read in csv that was retrieved earlier

In [20]:
rep_twt_df.head()

Unnamed: 0,tweet
0,Dems set up clash with GOP by pairing Covid re...
1,"@covid_priest @christogrozev @mod_russia Oh, i..."
2,"Trucker convoy laps Washington, DC, beltway to..."
3,Biden’s ‘test to treat’ covid plan draws prais...
4,@Kenny_Wallace My kids dad for covid pneumonia...


In [11]:
rep_no_dupes = list(rep_twt_df.tweet) #drop all duplicate tweets

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/farhat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/farhat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/farhat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
len(rep_no_dupes) #find the length of republican tweets

113

In [22]:
rep_twt_master = " ".join(rep_no_dupes) #join all the tweets as one long string
rep_emotion = te.get_emotion(rep_twt_master) #run text2emotion and get emotions for the republican tweets as a whole
rep_emotion

{'Happy': 0.16, 'Angry': 0.04, 'Surprise': 0.22, 'Sad': 0.21, 'Fear': 0.36}

# Retrieving Democrat County Tweets

In [None]:
dem_twt = []

#since all the top 5 democratic counties are within close proximity, 
#we choose to use only one location within 100km radius, which is San Francisco.
#this should cover the top 5 democratic voted counties in terms of percentage

response = api.search_tweets("covid", geocode = str(37.779026) + "," + str(-122.419906) + ",100km",
                    count = 200,
                    result_type="recent",
                    lang="en")

for i in response: 
    dem_twt.append(i.text.split("https")[0])


In [None]:
len(dem_twt) #number of tweets

In [None]:
dem_twt.head() #

# Preparing Democrat County Tweets

In [23]:
dem_twt_df = pd.read_csv("dem_tweets.csv") #here we 

In [24]:
#dem_twt_df = pd.DataFrame(dem_twt, columns = ["tweet"]).drop_duplicates() 
#create dataframe for tweets, drops duplicates


#dem_twt_df.to_csv("dem_tweets.csv",index = False) 
#save to csv so we don't have to query the API for our analysis every time

In [25]:
dem_no_dupes = list(dem_twt_df.tweet) #drop all duplicate tweets
dem_twt_master = " ".join(dem_no_dupes) #join them as one master string

In [32]:
dem_twt_master

'Hawaii becomes the final state to drop a mask mandate. Meanwhile in other news 1500 people/day are dying from covid right now. Right NOW!!! “She was told by one representative that a ‘young thing’ like her could not be disabled by the virus.”\n\n“There were…  Is anyone else just getting decimated by (not covid) sicknesses since starting to do more things recently after a 2…  Fascinating conversation happening now between Alexander Zaitchik and @lmlauramarsh for @newrepublic about missed o…  Four thousand more cases, four deaths related to COVID-19 since Friday.  An incredible and inspiring story indeed! #CorsiRosenthalBox @CorsIAQ @JimRosenthal4 #covid #innovation #impact  California’s COVID positive test rate plunges to the lowest point since July -- but the state is still reporting 17…  How to cope with the anxiety of no longer wearing a mask...  Influence of exercise and vitamin D on the immune system against Covid-19: an integrative review of current literat…  Been thinking about 

In [12]:
dem_twt

['Hawaii becomes the final state to drop a mask mandate. Meanwhile in other news 1500 people/day are dying from covid right now. Right NOW!!!',
 '“She was told by one representative that a ‘young thing’ like her could not be disabled by the virus.”\n\n“There were… ',
 'Is anyone else just getting decimated by (not covid) sicknesses since starting to do more things recently after a 2… ',
 'Fascinating conversation happening now between Alexander Zaitchik and @lmlauramarsh for @newrepublic about missed o… ',
 'Four thousand more cases, four deaths related to COVID-19 since Friday. ',
 'An incredible and inspiring story indeed! #CorsiRosenthalBox @CorsIAQ @JimRosenthal4 #covid #innovation #impact ',
 'California’s COVID positive test rate plunges to the lowest point since July -- but the state is still reporting 17… ',
 'How to cope with the anxiety of no longer wearing a mask... ',
 'Influence of exercise and vitamin D on the immune system against Covid-19: an integrative review of curre

z

In [26]:
len(dem_no_dupes) #number of tweets from democrat counties after removing dupes

96

In [27]:
dem_emotion = te.get_emotion(dem_twt_master) #get emotion values for 
dem_emotion 

{'Happy': 0.13, 'Angry': 0.07, 'Surprise': 0.2, 'Sad': 0.28, 'Fear': 0.33}

# Preparing Emotion DataFrame for Both Counties

In [28]:
dem_emo_df = pd.DataFrame([list(dem_emotion.keys()),
              list(dem_emotion.values())]).T #create dataframe using the emotion values


dem_emo_df["majority"] = "Democrat" #assign the corresponding majority

dem_emo_df.columns = ["Emotion","Value","Majority"] #rename the columns

dem_emo_df

Unnamed: 0,Emotion,Value,Majority
0,Happy,0.13,Democrat
1,Angry,0.07,Democrat
2,Surprise,0.2,Democrat
3,Sad,0.28,Democrat
4,Fear,0.33,Democrat


In [29]:
rep_emo_df = pd.DataFrame([list(rep_emotion.keys()),
              list(rep_emotion.values())]).T #create dataframe using the emotion values

rep_emo_df["majority"] = "Republican" #assign the corresponding majority

rep_emo_df.columns = ["Emotion","Value","Majority"] #rename the columns 

rep_emo_df

Unnamed: 0,Emotion,Value,Majority
0,Happy,0.16,Republican
1,Angry,0.04,Republican
2,Surprise,0.22,Republican
3,Sad,0.21,Republican
4,Fear,0.36,Republican


In [30]:
all_emotes = pd.concat([dem_emo_df,rep_emo_df]) #concatenate dataframes for each political majority into one
all_emotes

Unnamed: 0,Emotion,Value,Majority
0,Happy,0.13,Democrat
1,Angry,0.07,Democrat
2,Surprise,0.2,Democrat
3,Sad,0.28,Democrat
4,Fear,0.33,Democrat
0,Happy,0.16,Republican
1,Angry,0.04,Republican
2,Surprise,0.22,Republican
3,Sad,0.21,Republican
4,Fear,0.36,Republican


In [None]:
#all_emotes.to_csv("tweet_emotions.csv",index=False)

In [34]:
import plotly.express as px
fig = px.bar(all_emotes, x="Emotion", y="Value", color="Majority", barmode = "group",
            title = "Emotions of Tweets Regarding COVID, Democratic Counties (n=99) vs. Republican Counties(n=103)")

fig.show()