# DS700 Final Project
#### Analysis of Optum vs. UHC public sentiment via twitter tweets

In [1]:
import tweepy
import pandas as pd
import numpy as np
import re
import pickle
from textblob import TextBlob

In [2]:
# read credential file
creds= pd.read_csv('credentials.csv')

In [3]:
# inspect ... 
print(creds.columns)
print(creds.shape)

Index(['consumerKey', 'consumerSecret', 'tokenKey', 'tokenSecret'], dtype='object')
(1, 4)


In [4]:
# assign credentials
con_key = creds.consumerKey[0]
con_secret = creds.consumerSecret[0]
acc_token = creds.tokenKey[0]
acc_secret = creds.tokenSecret[0]

In [5]:
#Use tweepy.OAuthHandler to create an authentication using the given key and secret
auth = tweepy.OAuthHandler(consumer_key=con_key, 
                           consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)

#Connect to the Twitter API using the authentication
api = tweepy.API(auth)

In [6]:
# confirm user
api.me().name

'Adam Hendel'

In [7]:
def gather_tweets(terms, num_needed=2000):
    '''Accepts a list of terms to search twitter. Does not return a unique list of tweets\
    can be overlap in tweets across terms. eg one tweet could contain both terms=['texas', 'tx']'''
    tweets = []
    last_id = -1 # id of last tweet seen
    progress = 0
    for term in terms:     
        tweet_more = True
        while tweet_more: # while still finding tweets under a term
                if len(tweets) >= num_needed: # if we reached or gather req., exit the function
                    print('Reached  Limit: {}'.format(len(tweets)))
                    return tweets
                
                try: # try to get more
                    query = api.search(q = term, count = 100, max_id = str(last_id - 1), lang='en')
                    print('{} : {} tweets'.format(term, len(query)))
                except tweepy.TweepError as e:
                    print("Error", e)
                    break
                else:
                    if query: # if we found more, extend them to our tweet list
                        tweets.extend(query)
                        last_id = tweets[-1].id # take note of the tweet id
                    else: # move on to next term
                        print('No more tweets under "{}"'.format(term))
                        tweet_more = False # stop searching this term
                        last_id = -1 # reset the tweet id for this term
                        break

    tot_tweets = len(tweets)
    if tot_tweets < num_needed:
        print('Exhausted Search: Found {} Tweets. . .'.format(tot_tweets))
    return tweets # if we've exhausted our search terms, return what we found

In [8]:
# build list of search terms
optum_terms = ['#%23optum', '@optum', 'optum']
uhc_terms = ['#%uhc', '#%myuhc', '@AskUHC', 'unitedhealthcare']

In [9]:
# get optum tweets
optumTweets = gather_tweets(optum_terms)

#%23optum : 24 tweets
#%23optum : 0 tweets
No more tweets under "#%23optum"
@optum : 100 tweets
@optum : 100 tweets
@optum : 26 tweets
@optum : 0 tweets
No more tweets under "@optum"
optum : 100 tweets
optum : 100 tweets
optum : 100 tweets
optum : 100 tweets
optum : 100 tweets
optum : 100 tweets
optum : 89 tweets
optum : 100 tweets
optum : 100 tweets
optum : 96 tweets
optum : 100 tweets
optum : 100 tweets
optum : 80 tweets
optum : 0 tweets
No more tweets under "optum"
Exhausted Search: Found 1515 Tweets. . .


In [10]:
# get uhc tweets
uhcTweets = gather_tweets(uhc_terms)

#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
#%uhc : 100 tweets
Reached  Limit: 2000


In [11]:
# save tweet objects for later
with open('optum_tweets.pkl', 'wb') as output:
    pickle.dump(optumTweets, output, pickle.HIGHEST_PROTOCOL)
with open('uhc_tweets.pkl', 'wb') as output:
    pickle.dump(uhcTweets, output, pickle.HIGHEST_PROTOCOL)

# Text Cleaning & Processing

In [2]:
# open raw tweet file
with open('optum_tweets.pkl', 'rb') as input:
    optum = pickle.load(input)
with open('uhc_tweets.pkl', 'rb') as input:
    uhc = pickle.load(input)

In [3]:
# determine number of tweets contain geolocation
geos = 0
for tweet in optum:
    if tweet.geo is not None:
        geos = geos + 1
print(geos, geos/len(optum))

4 0.00261437908496732


In [4]:
geos = 0
for tweet in uhc:
    if tweet.geo is not None:
        geos = geos + 1
print(geos, geos/len(uhc))

0 0.0


## very sparse geo data---excluding from analysis

In [5]:
def count_emoji(tweets, emoji=':)'):
    '''counts number of emojis in a list of tweets'''
    tot_emoji = 0
    for tweet in tweets:
        if ':)' in tweet.text:
            tot_emoji = tot_emoji + 1
    return tot_emoji

In [6]:
def drop_char(tweets):
    '''drops list of characters'''
    dropChar='."#$%&\'()*+,-/:;<=>@[\\]^_`{|}~…'
    cleaned = []
    for tweet in tweets:
        clean = ''.join([c for c in tweet if c not in dropChar])
        cleaned.append(clean)
    return cleaned

In [7]:
def tweet_scrub(tweets):
    '''removes urls, hashtags and usernames, and retweet annotations'''
    cleaned = []
    for tweet in tweets:
        # drop urls, hastags, @targets
        clean_tweet = re.sub(r"(http|www|#|@|RT )\S+", "", tweet)
        clean_tweet = re.sub(r'[^\x00-\x7F]+',' ', clean_tweet)
        # drop extra white space
        clean_tweet = ' '.join(clean_tweet.split())
        if len(clean_tweet) > 1: # remove tweets that are 0 or 1 char long
            cleaned.append(clean_tweet)
    return cleaned

In [8]:
def screen_org(tweets, orgList):
    '''removes any tweets that are within the orgList'''
    tweet_list = []
    for tweet in tweets:
        if not np.any([x.upper() in tweet.user.name.upper() for x in orgList]):
            # skip this tweet if any match of text from orgList to username
            # we dont want tweets from the organizations
            tweet_list.append(tweet.text)
    return tweet_list

In [9]:
def process_tweets(tweets, orgList):
    '''clean and convert each tweet object into a list of tweets'''
    # remove tweets from the originating organization
    tweet_list = screen_org(tweets, orgList)
    
    # remove duplicates (note this step is first to prevent removing retweets)
    tweet_list = list(set(tweet_list))
    
    # clean up the texts
    tweets_clean = tweet_scrub(tweet_list)
    
    # drop special characters
    tweets_clean = drop_char(tweets_clean)
    
    return tweets_clean

In [10]:
emoji_df = pd.DataFrame({'Optum':[count_emoji(optum), len(optum)],
                         'UHC':[count_emoji(uhc), len(uhc)]},
                        index=['num_emojis', 'num_tweets'])

In [11]:
# save results to file
print(emoji_df)
emoji_df.to_csv('emoji_results.csv')

            Optum   UHC
num_emojis      0     9
num_tweets   1530  2000


In [12]:
# clean up the tweet objects
uhg = ['optum', 'uhc', 'uhg', 'unitedhealth']
optum_clean = process_tweets(tweets=optum, orgList=uhg)
uhc_clean = process_tweets(tweets=uhc, orgList=uhg)

In [13]:
# how many tweets are we left with after cleaning
print(len(optum_clean))
print(len(uhc_clean))

942
1225


In [14]:
# save cleaned to dataframe
minLen = min(len(optum_clean), len(uhc_clean))
tweet_df = pd.DataFrame({'optumTweets':optum_clean[:minLen],
                         'uhcTweets':uhc_clean[:minLen]})

In [15]:
tweet_df.to_csv('tweets_cleaned.csv', index=False)

# Sentiment Analysis

In [16]:
# read clean tweets in
tweet_df=pd.read_csv('tweets_cleaned.csv')
tweet_df.columns

Index(['optumTweets', 'uhcTweets'], dtype='object')

In [17]:
# conduct sentiment on each tweet, save results
for col in tweet_df.columns:
    tweets = tweet_df[col]
    blob = tweets.apply(TextBlob)
    pol = []
    sub = []
    for b in blob:
        pol.append(b.sentiment.polarity)
        sub.append(b.sentiment.subjectivity)
    tweet_df['{}_pol'.format(col)] = pol
    tweet_df['{}_sub'.format(col)] = sub

In [18]:
# inspect data
tweet_df.sample(n=15)

Unnamed: 0,optumTweets,uhcTweets,optumTweets_pol,optumTweets_sub,uhcTweets_pol,uhcTweets_sub
231,SHYFT Analytics is another garbage Optum VC co...,UHC every day at 512557623940038,0.5,0.5,0.0,0.0
692,Do you ever forget important questions to ask ...,Absolutely the key to tackling malaria and HI...,0.45,0.75,0.05,0.516667
214,unhoptum venture fund is a fantastic idea,20 para 400 waaa v UHC BADLION FFA Road To 40...,0.4,0.9,0.0,0.0
52,Does your get diaper rash? What remedies work ...,The donut hole should be against the law They ...,-0.1875,0.5,0.0,0.0
906,Your baby will a lot in their first year Find ...,Nick carpigaming,0.375,0.416667,0.0,0.0
267,Look at NHS supply chain CSUs OptumUH Big4 manag,All people have the right to health no matter ...,0.0,0.0,0.142857,0.267857
85,Employee Wellnes Is A Culture At Optum Gayatri...,FFA pls,0.0,0.0,0.0,0.0
874,Optum Ventures launched will invest 250 millio...,Uhc for 4k pls,0.0,0.0,0.0,0.0
358,UNH after announcement of 250M fund to invest ...,why does everyone on Hypixel Solo UHC !! play ...,-0.05,0.2,0.0,0.0
860,ICYMI Through new Optum Ventures unit UnitedHe...,I liked a video GER UHC Bedwars SkyWars und mehr,0.068182,0.227273,0.6,0.8


In [19]:
# save to file: for analysis in R
print(tweet_df.columns[2:])
tweet_df[tweet_df.columns[2:]].to_csv('optum_uhc_sentiments.csv', index=False)

Index(['optumTweets_pol', 'optumTweets_sub', 'uhcTweets_pol', 'uhcTweets_sub'], dtype='object')
