In [2]:
import re
import os
import csv
import time
import json
import string
import tweepy
import datetime
import warnings

import pprint
import numpy as np
import pandas as pd
import seaborn as sns
import preprocessor as pre
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook as tqdm

from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pandas import Panel
tqdm.pandas()
pd.set_option('max_colwidth',500)

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

ROOT_DATA = 'D:/Statistical Programming Projects/Social Network and Sentiment Analysis/data/'


sns.set(rc={'figure.figsize':(9,6),'lines.linewidth': 5, 'lines.markersize': 10})
plt.style.use('seaborn-whitegrid')
sns.set_context("notebook", font_scale=1.2)
sns.set_style("whitegrid",{"font.family": ["Corbel"]})

### Load Twitter Dev credentials

In [11]:
with open(ROOT_DATA+'twitter.txt', 'r') as textFile:
    for line in textFile:
        exec(line)

In [3]:
# Creating the authentication object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# Setting your access token and secret
auth.set_access_token(access_token, access_token_secret)
# Creating the API object while passing in auth information
api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True,timeout=1000) 

### Set date parameters and file save location

In [4]:
#declare file paths as follows for three files
uhc_tweets = ROOT_DATA+"uhc_data.csv"

#set two date variables for date range
start_date = datetime.datetime(2020, 4, 18)
end_date = datetime.datetime(2020, 4, 29)

COLS = ['id', #Unique id of the tweet
        'tweeter_handle', #Twerp
        'init_reach', #Followers
        'timestamp', # Tweet timestamp
        'orig_tweet', #Actual text
        'likes', #Favourates of the tweet
        'retweets', # Retweet count
        'hashtags', #Tagging used
        'mentions', #Profiles mentioned
        'location', # Twerp's location
        'retweeted', #Tweet that's being retweeted
        'reply' #Tweet Twerp is replying to
       ]
#HappyEmoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

emoticons = emoticons_happy.union(emoticons_sad)

### Method to scrap and tabulate tweets

In [5]:
def write_tweets(keyword, file):
    
    pp = pprint.PrettyPrinter(indent=2)
    
    if os.path.exists(file):
        health_tweets = pd.read_csv(file, header=0)
    else:
        health_tweets = pd.DataFrame(columns=COLS)
        health_tweets.to_csv(file, columns=COLS, index=False, encoding="utf-8")
    
    for page in tweepy.Cursor(api.search
                            ,q=keyword
                            ,tweet_mode='extended'
                            ,include_rts=True
                            ,count=1000000
                            ,lang = "en"
                            ,since=start_date
                            ,until=end_date).pages(100000000000000):        
        
        
        for status in page:           
                        
            status = status._json            
            retweeted_tweet = None
            media = False
            tweet_reply = None
            aberrant = False

            if(status['full_text'].startswith('RT @') or ('quoted_status' in status.keys())):
                
                #If the tweet is a reply
                if 'in_reply_to_status_id_str' in status.keys():
                    tweet_reply = status['in_reply_to_status_id_str']
                
                #Add/update retweeted tweet
                if 'quoted_status' in status.keys():
                    embedded_key = 'quoted_status'
                    retweeted_tweet = status[embedded_key]['id_str']
                elif status['full_text'].startswith('RT @'):
                    embedded_key = 'retweeted_status'
                    try:
                        retweeted_tweet = status[embedded_key]['id_str']
                    except KeyError:
                        media=True
                        try:
                            retweeted_tweet = [details['source_status_id_str'] for details in \
                                        status['entities']['media'] if 'source_status_id' in details.keys()][0]
                        except KeyError:
                            aberrant = True
                        except IndexError:
                            pp.pprint(status)
                
                #      
                #Save the retweet/quoted tweet
                #
                if not media: #Just a text retweet, can use embedded key
                    retweeted_likes = status[embedded_key]['favorite_count']
                    retweeted_shares = status[embedded_key]['retweet_count']               

                    #Hashtags
                    hashtags = ", ".join([hashtag_item['text']\
                                for hashtag_item in status[embedded_key]['entities']['hashtags']])

                    #Mentions
                    mentions = ", ".join([mention['screen_name']\
                                for mention in status[embedded_key]['entities']['user_mentions']])                
                
                    this_tweet={
                        'id':retweeted_tweet
                        ,'tweeter_handle':status[embedded_key]['user']['screen_name']
                        ,'init_reach':status[embedded_key]['user']['followers_count']
                        ,'timestamp':status[embedded_key]['created_at']
                        ,'orig_tweet':status[embedded_key]['full_text']                               
                        ,'likes':retweeted_likes
                        ,'retweets':retweeted_shares 
                        ,'hashtags':hashtags
                        ,'mentions':mentions
                        ,'location':status[embedded_key]['user']['location']
                        ,'retweeted':None
                        ,'reply':None
                    }

                    with open(file=file,mode="a",encoding='utf-8') as csvfile:
                        writer = csv.DictWriter(csvfile,fieldnames=COLS,lineterminator = '\n')
                        writer.writerow(this_tweet)
                        
                #
                #Save the quoting tweet/retweet action        
                #
                #Hashtags
                hashtags = ", ".join([hashtag_item['text'] \
                                      for hashtag_item in status['entities']['hashtags']])
                #Mentions
                mentions = ", ".join([mention['screen_name'] \
                                      for mention in status['entities']['user_mentions']])
                this_tweet={
                    'id':status['id_str']
                    ,'tweeter_handle':status['user']['screen_name']
                    ,'init_reach':status['user']['followers_count']
                    ,'timestamp':status['created_at']
                    ,'orig_tweet': '' if status['full_text'].startswith('RT @') and not aberrant else status['full_text']                               
                    ,'likes':0 if status['full_text'].startswith('RT @') else status['favorite_count']
                    ,'retweets':0 if status['full_text'].startswith('RT @') else status['retweet_count'] 
                    ,'hashtags':'' if status['full_text'].startswith('RT @') else hashtags
                    ,'mentions':'' if status['full_text'].startswith('RT @') and not aberrant else mentions
                    ,'location':status['user']['location']
                    ,'retweeted':retweeted_tweet
                    ,'reply': tweet_reply 
                }

                with open(file=file,mode="a",encoding='utf-8') as csvfile:
                    writer = csv.DictWriter(csvfile,fieldnames=COLS,lineterminator = '\n')
                    writer.writerow(this_tweet)
                
                
            else: #Original tweet
                #Hashtags
                hashtags = ", ".join([hashtag_item['text'] \
                                      for hashtag_item in status['entities']['hashtags']])
                #Mentions
                mentions = ", ".join([mention['screen_name'] \
                                      for mention in status['entities']['user_mentions']])                
                this_tweet={
                        'id':status['id_str']
                        ,'tweeter_handle':status['user']['screen_name']
                        ,'init_reach':status['user']['followers_count']
                        ,'timestamp':status['created_at']
                        ,'orig_tweet':status['full_text']                               
                        ,'likes':status['favorite_count']
                        ,'retweets':status['retweet_count'] 
                        ,'hashtags':hashtags
                        ,'mentions':mentions
                        ,'location':status['user']['location']
                        ,'retweeted':None
                        ,'reply':None
                }
                
                with open(file=file,mode="a",encoding='utf-8') as csvfile:
                    writer = csv.DictWriter(csvfile,fieldnames=COLS,lineterminator = '\n')
                    writer.writerow(this_tweet)

In [6]:
#declare keywords as a query for three categories
uhc_keywords = '(covid OR covid19 OR coronavirus OR cov19 OR cov-19)'
try:
    covid_tweets = pd.read_csv(ROOT_DATA+'uhc_data.csv')
except:
    #call main method passing keywords and file path
    write_tweets(uhc_keywords,  uhc_tweets)
    covid_tweets = pd.read_csv(ROOT_DATA+'uhc_data.csv')

In [7]:
covid_tweets.retweeted = covid_tweets.retweeted.map(lambda x: '{:.0f}'.format(x))

#Convert date to Y-M-d format
covid_tweets.timestamp = covid_tweets.timestamp.apply(lambda x:time.strftime('%Y-%m-%d %H:%M:%S', 
                                                     time.strptime(x,'%a %b %d %H:%M:%S +0000 %Y')))

fix_retweet_handle = covid_tweets[~covid_tweets.orig_tweet.isnull() &
                                  covid_tweets.orig_tweet.str.startswith('RT @')]

def add_retweet_id(status_id):
    row = covid_tweets[covid_tweets.id==status_id]
    orig_tweet = row.orig_tweet.unique().tolist()
    if len(orig_tweet) > 1:
        orig_tweet = [tweet for tweet in orig_tweet if pd.notnull(tweet)][0]
    else:
        orig_tweet = orig_tweet[0]
    if ~pd.isna(orig_tweet):
                
        tweet_handle = re.findall('\@(\w+)',orig_tweet)[0]
        
        tweet = orig_tweet.replace('RT @'+tweet_handle+": ",'')[0:30]
        retweet_id = covid_tweets.id[(covid_tweets.tweeter_handle==tweet_handle)
                    & covid_tweets.orig_tweet.str.contains(tweet)].unique().tolist()        
        covid_tweets.loc[covid_tweets.id == status_id,['retweeted']]=\
        str(retweet_id[0]) if len(retweet_id) > 0 else np.nan    
    return True

print("\nFix retweet IDs for media")
done = fix_retweet_handle.id.progress_apply(add_retweet_id)

covid_tweets.hashtags[~covid_tweets.orig_tweet.isnull() 
                        & covid_tweets.orig_tweet.str.startswith('RT @')]= np.nan
covid_tweets.mentions[~covid_tweets.orig_tweet.isnull() 
                        & covid_tweets.orig_tweet.str.startswith('RT @')]= np.nan
covid_tweets.likes[~covid_tweets.orig_tweet.isnull() 
                        & covid_tweets.orig_tweet.str.startswith('RT @')]= 0
covid_tweets.retweets[~covid_tweets.orig_tweet.isnull() 
                        & covid_tweets.orig_tweet.str.startswith('RT @')]= 0
covid_tweets.orig_tweet[~covid_tweets.orig_tweet.isnull() 
                        & covid_tweets.orig_tweet.str.startswith('RT @')]= np.nan

repeated_tweets = covid_tweets.groupby('id').tweeter_handle.count()
repeated_tweets = repeated_tweets.to_frame()
repeated_tweets.reset_index(inplace=True)
repeated_tweets = repeated_tweets[repeated_tweets.tweeter_handle > 1]
repeated_tweets.rename(columns={'tweeter_handle':'counts'},inplace=True)
repeated_tweets.reset_index(inplace=True,drop=True)


ok_tweets  = covid_tweets[~covid_tweets.id.isin(repeated_tweets.id)]
tweets_to_clean  = covid_tweets[covid_tweets.id.isin(repeated_tweets.id)]
tweets_to_clean = tweets_to_clean.sort_values(['id','tweeter_handle','timestamp'],ascending=False)
ok_tweets = ok_tweets.sort_values(['id','tweeter_handle','timestamp'],ascending=False)


def clean_tweet(tweet_id):    
    
    tweet = tweets_to_clean[tweets_to_clean.id==tweet_id]    
    likes = tweet.likes.max()
    retweets = tweet.retweets.max()
    retweeted = tweet.retweeted.unique().tolist()
    retweeted = [twerp for twerp in retweeted if twerp != 'nan']
    retweeted = np.nan if len(retweeted)==0 else retweeted[0]
    reply = np.nan if len(tweet.reply.unique().tolist())==0 else tweet.reply.unique().tolist()[0]
    
    this_tweet=pd.DataFrame({
            'id':tweet_id
            ,'tweeter_handle':tweet['tweeter_handle'].unique().tolist()[0]
            ,'init_reach':tweet['init_reach'].unique().tolist()[0]
            ,'timestamp':tweet['timestamp'].unique().tolist()[0]
            ,'orig_tweet':tweet['orig_tweet'].unique().tolist()[0]                               
            ,'likes':likes
            ,'retweets':retweets 
            ,'hashtags':tweet['hashtags'].unique().tolist()[0]
            ,'mentions':tweet['mentions'].unique().tolist()[0]
            ,'location':tweet['location'].unique().tolist()[0]
            ,'retweeted':retweeted
            ,'reply':reply
            
    },index=[0])
    return this_tweet

print("\nRemove duplicate records")
cleaned_tweets = repeated_tweets.id.progress_apply(clean_tweet)
cleaned_tweets = pd.concat(cleaned_tweets.tolist(),axis=0)

last_weeks_covid_tweets = pd.concat([ok_tweets,cleaned_tweets],axis=0)
last_weeks_covid_tweets = last_weeks_covid_tweets.sort_values(['id','tweeter_handle','timestamp'],
                                                              ascending=False)
last_weeks_covid_tweets.reset_index(inplace=True,drop=True)
last_weeks_covid_tweets.retweeted[last_weeks_covid_tweets.retweeted=='nan']=np.nan

tweets = last_weeks_covid_tweets.id.unique().tolist()
retweeted_tweets = last_weeks_covid_tweets.retweeted.unique().tolist()
retweeted_tweets = [int(tweet) for tweet in retweeted_tweets if not pd.isnull(tweet)]
unmatched_retweets = [str(tweet) for tweet in retweeted_tweets if tweet not in tweets]
last_weeks_covid_tweets.retweeted[last_weeks_covid_tweets.retweeted.isin(unmatched_retweets) & 
                       ~pd.isnull(last_weeks_covid_tweets.orig_tweet)] = np.nan
last_weeks_covid_tweets = last_weeks_covid_tweets[~last_weeks_covid_tweets.retweeted.isin(unmatched_retweets)]
last_weeks_covid_tweets.to_csv(ROOT_DATA+"covid_nlp_data.csv", index=False, encoding="utf-8")


Fix retweet IDs for media


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))



Remove duplicate records


HBox(children=(FloatProgress(value=0.0, max=4707.0), HTML(value='')))


