# Twitter data preparation

In [76]:
import twitter
import json
import sys
import pandas as pd
from datetime import datetime

In [3]:
ACCESS_TOKEN = ''
ACCESS_SECRET = ''
CONSUMER_KEY = ''
CONSUMER_SECRET = ''

## Twitter API + python-twitter package

In [4]:
# set up
api = twitter.Api(consumer_key=CONSUMER_KEY,
                  consumer_secret=CONSUMER_SECRET,
                  access_token_key=ACCESS_TOKEN,
                  access_token_secret=ACCESS_SECRET)

### Feb 27th - Mar 8th tweets 

In [125]:
tweets_profile = []
dates = pd.date_range("2020-02-27", "2020-03-09").strftime("%Y-%m-%d")
for day in dates:
    results = api.GetSearch(
        raw_query=f"q=COVID-19&lang=en&result_type=recent&until={day}&count=100")
    tweets_profile += results

len(tweets_profile)

1100

In [126]:
user_id = []
user_name = []
tweet_time = []
texts = []
for tweet in tweets_profile:
    if tweet.retweeted_status==None:
        texts.append(tweet.text)#.encode('utf-8'))
    else: 
        texts.append(tweet.retweeted_status.text)#encode('utf-8')
    
    user_id.append(tweet.id)
    user_name.append(tweet.user.screen_name)
    tweet_time.append(tweet.created_at)

In [127]:
df_twitter = pd.DataFrame(data = list(zip(user_id, user_name, tweet_time, texts)), 
                          columns = ["Id", "userName", "tweetTime", "text"])
df_twitter.head(5)

Unnamed: 0,Id,userName,tweetTime,text
0,1233180008212049920,SpeakUp_MHA,Thu Feb 27 23:59:59 +0000 2020,Wow - great way to look after small businesses...
1,1233180007767298048,AnaBamazing,Thu Feb 27 23:59:59 +0000 2020,"Friends, there's still no reason to panic abou..."
2,1233180007171751942,Ph0nograf,Thu Feb 27 23:59:59 +0000 2020,1) Interesting day. Last night @realDonaldTrum...
3,1233180005733105665,lunars_art,Thu Feb 27 23:59:59 +0000 2020,"Friends, there's still no reason to panic abou..."
4,1233180000872026114,Fmohnigeria,Thu Feb 27 23:59:58 +0000 2020,The Federal Ministry of Health has confirmed a...


In [128]:
df_twitter.shape

(1100, 4)

In [129]:
# extract weekday, date, and time from previous dataset
df_twitter["weekday"] = df_twitter["tweetTime"].apply(lambda x: x[0:3])
df_twitter["tweetTime"] = df_twitter["tweetTime"]\
                        .apply(lambda x: pd.to_datetime(datetime.strptime(x, '%a %b %d %H:%M:%S %z %Y')))

df_twitter["Date"] = df_twitter["tweetTime"].dt.date

df_twitter['Time'] = df_twitter["tweetTime"].dt.time

df_twitter.head(5)

Unnamed: 0,Id,userName,tweetTime,text,weekday,Date,Time
0,1233180008212049920,SpeakUp_MHA,2020-02-27 23:59:59+00:00,Wow - great way to look after small businesses...,Thu,2020-02-27,23:59:59
1,1233180007767298048,AnaBamazing,2020-02-27 23:59:59+00:00,"Friends, there's still no reason to panic abou...",Thu,2020-02-27,23:59:59
2,1233180007171751942,Ph0nograf,2020-02-27 23:59:59+00:00,1) Interesting day. Last night @realDonaldTrum...,Thu,2020-02-27,23:59:59
3,1233180005733105665,lunars_art,2020-02-27 23:59:59+00:00,"Friends, there's still no reason to panic abou...",Thu,2020-02-27,23:59:59
4,1233180000872026114,Fmohnigeria,2020-02-27 23:59:58+00:00,The Federal Ministry of Health has confirmed a...,Thu,2020-02-27,23:59:58


In [130]:
df_twitter.to_csv('df_twitter_feb27tomar8.csv',index=False)

## Twitter API + tweepy package

In [134]:
import tweepy

In [211]:
# auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
# auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
# api = tweepy.API(auth, wait_on_rate_limit=True)

# searched_tweets = [status for status in 
#                     tweepy.Cursor(api.search, 
#                         q="#COVID-19",
#                         since="2020-03-06",
#                         until="2020-03-07",lang='en').items(10000)]
# len(searched_tweets)

10000

In [216]:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True)

searched_tweets = [status for status in 
                    tweepy.Cursor(api.search, 
                        q="#COVID-19",
                        since="2020-03-09",
                        until="2020-03-10",lang='en').items(10000)]
len(searched_tweets)

10000

In [217]:
user_id = []
user_name = []
tweet_time = []
texts = []
likes = []
retweets = []
location = []

for tweet in searched_tweets:
    try:
        if tweet.retweeted_status==None:
            texts.append(tweet.text)
        else: 
            texts.append(tweet.retweeted_status.text)
    except AttributeError:
        texts.append(tweet.text)
    
    user_id.append(tweet.id)
    user_name.append(tweet.user.screen_name)
    tweet_time.append(tweet.created_at)
    
    likes.append(tweet.user.favourites_count)
    retweets.append(tweet.retweet_count)
    location.append(tweet.user.location)    


In [218]:
df_twitter = pd.DataFrame(data = list(zip(user_id, user_name, tweet_time, texts, likes,
                                         retweets, location)), 
                          columns = ["Id", "userName", "tweetTime", "text", "likes",
                                    "retweets", "usrLocation"])
df_twitter.head(5)

Unnamed: 0,Id,userName,tweetTime,text,likes,retweets,usrLocation
0,1237166273454198784,SummitSheriffCO,2020-03-09 23:59:59,The number one question I get is what Colorada...,1754,43,"Summit County, Colorado"
1,1237166272112017408,GGreediguts,2020-03-09 23:59:59,Publishing: We're hiring!\nMe: Great! Is it re...,1810,2016,Always seem to be in Flyover Country
2,1237166271428464642,Deplorable4Trum,2020-03-09 23:59:59,I don't recall H1N1 in 2009 being hyped at any...,52681,638,
3,1237166269352288259,minbyunh,2020-03-09 23:59:58,COVID-19 Outbreak Donations\n\nChanyeol- 50mil...,242,3336,nowhere
4,1237166268978999296,ClaudeUbanan,2020-03-09 23:59:58,After hearing and seeing the President last ni...,8546,31,


In [195]:
df_twitter.shape

(10000, 7)

In [219]:
df_twitter["Date"] = df_twitter["tweetTime"].dt.date
df_twitter['Time'] = df_twitter["tweetTime"].dt.time

df_twitter.head(5)

Unnamed: 0,Id,userName,tweetTime,text,likes,retweets,usrLocation,Date,Time
0,1237166273454198784,SummitSheriffCO,2020-03-09 23:59:59,The number one question I get is what Colorada...,1754,43,"Summit County, Colorado",2020-03-09,23:59:59
1,1237166272112017408,GGreediguts,2020-03-09 23:59:59,Publishing: We're hiring!\nMe: Great! Is it re...,1810,2016,Always seem to be in Flyover Country,2020-03-09,23:59:59
2,1237166271428464642,Deplorable4Trum,2020-03-09 23:59:59,I don't recall H1N1 in 2009 being hyped at any...,52681,638,,2020-03-09,23:59:59
3,1237166269352288259,minbyunh,2020-03-09 23:59:58,COVID-19 Outbreak Donations\n\nChanyeol- 50mil...,242,3336,nowhere,2020-03-09,23:59:58
4,1237166268978999296,ClaudeUbanan,2020-03-09 23:59:58,After hearing and seeing the President last ni...,8546,31,,2020-03-09,23:59:58


In [220]:
df_twitter.to_csv('df_twitter.csv',index=False)