In [1]:
import os
import tweepy
import pandas as pd
from datetime import datetime, timedelta, date
from time import sleep

In [2]:
## Twitter credentials
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
ACCESS_KEY = ''
ACCESS_SECRET = ''

In [3]:
off_accts = {'United States':['WhiteHouse', 'USAGov', 'US_FDA', 'CDCgov'],
            'India':['MoHFW_INDIA', 'COVIDNewsByMIB', 'TwitterIndia', 'rashtrapatibhvn', 'MoRD_GOI'],
            'Brazil':['govbrazil', 'govbr', 'jairbolsonaro','fabiofaria', 'minsaude'],
            'Mexico':['SSalud_mx','GobiernoMX','MexGov_GTA','lopezobrador_','SRE_mx', 'm_ebrard']}

In [4]:
## Criteria for COVID-19 data
general = ['ncov', 'coronavirus', 'covid', 'sars-cov-2', 'New coronavirus', 'unknown pneumonia', 'corona', '2019-nCov', 'COVID-19', 'delta variant',
          '#Coronavirus', '#covid19', '#covid', '#corona', '#coronaviras', '#corona-virus', '#covid19-virus', '#sarscov2']
key_dict = {'Closure': ['work from home', 'WFH', 'social distance', 'stayhome', 'gatherings restrictions', 'lockdown', 'quarantine', 'reopen',
                           '#stayatfomesafe', '#SocialDistancing', '#Quarantine'],
                    'Econ':['International support', 'debt relief', 'fiscal measures', 'GDP', 'recession',
                           '#stimulus', '#income', '#support', '#Debtrelief', '#Fiscalmeasures', '#export', '#COVID19Economy', '#unemployment'],
                    'Vaccination':['vaccine', 'vaccination', 'vaccinating', 'vaccinate', 'immunization', 'covidvaccine', 'covid19vaccine',
                                  '#vaccinate', '#GetVaccinated', '#VaccinesWork', '#vaccine'],
                    'Health':['testing', 'mask mandate', 'masks', 'mask', 'vaccine development', 'protect elderly',
                             '#testing', '#Tracing', '#MaskOn', '#MaskOff', '#PCR', '#PCRTEST', '#MaskMandate']}

In [5]:
auth = tweepy.OAuthHandler(CONSUMER_KEY,CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = tweepy.API(auth)

In [6]:
def acc_crawler(inputs): 
    
    for country, accts in inputs.items():
        print('##### Crawling tweets in ', country, '#####')

        for acct in accts:

            savingdir = os.path.join(os.getcwd(), 'Data', 'Tweets', country, acct)
            if not os.path.isdir(savingdir):
                os.mkdir(savingdir)

            print('##### Crawling tweets in ', acct, '#####')

            db = pd.DataFrame(columns=['username', 'retweets', 'likes', 'text', 'hashtags', 'time'])
            list_tweets = []

                #tweets = tweepy.Cursor(api.search_full_archive, environment_name='dev', 
                #                       fromDate='202003010000', toDate='202003302359', query=query, maxResults=num_tweets).items(100)
            tweets = api.user_timeline(screen_name = acct, count=200, tweet_mode='extended')
            list_tweets.extend(tweets)
            #save the id of the oldest tweet less one
            oldest = list_tweets[-1].id - 1

            #keep grabbing tweets until there are no tweets left to grab
            while len(tweets) > 0:
                print(f"getting tweets before {oldest}")

                #all subsiquent requests use the max_id param to prevent duplicates
                tweets = api.user_timeline(screen_name = acct, count=200, max_id=oldest, tweet_mode='extended')

                #save most recent tweets
                list_tweets.extend(tweets)

                #update the id of the oldest tweet less one
                oldest = oldest = list_tweets[-1].id - 1

                print(f"{len(list_tweets)} tweets downloaded so far")

            for tweet in list_tweets:
                time = tweet.created_at
                retweets = tweet.retweet_count
                likes = tweet.favorite_count
                username = tweet.user.screen_name
                hashtags = tweet.entities['hashtags']

                try:
                    text = tweet.retweet_status.full_text
                except AttributeError:
                    text = tweet.full_text

                hashtag = []
                for j in range(0, len(hashtags)):
                    hashtag.append(hashtags[j]['text'])

                record = [username, retweets, likes, text, hashtag, time]
                db.loc[len(db)] = record
            print(len(db), 'records are returned for', acct)

            filename = '{}_tweets.csv'.format(acct)
            print('{} written'.format(filename))

            db.to_csv(os.path.join(savingdir, filename))
        #sleep(900)

In [7]:
acc_crawler(off_accts)

##### Crawling tweets in  Mexico #####
##### Crawling tweets in  SSalud_mx #####
getting tweets before 1436389011556220929
400 tweets downloaded so far
getting tweets before 1434335477465862144
600 tweets downloaded so far
getting tweets before 1432146054800097282
800 tweets downloaded so far
getting tweets before 1429931331522371586
1000 tweets downloaded so far
getting tweets before 1427644625238609926
1200 tweets downloaded so far
getting tweets before 1425483420621316109
1400 tweets downloaded so far
getting tweets before 1423374419762761735
1600 tweets downloaded so far
getting tweets before 1421266861103230984
1800 tweets downloaded so far
getting tweets before 1419017037196693507
2000 tweets downloaded so far
getting tweets before 1417084302240608262
2200 tweets downloaded so far
getting tweets before 1415030770306007039
2400 tweets downloaded so far
getting tweets before 1412884380205338628
2600 tweets downloaded so far
getting tweets before 1411083367580422151
2800 tweets down