In [None]:
# For Tweepy API - needed to connect
import pickle
import os
import time
from tweepy import OAuthHandler
from tweepy import API
from tweepy import TweepError

# For Twitterscraper
from twitterscraper import query_tweets

# For dataframes
import datetime as dt
import pandas as pd

In [None]:
# Enter Twitter API info the first time running this notebook, then delete.
# Credentials will be saved into and loaded from separate pkl file.
if not os.path.exists('secret_twitter_credentials.pkl'):
    Twitter={}
    Twitter['Consumer Key'] = ''
    Twitter['Consumer Secret'] = ''
    Twitter['Access Token'] = ''
    Twitter['Access Token Secret'] = ''
    with open('secret_twitter_credentials.pkl','wb') as f:
        pickle.dump(Twitter, f)
else:
    Twitter=pickle.load(open('secret_twitter_credentials.pkl','rb'))

auth = OAuthHandler(Twitter['Consumer Key'], Twitter['Consumer Secret'])
auth.set_access_token(Twitter['Access Token'], Twitter['Access Token Secret'])

api = API(auth)

# If the authentication was successful, you should
# see the name of the account print out
print(api.me().name)

In [None]:
# Set up dictionary to collect tweets
tweets_dict = {'timestamp':[],
               'id':[],
               'text':[],
               'user':[],
               'likes':[],
               'replies':[],
               'retweets':[],
               'query':[]
              }

In [None]:
def query_by_month(query, y, m, limit=None):
    
    # Get number of days in each the month
    # Check for leap years
    if (y%4==0 and y%100!=0 or y%400==0) & (m == 2):
        total_d = 29
    elif m == 2:
        total_d = 28
    elif m in [4, 6, 9, 11]:
        total_d = 30
    else:
        total_d = 31
    
    # Set first start & end day
    d = 1
    end_d = d + 1
    
    # Run for number of days in month
    for day in range(total_d):

        # Set search begin date
        begin = dt.date(y, m, d)

        # Set search end date
        # Enables setting to 1st day of next month to get results from last day of search month
        if (end_d > total_d) & (m == 12):
            end = dt.date(y+1, 1, 1)
        elif end_d > total_d:
            end = dt.date(y, m+1, 1)
        else:
            end = dt.date(y, m, end_d)

        # Run twitterscraper query
        for tweet in query_tweets(query, begindate=begin, enddate=end, limit=limit):
            # Append info to tweets_dict
            tweets_dict['timestamp'].append(tweet.timestamp)
            tweets_dict['id'].append(tweet.id)
            tweets_dict['text'].append(tweet.text)
            tweets_dict['user'].append(tweet.user)
            tweets_dict['likes'].append(tweet.likes)
            tweets_dict['replies'].append(tweet.replies)
            tweets_dict['retweets'].append(tweet.retweets)
            tweets_dict['query'].append(query)

        # Pause
        time.sleep(1)

        # Increase begin and end search date by 1
        d += 1
        end_d += 1
    
    # Save to dataframe
    tweets = pd.DataFrame(tweets_dict)
    tweets.set_index('timestamp', inplace=True)
    return tweets

In [None]:
# Enter desired values for query
query = 'poweroutage'
year = 2012
month = 10
limit = 10

# Run query
tweets = query_by_month(query, year, month)

In [None]:
# Check number of total vs unique observations
print(f"Total: {tweets.shape[0]}")
print(f"Unique: {tweets['id'].nunique()}")
tweets.head()

In [None]:
# Drop duplicates
tweets.drop_duplicates(inplace=True)
print(f"Total: {tweets.shape[0]}")
print(f"Unique: {tweets['id'].nunique()}")