In [None]:
import os
import json
import requests
import time
import urllib.parse
import config
import pandas as pd
import feather

# 1) Creating the API connection and defining functions

In [None]:
#Load the bearer token from the config.py file
bearer_token = config.bearer_token
#Create the headers for authorization using the bearer token.
headers = {"Authorization": "Bearer {}".format(bearer_token)}

def main(query, fields, headers, print_yes=True):
    """
    This function loads up the bearer token, defines the query url
    and headers and returns the API response in JSON.
    
    Inputs:
    query = String variable. This is the main query specifying 
            the Query Parameters. For example: from:elonmusk, lang:en
            -is:retweet, max_results=100. Space separated list.
    tweet_fields = String variable. This defines what Response
            Fields we want for the tweets. For example: author_id, 
            public_metrics, created_at, geo, id. Comma separated list
            without spaces.
    headers = Authorization containing the bearer token. Dictionary.
    search = Specifies whether you want to query for 'users' or 'tweets'.
            String variable.
    print_yes = Boolean whether you want to print the output.
    
    Returns:
    json response object containing the API response
    """
    #Create the url for the API request.
    query = urllib.parse.quote(query)
    #Create a query and fields URL that will be fed into the API request.
    url = "https://api.twitter.com/2/tweets/search/all?query={}&{}".format(query, fields)
    
    if print_yes == True:
        print(url)
    
    #Connect to the Twitter API endpoint using the query url and headers
    response = requests.request("GET", url, headers=headers)
    if print_yes == True:
        print(response.status_code)
    
    #If Twitter's servers are overloaded, you may need to wait a before you make a request
    if response.status_code == 503:
        print("Twitter's service is currently unavailable, I will try again in five minutes.")
        time.sleep(300)
        response = requests.request("GET", url, headers=headers)

    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    
    if print_yes == True:
        print(json.dumps(response.json(), indent=4, sort_keys=True))
    
    return response.json()

In [None]:
def getRateLimit(headers):
    """
    This functions inputs the headers (the bearer token)
    and outputs how many tweets I have in the 15 minute window
    before the rate limit is hit and the timestamp when the rate is reset
    """
    url='https://api.twitter.com/1.1/application/rate_limit_status.json?'
    response = requests.request("GET",url,headers=headers)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    
    return response.json()['resources']['tweets']['/tweets/search/all']

In [None]:
def merge_and_clean(df, df_users):
    #Merge the two dataframes together:
    df_cleaned = df.merge(df_users, left_on='author_id', right_on='id')
    #Drop unneccessary columns:
    df_cleaned = df_cleaned.drop(['id_x', 'author_id', 'id_y'], axis=1)
    #Rename some columns:
    df_cleaned = df_cleaned.rename(columns = {'created_at_x': 'tweet_created_at',
                                              'created_at_y': 'user_created_at'})
    #Convert the dates to YYYY-MM-DD format so that they are easier to handle later on:
    df_cleaned['tweet_created_at'] = df_cleaned['tweet_created_at'].astype('datetime64[ns]')
    df_cleaned['user_created_at'] = df_cleaned['user_created_at'].astype('datetime64[ns]')
    df_cleaned['tweet_created_at'] = [time.date() for time in df_cleaned['tweet_created_at']]
    df_cleaned['user_created_at'] = [time.date() for time in df_cleaned['user_created_at']]
    df_cleaned['tweet_created_at'] = df_cleaned['tweet_created_at'].astype('datetime64[ns]')
    df_cleaned['user_created_at'] = df_cleaned['user_created_at'].astype('datetime64[ns]')
    
    #Convert int64 to in32 to save space:
    df_cleaned['public_metrics.retweet_count'] = df_cleaned['public_metrics.retweet_count'].astype('int32')
    df_cleaned['public_metrics.reply_count'] = df_cleaned['public_metrics.reply_count'].astype('int32')
    df_cleaned['public_metrics.like_count'] = df_cleaned['public_metrics.like_count'].astype('int32')
    df_cleaned['public_metrics.quote_count'] = df_cleaned['public_metrics.quote_count'].astype('int32')
    df_cleaned['public_metrics.followers_count'] = df_cleaned['public_metrics.followers_count'].astype('int32')
    df_cleaned['public_metrics.following_count'] = df_cleaned['public_metrics.following_count'].astype('int32')
    df_cleaned['public_metrics.tweet_count'] = df_cleaned['public_metrics.tweet_count'].astype('int32')
    df_cleaned['public_metrics.listed_count'] = df_cleaned['public_metrics.listed_count'].astype('int32')
    
    return df_cleaned

# 2) Download the whole dataset to the local computer

In [None]:
getRateLimit(headers)

In [None]:
#Define the query and the fields
query = '#banking lang:en -is:retweet'
fields_orig = "tweet.fields=author_id,created_at,public_metrics"\
              "&expansions=author_id"\
              "&user.fields=username,verified,created_at,public_metrics,description"\
              "&max_results=500"\
              "&start_time=2009-01-01T00%3A00%3A00Z"\
              "&end_time=2013-01-01T00%3A00%3A00Z"

#Call the API and define the dataframe for the tweets and the users:
result = main(query, fields_orig, headers, print_yes = False)
#Create dataframes from the result output, df corresponds to tweets and df_users to users
df = pd.json_normalize(result['data'])
df_users = pd.json_normalize(result['includes']['users'])
df_merged = merge_and_clean(df, df_users)

#Get the next_token from the API response
next_token = result['meta']['next_token']

#To keep track of how many requests I have made
index = 1

In [None]:
cycle = True
print(f'You currently have {df_merged.shape[0]} tweets in your dataframe \n')

try:
    while cycle is True:
        #Update the fields with the new next_token:
        fields = fields_orig + '&next_token=' + next_token
        
        result = main(query, fields, headers, print_yes = False)
        
        #Define temporary dataframes that will be appended to the main dataframes:
        df_temp = pd.json_normalize(result['data'])
        df_user_temp = pd.json_normalize(result['includes']['users'])
        df_merged_temp = merge_and_clean(df_temp, df_user_temp)
        #Append the merged dataframe (tweets&users) to the main merged dataframe:
        df_merged = df_merged.append(df_merged_temp, ignore_index = True)
        
        #Requests are limited to 1 request per 1 second:
        time.sleep(1)
        
        #Update the index after making another request:
        index += 1
        if index%100==0:
            print(f'I have made {index} requests and downloaded {df_merged.shape[0]} tweets')
            print('Dataframe length:', df_merged.shape[0])
            #Provide the last date in the dataframe to know where I'm at
            last_date = str(df_merged.iloc[-1, df_merged.columns.get_loc('tweet_created_at')])[:10]
            print('Dataframe last date:', last_date)
            #Calculate the total size of the dataframe:
            size = df_merged.memory_usage(deep=True).sum()/1000
            print('Dataframe size (kb):', size, '\n')
            
        #Check how far you are from the rate limit:    
        if index%10==0:
            rate_limit = getRateLimit(headers)
            #If there are less than 21 requests to the limit, sleep a bit:
            if rate_limit['remaining']<11:
                #Calculate the sleep time:
                sleep_time = int((rate_limit['reset']-time.time())+5)
                print(f'I have made {index} requests and will now sleep for {int(sleep_time)} seconds \n')
                time.sleep(sleep_time)
                
        #Get the next token:
        next_token = result['meta']['next_token']


except KeyError as e:
    print('There is no', str(e))

finally:
    print(f'You have downloaded {df_merged.shape[0]} tweets and the loop is now finished')

In [None]:
df_merged = df_merged.drop(['withheld.copyright', 'withheld.country_codes_x', 
                            'withheld.country_codes_y',], axis=1)

In [None]:
df_merged.to_csv('Datasets/#banking09-12.csv')
df_merged.to_pickle('Datasets/#banking09-12.pickle')
df_merged.to_feather('Datasets/#banking09-12.ftr')