## <center>  <h1>doLoop Tech</h1> <center>
<center> <h1> Twitter REST API </h1> <center>

## Step 1: Relevant packages

In [2]:
import importlib
import os
import keys      # contains personal credentials that identifies user [see example in `keys.py` file]
import json
import re
import pandas as pd
import csv
#pd.set_option('display.max_colwidth', -1)

In [3]:
from twython import Twython, TwythonError # library for Twitter API wrapper

In [4]:
importlib.reload(keys)
keychain = keys.keychain

## Step 2: Getting credentials

In [5]:
def init(keychain):
    """ Dummy function to get credential details from keychain
        
        Parameters:
          keychain: user keychain with private info
    
        Return Value: Needed credentials to access the Twitter API
    """
    api_key = keychain['twitter']['api_key']
    api_secret = keychain['twitter']['api_secret']
    access_token = keychain['twitter']['access_token']
    access_token_secret = keychain['twitter']['access_token_secret']
    return (api_key,api_secret,access_token,access_token_secret)

In [None]:
# Accessing function definition
#?init

In [6]:
def APIload(api_key,api_secret,access_token,access_token_s):
    """ Takes authorization information and returns twython object
        used for tweet extraction.
        
        Parameters:
           api_key: Client key
        api_secret: Client secret (keep hidden from application)
      access_token: Access token from Twitter server
    access_token_s: Token secret for server
    
        Return Value: Twython object
    """
    auth = Twython(api_key,api_secret,access_token,access_token_s)
    return auth

In [7]:
key,secret,a_token,a_secret = init(keychain)
Auth = APIload(key,secret,a_token,a_secret)

## Step 3: Getting tweets

### Part 1: Parameter initialization and helper functions

In [8]:
query = 'Twitter -filter:retweets' # see twitter documentation for more []
text = []
location = []
date = []
time = []
file_top = 'Top_metadata:'+query+'.json'
file_init = "Init_metadata:"+query+".json"
file_bottom = 'Bottom_metadata:'+query+'.json'

In [13]:
def getLimit(auth):
    """ Function to get count of remaining requests to the
        twitter server.
        
        Parameters:
          auth: twython authorization object
    
        Return Value: number of requests remaining to the server
    """
    Limit = auth.get_lastfunction_header('x-rate-limit-remaining')
    if Limit == None:
        print("Rate limit error!")
    else:
        return int(Limit)

In [14]:
def retrieve_since_id(metadata):
    """ Function to get since_id parameter from twython object.
        Since_id is used to keep track of the top of the tweet
        list which is needed to obtain newer incoming tweets.
        
        Parameters:
          metadata: json containing the search metadata from
                    previous server query
                
        Return Value: since_id parameter
    """
    refresh_url = metadata['refresh_url']
    pattern = r'\?since_id=(\d*)&'
    regex = re.compile(pattern, re.S)
    since_id = regex.search(refresh_url)
    return(int(since_id[1]))

In [15]:
def retrieve_max_id(metadata):
    """ Function to get max_id parameter from twython object.
        Max_id is used to keep track of the bottom of the tweet
        list which is needed to obtain the next batch of tweets.
        
        Parameters:
          metadata: json containing the search metadata from
                    previous server query
    
        Return Value: max_id parameter
    """
    try:
        next_results = metadata['next_results']
        pattern = r'\?max_id=(\d*)&'
        regex = re.compile(pattern, re.S)
        max_id = regex.search(next_results)
        return(int(max_id[1]))
    except:
        print("Result error.")

In [16]:
def run_once(f):
    def wrapper(*args, **kwargs):
        if not wrapper.has_run:
            wrapper.has_run = True
            return f(*args, **kwargs)
    wrapper.has_run = False
    return wrapper

@run_once
def fornewquery(auth,query,text,location,date,time):
    """ Function to get new tweets (max 100) since the first server
        call in previous session for a particular query.
        
        Parameters:
              auth: twython api object
             query: search query
              text: tweet text list to be appended
          location: tweet location list to be appended
              date: tweet date list to be appended
              time: tweet time list to be appended
    
        Return Value: twython object with new tweets
    """
    try:
        object1 = auth.search(q=query,count=100,lang='en',tweet_mode='extended')
        Limit = getLimit(auth)
        print("Query successful. Requests remaining: ",Limit)
    except TwythonError as e:
        print(e)

    for i in range(0,len(object1['statuses'])):
        text.append(object1['statuses'][i]['full_text'])
        location.append(object1['statuses'][i]['user']['location'])
        date.append(object1['statuses'][i]['created_at'][4:10])
        time.append(object1['statuses'][i]['created_at'][11:19])
    file = "Init_metadata:"+query+".json"
    with open(file, 'w') as outfile:
        json.dump(object1['search_metadata'], outfile)
    return

In [17]:
def boom(auth,query,metadata,text,location,date,time):
    """ Function to retrive tweets until server error or 
        rate-limit exceeded. Should retrieve approx 17000
        tweets when fully functional (rate limit resets
        every 15 mins).
        
        Note: Function keeps running till server timeout. 
              Make sure to use drop_duplicates() in Step 4
              before loading into SQL db.
        
        Parameters:
              auth: twython api object
             query: search query
          metadata: json containing the search metadata from
                    previous server query
              text: tweet text list to be appended
          location: tweet location list to be appended
              date: tweet date list to be appended
              time: tweet time list to be appended
    
        Return Value: appended lists and last request object
                      for further retrieval.
    """
    file = 'Bottom_metadata:'+query+'.json'
    try:
        with open(file) as f:
            metadata_end = json.load(f)
    except:
        print("Nothing to load from endpoint json.")
        pass
        
    Limit = getLimit(auth)
    
    try:
        while Limit > 0:
            Limit = getLimit(auth)
            max_id = retrieve_max_id(metadata)
            try:
                object1 = auth.search(q=query,
                                  count=100,lang='en',tweet_mode='extended',max_id=max_id)
                metadata = object1['search_metadata']
                print("Query successful. Requests remaining: ",Limit)
            except Exception as ex:
                template = "An exception of type {0} occurred. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                print(message)
                pass
            for j in range(0,len(object1['statuses'])):
                text.append(object1['statuses'][j]['full_text'])
                location.append(object1['statuses'][j]['user']['location'])
                date.append(object1['statuses'][j]['created_at'][4:10])
                time.append(object1['statuses'][j]['created_at'][11:19])
            metadata_end = object1['search_metadata']
    except TwythonError as e:
        print(e)
        pass
    print("Terminated.")
    
    try:
        with open(file, 'w') as outfile:
            json.dump(metadata_end, outfile)
    except Exception as ex:
        template = "An exception of type {0} occurred. Arguments:\n{1!r}"
        message = template.format(type(ex).__name__, ex.args)
        print(message) 
    return (metadata_end)

In [18]:
def get_new_tweets(auth,query,metadata,text,location,date,time):
    """ Function to get all new tweets since initial server
        call in previous session for a particular query.
        
        Parameters:
              auth: twython api object
             query: search query
          metadata: json containing the search metadata from
                    previous server query
              text: tweet text list to be appended
          location: tweet location list to be appended
              date: tweet date list to be appended
              time: tweet time list to be appended
    
        Return Value: twython object with new tweets
    """
    file = 'Top_metadata:'+query+'.json'
    since_id = retrieve_since_id(metadata) # can't initialize getLimit before call to the server
    try:
        latest = auth.search(q=query,count=100,lang='en',tweet_mode='extended',since_id=since_id)
        Limit = getLimit(auth)
        if len(latest['statuses'])==0:
            print("No new results. Requests remaining: ",Limit)
            
        while len(latest['statuses'])>0:
            Limit = getLimit(auth)
            print("Query successful. Requests remaining: ",Limit)
            for i in range(0,len(latest['statuses'])):
                text.append(latest['statuses'][i]['full_text'])
                location.append(latest['statuses'][i]['user']['location'])
                date.append(latest['statuses'][i]['created_at'][4:10])
                time.append(latest['statuses'][i]['created_at'][11:19])
            since_id = retrieve_since_id(latest['search_metadata'])
            latest = auth.search(q=query,count=100,lang='en',tweet_mode='extended',since_id=since_id)
            
        with open(file, 'w') as outfile:
            json.dump(latest['search_metadata'], outfile)
                
        with open(file) as f:
            metadata = json.load(f)
        return(metadata)
    except TwythonError as e:
        print(e)
        with open(file) as f:
            metadata = json.load(f)
        return(metadata)

### Part 2: Getting the data

In [None]:
# The startpoint should only be called ONCE for each query.
fornewquery(Auth,query,text,location,date,time)

In [None]:
with open(file_init) as f:
    startpoint_metadata = json.load(f)

In [None]:
print(startpoint_metadata)

In [None]:
# only for the first new call from startpoint
top_metadata = get_new_tweets(Auth,query,startpoint_metadata,text,location,date,time)

In [None]:
with open(file_top) as f:
    top_metadata = json.load(f)

In [None]:
# for all future calls for the query
top_metadata = get_new_tweets(Auth,query,top_metadata,text,location,date,time)

In [None]:
# for first batch
endpoint_metadata = boom(Auth,query,startpoint_metadata,text,location,date,time)

In [None]:
with open(file_bottom) as f:
    endpoint_metadata = json.load(f)

In [None]:
# for all future calls for the query
top_metadata = get_new_tweets(Auth,query,top_metadata,text,location,date,time)
endpoint_metadata = boom(Auth,query,endpoint_metadata,text,location,date,time)