## Raw Tweet Ingestion

In [5]:
import os
import sys
import json
import time
import tweepy
import datetime
import pandas as pd

def check_args():

    # error handling no arguments
    if len(sys.argv) != 3:
        print(
            "ERROR: No default number of runs,\
            must supply integer between 1 and 12:\
            \nUSAGE: python ingest_tweets.py <nruns[1:12]> <ntweets=200[100:1000]>\
            ")
        sys.exit()

    # error handling first arg 
    if sys.argv[1] not in range(1, 13):
        print(
            "ERROR: Number of runs must be an integer between 1 and 12:\
            \nUSAGE: python ingest_tweets.py <nruns[1:12]> <ntweets=200[100:1000]>\
            ")      
        sys.exit()
    else:
        nruns = sys.argv[1]
        
    # error handling second [optional] arg
    try:
        sys.argv[2]
    except NameError:
        return (nruns, 200) # default ntweeets
    else:
        if os.path.isfile(sys.argv[2]) not in range(100, 1000):
            print(
                "ERROR: Number of tweets must be an integer between 100 and 1000:\
                \nUSAGE: python ingest_tweets.py <nruns[1:12]> <ntweets=200[100:1000]>\
                ") 
            sys.exit()
        else:
            ntweets = sys.argv[2]
            return (nruns, ntweets)

def initiate_api():
    
    with open('./.conf/config.json', 'r') as f:
        config = json.load(f)
        
    auth = tweepy.OAuthHandler(
        config["CONSUMER_KEY"], 
        config["CONSUMER_SECRET"]
    )
    auth.set_access_token(
        config["ACCESS_KEY"], 
        config["ACCESS_SECRET"]
    )
    api = tweepy.API(auth, wait_on_rate_limit=True)
    return api

def collect_tweets(api, ntweets):
    
    neg_query = '(🤬 OR 🤮 OR 😡 OR 😤 OR 🥺 OR 🤢 OR 😣 OR \
                  😟 OR 😣 OR 🤔 OR 🤥 OR 😫 OR 🤮 OR 🥵 OR \
                  😨 OR 😰 OR 😭 OR 😥 OR 🙁 OR 😩) \
            AND \
                -(😃 OR 😄 OR 😁 OR 🥰 OR 😊 OR ❤️ OR 💋 OR \
                  😍 OR 😂 OR 😎 OR 🤣 OR 😘 OR 😇 OR 🙃 OR \
                  😉 OR 😇 OR 🤩 OR 😃 OR 😄 OR 🙂) \
            AND \
                -(😭)' # unclear polarity

    pos_query = '(😃 OR 😄 OR 😁 OR 🥰 OR 😊 OR ❤️ OR 💋 OR \
                  😍 OR 😂 OR 😎 OR 🤣 OR 😘 OR 😇 OR 🙃 OR \
                  😉 OR 😇 OR 🤩 OR 😃 OR 😄 OR 🙂) \
            AND \
                -(🤬 OR 🤮 OR 😡 OR 😤 OR 🥺 OR 🤢 OR 😣 OR \
                  😟 OR 😣 OR 🤔 OR 🤥 OR 😫 OR 🤮 OR 🥵 OR \
                  😨 OR 😰 OR 😭 OR 😥 OR 🙁 OR 😩) \
            AND \
                -(😭)' # unclear polarity                  
    
    tweets = []
    # collect negative tweets
    for status in tweepy.Cursor(api.search,
                                q=neg_query,
                                include_entities=True,
                                monitor_rate_limit=True, 
                                wait_on_rate_limit=True,
                                lang="en").items(ntweets/2):
        tweets.append([status.id_str,
                       status.created_at, 
                       status.user.screen_name, 
                       status.text,
                       -1])
        
    # collect positive tweets
    for status in tweepy.Cursor(api.search,
                                q=pos_query,
                                include_entities=True,
                                monitor_rate_limit=True, 
                                wait_on_rate_limit=True,
                                lang="en").items(ntweets/2):
        tweets.append([status.id_str,
                       status.created_at, 
                       status.user.screen_name, 
                       status.text,
                       1])
    return tweets

def save_raw_tweets(tweets):

    tweets_df = pd.DataFrame(
        tweets, 
        columns=["ID", "Timestamp", "User", "Text", "Polarity"]
    )
    
    now_prefix = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join("data","raw","tweets")
    
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        
    filename = ''.join([now_prefix, "_tweets.csv"])
    
    tweets_df.to_csv(os.path.join(filepath, filename), index=False)

def twitter_bot(api, ntweets):
    
    tweets = collect_tweets(api, ntweets)  
    save_raw_tweets(tweets)
    
    t = datetime.datetime.now().strftime("%H:%M:%S")
    print('New file saved at: ', str(t))

def main(nruns, ntweets):

    t1 = datetime.datetime.now().strftime("%H:%M:%S")
    print('api started at: ', str(t1))
    
    # start API
    api = initiate_api()
    
    for i in range(nruns):
        
        print('Run No.', str(i+1))
        t2 = datetime.datetime.now().strftime("%H:%M:%S")
        print('twitter bot run at: ', str(t2))
        
        # run bot
        twitter_bot(api, ntweets)
  
        # wait 15 min, except last time
        if i < nruns-1:
            print('waiting 15 mins...')
            time.sleep(915)

if __name__=="__main__":
    
    nruns, ntweets = check_args()
    
    main(nruns, ntweets)

In [13]:
## increase rate, check docs