## Raw Tweet Ingestion

In [1]:
import os
import json
import time
import tweepy
import datetime
import pandas as pd

In [2]:
def initiate_api():
    
    with open('./.conf/config.json', 'r') as f:
        config = json.load(f)
        
    auth = tweepy.OAuthHandler(config["CONSUMER_KEY"], config["CONSUMER_SECRET"])
    auth.set_access_token(config["ACCESS_KEY"], config["ACCESS_SECRET"])
    api = tweepy.API(auth, wait_on_rate_limit=True)
    
    return api

In [3]:
def collect_tweets(api, N):
    
    neg_query = '(🤬 OR 🤮 OR 😡 OR 😤 OR 🥺 OR 🤢 OR 😣 OR \
                  😟 OR 😣 OR 🤔 OR 🤥 OR 😫 OR 🤮 OR 🥵 OR \
                  😨 OR 😰 OR 😭 OR 😥 OR 🙁 OR 😩) \
            AND \
                -(😃 OR 😄 OR 😁 OR 🥰 OR 😊 OR ❤️ OR 💋 OR \
                  😍 OR 😂 OR 😎 OR 🤣 OR 😘 OR 😇 OR 🙃 OR \
                  😉 OR 😇 OR 🤩 OR 😃 OR 😄 OR 🙂) \
            AND \
                -(😭)' # unclear polarity

    pos_query = '(😃 OR 😄 OR 😁 OR 🥰 OR 😊 OR ❤️ OR 💋 OR \
                  😍 OR 😂 OR 😎 OR 🤣 OR 😘 OR 😇 OR 🙃 OR \
                  😉 OR 😇 OR 🤩 OR 😃 OR 😄 OR 🙂) \
            AND \
                -(🤬 OR 🤮 OR 😡 OR 😤 OR 🥺 OR 🤢 OR 😣 OR \
                  😟 OR 😣 OR 🤔 OR 🤥 OR 😫 OR 🤮 OR 🥵 OR \
                  😨 OR 😰 OR 😭 OR 😥 OR 🙁 OR 😩) \
            AND \
                -(😭)' # unclear polarity                  
    
    tweets = []
    for status in tweepy.Cursor(api.search,
                                q=neg_query,
                                include_entities=True,
                                monitor_rate_limit=True, 
                                wait_on_rate_limit=True,
                                lang="en").items(N/2):

        tweets.append([status.id_str,
                       status.created_at, 
                       status.user.screen_name, 
                       status.text,
                       -1])
        
    for status in tweepy.Cursor(api.search,
                                q=pos_query,
                                include_entities=True,
                                monitor_rate_limit=True, 
                                wait_on_rate_limit=True,
                                lang="en").items(N/2):

        tweets.append([status.id_str,
                       status.created_at, 
                       status.user.screen_name, 
                       status.text,
                       1])     
        
    return tweets

In [4]:
def save_raw_tweets(tweets):

    tweets_df = pd.DataFrame(tweets, 
                             columns=["ID", "Timestamp", "User", "Text", "Polarity"])
    
    now_prefix = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join("data","raw","tweets")
    
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        
    filename = ''.join([now_prefix, "_tweets.csv"])
    
    tweets_df.to_csv(os.path.join(filepath, filename), index=False)

In [5]:
def twitter_bot(api, N):
    
    tweets = collect_tweets(api, N)  
    save_raw_tweets(tweets)
    
    t = datetime.datetime.now().strftime("%H:%M:%S")
    print('tweet file saved at: ', str(t))

In [6]:
def main(X, N):

    # debug 
    t1 = datetime.datetime.now().strftime("%H:%M:%S")
    print('api started at: ', str(t1))
    
    # start API
    api = initiate_api()
    
    for i in range(X):
        
        # debug
        print('Run No.', str(i+1))
        t2 = datetime.datetime.now().strftime("%H:%M:%S")
        print('twitter bot run at: ', str(t2))
        
        # run bot
        twitter_bot(api, N)
  
        # wait 15 min if not last iteration
        if i < range(X):
            print('waiting 15 mins...')
            time.sleep(915)

In [None]:
# collect X amt of times, N tweets

main(X=2, N=200)

api started at:  22:22:17
Run No. 1
twitter bot run at:  22:22:17
tweet file saved at:  22:22:23
waiting 15 mins...
Run No. 2
twitter bot run at:  22:37:38
tweet file saved at:  22:37:44
waiting 15 mins...


In [18]:
## test for duplication, increase rate, check docs