## The code in this notebook will:
## 1. take a .txt file containing Tweet IDs (relevant variable: 'FILENAME')
## 2. check a local cache to see if we already captured that Tweet (relevant variable: 'CACHE_FNAME')
## 3. make a call to Twitter API using Tweepy, if we haven't captured that Tweet already & add it to cache (relevant variable: 'CACHE_DICTION')
## 4. after collecting all possible Tweets, convert what's in CACHE_DICTION to a pandas DataFrame, manipulate it a bit, & print out a sample of 5 Tweets (relevant variable: 'cache_df')

In [65]:
import secret_data
import json
import tweepy
import pandas as pd

In [66]:
#Start getting keys & secrets for running Twitter user, you will need your own user with details saved in a file named 'secret_data.txt' to run this
CONSUMER_KEY = secret_data.CONSUMER_KEY
CONSUMER_SECRET = secret_data.CONSUMER_SECRET
ACCESS_TOKEN = secret_data.ACCESS_TOKEN
ACCESS_SECRET = secret_data.ACCESS_SECRET
#End getting keys & secrets for running Twitter user

In [67]:
#Start cache setup
CACHE_FNAME = 'twitter_cache.json'
try:
    cache_file = open(CACHE_FNAME, 'r')
    cache_contents = cache_file.read() #this is a str
    CACHE_DICTION = json.loads(cache_contents) #this is a dict
    cache_file.close()
except:
    CACHE_DICTION = {}
#End cache setup

In [68]:
#Start OAuth code
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth)
#End OAuth code

In [69]:
#file containing tweet ids
# FILENAME = 'test_tweet_ids_10.txt'
FILENAME = 'test_tweet_ids.txt'

In [70]:
#Start funct to grab vars from Tweet
def get_tweet_vars(tweet):
    try:
        #set a number of vars for potential use
        tweet_text = tweet['text']
        tweet_in_reply_to_status_id_str = tweet['in_reply_to_status_id_str']
        tweet_in_reply_to_screen_name = tweet['in_reply_to_screen_name']
        tweet_entities_hashtags = tweet['entities']['hashtags']
        tweet_entities_symbols = tweet['entities']['symbols']
        tweet_entities_user_mentions = tweet['entities']['user_mentions']
        tweet_entities_urls = tweet['entities']['urls']

    except Exception as e:
        print('Exception in get_tweet_vars:{}\nProblematic Tweet:{}\n\n'.format(e, tweet))
#End funct to grab vars from Tweet

In [71]:
#Start funct for cache check
def get_tweet(found_id, label):
    if found_id in CACHE_DICTION:
        #if we get strange results in cache like missing child tweets then we may need to add the call to get_tweet() here
        return CACHE_DICTION[found_id]
#         if CACHE_DICTION[found_id]: #simple check if dict is populated or not
#             get_tweet_vars(CACHE_DICTION[found_id])
    else:
        try:
            resp = api.get_status(found_id) #resp is a class 'tweepy.models.Status'
            json_str = json.dumps(resp._json) #json_str var is str type
            json_obj = json.loads(json_str) #json_obj var is dict type
        except Exception as e:
            json_obj = {}
            message = 'No worries. Empty entry has been made in cache.'
            print('Exception in get_tweet:{}\nProblematic Tweet:{}\n\n{}\n'.format(e, found_id,message))
        CACHE_DICTION[found_id] = json_obj #creating new entry in cache dict where key = 'found_id' & value = 'json_obj' which is a dict
        CACHE_DICTION[found_id]['label'] = label
        CACHE_DICTION[found_id]['original_related'] = 'original'
        dumped_json_cache = json.dumps(CACHE_DICTION)
        fw = open(CACHE_FNAME,"w")
        fw.write(dumped_json_cache)
        fw.close() # close the open file

        #start recursive call to get_tweet() a.k.a. this funct if current tweet is in reply to another tweet
        if CACHE_DICTION[found_id]['in_reply_to_status_id_str'] is not None:
            in_reply_id = CACHE_DICTION[found_id]['in_reply_to_status_id_str']
            get_tweet(in_reply_id) 
        #end recursive call to get_tweet()

        return CACHE_DICTION[found_id]
#         if CACHE_DICTION[found_id]: #simple check if dict is populated or not
#             get_tweet_vars(CACHE_DICTION[found_id])
#End funct for cache check

In [72]:
#Start funct to read in Tweet IDs from file
def read_in_tweet_ids():
    with open(FILENAME, 'r') as infile:
        for line in infile:
            found_id = line.split('\t')[0]
            label = line.split('\t')[2] #label is needed for training of model
            try:
                get_tweet(found_id, label)
            except:
                pass #having exception message print here was causing duplicate messages when get_tweet failed
#End funct to read in Tweet IDs from file

## Here we call the functions above to grab Tweets from either the cache or Twitter API

In [73]:
check = read_in_tweet_ids()

In [74]:
#convert CACHE_DICTION to DF & transpose so it's more intuitive
cache_df = pd.DataFrame(CACHE_DICTION).transpose()

In [78]:
#strip '\n' char from label vals
cache_df.label = cache_df.label.str.strip('\n')

In [80]:
cache_df.sample(5)

Unnamed: 0,contributors,coordinates,created_at,entities,favorite_count,favorited,geo,id,id_str,in_reply_to_screen_name,...,lang,original_related,place,retweet_count,retweeted,retweeted_status,source,text,truncated,user
344592305476497408,,,,,,,,,,,...,,original,,,,,,,,
348304709448261633,,,,,,,,,,,...,,original,,,,,,,,
333342132385492994,,,,,,,,,,,...,,original,,,,,,,,
340624255521804289,,,Sat Jun 01 00:22:05 +0000 2013,"{'hashtags': [], 'symbols': [], 'user_mentions...",0.0,False,,3.406242555218043e+17,3.406242555218043e+17,,...,en,original,,0.0,False,,"<a href=""http://www.cloudhopper.com/"" rel=""nof...",Baclofen really is for recovering alcoholics.,False,"{'id': 345964998, 'id_str': '345964998', 'name..."
340283349706874880,,,Fri May 31 01:47:27 +0000 2013,"{'hashtags': [], 'symbols': [], 'user_mentions...",0.0,False,,3.402833497068749e+17,3.402833497068749e+17,,...,en,original,,0.0,False,,"<a href=""http://www.echofon.com/"" rel=""nofollo...",So it seems I'm one of those friends you can t...,False,"{'id': 21277554, 'id_str': '21277554', 'name':..."
