Background:
----------

The following script was modified from https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./

Goals:
--------

1) Connect to Twitter REST API

2) Download tweets with the keywords "clinical trials" OR #clinicaltrials OR #clinicaltrial"

Summary:
-------

A total of ~270,000 tweets were collected over a six month period

In [None]:
# Import modules
import tweepy
import sys
import jsonpickle
import os
import json
import time

from pymongo import Connection
from datetime import datetime

In [None]:
# Access API

# Enter API access information 
API_key = ''
API_secret = ''

# Create an OAuthHandler instance 
auth = tweepy.AppAuthHandler(API_key, API_secret) 

# Create Twitter API wrapper with rate limit  
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True) 

# Give an error message if can not access API 
if (not api):
    print ("Can't Authenticate")
    sys.exit(-1)

In [None]:
# Connect to MongoDB 

# Connect to local MongoDB
connection = Connection('localhost', 27017)

# Name MongoDB database as "TwitterStream"
db = connection.TwitterStream

# Create new indexes
db.tweets.ensure_index("id", unique=True, dropDups=True)

# MongoDB collection is named as RESTClinicalTrials
collection = db.RESTClinicalTrials

In [None]:
# Get keywords

# Collect tweets that are in English
language = ['en']

# Search for tweets that contain the keyword clinical trials
searchQuery = ['"clinical trials" OR #clinicaltrials OR #clinicaltrial"']

# Maximum total tweets to collect 
maxTweets = 45000

# Maximum tweets collect per query (100 is the API limit)
tweetsPerQry = 100  

# The Prefix of the name of the text file where tweets will be stored
fprefix = 'REST' 

In [None]:
# define min and max


# If results from a specific ID onwards are required, set since_id to that ID.
# else default to no lower limit, 
# go as far back as API allows
sinceId = None

# If results are below a specific ID, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query
max_id = -1L

In [None]:
# obtain tweets

# label output
sys.stdout = open('/Users/edwinreyes/Desktop/REST_ClincalTrials/' + fprefix + "TweetCount" + '.' + time.strftime('%Y-%m-%d_%H-%M-%S') + '.json', 'a+')

# initialize counter 
tweetCount = 0

# open file
with open('/Users/er/Desktop/REST_ClincalTrials/' + fprefix + '.' + time.strftime('%Y-%m-%d_%H-%M-%S') + '.json', 'a+') as f:
    
    # make while loop to stay within max Tweets allowed 
    while tweetCount < maxTweets: 
        
        # create try/else statements in case errors arise
        try:
            if (max_id <= 0):
                if (not sinceId):
                    
                    # Use api.search to return tweets that match query
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry, languages=language)
                else:
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,languages=language,
                                            since_id=sinceId)
                    
            else:
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,languages=language,
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,languages=language,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            
            # once all new_tweets are collected, let us know 
            if not new_tweets:
                print("No more tweets found")
                break 
            
            
            # loop over new_tweets
            for tweet in new_tweets: 
                
                # make json into a json string and make a line break 
                f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
                        '\n')
 
                # convert to json
                d = json.dumps(tweet._json)
                t = json.loads(d)

                # save specific features of tweet to mongodb
                _id = t['id_str']
                created_at = t['created_at']
                user_name = t['user']['name']
                user_screenName = t['user']['screen_name']
                user_id = t['user']['id']
                user_location = t['user']['location'] 
                text = t['text']
                
                # use try/except block because not all tweets have expanded urls
                try:
                    expanded_url = t['entities']['urls'][0]['expanded_url'] 
                except:
                    expanded_url = ''
                
                # save tweets as dictionary
                tweets = {'_id':_id, 'text':text,'created_at':created_at,'user_name':user_name,'user_screenName ':user_screenName ,'user_id': user_id, 'user_location': user_location,'expanded_url':expanded_url}

                # insert to mongodb
                collection.insert(tweets)
        

            # counting tweets                         
            tweetCount += len(new_tweets)
            
            # use try/except block to pass through IndexErrors
            try:
                max_id = new_tweets[-1].id
            except IndexError:
                pass

        except tweepy.TweepError as e:
            
            # if any error, break
            print("some error : " + str(e))
            break

# print number of tweets downloaded
print ("Downloaded {0} tweets".format(tweetCount))

# close system 
sys.stdout.close()