In [8]:
import twitter
import urlparse
import pandas as pd

# parallel print
from pprint import pprint as pp

In [36]:
# Load the twitter API keys
twitter_tokens = pd.read_csv("../twitter_tokens.csv")
twitter_tokens.keys()

Index([u'consumer_key', u' consumer_secret', u'access_token',
       u'access_secret'],
      dtype='object')

In [37]:

class TwitterAPI(object):
    """
        TwitterAPI class allows the Connection to Twitter via OAuth
        once you have registered with Twitter and receive the
        necessary credentials.
    """
    # Initialize key variables and get the twitter credentials
    def __init__(self):
        consumer_key = twitter_tokens.values.flatten()[0]
        consumer_secret = twitter_tokens.values.flatten()[1]
        access_token = twitter_tokens.values.flatten()[2]
        access_secret = twitter_tokens.values.flatten()[3]
        
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = access_token
        self.access_secret = access_secret
        
    # Authenticate credentials with Twitter using OAuth
        self.auth = twitter.oauth.OAuth(access_token, access_secret, 
                                        consumer_key, consumer_secret)
        
        
    # Create registered Twitter API
        self.api = twitter.Twitter(auth=self.auth)
        
        
    # Search Twitter with query q (i.e "ApacheSpark") and max result
    def searchTwitter(self, q, max_res=10, **kwargs):
        search_results = self.api.search.tweets(q=q, count=10, **kwargs)
        statuses = search_results['statuses']
        max_results = min(1000, max_res)
        
        for _ in range(10):
            try:
                next_results = search_results['search_metadata']['next_results']
            except KeyError as e:
                break
            
            next_results = urlparse.parse_qsl(next_results[1:])
            kwargs = dict(next_results)
            
            search_results = self.api.search.tweets(**kwargs)
            statuses += search_results['statuses']
            
            if len(statuses) > max_results:
                break
            
        return statuses
    
    
    
    # Parse tweets as it is collected to extract ID, creation date, userID, tweet text
    def parseTweets(self, statuses):
        tweetx = [(status['id'],
                   status['created_at'],
                   status['user']['id'],
                   status['user']['name'],
                   url['expanded_url'],
                   status['text']) 
                    for status in statuses 
                      for url in status['entities']['urls']
                 ]
        return tweetx
    

In [38]:
# Instantiate the class with the required authentication
obj = TwitterAPI()

In [39]:
# Run a query on the search tern
twtx = obj.searchTwitter("ApacheSpark")

# Parse the tweets
parsed_tweetx = obj.parseTweets(twtx)

In [41]:
# Display output of parsed tweets
print("Lenth of parsed tweets: {} \n\n".format(len(parsed_tweetx)))

pp(parsed_tweetx[:3])


Lenth of parsed tweets: 16 


[(714932809383567360,
  u'Tue Mar 29 21:50:39 +0000 2016',
  17712257,
  u'Reynold Xin',
  u'https://databricks.com/blog/2016/03/22/spark-trending-in-the-stack-overflow-survey.html',
  u'RT @dwhitena: I knew it, to be trendy I must integrate @apachespark with @reactjs.  https://t.co/NXCsMsQ7ab @databricks #maybenot https://t\u2026'),
 (714932725707366400,
  u'Tue Mar 29 21:50:19 +0000 2016',
  1872921709,
  u'EnterpriseTech',
  u'http://ow.ly/102Xcb',
  u'#IBM ports #ApacheSpark to #zSystems mainframe https://t.co/Aupm4sqO4C'),
 (714931536269877250,
  u'Tue Mar 29 21:45:35 +0000 2016',
  22486605,
  u'Chris Collins',
  u'https://medium.com/@ivanermilov/scalable-spark-hdfs-setup-using-docker-2fd0ffa1d6bf#.omqb4l4w3',
  u'RT @docker: Creating a scalable HDFS/@apachespark setup using @Docker &amp; #docker-compose by Ivan Ermilov: https://t.co/fSE4a5dEJ7')]


In [7]:
# Display result using parallel processing
pp(twtx[1])

{u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Tue Mar 29 06:10:29 +0000 2016',
 u'entities': {u'hashtags': [{u'indices': [65, 72], u'text': u'Python'},
                             {u'indices': [74, 82], u'text': u'BigData'}],
               u'symbols': [],
               u'urls': [{u'display_url': u'mapr.com/blog/churn-pre\u2026',
                          u'expanded_url': u'https://www.mapr.com/blog/churn-prediction-pyspark-using-mllib-and-ml-packages',
                          u'indices': [83, 106],
                          u'url': u'https://t.co/ieqDXN6yyI'}],
               u'user_mentions': [{u'id': 12391902,
                                   u'id_str': u'12391902',
                                   u'indices': [3, 11],
                                   u'name': u'Daniel Villanueva',
                                   u'screen_name': u'dvillaj'},
                                  {u'id': 1551361069,
                                   u'id_str': u'155136106