# Stream Twitter data and save it to Mongo Db#

Steps <br/>
1. First we create a subclass of StreamListener class. Create custom condition in on_data method
2. Create an instance of this class
3. Create an instance of the tweepy Stream class, which will stream the tweets.
    * We pass in our authentication credentials (api.auth) so that Twitter allows us to connect.
    * We pass in our stream_listener so that our callback functions are called. 
4. Start streaming tweets by calling the filter method. Pass in a list of terms to filter on, as the API requires.

In [1]:
# import libraries
import tweepy
import json
from pymongo import MongoClient

In [2]:
# Connect to local mongo database
MONGO_HOST= 'mongodb://localhost'
client = MongoClient(MONGO_HOST)
dbase = 'twitterdb'
db_col = 'worldcup2019'

In [3]:
# Number of tweets to capture
TWEETS_TO_CAPTURE = 100

In [4]:
# Words to search for in twitter
WORDS = ['cricket', 'worldcup', 'worldcup2019']

In [5]:
# Import our access keys and create authorization
from credentials import *

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

In [6]:
# Create a stream class with has a super class tweepy.StreamListener.
# This is a class provided by tweepy to access the Twitter Streaming API. 
class StreamListener(tweepy.StreamListener):    
   
    # Class constructor
    def __init__(self, api=None):
        self.api = api or API()
        self.num_tweets  = 0
    
    # Display a message when the streaming is connected
    def on_connect(self):
        # Called initially to connect to the Streaming API
        print("You are now connected to the streaming API.")
 
    def on_error(self, status_code):
        # On error - if an error occurs, display the error / status code
        print('An Error has occured: ' + repr(status_code))
        return False
 
    #It connects to mongoDB and stores the tweet
    def on_data(self, data):
        
        try:
            # Decode the JSON from Twitter.
            # data is a 'str' class. We convert it to JSON.
            datajson = json.loads(data)
            
            #grab the 'text' data from the Tweet to use for display
            # text = datajson['text']
            
            # Use database. If it doesn't exist, it will be created.
            db = client[dbase]
            col = db[db_col]
            
            #insert the data into a collection
            col.insert_one(datajson)
            
            if not self.on_status(data):
                print(str(TWEETS_TO_CAPTURE) +' tweets captured.')
                return False
            
        except Exception as e:
           print(e)
        
    def on_status(self, status):
        
        # Count the tweets.
        self.num_tweets += 1
        if self.num_tweets <= TWEETS_TO_CAPTURE:
            if self.num_tweets % 10 == 0: # just to see some progress...
                print('Numer of tweets captured so far: {}'.format(self.num_tweets))
            return True
        else:
            return False


In [7]:
%%time
#let's see how long it takes

# Initialize Stream listener
stream_listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True))

# Create you Stream object with authentication
streamer = tweepy.Stream(auth=auth, listener=stream_listener)

# Filter Twitter Streams to capture data by the keywords:
streamer.filter(track=WORDS)

You are now connected to the streaming API.
Numer of tweets captured so far: 10
Numer of tweets captured so far: 20
Numer of tweets captured so far: 30
Numer of tweets captured so far: 40
Numer of tweets captured so far: 50
Numer of tweets captured so far: 60
Numer of tweets captured so far: 70
Numer of tweets captured so far: 80
Numer of tweets captured so far: 90
Numer of tweets captured so far: 100
100 tweets captured.
Wall time: 23 s
