# Crawler Twitter Data

This code, will extract tweets from Twitter API and store them to a file.

### Important note
You will need to create a `config.txt` file in the folder with your Twitter API credentials or introduce them manually in the code.

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from collections import Counter
from config import *

In [None]:
# Read tokens from file
f = open("config.txt", "r")
tokens = []
for line in f:
    tokens.append(line.split()[1])

## Define tokens for Twitter API
access_token1 = tokens[0]
access_token_secret1 = tokens[1]

consumer_key1 = tokens[2]
consumer_secret1 =tokens[3]

In [None]:
#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API
from tweepy import Cursor
import json
import datetime

In [None]:
auth = OAuthHandler(consumer_key1, consumer_secret1)
auth.set_access_token(access_token1, access_token_secret1)
api = API(auth_handler=auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [None]:
import time

class MyStreamListener(StreamListener):
    """
    Twitter listener, collects streaming tweets and output to a file
    """
    def __init__(self, api, OUTPUT_FILENAME, stop_condition=10):
        """
        initialize the stream, with num. of tweets and saving the outputfile
        """
        
        # this line is needed to import the characteristics of the streaming API
        super(MyStreamListener, self).__init__()
        
        # to-count the number of tweets collected        
        self.num_tweets = 0
        
        # save filename
        self.filename = OUTPUT_FILENAME
        
        # stop-condition
        self.stop_condition = stop_condition
        

    def on_status(self, status):
        
        """
        this function runs each time a new bunch of tweets is retrived from the streaming
        """
        
        with open(self.filename, "a+") as f:
            tweet = status._json
            
            if tweet['text'].startswith('RT'): # Avoid rts for our scrapping
                return True
            
            f.write(json.dumps(tweet) + '\n')
            self.num_tweets += 1
            
            if self.num_tweets%1000 == 0:
                print("We have crawled {} tweets".format(self.num_tweets))
        
            # Stop condition        
            if self.num_tweets <= self.stop_condition:
                return True
            else:
                return False
        

    def on_error(self, status):
        """
        function useful to handle errors. It's possible to personalize it 
        depending on the way we want to handle errors
        """
        
        print(status)
        time.sleep(10)
        
        #returning False in on_error disconnects the stream
        return True

Here we download 100000 tweets (without RTs) related to covid ```["coronavirus", "covid"]```

In [None]:
%%time

OUTPUT_FILENAME = "../../data/tweets.json"
stop_condition = 100000

l = MyStreamListener(api, OUTPUT_FILENAME, stop_condition)
# here we recall the Stream Class from Tweepy to input the authentication info and our personalized listener 
stream = Stream(auth=api.auth, listener=l)


# keywords we may want decide to track 
TRACKING_KEYWORDS = ["coronavirus", "covid"]
stream.filter(
    track=TRACKING_KEYWORDS, 
    is_async=False, 
    languages = ["en"],
    since_id
)