# ETL WITH TWITTER DATA


## Part 1

### Import libraries

In [1]:
# importing the required libraries
import sys
import pandas as pd
import csv
import time
import tweepy
import api_keys as k

### Retrieve API access details and authenticate user

In [2]:
#Twitter API credentials
consumer_key = k.credentials['consumer_key']
consumer_secret = k.credentials['consumer_secret']
access_key = k.credentials['access_key']
access_secret = k.credentials['access_secret']

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

### Scrape Data and Store in Dataframe

In [3]:
tweets = []

def text_query_to_csv(api,text_query,count,max_requests):
        # Creation of query method using parameters
    tweets = tweepy.Cursor(api.search,q=text_query).items(count)

        # Pulling information from tweets iterable object
    tweets_list = [[tweet.created_at, tweet.id, tweet.text, tweet.source, tweet.coordinates, tweet.retweet_count, 
                        tweet.favorite_count, tweet.user.name, tweet.user.screen_name, tweet.user.location, 
                        tweet.user.friends_count, tweet.user.verified, tweet.user.description, 
                       tweet.user.followers_count] for tweet in tweets]

        # Creation of dataframe from tweets list
        # Add or remove columns as you remove tweet information
    tweets_df = pd.DataFrame(tweets_list,columns=['Datetime', 'Tweet Id', 'Text', 'Source', 'Coordinates', 'Retweet Count',
                                                     'Like Count', 'Username', 'Twitter Handle', 'Location', 
                                                      'Following', 'Verification Status', 'Description','Followers'])
        
    return tweets_df  
  

In [4]:
# Input search query to scrape tweets and name csv file
# Max recent tweets pulls x amount of most recent tweets from that search
text_query = 'andre_bonzoe'
count = 200
max_requests = 3

# Calling function to query X amount of relevant tweets and create a CSV file
response = text_query_to_csv(api, text_query, count, max_requests)

In [5]:
def save_csv(tweets_df):
    tweets_df.to_csv('{}-tweets.csv'.format(text_query), sep=',', index = False)
    
    # Creation of csv file with name format 'tweets_downloaded_yymmdd_hhmmss.csv' where 'yymmdd_hhmmss' is the current timestamp.
    current_time = time.localtime()
    time.strftime("%Y%m%d_%H%M%S", current_time)
        
    ct_string = time.strftime("%Y%m%d_%H%M%S", current_time)
    print(ct_string)
        
    file_name =f"tweets_downloaded_{ct_string}.csv"
    tweets_df.to_csv(file_name)


In [6]:
save_csv(response)

20201217_205941


## Part 2

### Collecting and storing data in MongoDB

In [7]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['twitter']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'twitter')

In [8]:
class MongoDB(object):
    
    def __init__(self, dBName=None, collectionName=None ):
        self.dBName = dBName
        self.collectionName = collectionName
        
        self.client = client
        self.DB = self.client[self.dBName]
        self.collection = self.DB[self.collectionName]
        
    def InsertData(self, path=None):
        """
        :param path: Path os csv file
        :return: None
        """
        f = open(path, encoding='utf8')
        tweets_df = pd.read_csv(f)
        data = tweets_df.to_dict('records')
        
        self.collection.insert_many(data)
        print("The collection has been uploded to server...")
        
if __name__ =='__main__':
    
    mongodb = MongoDB(dBName = 'tweets_db', collectionName = 'raw_tweets')
    mongodb.InsertData(path = 'tweets_downloaded_20201217_204545.csv')
        

The collection has been uploded to server...
