# An Extensive Guide to collecting Tweets from Twitter API v2 for Analysis using Python

## List:

1. Introduction
2. Pre-requisites to start
3. Bearer Token
4. Create Headers
5. Create URL
6. Connect to Endpoint
7. Save results to CSV
8. Tweet collection, explained

## 1. Introduction

## 2. Pre-requisites

In [None]:
import requests
import os
import json
import pandas as pd
import csv
import datetime
import dateutil.parser
import unicodedata
import time
import re

#helper function used to remove the URL from a given string. This is helpful as the Tweets we get from the Twitter API often
#contains the url of the Tweet, which could hinder the accuracy of our sentiment analysis tools.
#Requires: a string
#Ensures: The same txt string with url's removed.
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

## 3. Bearer Token

In [None]:
# (((Remember to remove key before publishing)))
def auth():
    return 'AAAAAAAAAAAAAAAAAAAAAPOyOAEAAAAAvoDoQHKZfQWoXvLeMAxwyU0QQks%3Dj1pEstfU8e3iLZOQCULI4tipIp5lk0z16DnsHAOBwMcoCII2Q9'

## 4. Create Headers

In [None]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

## 5. Create URL

In [None]:
def create_url(keyword, start_date, end_date, max_results,):
    search_url = "https://api.twitter.com/2/tweets/search/all"

    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': 500,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
    return (search_url, query_params)

## 6. Save results to CSV

In [None]:
def save_to_csv(json_response, fileName):

    #Saving the tweets in a csv file:
    csvFile = open(fileName, "a", newline="", encoding='utf-8-sig')
    csvWriter = csv.writer(csvFile)

    #Counter to find how many tweets were retrieved
    counter = 0

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # 1) Time 
        time_created = dateutil.parser.parse(tweet['created_at'])
        
        # 2) Tweet ID
        tweet_id = tweet['id'].encode('utf-8')

        # 3) Tweet text
        # text = remove_url(tweet['text'])
        text = tweet['text']
        # print(text)    

        # 4) Author ID
        author_id = tweet['author_id'].encode('utf-8')

        # 5) If A GEO location exist
        if ('geo' in tweet):   
            geo = tweet['geo']['place_id']
        else:
            geo = " "
        
        # 6) Conversation ID
        conversation_id = tweet['conversation_id'].encode('utf-8')

        # 7) If "tweet is a reply" ID of the original tweet exist
        if ('in_reply_to_user_id' in tweet):   
            in_reply = tweet['in_reply_to_user_id'].encode('utf-8')
        else:
            in_reply = " "
        
        # 8) Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 9) Language of tweet
        language = tweet['lang']

        # Assemble all data in a list
        res = [time_created, tweet_id, author_id, text, geo, language, conversation_id,in_reply,retweet_count,reply_count,like_count,quote_count]

        # Look for context Annotations
        if ('context_annotations' in tweet):
            
            # Only look at the first three
            if len(tweet['context_annotations']) > 3:
                min_len = 3
            else:
                min_len = len(tweet['context_annotations']) 

            #loop through the context content
            for i in range(min_len):
                domain_desc = tweet['context_annotations'][i]['domain']['description']
                domain_id = tweet['context_annotations'][i]['domain']['id']
                domain_name = tweet['context_annotations'][i]['domain']['name']
                entity_id = tweet['context_annotations'][i]['entity']['id']
                entity_name = unicodedata.normalize('NFKD', tweet['context_annotations'][i]['entity']['name']).encode('ascii','ignore')
                
                # Add the new context info to the row of data
                res +=[domain_id, domain_name, domain_desc, entity_id, entity_name]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("Tweets in API call: ", counter) 