In [1]:
# For sending GET requests from the API
import requests

# For saving access tokens and for file management when creating and adding to the dataset
import os

# For dealing with json responses we receive from the API
import json

# For displaying the data after
import pandas as pd

# For saving the response data in CSV format
import csv

# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata

#To add wait time between requests
import time

In [3]:
def auth():
    # Twitter API token goes here
    return  "XXXX-XXXX-XXXX-XXXX"

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def create_url(keyword, start_date, end_date, max_results = 10):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent" #Change to the endpoint you want to collect data from
    # search_url = "https://api.twitter.com/2/tweets/search/all" # With an academic research access

    #change params based on the endpoint you are using
    query_params = {'query': keyword,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                    'next_token': {}}
                    
    return (search_url, query_params)

def connect_to_endpoint(url, headers, params, next_token = None):
    #params object received from create_url function
    params['next_token'] = next_token
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [4]:
def append_to_csv(json_response, fileName):

    #A counter variable
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in json_response['data']:
        
        # We will create a variable for each since some of the keys might not exist for some tweets
        # So we will account for that

        # 1. Author ID
        author_id = tweet['author_id']

        # 2. Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # 3. Geolocation
        #if ('geo' in tweet):   
        #    geo = tweet['geo']['place_id']
        #else:
        #    geo = " "

        # 4. Tweet ID
        tweet_id = tweet['id']

        # 5. Language
        lang = tweet['lang']

        # 6. Tweet metrics
        retweet_count = tweet['public_metrics']['retweet_count']
        reply_count = tweet['public_metrics']['reply_count']
        like_count = tweet['public_metrics']['like_count']
        quote_count = tweet['public_metrics']['quote_count']

        # 7. source
        source = tweet['source']

        # 8. Tweet text
        text = tweet['text']
        
        # Assemble all data in a list
        # remove geo, not neccesary
        #res = [author_id, created_at, geo, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text]
        res = [author_id, created_at, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text]
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

In [5]:
#Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "Colombia lang:es -is:retweet"

month = 2 # feb
day_start = 15
day_end = 21
start_list = [f'2022-02-{i}T00:00:00.000Z' for i in range(day_start,day_end + 1)]
end_list = [f'2022-02-{i+1}T00:00:00.000Z' for i in range(day_start,day_end + 1)]
max_results = 100


def gatherTweetsPerPeriod(start_time, end_time, filename, desired_tweets = 600 ):

    #Total number of tweets we collected from the loop
    collected = 0

    # Create file
    csvFile = open(filename, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Create headers for the data you want to save, in this example, we only want save these columns in our dataset
    csvWriter.writerow(['author id', 'created_at', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
    csvFile.close()

    # Inputs
    #count = 0 # Counting tweets per time period
    #max_count = 500 # Max tweets per time period
    # Check if max_count reached
    max_results = min(desired_tweets, 100)
    next_token = None
    while collected < desired_tweets:
        try:
            url = create_url(keyword, start_time, end_time, max_results)
            json_response = connect_to_endpoint(url[0], headers, url[1], next_token=next_token)
            result_count = json_response['meta']['result_count']
        
            since_id = json_response['meta']['newest_id']
            next_token = json_response['meta']['next_token']
            append_to_csv(json_response, filename)
            collected += result_count

            # Save the token to use for next call
            # next_token = json_response['meta']['next_token']
            time.sleep(1)
        except Exception as error:
            print(f"Last Query Newest ID: {since_id}")
            print("Caught Exception:")
            print(error)
            break
    print(f"Last Query Newest ID: {since_id}")


In [6]:
from math import ceil
import os


total_target = int(1e5)
per_day = ceil(total_target / 7)
period = 6 # index from 0-6 
filename = f"period{period}.csv" #remove previously existing file
if os.path.exists(filename):
  os.remove(filename)
print(f"Gathering {per_day} tweets for period {start_list[period]}-{end_list[period]} (period {period})")
gatherTweetsPerPeriod(start_list[period],end_list[period],desired_tweets = per_day, filename=filename)

Gathering 14286 tweets for period 2022-02-21T00:00:00.000Z-2022-02-22T00:00:00.000Z (period 6)
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  99
Endpoint Response Code: 200
# of Tweets added from this response:  99
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  99
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  99
Endpoint Response Code: 200
# of Tweets added from this response:  99
Endpoint Response Code: 200
# of Tweets added from this response:  99
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  99
Endpoint Response Code: 200
# of Tweets added from this resp

Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoint Response Code: 200
# of Tweets added from this response:  99
Endpoint Response Code: 200
# of Tweets added from this response:  100
Endpoin

In [7]:
import pandas as pd

df = pd.concat([pd.read_csv('period0.csv'), pd.read_csv('period1.csv'), pd.read_csv('period2.csv'), pd.read_csv('period3.csv'), pd.read_csv('period4.csv'), pd.read_csv('period5.csv'), pd.read_csv('period6.csv')])
df = df[["id","lang","tweet"]]
unique = df["id"].is_unique
print(f"Unique tweets?: {unique}")
print(len(df))
df.head()

Unique tweets?: True
100604


Unnamed: 0,id,lang,tweet
0,1493736882106245120,es,@JuanCar99077589 @elojodiestro @intiasprilla Y...
1,1493736869108006912,es,@NairoQuinCo no tiene ni puta idea como es la ...
2,1493736868755611649,es,@Enrique_GomezM deje me decirle con todo res...
3,1493736858999787521,es,@PizarroMariaJo @MovimientoMAIS @UP_Colombia @...
4,1493736858769108992,es,Mal día para el duquecito en el parlamento eur...
