# Scraping More than 500 Tweets
https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a

In [1]:
import requests
import dateutil
import json
import os
import re

## Query parameters
https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all

In [2]:
# functions
def create_headers():
    # full academic research access
    # MUST HAVE YOUR OWN BEARER TOKEN FROM TWITTER'S API
    bearer_token = None
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def create_url(query, start_date, end_date, max_results = 10):
    # full archive access
    search_url = "https://api.twitter.com/2/tweets/search/all" 

    # parameters
    query_params = {'query': query,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
                    'tweet.fields': 'text,geo,created_at,referenced_tweets',
                    'user.fields': 'name,username,created_at,verified',
                    'place.fields': 'full_name,id,country,country_code,name',
                    'next_token': {}}
    return (search_url, query_params)


def connect_to_endpoint(url, headers, params, next_token = None):
    #params object received from create_url function
    params['next_token'] = next_token   
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))
    
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


def append_to_csv(json_response, fileName):
    counter = 0

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)
    
    # iterate through each tweet
    for i in range(len(json_response['data'])):
        tweet = json_response['data'][i]
        
        # Time created
        created_at = dateutil.parser.parse(tweet['created_at'])

        # Tweet text
        text = tweet['text']
        
        # replied_to
        reply = re.findall("@(\w+)", text)
        if reply != []:
            replied_to = reply[0]
        else:
            replied_to = " "
            
        # Userame and verified
        username = " "
        verified = False
        for user in json_response['includes']['users']:
            if user['id'] == tweet['author_id']:
                username = user['username']
                verified = user['verified']
        
        # Location
        if ('geo' in tweet):   
            geo_id = tweet['geo']['place_id']
            for loc in json_response['includes']['places']:
                if loc['id'] == geo_id:
                    location = loc['name']   
        else:
            location = " "
               
        # Assemble all data in a list
        res = [username, replied_to, location, verified, created_at, text]
        
        # Append the result to the CSV file
        csvWriter.writerow(res)
        counter += 1

    # When done, close the CSV file
    csvFile.close()

    # Print the number of tweets for this iteration
    print("# of Tweets added from this response: ", counter) 

### Fill out parameters and query


In [3]:
# QUERY
# For reference: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
with open("network_query.txt", 'r') as fr:
    query = fr.read()

print(query)

((russell westbrook) OR westbrick) -is:retweet -has:links lang:en


# Requesting more than 500 tweets
**Time Intervals**
- 1 week periods starting at start_interval for the specified n_weeks

**2018-19**
start_interval = datetime(2018, 10, 16)
end_interval = datetime(2018, 10, 22, 23, 59, 59)
n_weeks = 28
max_results = 75
max_count = 70

**2019-20**
start_interval = datetime(2019, 10, 22)
end_interval = datetime(2019, 10, 28, 23, 59, 59)
n_weeks = 40
max_results = 50
max_count = 45

**2020-21**
start_interval = datetime(2020, 10, 22)
end_interval = datetime(2020, 10, 28, 23, 59, 59)
n_weeks = 30
max_results = 65
max_count = 55

**2021-22**
start_interval = datetime(2021, 10, 19)
end_interval = datetime(2021, 10, 25, 23, 59, 59)
n_weeks = 26
max_results = 78
max_count = 68

In [27]:
from datetime import datetime, timedelta

# specify start and end dates for intervals (1 week)
start_interval = datetime(2021, 10, 19)
end_interval = datetime(2021, 10, 25, 23, 59, 59)

# how many weeks from start_interval to request
n_weeks = 26

# store list of tuples in iso format for start and end dates for each week (i.e. [(2020-1-1, 2020-1-7)])
request_dates = []
for i in range(n_weeks):
    start = start_interval + timedelta(weeks=i)
    end = end_interval + timedelta(weeks=i)
    request_dates.append((start.isoformat()+"Z", end.isoformat()+"Z"))

request_dates

In [33]:
import time
import csv
import dateutil
import os

# set max results per period (up to 500)
# calculate n_weeks*max_results is roungly = max total # tweets 
max_results = 77

#Total number of tweets we collected from the loop
total_tweets = 0

# Create file
filename = '2021_22 network tweets.csv'

# delete old file
try:
    os.remove(filename)
except OSError:
    pass

csvFile = open(filename, "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

# Create headers for the data you want to save
fields = ['username', 'replied_to_username', 'location', 'verified', 'created_at', 'text']

csvWriter.writerow(fields)
csvFile.close()

headers = create_headers()

for i in range(len(request_dates)):
    count = 0
    # max_count should be ~5 less than max_results
    max_count = 68
    flag = True
    next_token = None
    
    # Check if flag is true
    while flag:
        if count >= max_count:
            break
        print("-------------------")
        print("Token: ", next_token)
        url = create_url(query, request_dates[i][0],request_dates[i][1], max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']

        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                print("Start Date: ", datetime.fromisoformat(request_dates[i][0][:-1]).strftime("%m/%d/%Y"))
                append_to_csv(json_response, filename)
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(1)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                print("-------------------")
                print("Start Date: ", datetime.fromisoformat(request_dates[i][0][:-1]).strftime("%m/%d/%Y"))
                append_to_csv(json_response, filename)
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(1)
            
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(1)
print("Total number of results: ", total_tweets)
print("COMPLETE")

-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpdv6n01mnti3r1sc9c2j0tus9g48t
Start Date:  10/19/2021
# of Tweets added from this response:  74
Total # of Tweets added:  74
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpdv7w68eykidrrcfglw33aix7bbel
Start Date:  10/26/2021
# of Tweets added from this response:  69
Total # of Tweets added:  143
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpdv8qu8d3sqhvgo4vrw1bg78fjxx9
Start Date:  11/02/2021
# of Tweets added from this response:  70
Total # of Tweets added:  213
-------------------
-------------------
Token:  None
Endpoint Response Code: 200
Next Token:  b26v89c19zqg8o3fpdy5kcbqalna6ojkrm7t9x4pbkrjx
Start Date:  11/09/2021
# of Tweets added from this response:  72
Total # of Tweets added:  285
-------------------
-------------------
Token:  None
Endpoint Response Code: 

In [9]:
import pandas as pd

# read data
filename = '2016_17 network tweets.csv'

df = pd.read_csv(filename)

In [10]:
df.head()

Unnamed: 0,username,replied_to_username,location,verified,created_at,text
0,athegreat584,NBA,,False,2016-10-31 23:59:50+00:00,@NBA \nRussell Westbrook is simply a machine; ...
1,rxjjy96,,,False,2016-10-31 23:56:44+00:00,If Russell Westbrook don't get MVP this season...
2,duvaljr,,,False,2016-10-31 23:55:25+00:00,Nothing about Russell Westbrook game changed. ...
3,prince_nueve,,,False,2016-10-31 23:53:09+00:00,Russell Westbrook and Damian Lillard are ballers
4,MontezzAllen313,,Chicago,False,2016-10-31 23:43:38+00:00,Will Russell Westbrook win MVP this season?
