<a href="https://colab.research.google.com/github/sabinagio/what-s-the-matter/blob/main/twitter_api_data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter API data collection - proof of concept

In [1]:
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses we receive from the API
import json
# For displaying the data after
import pandas as pd
# For saving the response data in CSV format
import csv
# For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata
#To add wait time between requests
import time

In [2]:
os.environ['TOKEN'] = 'AAAAAAAAAAAAAAAAAAAAAFwFgQEAAAAAOjRNg8PpoZ29uTo2Z%2By0K7DEM3s%3DI1NuBq4xOCsYtDMg8nos58Io1kQKgdW80FSEKaaFcSK69TueFy'

In [3]:
def auth():
    return os.getenv('TOKEN')

In [4]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [5]:
def create_url(keyword, max_results=100):
    
    search_url = "https://api.twitter.com/2/tweets/search/recent" #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    query_params = {'query': keyword, 
                    'expansions': 'author_id,in_reply_to_user_id,geo.place_id', 
                    'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,possibly_sensitive,public_metrics,referenced_tweets,reply_settings,source,text,withheld', 
                    'user.fields': 'id,name,username,created_at,description,public_metrics,verified', 
                    'place.fields': 'full_name,id,country,country_code,geo,name,place_type', 
                    'next_token': {}}
    return (search_url, query_params)

In [6]:
def connect_to_endpoint(url, headers, params, next_token=None):
    params['next_token'] = next_token   #params object received from create_url function
    response = requests.request("GET", url, headers = headers, params = params)
    print("Endpoint Response Code: " + str(response.status_code))

    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
        
    return response.json()

In [7]:
#Inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = '''fear OR fearful OR afraid OR scared OR terrified OR worry OR worried OR anxiety OR anxious \
  OR distress OR concern OR dismay OR strain OR stress OR tension -"nothing to fear" -"fear not" -"don't worry" -"no worries" lang:en'''

In [8]:
### Small idea space
# perhaps use "I fear", "My fear" 
# remove items that have "fear not"
# How do we find what people fear? 
# Do not include "nothing to fear"

In [9]:
url = create_url(keyword)
json_response = connect_to_endpoint(url[0], headers, url[1])

Endpoint Response Code: 200


In [10]:
json_response

{'data': [{'public_metrics': {'retweet_count': 0,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0},
   'context_annotations': [{'domain': {'id': '130',
      'name': 'Multimedia Franchise',
      'description': "Franchises which span multiple forms of media like 'Harry Potter'"},
     'entity': {'id': '986247671663898625',
      'name': 'Star Wars',
      'description': 'This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable.\t\t\t'}},
    {'domain': {'id': '131',
      'name': 'Unified Twitter Taxonomy',
      'description': 'A taxonomy view into the Semantic Core knowledge graph'},
     'entity': {'id': '839544274442051584', 'name': 'Entertainment'}},
    {'domain': {'id': '131',
      'name': 'Unified Twitter Taxonomy',
      'description': 'A taxonomy view into the Semantic Core knowledge graph'},
     'entity': {'id': '986247671663898625',
      'name': 'Star Wars',
      'description': 'This e

In [11]:
# Figure out how to convert the JSON response to Pandas dataframe
json_response.keys()

dict_keys(['data', 'includes', 'meta'])

In [12]:
json_response["data"][0].keys()

dict_keys(['public_metrics', 'context_annotations', 'conversation_id', 'created_at', 'referenced_tweets', 'possibly_sensitive', 'text', 'author_id', 'entities', 'reply_settings', 'source', 'id', 'in_reply_to_user_id'])

In [13]:
json_response["data"][0]["public_metrics"]

{'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}

In [14]:
len(json_response["data"])

10

In [15]:
json_response["data"][0]

{'public_metrics': {'retweet_count': 0,
  'reply_count': 0,
  'like_count': 0,
  'quote_count': 0},
 'context_annotations': [{'domain': {'id': '130',
    'name': 'Multimedia Franchise',
    'description': "Franchises which span multiple forms of media like 'Harry Potter'"},
   'entity': {'id': '986247671663898625',
    'name': 'Star Wars',
    'description': 'This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable.\t\t\t'}},
  {'domain': {'id': '131',
    'name': 'Unified Twitter Taxonomy',
    'description': 'A taxonomy view into the Semantic Core knowledge graph'},
   'entity': {'id': '839544274442051584', 'name': 'Entertainment'}},
  {'domain': {'id': '131',
    'name': 'Unified Twitter Taxonomy',
    'description': 'A taxonomy view into the Semantic Core knowledge graph'},
   'entity': {'id': '986247671663898625',
    'name': 'Star Wars',
    'description': 'This entity includes all conversation about the franch

> Entity annotations (NER): Entities are comprised of people, places, products, and organizations. Entities are delivered as part of the entity payload section. They are programmatically assigned based on what is explicitly mentioned (named-entity recognition) in the Tweet text.

> Entity annotations are programmatically defined entities that are nested within the entities field and are reflected as annotations in the payload. Each annotation has a confidence score and an indication of where in the Tweet text the entities were identified (start and end fields).

> The entity annotations can have the following types:
> - Person - Barack Obama, Daniel, or George W. Bush
> - Place - Detroit, Cali, or "San Francisco, California"
> - Product - Mountain Dew, Mozilla Firefox
> - Organization - Chicago White Sox, IBM
> - Other - Diabetes, Super Bowl 50

In [16]:
json_response["data"][0]["entities"]

{'mentions': [{'start': 0, 'end': 8, 'username': 'TheJag1', 'id': '606348384'},
  {'start': 9, 'end': 25, 'username': 'martindempseyyy', 'id': '85535213'},
  {'start': 26, 'end': 39, 'username': 'allanmcgraw2', 'id': '1386560216'}]}

> Context annotations: Derived from the analysis of a Tweet’s text and will include a domain and entity pairing which can be used to discover Tweets on topics that may have been previously difficult to surface. At present, we’re using a list of 80+ domains to categorize Tweets. A CSV file of the available context annotation entities is available for download at our [Github repository](https://github.com/twitterdev/twitter-context-annotations).

In [17]:
json_response["data"][0]["context_annotations"]

[{'domain': {'id': '130',
   'name': 'Multimedia Franchise',
   'description': "Franchises which span multiple forms of media like 'Harry Potter'"},
  'entity': {'id': '986247671663898625',
   'name': 'Star Wars',
   'description': 'This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable.\t\t\t'}},
 {'domain': {'id': '131',
   'name': 'Unified Twitter Taxonomy',
   'description': 'A taxonomy view into the Semantic Core knowledge graph'},
  'entity': {'id': '839544274442051584', 'name': 'Entertainment'}},
 {'domain': {'id': '131',
   'name': 'Unified Twitter Taxonomy',
   'description': 'A taxonomy view into the Semantic Core knowledge graph'},
  'entity': {'id': '986247671663898625',
   'name': 'Star Wars',
   'description': 'This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable.\t\t\t'}},
 {'domain': {'id': '131',
   'name': 'Unified Twitter T

In [18]:
json_response["includes"]['users'][1].keys()

dict_keys(['description', 'public_metrics', 'verified', 'id', 'username', 'name', 'created_at'])

In [19]:
json_response["includes"]['users'][0]

{'description': 'Professional Fixer, Glaswegian, PTFC & Maryhill FC Fan getting away with it in Norge, resident at Dr Goldsteins Surgery. Life aint a rehearsal!!!',
 'public_metrics': {'followers_count': 418,
  'following_count': 835,
  'tweet_count': 8598,
  'listed_count': 6},
 'verified': False,
 'id': '24150343',
 'username': 'Norgethistle',
 'name': 'NorgeThistle🍺🥃🚬🇳🇴🏴\U000e0067\U000e0062\U000e0073\U000e0063\U000e0074\U000e007f',
 'created_at': '2009-03-13T09:39:41.000Z'}

In [20]:
len(json_response["includes"]['users'])

11

In [21]:
json_response["meta"].keys()

dict_keys(['newest_id', 'oldest_id', 'result_count', 'next_token'])

In [22]:
json_response["meta"]

{'newest_id': '1563111510595227649',
 'oldest_id': '1563111507193638917',
 'result_count': 10,
 'next_token': 'b26v89c19zqg8o3fpz8l5hem6o96dbgkafxkaaofnphfh'}

In [23]:
# Collect tweets / user data into a dataframe
def extract_data(json_response, data_type="tweet"):

  # Select the relevant data to extract
  data_dict = {"tweet": json_response["data"], "user": json_response["includes"]["users"]}
  data = data_dict[data_type]

  # Extract the dataframe columns
  columns_list = list(data[0].keys())
  columns_list.extend(list(data[0]["public_metrics"].keys()))
  columns_list.remove("public_metrics")
  df = pd.DataFrame(columns=columns_list)

  for row in data:
    # Extract the public metrics data
    public_metrics = row["public_metrics"]

    for key in public_metrics.keys():
      row[key] = public_metrics[key]
    row.pop("public_metrics")

    # Add tweet to dataframe with unpacked public metrics
    df = df.append(row, ignore_index=True)
  return df

In [24]:
tweet_data = extract_data(json_response, data_type="tweet")
tweet_data

Unnamed: 0,context_annotations,conversation_id,created_at,referenced_tweets,possibly_sensitive,text,author_id,entities,reply_settings,source,id,in_reply_to_user_id,retweet_count,reply_count,like_count,quote_count,attachments
0,"[{'domain': {'id': '130', 'name': 'Multimedia ...",1563089494475124736,2022-08-26T10:29:59.000Z,"[{'type': 'replied_to', 'id': '156311065075033...",False,@TheJag1 @martindempseyyy @allanmcgraw2 Fear i...,24150343,"{'mentions': [{'start': 0, 'end': 8, 'username...",everyone,Twitter for iPhone,1563111510595227649,606348384.0,0,0,0,0,
1,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1563111509995053061,2022-08-26T10:29:59.000Z,"[{'type': 'retweeted', 'id': '1562779004679372...",False,RT @ValaAfshar: The mesmerizing beauty of phys...,563697218,"{'urls': [{'start': 96, 'end': 119, 'url': 'ht...",everyone,Twitter for Android,1563111509995053061,,4674,0,0,0,{'media_keys': ['7_1139447559418732544']}
2,,1563111509898981376,2022-08-26T10:29:59.000Z,"[{'type': 'retweeted', 'id': '1562770800121241...",False,RT @JR1991JR: @ELHopkins He was too worried ab...,392678207,"{'mentions': [{'start': 3, 'end': 12, 'usernam...",everyone,Twitter for Android,1563111509898981376,,1,0,0,0,
3,,1563111509823488002,2022-08-26T10:29:59.000Z,"[{'type': 'retweeted', 'id': '1562988398716936...",False,RT @kurikondeshu: Not afraid cause u r here wi...,1317916968641470464,"{'urls': [{'start': 88, 'end': 111, 'url': 'ht...",everyone,Twitter for iPhone,1563111509823488002,,628,0,0,0,{'media_keys': ['3_1562988389128818688']}
4,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",1563111509492137985,2022-08-26T10:29:59.000Z,"[{'type': 'retweeted', 'id': '1563095938050076...",False,RT @NewsweekPak: Despite government assurances...,15943583,"{'mentions': [{'start': 3, 'end': 15, 'usernam...",everyone,Twitter for iPhone,1563111509492137985,,196,0,0,0,
5,,1563111509173014530,2022-08-26T10:29:59.000Z,"[{'type': 'retweeted', 'id': '1562985395344019...",False,RT @LittleMissNotes: little miss anxiety,1547588508071514112,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",everyone,Twitter for iPhone,1563111509173014530,,1072,0,0,0,
6,,1563111508615495680,2022-08-26T10:29:59.000Z,,False,#BeARealHeroWhen I went to donate blood for th...,1378253669674668033,"{'urls': [{'start': 259, 'end': 282, 'url': 'h...",everyone,Twitter for Android,1563111508615495680,,0,0,0,0,{'media_keys': ['3_1563111499702620166']}
7,,1563111508133158917,2022-08-26T10:29:59.000Z,,False,Fear is the little-death that brings total obl...,1505563960233570304,,everyone,Twitter for Android,1563111508133158917,,0,0,0,0,
8,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1563111507918893063,2022-08-26T10:29:59.000Z,"[{'type': 'retweeted', 'id': '1563015163774570...",False,RT @headlineplanet: Current Top 5 on Spotify's...,1215912303117459456,"{'annotations': [{'start': 37, 'end': 43, 'pro...",everyone,Twitter for Android,1563111507918893063,,1768,0,0,0,
9,,1563111507193638917,2022-08-26T10:29:59.000Z,"[{'type': 'retweeted', 'id': '1563098268011155...",False,RT @kolibabski: I'm feeling anxious and overwh...,1333751501194362880,"{'mentions': [{'start': 3, 'end': 14, 'usernam...",everyone,Twitter for iPhone,1563111507193638917,,2,0,0,0,


In [26]:
user_data = extract_data(json_response, data_type="user")
user_data

Unnamed: 0,description,verified,id,username,name,created_at,followers_count,following_count,tweet_count,listed_count
0,"Professional Fixer, Glaswegian, PTFC & Maryhil...",False,24150343,Norgethistle,NorgeThistle🍺🥃🚬🇳🇴🏴󠁧󠁢󠁳󠁣󠁴󠁿,2009-03-13T09:39:41.000Z,418,835,8598,6
1,"Likes the truth. Hates liars, hangers on, self...",False,606348384,TheJag1,Calum Nicol,2012-06-12T12:09:13.000Z,98,359,13037,0
2,FJBA Original Account | ENGENE🔀 | ARMY💜 | SM S...,False,563697218,fritzjanross,FJ | laban,2012-04-26T12:49:33.000Z,723,1160,65027,4
3,I Got Happiness LFC supporter since 1965 YNWA ...,False,392678207,RobertGEdge,Robert G Edge (Ex Labour Party Member),2011-10-17T12:14:11.000Z,4823,3110,472542,6
4,,False,1317916968641470464,Cherry14222913,Cherry,2020-10-18T19:54:41.000Z,1,55,640,0
5,"Justice for all living beings, peace for all. ...",False,15943583,bmw325ia,Muhammad Shafi,2008-08-22T11:09:19.000Z,572,1440,37142,25
6,💫🌖The sun watches what I do but the moon knows...,False,1547588508071514112,1PrettyGuardian,✨🌙 Ashley Herrera ✨🌙,2022-07-14T14:27:15.000Z,19,71,708,0
7,mukesh,False,1378253669674668033,MonuRaj72275511,Monu Rajput,2021-04-03T07:51:31.000Z,4,19,134,0
8,"by grace, she'll make it ❦︎ {📱📱 — reason: l...",False,1505563960233570304,savedbysebongs,angel ☽︎,2022-03-20T15:16:58.000Z,321,2787,9686,0
9,BLACKPINK fan account,False,1215912303117459456,Sportyrosie,em,2020-01-11T08:24:28.000Z,12,222,17306,1


In [None]:
# Attempt to get all of the tweets that match the query and save into a dataframe 

#Inputs for the request
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = '''fear OR fearful OR afraid OR scared OR terrified OR worry OR worried OR anxiety OR anxious \
  OR distress OR concern OR dismay OR strain OR stress OR tension -"nothing to fear" -"fear not" -"don't worry" -"no worries" lang:en'''
url = create_url(keyword)

# First query
json_response = connect_to_endpoint(url[0], headers, url[1])
next_token = json_response["meta"]

while next_token:


In [None]:
### CODE FROM AN ARTICLE - TO BE USED FOR INSPIRATION IN GETTING RECURRENT REQUESTS

#Inputs for tweets
bearer_token = auth()
headers = create_headers(bearer_token)
keyword = "xbox lang:en"
start_list =    ['2021-01-01T00:00:00.000Z',
                 '2021-02-01T00:00:00.000Z',
                 '2021-03-01T00:00:00.000Z']

end_list =      ['2021-01-31T00:00:00.000Z',
                 '2021-02-28T00:00:00.000Z',
                 '2021-03-31T00:00:00.000Z']
max_results = 500

#Total number of tweets we collected from the loop
total_tweets = 0

# Create file
csvFile = open("data.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)

#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csvWriter.writerow(['author id', 'created_at', 'geo', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet'])
csvFile.close()

for i in range(0,len(start_list)):

    # Inputs
    count = 0 # Counting tweets per time period
    max_count = 100 # Max tweets per time period
    flag = True
    next_token = None
    
    # Check if flag is true
    while flag:
        # Check if max_count reached
        if count >= max_count:
            break
        print("-------------------")
        print("Token: ", next_token)
        url = create_url(keyword, start_list[i],end_list[i], max_results)
        json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
        result_count = json_response['meta']['result_count']

        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
            print("Next Token: ", next_token)
            if result_count is not None and result_count > 0 and next_token is not None:
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)                
        # If no next token exists
        else:
            if result_count is not None and result_count > 0:
                print("-------------------")
                print("Start Date: ", start_list[i])
                append_to_csv(json_response, "data.csv")
                count += result_count
                total_tweets += result_count
                print("Total # of Tweets added: ", total_tweets)
                print("-------------------")
                time.sleep(5)
            
            #Since this is the final request, turn flag to false to move to the next time period.
            flag = False
            next_token = None
        time.sleep(5)
print("Total number of results: ", total_tweets)