In [2]:
! pip install emoji --upgrade
! pip install twarc
! pip install twarc-csv



In [4]:
# Load packages

In [3]:
import os
import glob
import logging
from tqdm import tqdm
import ast
import requests
import os
import json
import pandas as pd
import time

import emoji


# Prerequisite functions

In [47]:
def set_logger(log_file_path="debug.log", level="INFO"):
# def set_logger(log_file_path="debug.log", level="DEBUG"):
    logger = logging.getLogger()
    logger.setLevel(level)
    scream_handler = logging.StreamHandler()
    file_handler = logging.FileHandler(log_file_path)
    logger.addHandler(scream_handler)
    logger.addHandler(file_handler)
    return logger
    


try:
    # print(len(logger.handlers))
    while len(logger.handlers) > 1:
        logger.handlers.pop(0)
        # print(len(logger.handlers))
except:
    pass

logger = set_logger()

def get_api_token(token_path):    
    try:
        with open(token_path, "r") as f:
            logger.debug("token_path: %s" % token_path)
            lines = f.readlines()
            logger.debug("lines in the file: %s" % lines)

            lines = [line.split(": ")[-1][:-1] for line in lines]
        return lines

    except Exception as e:
        logger.error("Error: %s" % str(e))




#-------------------- merge results -------------------#          
def find_place_id(row):
    # print(row)
    
    cell_text = row.get("geo", "")
    # print(cell_text)
    if len(cell_text) > 1:
        place_dict = ast.literal_eval(cell_text)
    else:
        return ""
    # print(place_dict)
    if isinstance(place_dict, dict):
        place_id = place_dict.get("place_id", "")
        if len(place_id) > 1:
            return place_id

def clean_tweets(row):
    
    text = row['text'].replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').strip()
    return text

def refine_data(df):
    df['place_id'] = df.apply(find_place_id, axis=1)
    df['text'] = df.apply(clean_tweets, axis=1)
    
    return df

def find_media_row(row, df_media):
    cell_text = row["attachments"]
    if len(cell_text) > 1:
        attachments_dict = ast.literal_eval(cell_text)
    else:
        return ""
    
    if isinstance(attachments_dict, dict):
        media_keys = attachments_dict.get("media_keys", "")
        media_rows = []
        # print(df_media)
        # print(attachments_dict)
        for key in media_keys:
            key = str(key)
            if len(key) > 1:
                # print(key)
                # print(df_media['media_table_media_key'])
                row = df_media[df_media['media_table_media_key']==key].iloc[0].to_json()
                # print(df_media[df_media['media_table_media_key']==key])
                media_rows.append(row)
        # print(media_rows)
        return media_rows
    return ""

def get_lonlat(row):
    row["lon"] = ""
    row["lat"] = ""
#     print('row[places_table_geo]:', row["places_table_geo"])
    if len(row["places_table_geo"]) > 1:
        geo_dict = ast.literal_eval(row["places_table_geo"])
#         print('geo_dict:', geo_dict)
        bbox = geo_dict.get("bbox", [])
        if len(bbox) == 4:
            row["lon"] = (bbox[0] + bbox[2]) / 2
            row["lat"] = (bbox[1] + bbox[3]) / 2
    return row

def merge_results(saved_path):
    data_files = glob.glob(os.path.join(saved_path, "*_data.csv"))
    logger.info("Start to merge %d filles." % len(data_files))
    all_df = []
    for d in tqdm(data_files[:]):
        df_data = pd.read_csv(d)
        print(d)
        df_data = df_data.fillna("")
        df_data = refine_data(df_data)

        df_merged = df_data        

        # process places file
        places_csv = d.replace("data.csv", "includes_places.csv")
        if os.path.exists(places_csv):
            df_places = pd.read_csv(places_csv).fillna("")
            new_column_name = {name: "places_table_" + name for name in df_places.columns}
            df_places = df_places.rename(columns=new_column_name)        
            df_merged = pd.merge(df_merged, df_places, how='left', left_on="place_id", right_on="places_table_id")

        # process tweets file
        tweets_csv = d.replace("data.csv", "includes_tweets.csv")
        if os.path.exists(tweets_csv):
            df_tweets = pd.read_csv(tweets_csv).fillna("")
            df_tweets["text"] = df_tweets["text"].str.replace("\n", " ")
            new_column_name = {name: "tweets_table_" + name for name in df_tweets.columns}
            df_tweets = df_tweets.rename(columns=new_column_name)      
            df_merged = pd.merge(df_merged, df_tweets, how='left', left_on="id", right_on="tweets_table_id")

        # process users file
        users_csv = d.replace("data.csv", "includes_users.csv")
        if os.path.exists(tweets_csv):
            df_users = pd.read_csv(users_csv).fillna("")
            df_users["description"] = df_users["description"].str.replace("\n", " ")
            new_column_name = {name: "users_table_" + name for name in df_users.columns}
            df_users = df_users.rename(columns=new_column_name)     
            df_merged = pd.merge(df_merged, df_users, how='left', left_on="author_id", right_on="users_table_id")        

        # process media file
        media_csv = d.replace("data.csv", "includes_media.csv")
        if os.path.exists(media_csv):
            df_media = pd.read_csv(media_csv).fillna("")
            df_media['media_key'] = df_media['media_key'].astype(str)
            new_column_name = {name: "media_table_" + name for name in df_media.columns}
            df_media = df_media.rename(columns=new_column_name)  
            df_merged["media_table_rows"] = df_merged.apply(find_media_row, args=(df_media,), axis=1)
        

        df_merged = df_merged.fillna("")
        df_merged.replace("\n", " ")
        df_merged = df_merged.drop_duplicates(subset=['id'], keep='last')
        # print(len(df_places))
        # return df_merged
        all_df.append(df_merged)

    print("\nGenerating final CSV file, including %d small CSV files." % len(all_df))
    print("\nPlease wait...")

    final_df = pd.concat(all_df).fillna("")
    final_df = final_df.apply(get_lonlat, axis=1).reset_index()
    final_file = os.path.join(saved_path, "merged.csv.gz")
    final_df.to_csv(final_file, index=False)
    logger.info("\nSaved merged tweets in %s ." % final_file)

    return final_df

# Set tokens

Put your Twitter API tokens in the ```tweet_api_keys.txt``` file in the same directory of this notebook in the following format:
```
Consumer API Key: XXXX
Consumer API Secret Key: XXXX
Bearer Token: XXXX
Access Token: XXXX
Access Token Secret: XXXX
```

In [10]:
token_path = r'J:\Research\tweet_download\tweet_api_keys.txt'

tokens = get_api_token(token_path)

consumer_key = tokens[0]
consumer_secret = tokens[1]
bearer_token = tokens[2]
access_token = tokens[3]
access_token_secret = tokens[4]

# Download tweets

The following cell is a exmaple query to download tweets in Australia with a keyword of "vaccine" since 2021-01-01 to 2021-06-01.

Please set ```query```, ```start_time```, ```end_time```, ```saved_path```, and ```max_results``` (10 - 500).

See these pages to building a query: 

[Building queries for Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#examples)

[Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all)




In [37]:
# a exmaple query to download tweets in Australia with a keyword of "vaccine" since 2020-01-01
# keyword = "vaccine"

# query = f"({keyword}) place_country:AU -is:retweet"
# query = f"({keyword}) place_country:AU"
# query = "(vaccin OR vaccination OR vaccine OR vaccinate) place_country:AU"

query = "telemedicine  OR telehealth  OR telecare"

# query = f"({keyword})"
start_time = "2021-11-29T20:00:01Z"
end_time = "2021-11-30T00:00:01Z"
max_results = 100   # max_results can be 500 if do not request the field: context_annotations

# since_id = "139819805172285849"  # cannot used with start/end_time!


# borrow from Twitter:
# https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py
search_url = "https://api.twitter.com/2/tweets/search/all"


# saved_path = os.path.join(os.getcwd(), "saved_tweets")
saved_path = r"downloaded_tweets_test"
os.makedirs(saved_path, exist_ok=True)

In [42]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", search_url, headers=headers, params=params)
    # print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)   
    return response.json()

def save_search(json_response, saved_path):
    try:
        if not os.path.exists(saved_path):
            os.mkdir(saved_path)

        meta = json_response['meta']
        data = json_response['data']
        includes = json_response['includes']
        basename = f"{meta['oldest_id']}_{meta['newest_id']}_{meta['result_count']}"

        data_filename = os.path.join(saved_path, basename + "_data.csv")
        df = pd.DataFrame(data)
        df.to_csv(data_filename, index=False)
        result_count = meta['result_count']
        result_count = str(result_count)
        logger.info("Saved %s tweets in: %s" % (result_count, data_filename))

        for key in includes.keys():
            includes_filename = os.path.join(saved_path, basename + f"_includes_{key}.csv")
            df = pd.DataFrame(includes[key])
            df.to_csv(includes_filename, index=False)
    except Exception as e:
        logger.error(e, exc_info=True)

def execute_download(saved_path=os.getcwd()):
    
    start_timer = time.perf_counter()

    next_token = 'start'
    search_url = "https://api.twitter.com/2/tweets/search/all"
    headers = create_headers(bearer_token)
    total = 0
    query_params = {'query': query, \
                    "max_results": str(max_results), \
                    'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id', \
                    'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \
                    'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type', \
                    "user.fields": 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',\
                    "media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics", \
                    "poll.fields": "duration_minutes,end_datetime,id,options,voting_status", \
                    "start_time": start_time, \
                    "end_time": end_time, \
                    # "since_id":since_id, \  # cannot used with start/end_time!
                    }

    while next_token != "":
        try:
            
            json_response = connect_to_endpoint(search_url, headers, query_params)
#             df = pd.DataFrame(json_response['data'])
            save_search(json_response, saved_path)
            
            total += int(json_response['meta']['result_count'])
            logger.info("Downloaded %s tweets in total." % total)


            next_token = json_response['meta'].get('next_token', "")
            if next_token == "":
                print("No next page! Exit.")
                return

            query_params.update({"next_token": next_token})            
#             time.sleep(1)
        
        except Exception as e:
            logger.error(e, exc_info=True)
            
            print(e)
            
            now = time.perf_counter()
            
            time_window = 15 * 60 # seconds
            
            if 'Too Many Requests' in json_response.text:
                elapsed_time = int(now - start_timer)
                need_to_wait_time = time_window - elapsed_time
                print(f'Too Many Requests, waiting for {need_to_wait_time} seconds.')
                time.sleep(need_to_wait_time)
                
            continue


execute_download(saved_path=saved_path)
merge_df = merge_results(saved_path)
merge_df

Saved 100 tweets in: downloaded_tweets_test\1465458465405648903_1465470516379021313_100_data.csv
Downloaded 100 tweets in total.
Saved 97 tweets in: downloaded_tweets_test\1465443799593734146_1465458273432182785_97_data.csv
Downloaded 197 tweets in total.
Saved 99 tweets in: downloaded_tweets_test\1465429190065463312_1465443617347039232_99_data.csv
Downloaded 296 tweets in total.
Saved 99 tweets in: downloaded_tweets_test\1465419376308359170_1465428976810213382_99_data.csv
Downloaded 395 tweets in total.
Saved 40 tweets in: downloaded_tweets_test\1465410276778553347_1465419287460519941_40_data.csv
Downloaded 435 tweets in total.
Start to merge 5 filles.


No next page! Exit.


 40%|█████████████████████████████████▌                                                  | 2/5 [00:00<00:00, 19.04it/s]

downloaded_tweets_test\1465410276778553347_1465419287460519941_40_data.csv
downloaded_tweets_test\1465419376308359170_1465428976810213382_99_data.csv
downloaded_tweets_test\1465429190065463312_1465443617347039232_99_data.csv
downloaded_tweets_test\1465443799593734146_1465458273432182785_97_data.csv


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 17.67it/s]


downloaded_tweets_test\1465458465405648903_1465470516379021313_100_data.csv

Generating final CSV file, including 5 small CSV files.

Please wait...
row[places_table_geo]: nan


TypeError: object of type 'float' has no len()

In [48]:
# execute_download(saved_path=saved_path)
merge_df = merge_results(saved_path)
merge_df

Start to merge 5 filles.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 27.02it/s]


downloaded_tweets_test\1465410276778553347_1465419287460519941_40_data.csv
downloaded_tweets_test\1465419376308359170_1465428976810213382_99_data.csv
downloaded_tweets_test\1465429190065463312_1465443617347039232_99_data.csv
downloaded_tweets_test\1465443799593734146_1465458273432182785_97_data.csv
downloaded_tweets_test\1465458465405648903_1465470516379021313_100_data.csv

Generating final CSV file, including 5 small CSV files.

Please wait...



Saved merged tweets in downloaded_tweets_test\merged.csv.gz .


Unnamed: 0,index,text,id,entities,source,reply_settings,possibly_sensitive,author_id,created_at,conversation_id,...,geo,places_table_name,places_table_country_code,places_table_geo,places_table_country,places_table_place_type,places_table_id,places_table_full_name,lon,lat
0,0,Telehealth takes off but security concerns per...,1465419287460519941,"{'hashtags': [{'start': 51, 'end': 59, 'tag': ...",TwinyBots,everyone,False,1439265257894170630,2021-11-29T20:35:58.000Z,1465419287460519941,...,,,,,,,,,,
1,1,how #telepsychiatry saved this family #telemed...,1465419037652008961,"{'hashtags': [{'start': 4, 'end': 19, 'tag': '...",Twitter Web App,everyone,False,511022548,2021-11-29T20:34:59.000Z,1465419037652008961,...,,,,,,,,,,
2,2,RT @CaliforniaTRC: You won't want to miss this...,1465418791098093575,"{'hashtags': [{'start': 126, 'end': 137, 'tag'...",Twitter Web App,everyone,False,437081670,2021-11-29T20:34:00.000Z,1465418791098093575,...,,,,,,,,,,
3,3,@audrey_earth Yep- The clinic my husband &amp;...,1465418439804305418,"{'mentions': [{'start': 0, 'end': 13, 'usernam...",Twitter for Android,everyone,False,48845861,2021-11-29T20:32:36.000Z,1464963974571646976,...,,,,,,,,,,
4,4,Gov. Noem pushes back on narrative GOP is anti...,1465418362708705283,"{'mentions': [{'start': 137, 'end': 152, 'user...",Twitter for iPhone,everyone,False,19020689,2021-11-29T20:32:18.000Z,1465418362708705283,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,95,9 out of 10 Healthcare Organizations Provide T...,1465458929383751683,"{'urls': [{'start': 121, 'end': 144, 'url': 'h...",IFTTT,everyone,False,369107103,2021-11-29T23:13:30.000Z,1465458929383751683,...,,,,,,,,,,
431,96,Gov. Noem pushes back on narrative GOP is anti...,1465458743806676995,"{'annotations': [{'start': 0, 'end': 8, 'proba...",Twitter for iPhone,everyone,False,1362186522062110720,2021-11-29T23:12:46.000Z,1465458743806676995,...,,,,,,,,,,
432,97,RT @4fitnesshealthy: Comparing Video-Based; Te...,1465458518425751557,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",Healthymes Retweeter,everyone,False,2238159482,2021-11-29T23:11:52.000Z,1465458518425751557,...,,,,,,,,,,
433,98,RT @4fitnesshealthy: Telehealth-delivered diet...,1465458467141996547,"{'annotations': [{'start': 112, 'end': 125, 'p...",Healthymes Retweeter,everyone,False,2238159482,2021-11-29T23:11:40.000Z,1465458467141996547,...,,,,,,,,,,


In [50]:
next_token = 'start'
search_url = "https://api.twitter.com/2/tweets/search/all"
headers = create_headers(bearer_token)
total = 0
query_params = {'query': query, \
                "max_results": str(max_results), \
                'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id', \
                'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \
                'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type', \
                "user.fields": 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',\
                "media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics", \
                "poll.fields": "duration_minutes,end_datetime,id,options,voting_status", \
                "start_time": start_time, \
                "end_time": end_time, \
                # "since_id":since_id, \  # cannot used with start/end_time!
                }


In [51]:
json_response = connect_to_endpoint(search_url, headers, query_params)

In [55]:
#json_response.keys()   # ['data', 'includes', 'meta']
json_response['meta']

{'newest_id': '1465470516379021313',
 'oldest_id': '1465458465405648903',
 'result_count': 100,
 'next_token': 'b26v89c19zqg8o3fpdy7o6jo5ebu3ya8iq2kaptu7erul'}

In [57]:
json_response['includes'].keys()  # dict_keys(['users', 'tweets', 'media', 'places', 'polls'])

dict_keys(['users', 'tweets', 'media', 'places', 'polls'])

In [61]:
json_response['includes'].keys()

dict_keys(['users', 'tweets', 'media', 'places', 'polls'])

In [60]:
json_response['data'][0]

{'reply_settings': 'everyone',
 'conversation_id': '1465183027043069959',
 'entities': {'mentions': [{'start': 0,
    'end': 16,
    'username': 'jessica_leigh75',
    'id': '2862477860'},
   {'start': 17,
    'end': 29,
    'username': 'EmergencyBK',
    'id': '1153682741780598785'}]},
 'source': 'Twitter for iPad',
 'created_at': '2021-11-29T23:59:32.000Z',
 'id': '1465470516379021313',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 0,
  'like_count': 2,
  'quote_count': 0},
 'author_id': '470651164',
 'context_annotations': [{'domain': {'id': '123',
    'name': 'Ongoing News Story',
    'description': "Ongoing News Stories like 'Brexit'"},
   'entity': {'id': '1220701888179359745', 'name': 'COVID-19'}}],
 'in_reply_to_user_id': '2862477860',
 'referenced_tweets': [{'type': 'replied_to', 'id': '1465362278488834049'}],
 'possibly_sensitive': False,
 'text': '@jessica_leigh75 @EmergencyBK 🤣😂 The new COVID safe medical appointments, Telehealth etc., have been a boon, but can b