In [76]:
# ! pip install emoji --upgrade
# ! pip install twarc
# ! pip install twarc-csv

In [4]:
# Load packages

In [77]:
import os
import glob
import logging
from tqdm import tqdm
import ast
import requests
import os
import json
import pandas as pd
import time

import emoji


# Prerequisite functions

In [88]:
def set_logger(log_file_path="debug.log", level="INFO"):
# def set_logger(log_file_path="debug.log", level="DEBUG"):
    logger = logging.getLogger()
    logger.setLevel(level)
    scream_handler = logging.StreamHandler()
    file_handler = logging.FileHandler(log_file_path)
    logger.addHandler(scream_handler)
    logger.addHandler(file_handler)
    return logger
    


try:
    # print(len(logger.handlers))
    while len(logger.handlers) > 1:
        logger.handlers.pop(0)
        # print(len(logger.handlers))
except:
    pass

logger = set_logger()

def get_api_token(token_path):    
    try:
        with open(token_path, "r") as f:
            logger.debug("token_path: %s" % token_path)
            lines = f.readlines()
            logger.debug("lines in the file: %s" % lines)

            lines = [line.split(": ")[-1][:-1] for line in lines]
        return lines

    except Exception as e:
        logger.error("Error: %s" % str(e))




#-------------------- merge results -------------------#          
def find_place_id(row):
    # print(row)
    
    cell_text = row.get("geo", "")
    # print(cell_text)
    if len(cell_text) > 1:
        place_dict = ast.literal_eval(cell_text)
    else:
        return ""
    # print(place_dict)
    if isinstance(place_dict, dict):
        place_id = place_dict.get("place_id", "")
        if len(place_id) > 1:
            return place_id

def clean_tweets(row):
    
    text = row['text'].replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').strip()
    return text

def find_poll_id(row):
    
    text = row['text'].replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').strip()
    return text

def refine_data(df):
    df['place_id'] = df.apply(find_place_id, axis=1)
    df['text'] = df.apply(clean_tweets, axis=1)
    
    return df

def find_media_row(row, df_media):
    cell_text = row["attachments"]
    if len(cell_text) > 1:
        attachments_dict = ast.literal_eval(cell_text)
    else:
        return ""
    
    if isinstance(attachments_dict, dict):
        media_keys = attachments_dict.get("media_keys", "")
        media_rows = []
        # print(df_media)
        # print(attachments_dict)
        for key in media_keys:
            key = str(key)
            if len(key) > 1:
                # print(key)
                # print(df_media['media_table_media_key'])
                row = df_media[df_media['media_table_media_key']==key].iloc[0].to_json()
                # print(df_media[df_media['media_table_media_key']==key])
                media_rows.append(row)
        # print(media_rows)
        return media_rows
    return ""

def get_lonlat(row):
    row["lon"] = ""
    row["lat"] = ""
#     print('row[places_table_geo]:', row["places_table_geo"])
    if len(row["places_table_geo"]) > 1:
        geo_dict = ast.literal_eval(row["places_table_geo"])
#         print('geo_dict:', geo_dict)
        bbox = geo_dict.get("bbox", [])
        if len(bbox) == 4:
            row["lon"] = (bbox[0] + bbox[2]) / 2
            row["lat"] = (bbox[1] + bbox[3]) / 2
    return row

def merge_results(saved_path):
    data_files = glob.glob(os.path.join(saved_path, "*_data.csv"))
    logger.info("Start to merge %d filles." % len(data_files))
    all_df = []
    for d in tqdm(data_files[:]):
        df_data = pd.read_csv(d)
        print(d)
        df_data = df_data.fillna("")
        df_data = refine_data(df_data)

        df_merged = df_data        

        # process places file
        places_csv = d.replace("data.csv", "includes_places.csv")
        if os.path.exists(places_csv):
            df_places = pd.read_csv(places_csv).fillna("")
            new_column_name = {name: "places_table_" + name for name in df_places.columns}
            df_places = df_places.rename(columns=new_column_name)        
            df_merged = pd.merge(df_merged, df_places, how='left', left_on="place_id", right_on="places_table_id")

        # process tweets file
        tweets_csv = d.replace("data.csv", "includes_tweets.csv")
        if os.path.exists(tweets_csv):
            df_tweets = pd.read_csv(tweets_csv).fillna("")
            df_tweets["text"] = df_tweets["text"].str.replace("\n", " ")
            new_column_name = {name: "tweets_table_" + name for name in df_tweets.columns}
            df_tweets = df_tweets.rename(columns=new_column_name)      
            df_merged = pd.merge(df_merged, df_tweets, how='left', left_on="id", right_on="tweets_table_id")

        # process users file
        users_csv = d.replace("data.csv", "includes_users.csv")
        if os.path.exists(tweets_csv):
            df_users = pd.read_csv(users_csv).fillna("")
            df_users["description"] = df_users["description"].str.replace("\n", " ")
            new_column_name = {name: "users_table_" + name for name in df_users.columns}
            df_users = df_users.rename(columns=new_column_name)     
            df_merged = pd.merge(df_merged, df_users, how='left', left_on="author_id", right_on="users_table_id")        

        # process media file
        media_csv = d.replace("data.csv", "includes_media.csv")
        if os.path.exists(media_csv):
            df_media = pd.read_csv(media_csv).fillna("")
            df_media['media_key'] = df_media['media_key'].astype(str)
            new_column_name = {name: "media_table_" + name for name in df_media.columns}
            df_media = df_media.rename(columns=new_column_name)  
            df_merged["media_table_rows"] = df_merged.apply(find_media_row, args=(df_media,), axis=1)
            
       # process poll file
        poll_csv = d.replace("data.csv", "includes_poll.csv")
        if os.path.exists(poll_csv):
            df_poll = pd.read_csv(poll_csv).fillna("")
            df_poll['poll_ids'] = df_poll['poll_ids'].astype(str)
            new_column_name = {name: "poll_table_" + name for name in df_poll.columns}
            df_poll = df_poll.rename(columns=new_column_name)  
            df_poll["poll_table_rows"] = df_poll.apply(find_poll_row, args=(df_poll,), axis=1)    
        

        df_merged = df_merged.fillna("")
        df_merged.replace("\n", " ")
        df_merged = df_merged.drop_duplicates(subset=['id'], keep='last')
        # print(len(df_places))
        # return df_merged
        all_df.append(df_merged)

    print("\nGenerating final CSV file, including %d small CSV files." % len(all_df))
    print("\nPlease wait...")

    final_df = pd.concat(all_df).fillna("")
    final_df = final_df.apply(get_lonlat, axis=1).reset_index()
    final_file = os.path.join(saved_path, "merged.csv")
    final_df.to_csv(final_file, index=False)
    logger.info("\nSaved merged tweets in %s ." % final_file)

    return final_df

# Set tokens

Put your Twitter API tokens in the ```tweet_api_keys.txt``` file in the same directory of this notebook in the following format:
```
Consumer API Key: XXXX
Consumer API Secret Key: XXXX
Bearer Token: XXXX
Access Token: XXXX
Access Token Secret: XXXX
```

In [83]:
token_path = r'J:\Research\tweet_download\tweet_api_keys.txt'

tokens = get_api_token(token_path)

consumer_key = tokens[0]
consumer_secret = tokens[1]
bearer_token = tokens[2]
access_token = tokens[3]
access_token_secret = tokens[4]

# Download tweets

The following cell is a exmaple query to download tweets in Australia with a keyword of "vaccine" since 2021-01-01 to 2021-06-01.

Please set ```query```, ```start_time```, ```end_time```, ```saved_path```, and ```max_results``` (10 - 500).

See these pages to building a query: 

[Building queries for Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#examples)

[Search Tweets](https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-all)




In [87]:
# a exmaple query to download tweets in Australia with a keyword of "vaccine" since 2020-01-01
# keyword = "vaccine"

# query = f"({keyword}) place_country:AU -is:retweet"
# query = f"({keyword}) place_country:AU"
# query = "(vaccin OR vaccination OR vaccine OR vaccinate) place_country:AU"

query = "telemedicine  OR telehealth  OR telecare"

# query = f"({keyword})"
start_time = "2021-11-29T20:00:01Z"
end_time = "2021-11-30T00:00:01Z"
max_results = 500   # max_results can be 500 if do not request the field: context_annotations

# since_id = "139819805172285849"  # cannot used with start/end_time!


# borrow from Twitter:
# https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py
search_url = "https://api.twitter.com/2/tweets/search/all"


# saved_path = os.path.join(os.getcwd(), "saved_tweets")
saved_path = r"downloaded_tweets_test"
os.makedirs(saved_path, exist_ok=True)

In [94]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", search_url, headers=headers, params=params)
    # print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)   
    return response.json()

def save_search(json_response, saved_path):
    try:
        if not os.path.exists(saved_path):
            os.mkdir(saved_path)

        meta = json_response['meta']
        data = json_response['data']
        includes = json_response['includes']
        basename = f"{meta['oldest_id']}_{meta['newest_id']}_{meta['result_count']}"

        data_filename = os.path.join(saved_path, basename + "_data.csv")
        df = pd.DataFrame(data)
        for c in df.columns:
            df[c] = df[c].astype(str)
            df[c] = df[c].str.replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').str.strip()
        df.to_csv(data_filename, index=False)
        result_count = meta['result_count']
        result_count = str(result_count)
        logger.info("Saved %s tweets in: %s" % (result_count, data_filename))

        for key in includes.keys():
            includes_filename = os.path.join(saved_path, basename + f"_includes_{key}.csv")
            df = pd.DataFrame(includes[key])
            for c in df.columns:
                df[c] = df[c].astype(str)
                df[c] = df[c].str.replace('\n',' ').replace(",", ";").replace('\r', '').replace('\t', ' ').str.strip()
            df.to_csv(includes_filename, index=False)
    except Exception as e:
        logger.error(e, exc_info=True)

def execute_download(saved_path=os.getcwd()):
    
    start_timer = time.perf_counter()

    next_token = 'start'
    search_url = "https://api.twitter.com/2/tweets/search/all"
    headers = create_headers(bearer_token)
    total = 0
    query_params = {'query': query, \
                    "max_results": str(max_results), \
                    'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id', \
                    
                     # HAVE context_annotations, max_results can be only 100
                    'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \
                    
                    # NO context_annotations,  max_results can be 500
                    'tweet.fields': 'attachments,author_id,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \

                    'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type', \
                    "user.fields": 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',\
                    "media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics", \
                    "poll.fields": "duration_minutes,end_datetime,id,options,voting_status", \
                    "start_time": start_time, \
                    "end_time": end_time, \
                    # "since_id":since_id, \  # cannot used with start/end_time!
                    }

    while next_token != "":
        try:
            
            json_response = connect_to_endpoint(search_url, headers, query_params)
#             df = pd.DataFrame(json_response['data'])
            save_search(json_response, saved_path)
            
            total += int(json_response['meta']['result_count'])
            logger.info("Downloaded %s tweets in total." % total)


            next_token = json_response['meta'].get('next_token', "")
            if next_token == "":
                print("No next page! Exit.")
                return

            query_params.update({"next_token": next_token})            
#             time.sleep(1)
        
        except Exception as e:
            logger.error(e, exc_info=True)
            
            print(e)
            
            now = time.perf_counter()
            
            time_window = 15 * 60 # seconds
            
            if 'Too Many Requests' in json_response.text:
                elapsed_time = int(now - start_timer)
                need_to_wait_time = time_window - elapsed_time
                print(f'Too Many Requests, waiting for {need_to_wait_time} seconds.')
                time.sleep(need_to_wait_time)
                
            continue


execute_download(saved_path=saved_path)
merge_df = merge_results(saved_path)
merge_df

Saved 494 tweets in: downloaded_tweets_test\1465402275401277445_1465470516379021313_494_data.csv
Downloaded 494 tweets in total.
Saved 494 tweets in: downloaded_tweets_test\1465341785635442688_1465402214323834900_494_data.csv
Downloaded 988 tweets in total.
Saved 497 tweets in: downloaded_tweets_test\1465263326967988231_1465341770817015811_497_data.csv
Downloaded 1485 tweets in total.
Saved 500 tweets in: downloaded_tweets_test\1465100693338050564_1465263217530519552_500_data.csv
Downloaded 1985 tweets in total.
Saved 491 tweets in: downloaded_tweets_test\1464940915978424329_1465100521946050562_491_data.csv
Downloaded 2476 tweets in total.
Saved 471 tweets in: downloaded_tweets_test\1464685575072632838_1464940893937487877_471_data.csv
Downloaded 2947 tweets in total.
Start to merge 6 filles.


No next page! Exit.


 17%|██████████████                                                                      | 1/6 [00:00<00:00,  9.09it/s]

downloaded_tweets_test\1464685575072632838_1464940893937487877_471_data.csv
downloaded_tweets_test\1464940915978424329_1465100521946050562_491_data.csv


 50%|██████████████████████████████████████████                                          | 3/6 [00:00<00:00,  8.19it/s]

downloaded_tweets_test\1465100693338050564_1465263217530519552_500_data.csv
downloaded_tweets_test\1465263326967988231_1465341770817015811_497_data.csv


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [00:00<00:00,  7.90it/s]

downloaded_tweets_test\1465341785635442688_1465402214323834900_494_data.csv
downloaded_tweets_test\1465402275401277445_1465470516379021313_494_data.csv


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00,  8.21it/s]



Generating final CSV file, including 6 small CSV files.

Please wait...



Saved merged tweets in downloaded_tweets_test\merged.csv .


Unnamed: 0,index,created_at,conversation_id,possibly_sensitive,text,entities,id,source,author_id,lang,...,users_table_created_at,users_table_entities,users_table_url,users_table_username,users_table_location,users_table_pinned_tweet_id,media_table_rows,tweets_table_geo,lon,lat
0,0,2021-11-28T12:55:01.000Z,1464940893937487877,False,#PremiumDomain #ForSale @dotcdomains - a Bloc...,"{'mentions': [{'start': 25, 'end': 37, 'userna...",1464940893937487877,Sendible,1393145384835354625,en,...,2021-05-14T10:05:30.000Z,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",https://t.co/dkQEhQZtZh,dotcdomains,,,"[{""media_table_media_key"":""3_14649408907582955...",,,
1,1,2021-11-28T12:54:15.000Z,1464940703176265728,False,反ワクチン活動組織と結託？ オンライン医療サービス企業がイベルメクチン、ヒドロキシクロロキン...,"{'urls': [{'start': 92, 'end': 115, 'url': 'ht...",1464940703176265728,Twitter for iPhone,793662313773400064,ja,...,2016-11-02T03:53:35.000Z,,,chat_noir33,公国,1430382986516516864.0,,,,
2,2,2021-11-28T12:53:39.000Z,1464940552302985216,False,RT @EARL_COVID19_tw: 反ワクチン活動組織と結託？オンライン医療サービス企...,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",1464940552302985216,Twitter for Android,879317430014574592,ja,...,2017-06-26T12:36:27.000Z,,,nyako283,,,,,,
3,3,2021-11-28T12:52:01.000Z,1464940139843518467,False,RT @EARL_COVID19_tw: 反ワクチン活動組織と結託？オンライン医療サービス企...,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",1464940139843518467,Twitter for iPad,1268868554142519298,ja,...,2020-06-05T11:33:58.000Z,"{'description': {'hashtags': [{'start': 33, 'e...",,hiromimatsumo15,,1459012485860585472.0,,,,
4,4,2021-11-28T12:50:40.000Z,1464939802994700288,False,RT @EARL_COVID19_tw: 反ワクチン活動組織と結託？オンライン医療サービス企...,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",1464939802994700288,Twitter for Android,868834032280256512,ja,...,2017-05-28T14:19:10.000Z,,,hobbysoccer,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2942,494,2021-11-29T19:30:03.000Z,1465402696429625344,False,Insights on @KLASresearch report by @healthcar...,"{'mentions': [{'start': 12, 'end': 25, 'userna...",1465402696429625344,Buffer,1144223939742187520,en,...,2019-06-27T12:40:20.000Z,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",https://t.co/essm0vSmu3,Medigy1,,,"[{""media_table_url"":""https:\/\/pbs.twimg.com\/...",,,
2943,495,2021-11-29T19:30:00.000Z,1465402683175706634,False,Complex Medicaid Rules Limit FQHC; Safety-Net ...,"{'urls': [{'start': 62, 'end': 85, 'url': 'htt...",1465402683175706634,Twitter Web App,1244543174955032576,en,...,2020-03-30T08:34:10.000Z,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",https://t.co/IekyfoubWG,truetelehealth,,,,,,
2944,496,2021-11-29T19:28:54.000Z,1465402406833991682,False,🎶fell asleep at 10 o clock snapped awake yeah ...,,1465402406833991682,Twitter for Android,1019025351094210560,en,...,2018-07-17T01:05:51.000Z,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",https://t.co/udOWTtztny,megagarbage,trash island,1041285486650765312.0,,,,
2945,497,2021-11-29T19:28:46.000Z,1465402373984202754,False,Whether you are the owner of a startup or the ...,"{'urls': [{'start': 257, 'end': 280, 'url': 'h...",1465402373984202754,Twitter Web App,556987567,en,...,2012-04-18T15:56:10.000Z,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",https://t.co/OHi5TS3mP3,ORBAAccounting,"Chicago, IL",,"[{""media_table_url"":""https:\/\/pbs.twimg.com\/...",,,


In [80]:
# execute_download(saved_path=saved_path)
merge_df = merge_results(saved_path)
merge_df

Start to merge 51 filles.
  4%|███▎                                                                               | 2/51 [00:00<00:03, 13.03it/s]

downloaded_tweets_test\1463960801681809419_1463989054475804676_99_data.csv
downloaded_tweets_test\1463990571022884864_1464032833106681857_98_data.csv
downloaded_tweets_test\1464034059231891459_1464077005150990337_98_data.csv


  8%|██████▌                                                                            | 4/51 [00:00<00:03, 12.71it/s]

downloaded_tweets_test\1464078459723218944_1464136725303803906_100_data.csv
downloaded_tweets_test\1464139903135068161_1464178450256871428_100_data.csv
downloaded_tweets_test\1464179091356016642_1464217779255582748_99_data.csv


 16%|█████████████                                                                      | 8/51 [00:00<00:03, 13.78it/s]

downloaded_tweets_test\1464218008998584347_1464246818955567114_100_data.csv
downloaded_tweets_test\1464247482859229184_1464266999954083849_99_data.csv
downloaded_tweets_test\1464267311762837512_1464283223395946498_98_data.csv


 24%|███████████████████▎                                                              | 12/51 [00:00<00:02, 15.25it/s]

downloaded_tweets_test\1464284281207857157_1464299902452867081_98_data.csv
downloaded_tweets_test\1464300420214501382_1464320944789331968_99_data.csv
downloaded_tweets_test\1464321080718462981_1464342756625690626_100_data.csv
downloaded_tweets_test\1464343988232544257_1464373947533828104_98_data.csv


 31%|█████████████████████████▋                                                        | 16/51 [00:01<00:02, 15.67it/s]

downloaded_tweets_test\1464374200207122449_1464428174473900038_98_data.csv
downloaded_tweets_test\1464428878030536710_1464510529070931970_100_data.csv
downloaded_tweets_test\1464510538478542848_1464571555237797895_99_data.csv
downloaded_tweets_test\1464572213009342472_1464602365416386561_94_data.csv


 39%|████████████████████████████████▏                                                 | 20/51 [00:01<00:01, 15.84it/s]

downloaded_tweets_test\1464602413361348611_1464636392693248007_97_data.csv
downloaded_tweets_test\1464636660352798725_1464673469782704135_99_data.csv
downloaded_tweets_test\1464674350947352579_1464718216983314435_99_data.csv


 43%|███████████████████████████████████▎                                              | 22/51 [00:01<00:01, 15.93it/s]

downloaded_tweets_test\1464718525600260103_1464777194215915520_99_data.csv
downloaded_tweets_test\1464777200536723465_1464874873474490370_99_data.csv
downloaded_tweets_test\1464874892042584066_1464930232146616320_100_data.csv


 51%|█████████████████████████████████████████▊                                        | 26/51 [00:01<00:01, 15.99it/s]

downloaded_tweets_test\1464930237163220995_1464940915978424329_99_data.csv
downloaded_tweets_test\1464940917316460546_1464967240957128712_99_data.csv
downloaded_tweets_test\1464967532612366340_1464996652964536328_98_data.csv
downloaded_tweets_test\1464997379380322312_1465032876072833028_96_data.csv


 59%|████████████████████████████████████████████████▏                                 | 30/51 [00:01<00:01, 16.64it/s]

downloaded_tweets_test\1465033387488469002_1465067440354873344_99_data.csv
downloaded_tweets_test\1465067624971259915_1465100521946050562_99_data.csv
downloaded_tweets_test\1465100693338050564_1465148409841606660_100_data.csv
downloaded_tweets_test\1465148467760623620_1465183771565514759_100_data.csv


 63%|███████████████████████████████████████████████████▍                              | 32/51 [00:02<00:01, 16.48it/s]

downloaded_tweets_test\1465183813651218440_1465223297218588673_100_data.csv
downloaded_tweets_test\1465223336967946240_1465241950135619592_100_data.csv
downloaded_tweets_test\1465242113210085376_1465263217530519552_100_data.csv


 71%|█████████████████████████████████████████████████████████▉                        | 36/51 [00:02<00:00, 15.43it/s]

downloaded_tweets_test\1465263326967988231_1465289475018395652_100_data.csv
downloaded_tweets_test\1465289505666121733_1465305110242906119_100_data.csv
downloaded_tweets_test\1465305254841532422_1465319786683019266_98_data.csv
downloaded_tweets_test\1465319850440540167_1465330963261673475_99_data.csv


 78%|████████████████████████████████████████████████████████████████▎                 | 40/51 [00:02<00:00, 15.50it/s]

downloaded_tweets_test\1465330963920084996_1465341770817015811_100_data.csv
downloaded_tweets_test\1465341785635442688_1465353047287250947_100_data.csv
downloaded_tweets_test\1465353604445917192_1465365249029287943_98_data.csv


 86%|██████████████████████████████████████████████████████████████████████▋           | 44/51 [00:02<00:00, 14.92it/s]

downloaded_tweets_test\1465365251805925376_1465377454395211779_98_data.csv
downloaded_tweets_test\1465377528978325505_1465390185567428612_99_data.csv
downloaded_tweets_test\1465390219314839561_1465402214323834900_99_data.csv


 90%|█████████████████████████████████████████████████████████████████████████▉        | 46/51 [00:02<00:00, 15.88it/s]

downloaded_tweets_test\1465402275401277445_1465419376308359170_99_data.csv
downloaded_tweets_test\1465410276778553347_1465419287460519941_40_data.csv
downloaded_tweets_test\1465419376308359170_1465428976810213382_99_data.csv
downloaded_tweets_test\1465419387230433289_1465428976810213382_99_data.csv


100%|██████████████████████████████████████████████████████████████████████████████████| 51/51 [00:03<00:00, 15.66it/s]


downloaded_tweets_test\1465429190065463312_1465443617347039232_99_data.csv
downloaded_tweets_test\1465443799593734146_1465458273432182785_97_data.csv
downloaded_tweets_test\1465458465405648903_1465470516379021313_100_data.csv

Generating final CSV file, including 51 small CSV files.

Please wait...



Saved merged tweets in downloaded_tweets_test\merged.csv.gz .


Unnamed: 0,index,source,entities,text,possibly_sensitive,conversation_id,lang,id,reply_settings,created_at,...,users_table_verified,users_table_username,users_table_description,users_table_entities,users_table_location,users_table_pinned_tweet_id,media_table_rows,tweets_table_geo,lon,lat
0,0,Twitter for iPhone,"{'mentions': [{'start': 3, 'end': 15, 'usernam...",RT @ReliqHealth: InvestmentPitch Media Video D...,False,1463989054475804676,en,1463989054475804676,everyone,2021-11-25T21:52:44.000Z,...,False,AronO03357219,,,,,,,,
1,1,Twitter for iPhone,"{'mentions': [{'start': 3, 'end': 14, 'usernam...",RT @MicahsTeam: An Telehealth page; please sha...,False,1463988796874235905,en,1463988796874235905,everyone,2021-11-25T21:51:43.000Z,...,False,Legendary_wale,"Design & Hip-Hop, Customer Service Expert, UI/...","{'description': {'mentions': [{'start': 67, 'e...",,1418999919340367872.0,"[{""media_table_url"":""https:\/\/pbs.twimg.com\/...",,,
2,2,Twitter for Android,"{'mentions': [{'start': 0, 'end': 7, 'username...",@chigrl @DeItaone Theyre gonna link them to CB...,False,1463977803729231875,en,1463988566321737736,everyone,2021-11-25T21:50:48.000Z,...,False,DaveRubin15,Optimistic realist | futurist | lawnchair geo ...,,,1443054414030270464.0,,,,
3,3,AdvanceML,"{'mentions': [{'start': 3, 'end': 14, 'usernam...",RT @ai_jobsNET: HIRING: Data Engineer / Remote...,False,1463988316731281409,tl,1463988316731281409,everyone,2021-11-25T21:49:48.000Z,...,False,AdvanceML,made by an enthusiast @Enthusiastic97 For spre...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",Ghaziabad,,,,,
4,4,Twitter for Android,"{'mentions': [{'start': 3, 'end': 14, 'usernam...",RT @MicahsTeam: An Telehealth page; please sha...,False,1463987555074916360,en,1463987555074916360,everyone,2021-11-25T21:46:47.000Z,...,False,mrpeller_,Product Designer with formal Degree in Economi...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",,1455829579294134272.0,"[{""media_table_url"":""https:\/\/pbs.twimg.com\/...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4979,95,IFTTT,"{'urls': [{'start': 121, 'end': 144, 'url': 'h...",9 out of 10 Healthcare Organizations Provide T...,False,1465458929383751683,en,1465458929383751683,everyone,2021-11-29T23:13:30.000Z,...,False,MrsYisWhy,Security Bene Gesserit and professional nerd s...,"{'url': {'urls': [{'start': 0, 'end': 22, 'url...",In witness protection,,,,,
4980,96,Twitter for iPhone,"{'annotations': [{'start': 0, 'end': 8, 'proba...",Gov. Noem pushes back on narrative GOP is anti...,False,1465458743806676995,en,1465458743806676995,everyone,2021-11-29T23:12:46.000Z,...,False,MelanieAlex62,"God, family, country, capitalism",,"Burleson, TX",1429779055184916480.0,,,,
4981,97,Healthymes Retweeter,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",RT @4fitnesshealthy: Comparing Video-Based; Te...,False,1465458518425751557,en,1465458518425751557,everyone,2021-11-29T23:11:52.000Z,...,False,weightlossasset,Refocus on weight loss and fitness. https://t....,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",New York,,,,,
4982,98,Healthymes Retweeter,"{'mentions': [{'start': 3, 'end': 19, 'usernam...",RT @4fitnesshealthy: Telehealth-delivered diet...,False,1465458467141996547,en,1465458467141996547,everyone,2021-11-29T23:11:40.000Z,...,False,weightlossasset,Refocus on weight loss and fitness. https://t....,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",New York,,,,,


In [74]:
next_token = 'start'
search_url = "https://api.twitter.com/2/tweets/search/all"
headers = create_headers(bearer_token)
total = 0
query_params = {'query': query, \
                "max_results": str(max_results), \
                    'expansions': 'attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id', \
                    'tweet.fields': 'attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,promoted_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld', \
                    'place.fields': 'contained_within,country,country_code,full_name,geo,id,name,place_type', \
                    "user.fields": 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',\
                    "media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,organic_metrics,promoted_metrics,alt_text", \
                    "poll.fields": "duration_minutes,end_datetime,id,options,voting_status", \
                "start_time": start_time, \
                "end_time": end_time, \
                # "since_id":since_id, \  # cannot used with start/end_time!
                }


In [70]:
json_response = connect_to_endpoint(search_url, headers, query_params)

In [71]:
#json_response.keys()   # ['data', 'includes', 'meta']
json_response['meta']

{'result_count': 0,
 'next_token': 'b26v89c19zqg8o3fpdy7o6jo5ebu3ya8iq2kaptu7erul'}

In [72]:
json_response['includes'].keys()  # dict_keys(['users', 'tweets', 'media', 'places', 'polls'])

KeyError: 'includes'

In [61]:
json_response['includes'].keys()

dict_keys(['users', 'tweets', 'media', 'places', 'polls'])

In [60]:
json_response['data'][0]

{'reply_settings': 'everyone',
 'conversation_id': '1465183027043069959',
 'entities': {'mentions': [{'start': 0,
    'end': 16,
    'username': 'jessica_leigh75',
    'id': '2862477860'},
   {'start': 17,
    'end': 29,
    'username': 'EmergencyBK',
    'id': '1153682741780598785'}]},
 'source': 'Twitter for iPad',
 'created_at': '2021-11-29T23:59:32.000Z',
 'id': '1465470516379021313',
 'public_metrics': {'retweet_count': 0,
  'reply_count': 0,
  'like_count': 2,
  'quote_count': 0},
 'author_id': '470651164',
 'context_annotations': [{'domain': {'id': '123',
    'name': 'Ongoing News Story',
    'description': "Ongoing News Stories like 'Brexit'"},
   'entity': {'id': '1220701888179359745', 'name': 'COVID-19'}}],
 'in_reply_to_user_id': '2862477860',
 'referenced_tweets': [{'type': 'replied_to', 'id': '1465362278488834049'}],
 'possibly_sensitive': False,
 'text': '@jessica_leigh75 @EmergencyBK 🤣😂 The new COVID safe medical appointments, Telehealth etc., have been a boon, but can b