# 05_tweepy_search_strategy

Assuming we might be in a situation where we don't want to stream tweets, but instead want to search (within a timeframe of the past 7 days), we have to follow a different pathway in the tweepy module -- the `Client` rather than `StreamingClient`. Let us see here if this means we can achieve similar results as through streaming, but after the fact.

NL, 05/12/22

## IMPORTS

In [49]:
import os
from dotenv import load_dotenv
import json
import re
import datetime
from dateutil import parser as date_parser
import tweepy

## FUNCTIONS

(just recycling existing functions from `01_`)

In [2]:
def check_path_exists(filepath:str):
    '''
    checks if a supplied filepath 
    (dir + filename) exists. if the full path
    with file is not a file, checks for 
    just the dir path and creates file if exists, 
    raises error if not
    
    args:
        - filepath: str, full file path
    '''
    if filepath is None:
        raise TypeError(f'specified path is None.')

    if not isinstance(filepath, str):
        raise TypeError(f'filepath object must be\
            str. Please re-specify.')

    if os.path.isdir(filepath):
        raise ValueError(f'Need to provide a full path to a file,\
            not a dir.')

    if not os.path.isfile(filepath):
        # split and check if everything before the last 
        # `/` is a dir
        splits = filepath.split('/')
        concat = '/'.join(splits[:-1])+'/'
        if not os.path.isdir(concat):
            raise NotADirectoryError(f'Directory path \
                {concat} does not exist. Please re-specify\
                    `out_path`.')
        else:
            print(f'{concat}, the dir in for\
                specified filepath is a directory, but\
                    file {splits[-1:]} does not exist.\
                        Thats fine for us.')
            return filepath
    
    else: 
        return filepath

In [3]:
def extract_count_domains_entities(context_field:list):
    '''
    entracts the counts of domains and entities in 
    a given tweet.

    returns:
        - domains, dict
        - entities, dict
    '''
    domains = {}
    entities = {}

    for context in context_field:
        # domain
        if context['domain']['name'] not in domains.keys():
            domains[context['domain']['name']] = 1
        else:
            domains[context['domain']['name']] += 1

        # entity
        if context['entity']['name'] not in entities.keys():
            entities[context['entity']['name']] = 1
        else:
            entities[context['entity']['name']] += 1

    return domains, entities

In [4]:
def total_domain_entity_counts(domains_tweet:dict,
                               domains_session:dict,
                               entities_tweet:dict,
                               entities_session:dict):
    ''' 
    accumulates counts for domains and entities
    for the entire streaming session.  
    '''
    for domain in domains_tweet.keys():
        if domain not in domains_session.keys():
            domains_session[domain] = 1
        else:
            domains_session[domain] += 1

    for entity in entities_tweet.keys():
        if entity not in entities_session.keys():
            entities_session[entity] = 1
        else:
            entities_session[entity] += 1

    return domains_session, entities_session

In [5]:
def extract_urls(tweet_text:str) -> list:
    '''
    extracts urls from tweet text
    returns all urls in tweet text in list
    '''
    urls = re.findall("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", tweet_text)

    return urls

## INIT

In [6]:
load_dotenv()

True

## PATHS & CONSTANTS

In [37]:
SEARCH_TERMS = '/home/nikloynes/projects/world_cup_misinfo_tracking/data_collection/twitter_search_terms.txt'

In [38]:
# build query string from search terms
with open(SEARCH_TERMS, 'r') as infile:
    search_terms = "".join(['('+line.rstrip()+')'+' OR ' for line in infile][:5])

In [45]:
# drop the final 'or'
search_terms = search_terms[:-4]

In [47]:
bearer_token = os.getenv('TWITTER_BEARER_TOKEN')

In [225]:
expansions = ['author_id', 'referenced_tweets.id']
tweet_fields = ['created_at', 'public_metrics', 'source', 'context_annotations']
media_fields = ['media_key', 'type', 'url', 'duration_ms']
user_fields = ['id', 'name', 'username', 'created_at', 'description', 'location', 'public_metrics', 'protected']

In [56]:
# time stamps
# we want to generate a list of dicts with 8 entries. for each entry we will have these keys: filename, start_time, end_time
earliest = date_parser.parse('2022-12-04 16:00:01')
delta = datetime.timedelta(minutes=59)
time_chunks = []

for i in range(8):
    tmp = {
        'start_time' : earliest,
        'end_time' : earliest+delta,
        'filename' : 'tweets_'+earliest.strftime('%Y_%m_%d-%H_%M_%S')+'.json'
    }

    time_chunks.append(tmp)
    earliest = earliest + datetime.timedelta(hours=1)

## THE THING!

we will now test this and see what we the object it returns looks like. we want to create an outfile that is exactly the same as with the streaming approach

In [160]:
client = tweepy.Client(bearer_token=bearer_token, wait_on_rate_limit=True)

In [157]:
tweets = client.search_recent_tweets(
    query=search_terms,
    start_time=time_chunks[0]['start_time'],
    end_time=time_chunks[0]['end_time'],
    expansions=expansions, 
    tweet_fields=tweet_fields,
    media_fields=media_fields,
    user_fields=user_fields,
    max_results=100
)

In [158]:
core_fields = ['author_id', 'created_at', 'id', 'public_metrics', 'referenced_tweets', 'source', 'text'] 
user_fields = ['created_at', 'description', 'id', 'location', 'name', 'public_metrics', 'username']
out_dir = '../data/test/'

In [159]:
for i in range(len(tweets[0])):
    tmp_dict = tweets[0][i].data
    tmp_user = tweets[1]['users'][i].data

    out = {}
    for field in core_fields:
        if field in tmp_dict:
            out[field] = tmp_dict[field]

    urls = extract_urls(tmp_dict['text'])
    if len(urls)>0:
        out['urls'] = urls

    # extract domain and entity counts
    if 'context_annotations' in tmp_dict.keys():
        domains, entities = extract_count_domains_entities(tmp_dict['context_annotations'])
        out['domains'] = domains
        out['entities'] = entities

    # add user stuff
    user = {}
    for field in user_fields:
        if field in tmp_user:
            user[field] = tmp_user[field]

    out['user'] = user

    with open(out_dir+time_chunks[0]['filename'], 'a') as o:
        o.write(json.dumps(out)+'\n')    

OK - that works. we were able to retrieve 10 (100) tweets using this approach, and then write them out to json in the standard format.  
Now, let's expand this to include pagination, and run it.

In [287]:
it_counter = 0
n_tweets = 0

for tweets in tweepy.Paginator(client.search_recent_tweets, 
                               query=search_terms,
                               start_time=time_chunks[0]['start_time'],
                            #    end_time=time_chunks[0]['end_time'],
                               end_time=tmp_start_time,
                               expansions=expansions, 
                               tweet_fields=tweet_fields,
                               media_fields=media_fields,
                               user_fields=user_fields,
                               max_results=100,
                               limit=970):
                            #    limit=1000):
    it_counter += 1
    print(f'on page {it_counter} out of 1000. n tweets collected so far: {n_tweets}')
    
    # the 'tweets' object is the same as what we had above. 
    # so we need to iterate over it again.
    for i in range(len(tweets[0])):
        tmp_dict = tweets[0][i].data
        author_id = tmp_dict['author_id']
        tmp_user = {}
        
        try:
            tmp_user = tweets[1]['users'][i].data
            if tmp_user['id']!=author_id:
                # we now need to find the correct user object in there
                for user_obj in tweets[1]['users']:
                    if user_obj.data['id']==author_id:
                        tmp_user = user_obj.data
        except IndexError:
            for user_obj in tweets[1]['users']:
                if user_obj.data['id']==author_id:
                    tmp_user = user_obj.data
        
        if 'id' in tmp_user.keys():
            if tmp_user['id']!=author_id:
                tmp_user = {'not available'}

        out = {}
        for field in core_fields:
            if field in tmp_dict:
                out[field] = tmp_dict[field]

        urls = extract_urls(tmp_dict['text'])
        if len(urls)>0:
            out['urls'] = urls

        # extract domain and entity counts
        if 'context_annotations' in tmp_dict.keys():
            domains, entities = extract_count_domains_entities(tmp_dict['context_annotations'])
            out['domains'] = domains
            out['entities'] = entities

        # add user stuff
        user = {}
        if tmp_user!={'not available'}:
            for field in user_fields:
                if field in tmp_user:
                    user[field] = tmp_user[field]

            out['user'] = user
        else:
            out['user'] = tmp_user

        with open(out_dir+time_chunks[0]['filename'], 'a') as o:
            o.write(json.dumps(out)+'\n')

        n_tweets += 1


on page 1 out of 1000. n tweets collected so far: 0
on page 2 out of 1000. n tweets collected so far: 98
on page 3 out of 1000. n tweets collected so far: 197
on page 4 out of 1000. n tweets collected so far: 297
on page 5 out of 1000. n tweets collected so far: 397
on page 6 out of 1000. n tweets collected so far: 497
on page 7 out of 1000. n tweets collected so far: 597
on page 8 out of 1000. n tweets collected so far: 697
on page 9 out of 1000. n tweets collected so far: 797
on page 10 out of 1000. n tweets collected so far: 896
on page 11 out of 1000. n tweets collected so far: 995
on page 12 out of 1000. n tweets collected so far: 1094
on page 13 out of 1000. n tweets collected so far: 1194
on page 14 out of 1000. n tweets collected so far: 1293
on page 15 out of 1000. n tweets collected so far: 1393
on page 16 out of 1000. n tweets collected so far: 1493
on page 17 out of 1000. n tweets collected so far: 1593
on page 18 out of 1000. n tweets collected so far: 1693
on page 19 out 

Rate limit exceeded. Sleeping for 251 seconds.


on page 450 out of 1000. n tweets collected so far: 44778
on page 451 out of 1000. n tweets collected so far: 44878
on page 452 out of 1000. n tweets collected so far: 44977
on page 453 out of 1000. n tweets collected so far: 45077
on page 454 out of 1000. n tweets collected so far: 45177
on page 455 out of 1000. n tweets collected so far: 45276
on page 456 out of 1000. n tweets collected so far: 45376
on page 457 out of 1000. n tweets collected so far: 45475
on page 458 out of 1000. n tweets collected so far: 45574
on page 459 out of 1000. n tweets collected so far: 45674
on page 460 out of 1000. n tweets collected so far: 45774
on page 461 out of 1000. n tweets collected so far: 45874
on page 462 out of 1000. n tweets collected so far: 45974
on page 463 out of 1000. n tweets collected so far: 46074
on page 464 out of 1000. n tweets collected so far: 46174
on page 465 out of 1000. n tweets collected so far: 46273
on page 466 out of 1000. n tweets collected so far: 46373
on page 467 ou

Rate limit exceeded. Sleeping for 285 seconds.


on page 900 out of 1000. n tweets collected so far: 89592
on page 901 out of 1000. n tweets collected so far: 89691
on page 902 out of 1000. n tweets collected so far: 89790
on page 903 out of 1000. n tweets collected so far: 89889
on page 904 out of 1000. n tweets collected so far: 89989
on page 905 out of 1000. n tweets collected so far: 90089
on page 906 out of 1000. n tweets collected so far: 90189
on page 907 out of 1000. n tweets collected so far: 90289
on page 908 out of 1000. n tweets collected so far: 90389
on page 909 out of 1000. n tweets collected so far: 90489
on page 910 out of 1000. n tweets collected so far: 90589
on page 911 out of 1000. n tweets collected so far: 90689
on page 912 out of 1000. n tweets collected so far: 90789
on page 913 out of 1000. n tweets collected so far: 90888
on page 914 out of 1000. n tweets collected so far: 90988
on page 915 out of 1000. n tweets collected so far: 91088
on page 916 out of 1000. n tweets collected so far: 91188
on page 917 ou

In [226]:
tmp_start_time = date_parser.parse("2022-12-04T16:58:16.000Z")

In [227]:
tmp_start_time

datetime.datetime(2022, 12, 4, 16, 58, 16, tzinfo=tzlocal())

In [283]:
if bob:
    print('yes')

NameError: name 'bob' is not defined

In [228]:
tweets2 = client.search_recent_tweets(
    query=search_terms,
    start_time=time_chunks[0]['start_time'],
    end_time=tmp_start_time,
    expansions=expansions, 
    tweet_fields=tweet_fields,
    media_fields=media_fields,
    user_fields=user_fields,
    max_results=100
)

In [288]:
tmp0 = []
path = '../data/test/tweets_2022_12_04-16_00_01.json'

with open(path, 'r') as infile:
    for line in infile:
        tmp0.append(json.loads(line))

In [289]:
mismatched = []

for row in tmp0:
    if row['author_id']!=row['user']['id']:
        mismatched.append(row)

In [291]:
unique_ids = []
for row in tmp0:
    if row['id'] not in unique_ids:
        unique_ids.append(row['id'])
    else:
        print(f'{row["id"]} is duplicated')

KeyboardInterrupt: 

In [294]:
earliest = date_parser.parse('2022-12-04 17:00:01')
delta = datetime.timedelta(minutes=59)
time_chunks = []

for i in range(7):
    tmp = {
        'start_time' : earliest,
        'end_time' : earliest+delta,
        'filename' : 'tweets_'+earliest.strftime('%Y_%m_%d-%H_%M_%S')+'.json'
    }

    time_chunks.append(tmp)
    earliest = earliest + datetime.timedelta(hours=1)

In [295]:
for chunk in time_chunks:
    print(chunk)

{'start_time': datetime.datetime(2022, 12, 4, 17, 0, 1), 'end_time': datetime.datetime(2022, 12, 4, 17, 59, 1), 'filename': 'tweets_2022_12_04-17_00_01.json'}
{'start_time': datetime.datetime(2022, 12, 4, 18, 0, 1), 'end_time': datetime.datetime(2022, 12, 4, 18, 59, 1), 'filename': 'tweets_2022_12_04-18_00_01.json'}
{'start_time': datetime.datetime(2022, 12, 4, 19, 0, 1), 'end_time': datetime.datetime(2022, 12, 4, 19, 59, 1), 'filename': 'tweets_2022_12_04-19_00_01.json'}
{'start_time': datetime.datetime(2022, 12, 4, 20, 0, 1), 'end_time': datetime.datetime(2022, 12, 4, 20, 59, 1), 'filename': 'tweets_2022_12_04-20_00_01.json'}
{'start_time': datetime.datetime(2022, 12, 4, 21, 0, 1), 'end_time': datetime.datetime(2022, 12, 4, 21, 59, 1), 'filename': 'tweets_2022_12_04-21_00_01.json'}
{'start_time': datetime.datetime(2022, 12, 4, 22, 0, 1), 'end_time': datetime.datetime(2022, 12, 4, 22, 59, 1), 'filename': 'tweets_2022_12_04-22_00_01.json'}
{'start_time': datetime.datetime(2022, 12, 4, 