In [1]:
import logging

# Libraries

In [45]:
import pandas as pd
from collections import defaultdict
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time

# Load Crentials

In [3]:
from dotenv import dotenv_values

config = dotenv_values(".env")
BEARER_TOKEN = config['BEARER_TOKEN']

In [38]:
logging.basicConfig(filename='App_LikedTweets_ServerStatus.log', encoding='utf-8', level=logging.ERROR, force=True)

# Liked Tweets

In [13]:
import requests
import os
import json

In [14]:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {BEARER_TOKEN}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def main(url, params):
    json_response = connect_to_endpoint(url, params)
    # print(json.dumps(json_response, indent=4, sort_keys=True))
    return json_response

In [15]:
def expand_dict_object(object_dict: dict, column_prefix: str,key_names_list: list = []) -> dict:
    """Return key,value pairs from dict. Returns selected keys in key_names_list or all if key_names_list is []

    Args:
        object_dict (dict)
        key_names_list (list)
    Returns:
        dict
    """
    result_dict = defaultdict(list)
    for key_name, value_name in object_dict.items():
        column_name = f"{column_prefix}_{key_name}"
        if not key_names_list:
            result_dict[column_name] = value_name
        elif key_name in key_names_list:
            result_dict[column_name] = value_name
    return result_dict

In [16]:
def get_list_of_items(list_of_dict: list, key_name: str) -> list:
    if not list_of_dict: # If list_of_dict is None
        return []
    return [entity[key_name] for entity in list_of_dict]

In [18]:
def parse_entities(tweet_entities: dict, object_dict: dict, prefix: str = '') -> dict:
    entities_dict = defaultdict(list)

    # Retrieve Entities Objects
    for object_name, key_name in object_dict.items():
        column_name = f"{prefix}{object_name}_list"
        # print(object_name)
        entities_dict[column_name] = get_list_of_items(tweet_entities.get(object_name), key_name)
    return entities_dict
    # entities_dict['hashtags_list'] = get_list_of_items(tweet_entities['hashtags'], 'tag')
    # entities_dict['urls_list'] = get_list_of_items(tweet_entities['urls'], 'expanded_url')

In [None]:
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {
    'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
    'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo,referenced_tweets',
    'expansions': 'author_id',
}
# query_params = {}
search_url = "https://api.twitter.com/2/users/3031071234/liked_tweets"
# search_url="https://api.twitter.com/2/users/by/username/TeamYouTube"
# Snowstorm = 3001806479
# Team Youtube
output_tweets = main(search_url, query_params)


# Retrieve Liked Tweets from users AVAX

In [22]:
path = './avax_data'
dir_list = os.listdir(path)

In [25]:
df = pd.read_csv(f'{path}/{dir_list[0]}')

  df = pd.read_csv(f'{path}/{dir_list[0]}')


In [28]:
users_id_list = set(df['userid'])

In [30]:
json_tweets_columns = [
    'id', 'author_id', 'possibly_sensitive', 'edit_history_tweet_ids', 'lang',
    'source', 'reply_settings', 'text', 'created_at'
]

In [43]:
sleep = 300 # in seconds
N = 5 # Number of retries. Maxium retry time = N*sleep (s)

In [44]:
current_time = datetime.now()

date_str = f'{current_time.year}-{current_time.month:02d}-{current_time.day:02d}'
time_str = f'{current_time.hour:02d}-{current_time.minute:02d}'

In [53]:
# for batch_of_users, batch_csv in enumerate(dir_list):
#     df = pd.read_csv(batch_csv)
#     logging.info(f"Read {batch_csv}")
batch_of_users = 0 # 0.csv
tweets_list_for_dataframe = []
users_list_for_dataframe = []

query_params = {
    'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
    'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo,referenced_tweets',
    'expansions': 'author_id',
}

hast_next_token = True

for userid in list(users_id_list): # There are 200 users id
    while hast_next_token:
        for attempt in range(N):
            try:
                # Query Data from API
                search_url = f"https://api.twitter.com/2/users/{userid}/liked_tweets"
                output_tweets = main(search_url, query_params)
                logging.info(f'Twitter URL: {search_url}')
                logging.info(f'Params: {query_params}')

                # If no data found:
                if not output_tweets.get('data'):
                    logging.warning(f'No data found for userId {userid}')
                    continue

                next_token = {}
                next_token['pagination_token'] = output_tweets['meta'].get('next_token')
                if next_token['pagination_token']:
                    query_params['pagination_token'] = next_token['pagination_token']
                    hast_next_token = True
                else:
                    hast_next_token = False

                for tweet in output_tweets['data']:
                    # Get new columns
                    if not tweet.get('entities'):
                        entities_dict = parse_entities({}, {'hashtags': 'tag', 'urls': 'expanded_url'})
                    else:
                        entities_dict = parse_entities(tweet['entities'], {'hashtags': 'tag', 'urls': 'expanded_url'})
                    
                    if not tweet.get('public_metrics'):
                        public_metrics_dict = expand_dict_object({}, 'public_metrics')
                    else:
                        public_metrics_dict = expand_dict_object(tweet['public_metrics'], 'public_metrics')

                    # Filter out unwanted columns
                    tweet = {key: tweet[key] for key in tweet.keys() if key in json_tweets_columns}

                    # Combine dicts
                    tweet = {**tweet, **entities_dict, **public_metrics_dict, **next_token, 'current_time': datetime.now()}
                    tweets_list_for_dataframe.append(tweet)

            except Exception as err:
                if err.args[0] == 429: # If ERROR = 429 (Too Many Requests, wait for retry) 
                    logging.info(f'Error 429. Sleep: {sleep}')
                    time.sleep(sleep)
                    # continue
                    raise
                else:
                    raise
            break

1188283809826869248
{'pagination_token': '7140dibdnow9c7btw4539ky98xkteilwrcj3etufe2ckx'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw4803qh0y60v44vlfxy97j0k2addk'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw4803k2jgno90eq1scih76won6tdf'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw4803dp9cefr7ofnms8qi5sbzzbjt'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw4544d7d2hp0y1dfuqowfjccvyfjn'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw4544b1fjuh77xwxj6gf0scrubo6t'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw45444nuoprplk3bfc8ijzy2noojd'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw4543yakkgjgfb6bv57wksw39ewqd'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw4543tzbg8ekcqtnm9m4xxv30e69k'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw453p3b23ttu6s2h8rcz6hrwu9po4'}
1188283809826869248
{'pagination_token': '7140dibdnow9c7btw453oz2ifjcaznzl8dbrg83jhcuno'}


KeyboardInterrupt: 

In [48]:
tweets =  pd.DataFrame.from_records(tweets_list_for_dataframe)

In [50]:
tweets.to_csv(f'data/liked-tweets-{batch_of_users}-{date_str}-{time_str}.csv', index=False)