In [1]:
import logging

# Libraries

In [62]:
import pandas as pd
from collections import defaultdict
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [3]:
logging.basicConfig(filename='AppServerStatus.log', encoding='utf-8', level=logging.DEBUG, force=True)

In [4]:
pd.options.display.max_columns = None

# Load Crentials

In [186]:
from dotenv import dotenv_values

config = dotenv_values(".env")
CONSUMER_KEY = config['CONSUMER_KEY']
CONSUMER_SECRET = config['CONSUMER_SECRET']
BEARER_TOKEN = config['BEARER_TOKEN']

# Full Archive Search

In [24]:
import requests
import os
import json

In [89]:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {BEARER_TOKEN}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def main(url, params):
    json_response = connect_to_endpoint(url, params)
    # print(json.dumps(json_response, indent=4, sort_keys=True))
    return json_response

In [None]:
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {
    'query': '(from:twitterdev) has:links has:hashtags lang:en',
    'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
    'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo',
    'expansions': 'author_id',
    'start_time': '2021-01-01T02:07:14Z',
    'end_time': '2021-12-01T02:07:14Z'
}
search_url = "https://api.twitter.com/2/tweets/search/all"

output_tweets = main(search_url, query_params)

In [187]:
userid

469946466

## Get Tweet & Users

In [85]:
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {
    'query': '(from:twitterdev) has:links has:hashtags lang:en',
    'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
    'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo',
    'expansions': 'author_id',
    'start_time': '2021-01-01T02:07:14Z',
    'end_time': '2021-12-01T02:07:14Z'
}
search_url = "https://api.twitter.com/2/tweets/search/all"

output_tweets = main(search_url, query_params)

In [159]:
def get_list_of_items(list_of_dict: list, key_name: str) -> list:
    if not list_of_dict: # If list_of_dict is None
        return []
    return [entity[key_name] for entity in list_of_dict]

In [160]:
def parse_entities(tweet_entities: dict, object_dict: dict, prefix: str = '') -> dict:
    entities_dict = defaultdict(list)

    # Retrieve Entities Objects
    for object_name, key_name in object_dict.items():
        column_name = f"{prefix}{object_name}_list"
        # print(object_name)
        entities_dict[column_name] = get_list_of_items(tweet_entities.get(object_name), key_name)
    return entities_dict
    # entities_dict['hashtags_list'] = get_list_of_items(tweet_entities['hashtags'], 'tag')
    # entities_dict['urls_list'] = get_list_of_items(tweet_entities['urls'], 'expanded_url')



In [161]:
def expand_dict_object(object_dict: dict, column_prefix: str,key_names_list: list = []) -> dict:
    """Return key,value pairs from dict. Returns selected keys in key_names_list or all if key_names_list is []

    Args:
        object_dict (dict)
        key_names_list (list)
    Returns:
        dict
    """
    result_dict = defaultdict(list)
    for key_name, value_name in object_dict.items():
        column_name = f"{column_prefix}_{key_name}"
        if not key_names_list:
            result_dict[column_name] = value_name
        elif key_name in key_names_list:
            result_dict[column_name] = value_name
    return result_dict

In [162]:
json_tweets_columns = [
    'id', 'author_id', 'possibly_sensitive', 'edit_history_tweet_ids', 'lang',
    'source', 'reply_settings', 'text', 'created_at'
]

json_users_columns = [
    'id', 'name', 'username', 'location', 'url', 'created_at', 'username',
    'profile_image_url', 'profile_image_url', 'verified', 'description',
    'protected'
]

In [242]:
tweets_list_for_dataframe = []
users_list_for_dataframe = []
next_token = {'next_token': output['meta']['next_token']}
logging.info(f'Twitter URL: {search_url}')
logging.info(f'Params: {query_params}')

for tweet in output_tweets['data']:
    # Get new columns
    entities_dict = parse_entities(tweet['entities'], {'hashtags': 'tag', 'urls': 'expanded_url'})
    public_metrics_dict = expand_dict_object(tweet['public_metrics'], 'public_metrics')

    # Filter out unwanted columns
    tweet = {key: tweet[key] for key in tweet.keys() if key in json_tweets_columns}

    # Combine dicts
    tweet = {**tweet, **entities_dict, **public_metrics_dict, **next_token}
    tweets_list_for_dataframe.append(tweet)

for user in output_tweets['includes']['users']:
    # Get new columns
    url_dict = parse_entities(user['entities']['url'], {'urls': 'expanded_url'}, prefix='url_')

    description_dict = parse_entities(
        user['entities']['description'],
        {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'tag', 'cashtags': 'tag'}, prefix='description_')
    user_public_metrics_dict = expand_dict_object(user['public_metrics'], 'public_metrics')
    # Filter out unwanted columns
    user = {key: user[key] for key in user.keys() if key in json_users_columns}

    # Combine dicts
    user = {**user, **url_dict, **description_dict, **user_public_metrics_dict}
    users_list_for_dataframe.append(user)

logging.info(f"Next token: {output_tweets['meta']['next_token']}")

# Retrieve data from users AVAX

In [67]:
df = pd.read_csv('0.csv')

  df = pd.read_csv('0.csv')


In [69]:
users_id_list = set(df['userid'])

In [174]:
# Compute Start and End Time with 6 month interval

END_TIME = '2022-10-01T00:00:00Z'
END_TIME_DATETIME = datetime.strptime(START_TIME, '%Y-%m-%dT%H:%M:%SZ')
START_TIME_DATETIME = END_TIME_DATETIME - relativedelta(months=6) # 6 Months Window
START_TIME = START_TIME_DATETIME.strftime('%Y-%m-%dT%H:%M:%SZ')

current_time = datetime.now()

date = f'{current_time.year}-{current_time.month:02d}-{current_time.day:02d}'
time = f'{current_time.hour:02d}-{current_time.minute:02d}'

In [170]:
batch_of_users = 0 # 0.csv
tweets_list_for_dataframe = []
users_list_for_dataframe = []

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {
    # 'query': '(from:twitterdev) has:links has:hashtags lang:en',
    'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
    'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo',
    'expansions': 'author_id',
    'start_time': START_TIME,
    'end_time': END_TIME,
    'max_results': 500
}
search_url = "https://api.twitter.com/2/tweets/search/all"


for userid in list(users_id_list)[30:]: # There are 200 users id

    # Query Data from API
    query_params['query'] = f'(from:{userid})'
    output_tweets = main(search_url, query_params)
    next_token = {}
    next_token['next_token'] = output_tweets['meta'].get('next_token')
    if next_token['next_token']:
        query_params['next_token'] = next_token['next_token']
    logging.info(f'Twitter URL: {search_url}')
    logging.info(f'Params: {query_params}')

    # If no data found:
    if not output_tweets.get('data'):
        logging.warning(f'No data found for userId {userid}')
        continue

    for tweet in output_tweets['data']:
        # Get new columns
        if not tweet.get('entities'):
            entities_dict = parse_entities({}, {'hashtags': 'tag', 'urls': 'expanded_url'})
        else:
            entities_dict = parse_entities(tweet['entities'], {'hashtags': 'tag', 'urls': 'expanded_url'})
        
        if not tweet.get('public_metrics'):
            public_metrics_dict = expand_dict_object({}, 'public_metrics')
        else:
            public_metrics_dict = expand_dict_object(tweet['public_metrics'], 'public_metrics')

        # Filter out unwanted columns
        tweet = {key: tweet[key] for key in tweet.keys() if key in json_tweets_columns}

        # Combine dicts
        tweet = {**tweet, **entities_dict, **public_metrics_dict, **next_token}
        tweets_list_for_dataframe.append(tweet)

    for user in output_tweets['includes']['users']:

        # Get new columns
        if (not user.get('entities')) or (not user.get('entities').get('url')):
            url_dict = parse_entities({}, {'urls': 'expanded_url'}, prefix='url_')
        else:
            url_dict = parse_entities(user['entities']['url'], {'urls': 'expanded_url'}, prefix='url_')

        if (not user.get('entities')) or (not user.get('entities').get('description')):
            description_dict = parse_entities(
                {},
                {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'username', 'cashtags': 'tag'}, prefix='description_')
        else:
            description_dict = parse_entities(
                user['entities']['description'],
                {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'username', 'cashtags': 'tag'}, prefix='description_')

        user_public_metrics_dict = expand_dict_object(user['public_metrics'], 'public_metrics')
        # Filter out unwanted columns
        user = {key: user[key] for key in user.keys() if key in json_users_columns}

        # Combine dicts
        user = {**user, **url_dict, **description_dict, **user_public_metrics_dict}
        users_list_for_dataframe.append(user)

    # logging.info(f"Next token: {output_tweets['meta']['next_token']}")
    # break


Exception: (429, '{"title":"Too Many Requests","detail":"Too Many Requests","type":"about:blank","status":429}')

In [172]:
tweets =  pd.DataFrame.from_records(tweets_list_for_dataframe)

In [191]:
users = pd.DataFrame.from_records(users_list_for_dataframe)

In [176]:
tweets.to_csv(f'data/tweets-{batch_of_users}-{date}-{time}.csv', index=False)
users.to_csv(f'data/users-{batch_of_users}-{date}-{time}.csv', index=False)

In [185]:
# tweets[tweets['next_token'] == 'b26v89c19zqg8o3fpytl0ibacdbf7wx9x2ebhdl7rmqgt']

In [189]:
set(tweets['author_id'])

{'1047544362',
 '118232657',
 '1301434620777390080',
 '1362335834',
 '184352307',
 '214811214',
 '22882864',
 '250104916',
 '2829202522',
 '318489159',
 '607053909',
 '718944819',
 '919185985132466176'}