In [238]:
import logging

# Libraries

In [239]:
import pandas as pd
from collections import defaultdict

In [241]:
logging.basicConfig(filename='AppServerStatus.log', encoding='utf-8', level=logging.DEBUG, force=True)

In [167]:
pd.options.display.max_columns = None

# Load Crentials

In [168]:
from dotenv import dotenv_values

config = dotenv_values(".env")
CONSUMER_KEY = config['CONSUMER_KEY']
CONSUMER_SECRET = config['CONSUMER_SECRET']
BEARER_TOKEN = config['BEARER_TOKEN']

# Full Archive Search

In [169]:
df = pd.read_csv('0.csv')

  df = pd.read_csv('0.csv')


In [3]:
import requests
import os
import json

In [180]:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {BEARER_TOKEN}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def main(url, params):
    json_response = connect_to_endpoint(url, params)
    # print(json.dumps(json_response, indent=4, sort_keys=True))
    return json_response

## Get Tweet & Users

In [194]:
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {
    'query': '(from:twitterdev) has:links has:hashtags lang:en',
    'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
    'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo',
    'expansions': 'author_id',
    'start_time': '2021-01-01T02:07:14Z',
    'end_time': '2021-12-01T02:07:14Z'
}
search_url = "https://api.twitter.com/2/tweets/search/all"

output_tweets = main(search_url, query_params)

In [105]:
def get_list_of_items(list_of_dict: list, key_name: str) -> list:
    if not list_of_dict: # If list_of_dict is None
        return []
    return [entity[key_name] for entity in list_of_dict]

In [222]:
def parse_entities(tweet_entities: dict, object_dict: dict, prefix: str = '') -> dict:
    entities_dict = defaultdict(list)

    # Retrieve Entities Objects
    for object_name, key_name in object_dict.items():
        column_name = f"{prefix}{object_name}_list"
        entities_dict[column_name] = get_list_of_items(tweet_entities.get(object_name), key_name)
    return entities_dict
    # entities_dict['hashtags_list'] = get_list_of_items(tweet_entities['hashtags'], 'tag')
    # entities_dict['urls_list'] = get_list_of_items(tweet_entities['urls'], 'expanded_url')



In [136]:
def expand_dict_object(object_dict: dict, column_prefix: str,key_names_list: list = []) -> dict:
    """Return key,value pairs from dict. Returns selected keys in key_names_list or all if key_names_list is []

    Args:
        object_dict (dict)
        key_names_list (list)
    Returns:
        dict
    """
    result_dict = defaultdict(list)
    for key_name, value_name in object_dict.items():
        column_name = f"{column_prefix}_{key_name}"
        if not key_names_list:
            result_dict[column_name] = value_name
        elif key_name in key_names_list:
            result_dict[column_name] = value_name
    return result_dict

In [199]:
json_tweets_columns = [
    'id', 'author_id', 'possibly_sensitive', 'edit_history_tweet_ids', 'lang',
    'source', 'reply_settings', 'text', 'created_at'
]

json_users_columns = [
    'id', 'name', 'username', 'location', 'url', 'created_at', 'username',
    'profile_image_url', 'profile_image_url', 'verified', 'description',
    'protected'
]

In [242]:
tweets_list_for_dataframe = []
users_list_for_dataframe = []
next_token = {'next_token': output['meta']['next_token']}
logging.info(f'Twitter URL: {search_url}')
logging.info(f'Params: {query_params}')

for tweet in output_tweets['data']:
    # Get new columns
    entities_dict = parse_entities(tweet['entities'], {'hashtags': 'tag', 'urls': 'expanded_url'})
    public_metrics_dict = expand_dict_object(tweet['public_metrics'], 'public_metrics')

    # Filter out unwanted columns
    tweet = {key: tweet[key] for key in tweet.keys() if key in json_tweets_columns}

    # Combine dicts
    tweet = {**tweet, **entities_dict, **public_metrics_dict, **next_token}
    tweets_list_for_dataframe.append(tweet)

for user in output_tweets['includes']['users']:
    # Get new columns
    url_dict = parse_entities(user['entities']['url'], {'urls': 'expanded_url'}, prefix='url_')

    description_dict = parse_entities(
        user['entities']['description'],
        {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'tag', 'cashtags': 'tag'}, prefix='description_')
    user_public_metrics_dict = expand_dict_object(user['public_metrics'], 'public_metrics')
    # Filter out unwanted columns
    user = {key: user[key] for key in user.keys() if key in json_users_columns}

    # Combine dicts
    user = {**user, **url_dict, **description_dict, **user_public_metrics_dict}
    users_list_for_dataframe.append(user)

logging.info(f"Next token: {output_tweets['meta']['next_token']}")

In [229]:
users = pd.DataFrame.from_records(users_list_for_dataframe)

In [230]:
tweets =  pd.DataFrame.from_records(tweets_list_for_dataframe)

In [231]:
users.to_csv('data/users.csv', index=False)

In [232]:
tweets.to_csv('data/tweets.csv', index=False)