In [1]:
import logging

# Libraries

In [12]:
import pandas as pd
from collections import defaultdict
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time
import os

In [80]:
logging.basicConfig(filename='AppServerStatus.log', encoding='utf-8', level=logging.DEBUG, force=True)

In [81]:
pd.options.display.max_columns = None

# Load Crentials

In [5]:
from dotenv import dotenv_values

config = dotenv_values(".env")
CONSUMER_KEY = config['CONSUMER_KEY']
CONSUMER_SECRET = config['CONSUMER_SECRET']
BEARER_TOKEN = config['BEARER_TOKEN']

# Full Archive Search

In [6]:
import requests
import os
import json

In [2]:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {BEARER_TOKEN}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def main(url, params):
    json_response = connect_to_endpoint(url, params)
    # print(json.dumps(json_response, indent=4, sort_keys=True))
    return json_response

In [5]:
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {
    'query': '(from:twitterdev) has:links has:hashtags lang:en',
    'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
    'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo',
    'expansions': 'author_id',
    'start_time': '2021-01-01T02:07:14Z',
    'end_time': '2021-12-01T02:07:14Z'
}
search_url = "https://api.twitter.com/2/tweets/search/all"

output_tweets = main(search_url, query_params)

## Get Tweet & Users

In [9]:
# # Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# # expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
# query_params = {
#     'query': '(from:twitterdev) has:links has:hashtags lang:en',
#     'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
#     'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo',
#     'expansions': 'author_id',
#     'start_time': '2021-01-01T02:07:14Z',
#     'end_time': '2021-12-01T02:07:14Z'
# }
search_url = "https://api.twitter.com/2/tweets/search/all"
search_url = "https://api.twitter.com/2/users/by/username/_atanas_"

output_tweets = main(search_url, None)
# output_tweets = main(search_url, query_params)

In [7]:
def get_list_of_items(list_of_dict: list, key_name: str) -> list:
    if not list_of_dict: # If list_of_dict is None
        return []
    return [entity[key_name] for entity in list_of_dict]

In [8]:
def parse_entities(tweet_entities: dict, object_dict: dict, prefix: str = '') -> dict:
    entities_dict = defaultdict(list)

    # Retrieve Entities Objects
    for object_name, key_name in object_dict.items():
        column_name = f"{prefix}{object_name}_list"
        # print(object_name)
        entities_dict[column_name] = get_list_of_items(tweet_entities.get(object_name), key_name)
    return entities_dict
    # entities_dict['hashtags_list'] = get_list_of_items(tweet_entities['hashtags'], 'tag')
    # entities_dict['urls_list'] = get_list_of_items(tweet_entities['urls'], 'expanded_url')



In [9]:
def expand_dict_object(object_dict: dict, column_prefix: str,key_names_list: list = []) -> dict:
    """Return key,value pairs from dict. Returns selected keys in key_names_list or all if key_names_list is []

    Args:
        object_dict (dict)
        key_names_list (list)
    Returns:
        dict
    """
    result_dict = defaultdict(list)
    for key_name, value_name in object_dict.items():
        column_name = f"{column_prefix}_{key_name}"
        if not key_names_list:
            result_dict[column_name] = value_name
        elif key_name in key_names_list:
            result_dict[column_name] = value_name
    return result_dict

In [10]:
json_tweets_columns = [
    'id', 'author_id', 'possibly_sensitive', 'edit_history_tweet_ids', 'lang',
    'source', 'reply_settings', 'text', 'created_at'
]

json_users_columns = [
    'id', 'name', 'username', 'location', 'url', 'created_at', 'username',
    'profile_image_url', 'profile_image_url', 'verified', 'description',
    'protected'
]

In [None]:
tweets_list_for_dataframe = []
users_list_for_dataframe = []
next_token = {'next_token': output['meta']['next_token']}
logging.info(f'Twitter URL: {search_url}')
logging.info(f'Params: {query_params}')

for tweet in output_tweets['data']:
    # Get new columns
    entities_dict = parse_entities(tweet['entities'], {'hashtags': 'tag', 'urls': 'expanded_url'})
    public_metrics_dict = expand_dict_object(tweet['public_metrics'], 'public_metrics')

    # Filter out unwanted columns
    tweet = {key: tweet[key] for key in tweet.keys() if key in json_tweets_columns}

    # Combine dicts
    tweet = {**tweet, **entities_dict, **public_metrics_dict, **next_token}
    tweets_list_for_dataframe.append(tweet)

for user in output_tweets['includes']['users']:
    # Get new columns
    url_dict = parse_entities(user['entities']['url'], {'urls': 'expanded_url'}, prefix='url_')

    description_dict = parse_entities(
        user['entities']['description'],
        {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'tag', 'cashtags': 'tag'}, prefix='description_')
    user_public_metrics_dict = expand_dict_object(user['public_metrics'], 'public_metrics')
    # Filter out unwanted columns
    user = {key: user[key] for key in user.keys() if key in json_users_columns}

    # Combine dicts
    user = {**user, **url_dict, **description_dict, **user_public_metrics_dict}
    users_list_for_dataframe.append(user)

logging.info(f"Next token: {output_tweets['meta']['next_token']}")

NameError: name 'output' is not defined

# Retrieve data from users AVAX

In [3]:
import pandas as pd

In [1]:

path = './avax_data/'
dir_list = os.listdir(path)

In [4]:
df = pd.read_csv(f'{path}/0.csv')

  df = pd.read_csv(f'{path}/0.csv')


In [None]:
1208459959509356545

In [6]:
users_id_list = set(df['userid'])

In [7]:
[ i for i, j in enumerate(users_id_list) if j == 1208459959509356545]

[59]

In [24]:
list(users_id_list)[24:]

[1239472259959382016,
 1339933891100958720,
 4244790820,
 117867046,
 270490153,
 1203108041165541376,
 919185985132466176,
 1047544362,
 22882864,
 184352307,
 718944819,
 1301434620777390080,
 1212963806537637888,
 318489159,
 214811214,
 118232657,
 250104916,
 607053909,
 1362335834,
 2829202522,
 469946466,
 384005220,
 882495084,
 3145585778,
 858788472,
 1135196992559243264,
 1430530687,
 1191684412389429248,
 1324296340008964096,
 1333635933699149824,
 1291191126905376768,
 947049360688320512,
 1258126238188572673,
 971888814359760897,
 1229390877883273216,
 1208459959509356545,
 1272358699533651968,
 1202417750276214786,
 704869445227974657,
 303143556,
 826377784759029763,
 819378092200382467,
 3170426002,
 16661651,
 44728980,
 19766933,
 1312347898940518400,
 1082509988,
 260719781,
 264692904,
 1034366124,
 17186989,
 1280400605769539584,
 499945135,
 758126858072272896,
 1687036603,
 182926530,
 2273620676,
 1104259424959504385,
 736447232174661632,
 927142610,
 144604693

In [110]:
# Compute Start and End Time with 6 month interval

END_TIME = '2022-10-01T00:00:00Z'
END_TIME_DATETIME = datetime.strptime(END_TIME, '%Y-%m-%dT%H:%M:%SZ')
START_TIME_DATETIME = END_TIME_DATETIME - relativedelta(months=6) # 6 Months Window
START_TIME = START_TIME_DATETIME.strftime('%Y-%m-%dT%H:%M:%SZ')

current_time = datetime.now()

date_str = f'{current_time.year}-{current_time.month:02d}-{current_time.day:02d}'
time_str = f'{current_time.hour:02d}-{current_time.minute:02d}'

In [111]:
sleep = 300 # in seconds
N = 5 # Number of retries. Maxium retry time = N*sleep (s)

In [128]:
with open(f"data/tweets-{batch_of_users}.csv", "a+", encoding='utf-8') as f:
    f.write(", ".join(tweet_columns) + "\n")

In [132]:
# import os.path
os.path.exists(f"data/tweets-{batch_of_users}.csv")

True

In [None]:
batch_of_users

In [127]:
# for batch_of_users, batch_csv in enumerate(dir_list):
#     df = pd.read_csv(batch_csv)
#     logging.info(f"Read {batch_csv}")
# batch_of_users = 0 # 0.csv
tweets_list_for_dataframe = []
users_list_for_dataframe = []
user_columns = [
    'profile_image_url', 'username', 'protected', 'name', 'id', 'description', 
    'created_at', 'verified', 'location', 'url_urls_list', 'description_hashtags_list', 'description_urls_list', 'description_mentions_list',
    'description_cashtags_list', 'public_metrics_followers_count', 'public_metrics_following_count', 'public_metrics_tweet_count', 
    'public_metrics_listed_count', 'current_time'
]

tweet_columns = [
    'possibly_sensitive', 'text', 'source', 'id', 'created_at', 'lang', 'reply_settings', 'author_id', 'edit_history_tweet_ids', 
    'hashtags_list', 'urls_list', 'public_metrics_retweet_count', 'public_metrics_reply_count', 'public_metrics_like_count', 
    'public_metrics_quote_count', 'pagination_token', 'current_time'
]


# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {
    # 'query': '(from:twitterdev) has:links has:hashtags lang:en',
    'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
    'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo',
    'expansions': 'author_id',
    'start_time': START_TIME,
    'end_time': END_TIME,
    'max_results': 500
}
search_url = "https://api.twitter.com/2/tweets/search/all"

hast_next_token = True
users_id_list = set(df['userid'])
# Create CSV file
if not os.path.exists(f"data/tweets-{batch_of_users}.csv"):
    with open(f"data/tweets-{batch_of_users}.csv", "a+", encoding='utf-8') as f:
        f.write(", ".join(tweet_columns) + "\n")
if not os.path.exists(f"data/users-{batch_of_users}.csv"):
    with open(f"data/users-{batch_of_users}.csv", "a+", encoding='utf-8') as f:
        f.write(", ".join(user_columns) + "\n")
for userid in list(users_id_list): # There are 200 users id
    print(userid)
    while hast_next_token:
        print('request')
        tweets_list_for_dataframe = []
        users_list_for_dataframe = []
        for attempt in range(N):
            print(attempt)
            try:
                # Query Data from API
                query_params['query'] = f'(from:{userid})'
                output_tweets = main(search_url, query_params)
                logging.info(f'Twitter URL: {search_url}')
                logging.info(f'Params: {query_params}')

                # If no data found:
                if not output_tweets.get('data'):
                    logging.warning(f'No data found for userId {userid}')
                    # continue
                    hast_next_token = False
                    break
                next_token = {}
                next_token['pagination_token'] = output_tweets['meta'].get('next_token')
                if next_token['pagination_token']:
                    query_params['pagination_token'] = next_token['pagination_token']
                    hast_next_token = True
                else:
                    hast_next_token = False

                for tweet in output_tweets['data']:
                    # Get new columns
                    if not tweet.get('entities'):
                        entities_dict = parse_entities({}, {'hashtags': 'tag', 'urls': 'expanded_url'})
                    else:
                        entities_dict = parse_entities(tweet['entities'], {'hashtags': 'tag', 'urls': 'expanded_url'})

                    if not tweet.get('public_metrics'):
                        public_metrics_dict = expand_dict_object({}, 'public_metrics')
                    else:
                        public_metrics_dict = expand_dict_object(tweet['public_metrics'], 'public_metrics')

                    # Filter out unwanted columns
                    tweet = {key: tweet[key] for key in tweet.keys() if key in json_tweets_columns}

                    # Combine dicts
                    tweet = {**tweet, **entities_dict, **public_metrics_dict, **next_token, 'current_time': datetime.now()}
                    # tweet = sorted(tweet.items(), key=lambda kv: kv[1])
                    tweets_list_for_dataframe.append(tweet)

                for user in output_tweets['includes']['users']:

                    # Get new columns
                    if (not user.get('entities')) or (not user.get('entities').get('url')):
                        url_dict = parse_entities({}, {'urls': 'expanded_url'}, prefix='url_')
                    else:
                        url_dict = parse_entities(user['entities']['url'], {'urls': 'expanded_url'}, prefix='url_')

                    if (not user.get('entities')) or (not user.get('entities').get('description')):
                        description_dict = parse_entities(
                            {},
                            {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'username', 'cashtags': 'tag'}, prefix='description_')
                    else:
                        description_dict = parse_entities(
                            user['entities']['description'],
                            {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'username', 'cashtags': 'tag'}, prefix='description_')

                    user_public_metrics_dict = expand_dict_object(user['public_metrics'], 'public_metrics')
                    # Filter out unwanted columns
                    user = {key: user[key] for key in user.keys() if key in json_users_columns}

                    # Combine dicts
                    user = {**user, **url_dict, **description_dict, **user_public_metrics_dict, 'current_time': datetime.now()}
                    # user = sorted(user.items(), key=lambda kv: kv[1])
                    users_list_for_dataframe.append(user)
            except Exception as err:
                if err.args[0] == 429: # If ERROR = 429 (Too Many Requests, wait for retry) 
                    logging.info(f'Error 429. Sleep: {sleep}')
                    time.sleep(sleep)
                    continue
                else:
                    raise
            break
        tweets =  pd.DataFrame.from_records(tweets_list_for_dataframe, columns=tweet_columns)
        users = pd.DataFrame.from_records(users_list_for_dataframe, columns=user_columns)
        tweets.to_csv(f'data/tweets-{batch_of_users}.csv', mode='a', index=False, header=None)
        users.to_csv(f'data/users-{batch_of_users}.csv', mode='a', index=False, header=None)


1430530687
request
0
{'lang': 'en', 'id': '1575524103959740417', 'created_at': '2022-09-29T16:33:12.000Z', 'edit_history_tweet_ids': ['1575524103959740417'], 'author_id': '1430530687', 'possibly_sensitive': False, 'text': 'RT @frenchmarlboro: Adornments by Nusi https://t.co/jvwHEeC1v9', 'source': 'Twitter for iPhone', 'reply_settings': 'everyone'}
{'lang': 'zxx', 'id': '1575490451754213385', 'created_at': '2022-09-29T14:19:29.000Z', 'edit_history_tweet_ids': ['1575490451754213385'], 'author_id': '1430530687', 'possibly_sensitive': False, 'text': 'RT @joonstudio: https://t.co/jof9O6F7u5', 'source': 'Twitter for iPhone', 'reply_settings': 'everyone'}
{'lang': 'en', 'id': '1575089716310904832', 'created_at': '2022-09-28T11:47:06.000Z', 'edit_history_tweet_ids': ['1575089716310904832'], 'author_id': '1430530687', 'possibly_sensitive': False, 'text': 'RT @JINK997: people died. https://t.co/fu45YodB63', 'source': 'Twitter for iPhone', 'reply_settings': 'everyone'}
{'lang': 'en', 'id': '15749

KeyboardInterrupt: 

In [115]:
tweets =  pd.DataFrame.from_records(tweets_list_for_dataframe, columns=tweet_columns)

In [133]:
tweets_list_for_dataframe

[]

In [121]:
with open(f"data/tweets-{batch_of_users}.csv", "w", encoding='utf-8') as f:
    f.write(", ".join(tweet_columns) + "\n")

In [35]:
# with open("out.csv", "w", encoding='utf-8') as f:
#     f.write("date,user,is_retweet,is_quote,text,quoted_text\n")

In [36]:
import csv
with open("out.csv", "a", encoding='utf-8') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow([1,2,1,1, ['dasd'],'a'])

In [77]:
# tweet
sorted(tweet.items(), key=lambda kv: kv[1])

TypeError: '<' not supported between instances of 'list' and 'str'

In [76]:
tweet.items()

dict_items([('created_at', '2021-09-13T04:47:01.000Z'), ('text', 'RT @DngBch11: Đi chơi vùng cao có một bản làng khi sinh là nữ là cha mẹ bôi một loại thuốc gia truyền là ko bao giờ mọc lông mu, cô giáo tâ…'), ('edit_history_tweet_ids', ['1437276608507703300']), ('reply_settings', 'everyone'), ('lang', 'vi'), ('id', '1437276608507703300'), ('author_id', '1188283809826869248'), ('source', 'Twitter for Android'), ('possibly_sensitive', False), ('hashtags_list', []), ('urls_list', []), ('public_metrics_retweet_count', 344), ('public_metrics_reply_count', 0), ('public_metrics_like_count', 0), ('public_metrics_quote_count', 0), ('pagination_token', 'b26v89c19zqg8o3fpdp8ryq98td34r3qljtuhtvympuv1'), ('current_time', datetime.datetime(2022, 11, 5, 14, 53, 35, 799548))])

In [37]:
df_2 = pd.read_csv("out.csv")

In [42]:
df_2['text'].iloc[0]

"['dasd']"

In [60]:
tweets =  pd.DataFrame.from_records(tweets_list_for_dataframe)

In [62]:
users = pd.DataFrame.from_records(users_list_for_dataframe)

In [63]:
tweets.shape

(482, 16)

In [None]:
set(tweets['author_id'])

{'1047544362',
 '118232657',
 '1301434620777390080',
 '1362335834',
 '184352307',
 '214811214',
 '22882864',
 '250104916',
 '2829202522',
 '318489159',
 '607053909',
 '718944819',
 '919185985132466176'}

In [139]:
import logging

import pandas as pd
from collections import defaultdict
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time
import os
import requests
import os
import json

logging.basicConfig(filename='AppServerStatus_script.log', encoding='utf-8', level=logging.DEBUG, force=True)
CONSUMER_KEY='7MUj9kId1GJB74UZjoP5qQou3'
CONSUMER_SECRET='soYSWChh8q68CJqEffjom1rJZTDcpW70gOMtxaljP5tTPNSc0h'
BEARER_TOKEN='AAAAAAAAAAAAAAAAAAAAAEld8wAAAAAA1WJyh36i7Dcy%2FjL0jRQaW6%2BoVT4%3DT7AzkxcV4BeJScjMWq8KBmSQ3bpQ6ivORsegWOebktx39oUYid'

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {BEARER_TOKEN}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def main(url, params):
    json_response = connect_to_endpoint(url, params)
    # print(json.dumps(json_response, indent=4, sort_keys=True))
    return json_response

def get_list_of_items(list_of_dict: list, key_name: str) -> list:
    if not list_of_dict: # If list_of_dict is None
        return []
    return [entity[key_name] for entity in list_of_dict]

def parse_entities(tweet_entities: dict, object_dict: dict, prefix: str = '') -> dict:
    entities_dict = defaultdict(list)

    # Retrieve Entities Objects
    for object_name, key_name in object_dict.items():
        column_name = f"{prefix}{object_name}_list"
        # print(object_name)
        entities_dict[column_name] = get_list_of_items(tweet_entities.get(object_name), key_name)
    return entities_dict
    # entities_dict['hashtags_list'] = get_list_of_items(tweet_entities['hashtags'], 'tag')
    # entities_dict['urls_list'] = get_list_of_items(tweet_entities['urls'], 'expanded_url')

def expand_dict_object(object_dict: dict, column_prefix: str,key_names_list: list = []) -> dict:
    """Return key,value pairs from dict. Returns selected keys in key_names_list or all if key_names_list is []

    Args:
        object_dict (dict)
        key_names_list (list)
    Returns:
        dict
    """
    result_dict = defaultdict(list)
    for key_name, value_name in object_dict.items():
        column_name = f"{column_prefix}_{key_name}"
        if not key_names_list:
            result_dict[column_name] = value_name
        elif key_name in key_names_list:
            result_dict[column_name] = value_name
    return result_dict

if __name__ == '__main__':
    json_tweets_columns = [
        'id', 'author_id', 'possibly_sensitive', 'edit_history_tweet_ids', 'lang',
        'source', 'reply_settings', 'text', 'created_at'
    ]

    json_users_columns = [
        'id', 'name', 'username', 'location', 'url', 'created_at', 'username',
        'profile_image_url', 'profile_image_url', 'verified', 'description',
        'protected'
    ]
    user_columns = [
        'profile_image_url', 'username', 'protected', 'name', 'id', 'description', 
        'created_at', 'verified', 'location', 'url_urls_list', 'description_hashtags_list', 'description_urls_list', 'description_mentions_list',
        'description_cashtags_list', 'public_metrics_followers_count', 'public_metrics_following_count', 'public_metrics_tweet_count', 
        'public_metrics_listed_count', 'current_time'
    ]

    tweet_columns = [
        'possibly_sensitive', 'text', 'source', 'id', 'created_at', 'lang', 'reply_settings', 'author_id', 'edit_history_tweet_ids', 
        'hashtags_list', 'urls_list', 'public_metrics_retweet_count', 'public_metrics_reply_count', 'public_metrics_like_count', 
        'public_metrics_quote_count', 'pagination_token', 'current_time'
    ]

    # Current folder
    pwd = os.getcwd()
    # Compute Start and End Time with 6 month interval

    END_TIME = '2022-10-01T00:00:00Z'
    END_TIME_DATETIME = datetime.strptime(END_TIME, '%Y-%m-%dT%H:%M:%SZ')
    START_TIME_DATETIME = END_TIME_DATETIME - relativedelta(months=6) # 6 Months Window
    START_TIME = START_TIME_DATETIME.strftime('%Y-%m-%dT%H:%M:%SZ')

    current_time = datetime.now()

    date_str = f'{current_time.year}-{current_time.month:02d}-{current_time.day:02d}'
    time_str = f'{current_time.hour:02d}-{current_time.minute:02d}'

    sleep = 10 # in seconds
    N = 5 # Number of retries. Maxium retry time = N*sleep (s)

    # path = '/project/ll_774_951/SMHabits/avax_project_data/random_user_tweets'
    path = 'avax_data'
    dir_list = os.listdir(path)

    for batch_of_users, batch_csv in enumerate(dir_list):
        df = pd.read_csv(f"{path}/{batch_csv}")
        logging.info(f"Read {batch_csv}")

        # Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
        # expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
        query_params = {
            # 'query': '(from:twitterdev) has:links has:hashtags lang:en',
            'user.fields': 'created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld',
            'tweet.fields': 'created_at,public_metrics,entities,lang,possibly_sensitive,reply_settings,source,in_reply_to_user_id,geo',
            'expansions': 'author_id',
            'start_time': START_TIME,
            'end_time': END_TIME,
            'max_results': 500
        }
        search_url = "https://api.twitter.com/2/tweets/search/all"

        hast_next_token = True
        users_id_list = set(df['userid'])
        # Create CSV file
        if not os.path.exists(f"data/tweets-{batch_of_users}.csv"):
            with open(f"{pwd}/data/tweets-{batch_of_users}.csv", "a+", encoding='utf-8') as f:
                f.write(", ".join(tweet_columns) + "\n")
        if not os.path.exists(f"data/users-{batch_of_users}.csv"):
            with open(f"{pwd}/data/users-{batch_of_users}.csv", "a+", encoding='utf-8') as f:
                f.write(", ".join(user_columns) + "\n")
        for userid in list(users_id_list): # There are 200 users id
            print(userid)
            hast_next_token = True
            while hast_next_token:
                print(hast_next_token)
                tweets_list_for_dataframe = []
                users_list_for_dataframe = []
                for attempt in range(N):
                    print(attempt)
                    try:
                        # Query Data from API
                        query_params['query'] = f'(from:{userid})'
                        output_tweets = main(search_url, query_params)
                        logging.info(f'Twitter URL: {search_url}')
                        logging.info(f'Params: {query_params}')

                        # If no data found:
                        if not output_tweets.get('data'):
                            logging.warning(f'No data found for userId {userid}')
                            # continue
                            hast_next_token = False
                            break
                        next_token = {}
                        next_token['pagination_token'] = output_tweets['meta'].get('next_token')
                        if next_token['pagination_token']:
                            query_params['pagination_token'] = next_token['pagination_token']
                            hast_next_token = True
                        else:
                            hast_next_token = False

                        for tweet in output_tweets['data']:
                            # Get new columns
                            if not tweet.get('entities'):
                                entities_dict = parse_entities({}, {'hashtags': 'tag', 'urls': 'expanded_url'})
                            else:
                                entities_dict = parse_entities(tweet['entities'], {'hashtags': 'tag', 'urls': 'expanded_url'})

                            if not tweet.get('public_metrics'):
                                public_metrics_dict = expand_dict_object({}, 'public_metrics')
                            else:
                                public_metrics_dict = expand_dict_object(tweet['public_metrics'], 'public_metrics')

                            # Filter out unwanted columns
                            tweet = {key: tweet[key] for key in tweet.keys() if key in json_tweets_columns}

                            # Combine dicts
                            tweet = {**tweet, **entities_dict, **public_metrics_dict, **next_token, 'current_time': datetime.now()}
                            # tweet = sorted(tweet.items(), key=lambda kv: kv[1])
                            tweets_list_for_dataframe.append(tweet)

                        for user in output_tweets['includes']['users']:

                            # Get new columns
                            if (not user.get('entities')) or (not user.get('entities').get('url')):
                                url_dict = parse_entities({}, {'urls': 'expanded_url'}, prefix='url_')
                            else:
                                url_dict = parse_entities(user['entities']['url'], {'urls': 'expanded_url'}, prefix='url_')

                            if (not user.get('entities')) or (not user.get('entities').get('description')):
                                description_dict = parse_entities(
                                    {},
                                    {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'username', 'cashtags': 'tag'}, prefix='description_')
                            else:
                                description_dict = parse_entities(
                                    user['entities']['description'],
                                    {'hashtags': 'tag', 'urls': 'expanded_url', 'mentions': 'username', 'cashtags': 'tag'}, prefix='description_')

                            user_public_metrics_dict = expand_dict_object(user['public_metrics'], 'public_metrics')
                            # Filter out unwanted columns
                            user = {key: user[key] for key in user.keys() if key in json_users_columns}

                            # Combine dicts
                            user = {**user, **url_dict, **description_dict, **user_public_metrics_dict, 'current_time': datetime.now()}
                            # user = sorted(user.items(), key=lambda kv: kv[1])
                            users_list_for_dataframe.append(user)
                    except Exception as err:
                        if err.args[0] == 429: # If ERROR = 429 (Too Many Requests, wait for retry) 
                            logging.info(f'Error 429. Sleep: {sleep}')
                            hast_next_token = False
                            time.sleep(sleep)
                            continue
                        else:
                            raise
                    break
                tweets =  pd.DataFrame.from_records(tweets_list_for_dataframe, columns=tweet_columns)
                users = pd.DataFrame.from_records(users_list_for_dataframe, columns=user_columns)
                tweets.to_csv(f'{pwd}/data/tweets-{batch_of_users}.csv', mode='a', index=False, header=None)
                users.to_csv(f'{pwd}/data/users-{batch_of_users}.csv', mode='a', index=False, header=None)

  df = pd.read_csv(f"{path}/{batch_csv}")


1188283809826869248
True
0
1
2
3
4
914779140225675265
True
0
969598808186212353
True
0
1
True
0
True
0
True
0
True
0
True
0
True
0
True
0


KeyboardInterrupt: 

In [137]:
tweet

{'reply_settings': 'everyone',
 'possibly_sensitive': False,
 'id': '1536419092655128576',
 'created_at': '2022-06-13T18:43:51.000Z',
 'edit_history_tweet_ids': ['1536419092655128576'],
 'author_id': '1430530687',
 'text': 'RT @Iovekil: apo is literally the most beautiful man alive',
 'lang': 'en',
 'source': 'Twitter for iPhone',
 'hashtags_list': [],
 'urls_list': [],
 'public_metrics_retweet_count': 359,
 'public_metrics_reply_count': 0,
 'public_metrics_like_count': 0,
 'public_metrics_quote_count': 0,
 'pagination_token': 'b26v89c19zqg8o3fpyzn3gj1jajzu6am8xqx75egonipp',
 'current_time': datetime.datetime(2022, 11, 6, 16, 21, 43, 174611)}

NameError: name 'pd' is not defined

In [2]:
import pandas as pd

In [6]:
df_users = pd.read_csv('data/users-0.csv')

In [7]:
len(set(df_users['id']))

52