In [1]:
import datetime
import json

# Load accounts.json
with open('../output/accounts.json', 'r', encoding='utf-8') as accounts_file:
    account_data = json.load(accounts_file)

# Load creation_dates.json
with open('../output/creation_dates.json', 'r', encoding='utf-8') as creation_dates_file:
    creation_dates = json.load(creation_dates_file)


date_str = account_data.get('date')
if date_str:
    date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
else:
    date = datetime.datetime.now()
accounts = account_data.get('accounts', [])

# augment accounts with creation dates
for account in accounts:
    ad_status = account.get('ad_status', {})
    ad_status['has_run'] = ad_status.get('has_run', False)  # Set default value for has_run if not present
    account['ad_status'] = ad_status
    profile_url = account.get('profile_url')
    if profile_url in creation_dates:
        account['creation_date'] = creation_dates[profile_url]
    else:
        account['creation_date'] = None

# Compute number of days between creation date and account_data date
import datetime

for account in accounts:
    creation_date = account.get('creation_date')
    if creation_date:
        creation_date = datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S')
        days_since_creation = (date - creation_date).days
        account['days_since_creation'] = days_since_creation
    else:
        account['days_since_creation'] = None

# Compute follower growth rate (both account.stats.followers and days since creation can be None)
# add follower_growth_rate to each account's `stats` dictionary
for account in accounts:
    stats = account.get('stats', {})
    followers = stats.get('followers')
    days_since_creation = account.get('days_since_creation')
    
    if followers is not None and days_since_creation is not None and days_since_creation > 0:
        stats['follower_growth_rate'] = followers / days_since_creation
    else:
        stats['follower_growth_rate'] = None

Add account name history

In [2]:
import os
import hashlib

for account in accounts:
    

    profile_url = account.get('profile_url')
    if profile_url:
        # Hash the profile_url using sha256 and convert to uppercase
        sha256_hash = hashlib.sha256(profile_url.encode('utf-8')).hexdigest().upper()
        
        # Construct the file path
        file_path = f"../output/profiles/parsed_{sha256_hash}.json"
        
        # Check if the file exists
        if os.path.exists(file_path):
            # Load the JSON file
            with open(file_path, 'r', encoding='utf-8') as profile_file:
                parsed_data = json.load(profile_file)
                # Add specific fields from parsed_data to account
                account['ad_status'] = parsed_data.get('ad_status')
                account['account_type'] = parsed_data.get('account_type')
                account['history'] = parsed_data.get('history')
                account['page_managers'] = parsed_data.get('page_managers')

Add scraped addressed

In [3]:
# Load addresses.json

addresses_data = {}

with open('../output/addresses.json', 'r', encoding='utf-8') as addresses_file:
    addresses_data = json.load(addresses_file)

for key in list(addresses_data.keys()):
    # replace "null" with None in the addresses data
    if addresses_data[key] == "null":
        addresses_data[key] = None
    # normalize the addresses data (the keys are the profile URLs that include either /about or &sk=about)
    if '/about' in key:
        new_key = key.split('/about')[0]
        addresses_data[new_key] = addresses_data.pop(key)
    elif '&sk=about' in key:
        new_key = key.split('&sk=about')[0]
        addresses_data[new_key] = addresses_data.pop(key)

for account in accounts:
    profile_url = account.get('profile_url')
    if profile_url and profile_url in addresses_data:
        account['address'] = addresses_data[profile_url]
    else:
        account['address'] = None

# Save the updated accounts data to a new JSON file
output_file_path = '../output/accounts_master.json' 
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    json.dump(account_data, output_file, ensure_ascii=False, indent=4)