In [3]:
import os
import json
import re

# Directory containing the JSON files
directory = "../profile_transparency"
output_dir = "../output/profiles"

possibly_deleted = []

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define an enum for ad statuses
class AdStatus:
    NOT_RUNNING = "NotRunning"
    HAS_RUN = "HasRun"
    CURRENTLY_RUNNING = "CurrentlyRunning"

class EventType:
    UNKNOWN = "Unknown"
    MERGE = "Merge"
    NAME_CHANGE = "NameChange"
    CREATION = "Creation"


def get_ad_status(ad_entry):
    """Extracts the ad status from the ad entry."""
    if "is not" in ad_entry:
        return AdStatus.NOT_RUNNING
    elif "has run" in ad_entry:
        return AdStatus.HAS_RUN
    elif "is currently running" in ad_entry:
        return AdStatus.CURRENTLY_RUNNING
    else:
        return None

def parse_date(date_str):
    """Parses a date string and returns it in YYYY-MM-DD format."""
    # Example: "November 20, 2022" -> "2022-11-20"
    try:
        from datetime import datetime
        # Define the date format
        date_format = "%B %d, %Y"
        # Parse the date string
        date_obj = datetime.strptime(date_str, date_format)
        # Format the date to YYYY-MM-DD
        return date_obj.strftime("%Y-%m-%d")
    except Exception as e:
        print(f"Error parsing date: {e}")
        return None

def parse_event(event_str):
    """Parses an event string and returns a dictionary with description and date. Returns EventType and name."""
    # Examples: 
    # "Merged with the Page Timpul rezolva tot"
    # "Changed name to Timpul rezolva tot"
    # "Created - Masca Adevarului"
    if event_str.startswith("Merged with the Page"):
        return {
            "event_type": EventType.MERGE,
            "name": event_str[len("Merged with the Page"):].strip()
        }
    elif event_str.startswith("Changed name to"):
        return {
            "event_type": EventType.NAME_CHANGE,
            "name": event_str[len("Changed name to"):].strip()
        }
    elif event_str.startswith("Created -"):
        return {
            "event_type": EventType.CREATION,
            "name": event_str[len("Created -"):].strip()
        }
    else:
        return {
            "event_type": EventType.UNKNOWN,
            "name": event_str.strip()
        }

    

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.startswith("profile_") and filename.endswith(".json"):
        file_path = os.path.join(directory, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                data = json.load(file)
                output = {}
                output["url"] = data.get("url", "")
                output["date"] = data.get("date", "")
                if "adsStatus" in data:
                    statuses = data["adsStatus"]
                    output_ad_status = {}
                    captures = []
                    for status in statuses:
                        if status.startswith("This "):
                            keywords = ["is not", "has run", "is currently running"]
                            for keyword in keywords:
                                if keyword in status:
                                    ad_status = get_ad_status(status)

                                    if ad_status:
                                        captures.append(status[5:status.find(keyword)].strip())
                                        if ad_status == AdStatus.NOT_RUNNING:
                                            output_ad_status["running"] = False
                                        elif ad_status == AdStatus.CURRENTLY_RUNNING:
                                            output_ad_status["running"] = True
                                        elif ad_status == AdStatus.HAS_RUN:
                                            output_ad_status["has_run"] = True

                    output["ad_status"] = output_ad_status

                    if len(set(captures)) > 1:
                        print(f"Inconsistent captures in file {filename}: {captures}")
                    else:
                        if not captures:
                            possibly_deleted.append(data["url"])
                            print(f"No captures found for url {output['url']} in file {filename}")
                        else:
                            output["account_type"] = captures[0] if captures else ""
                
                if "historyEntries" in data:
                    history = data["historyEntries"]
                    output["history"] = []
                    for entry in history:
                        # split the entry into description and date
                        if "\n" in entry:
                            description, date = entry.split("\n", 1)
                            date = parse_date(date.strip())

                            event = parse_event(description.strip())
                            event["date"] = date
                            output["history"].append(event)
                        else:
                            print(f"Unexpected format in history entry: {entry.strip()}")
                            event = parse_event(entry.strip())
                            event["date"] = None
                            output["history"].append(event)

                # page managers
                # Sample:
                #   "pageManagers": [
                        # "Primary country/region location for people who manage this Page includes:",
                        # "Romania (2)\nUnited Kingdom (1)"
                    # ],
                if "pageManagers" in data:  
                    managers = data["pageManagers"]
                    output["page_managers"] = []
                    for manager in managers:
                        # Skip the first line which is a header
                        # Use this regex pattern to identify the header
                        # Primary * location * manage this *

                        if re.match(r"Primary .* location .* manage this .*", manager):
                            continue

                        parts = manager.split("\n")
                        for part in parts:
                            # Split by parentheses to get the country and number of managers
                            match = re.match(r"(.+?)\s*\((\d+)\)", part.strip())
                            if match:
                                country = match.group(1).strip()
                                count = int(match.group(2).strip())
                                output["page_managers"].append({"country": country, "count": count})
                            else:
                                # is part is See less, ignore it
                                if part.strip() == "See less":
                                    continue

                                # If the format is not as expected, just add the part as is
                                print(f"Unexpected format in page managers: {part.strip()}")
                                output["page_managers"].append({"country": part.strip(), "count": 0})

                
                # Write the output to a new JSON file in output/profiles
                output_filename = os.path.join(output_dir, filename.replace("profile_", "parsed_"))
                with open(output_filename, "w", encoding="utf-8") as output_file:
                    json.dump(output, output_file, ensure_ascii=False, indent=4)

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in file {filename}: {e}")
        except Exception as e:
            print(f"Error processing file {filename}: {e}")


print("Possibly deleted profiles:")
for filename in possibly_deleted:
    print(filename)

No captures found for url https://www.facebook.com/catalin.tarau in file profile_1F6BAFE4774036F8474905B15B60F5E4989D5E28615E1F479C0DD7FC893238EB.json
No captures found for url https://www.facebook.com/placutasuedeza in file profile_2B3E75FBA5562E6F55F69F90D7010DC16196FFB1BCF0BDDFAA7B909B8191898D.json
No captures found for url https://www.facebook.com/aur.arad.9 in file profile_34A806629F7FC06896C25DB2549326355289E5A2E9C1DED8430886352C6F258A.json
No captures found for url https://www.facebook.com/adina.iriza.2024 in file profile_4A3A4D7FB4FBFF66758E87BE08BAB030CCF1E9F6F99CE9C35FC368D5026FFA7C.json
No captures found for url https://www.facebook.com/profile.php?id=61570587221304 in file profile_555DE8DF6209F5204299CB25F8673753554B5F0CE9E136EE15320A94B98B4071.json
No captures found for url https://www.facebook.com/badea.elena.71 in file profile_570D1F28C9BC0EAB7A1432C497D01EDFA8DDBF55431A0EC364F9A66E077C149A.json
No captures found for url https://www.facebook.com/noapteamintii in file pro