In [None]:
import json 
import pandas as pd 
import numpy as np

In [None]:
# Load the JSON from the file after extracting the zip
with open("./data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract the list of apps from hits ONLY
apps = [hit["_source"] for hit in data["hits"]["hits"]]


In [2]:
# Function to process one app
def process_app(entry):
    app = {}

    # Basic app metadata (not including any rating values, as new devs wouldn't have this info)
    app["app_id"] = entry.get("app_id")
    app["app_name"] = entry.get("app_name")
    app["country_code"] = entry.get("country_code")
    app["content_rating"] = entry["metadata"].get("content_rating", "")
    app["has_in_app_purchases"] = entry["metadata"].get("has_in_app_purchases", False)
    app["price"] = entry["metadata"].get("price", 0)
    app["currency"] = entry["metadata"].get("currency_code", "")

    # All genres as a list of strings
    genres = entry["metadata"].get("genres", [])
    app["genres"] = [g.get("attributes", {}).get("name", "") for g in genres]

    # Privacy labels
    labels = entry.get("privacylabels", {}).get("privacyDetails", {}).get("privacyTypes", [])
    privacy_types = []
    data_categories = set()
    data_types = set()

    for label in labels:
        pt = label.get("privacyType")
        if pt:
            privacy_types.append(pt)

        for cat in label.get("dataCategories", []):
            data_categories.add(cat.get("dataCategory", ""))
            for dt in cat.get("dataTypes", []):
                data_types.add(dt)

        # Handle nested dataCategories under purposes
        for purpose in label.get("purposes", []):
            for cat in purpose.get("dataCategories", []):
                data_categories.add(cat.get("dataCategory", ""))
                for dt in cat.get("dataTypes", []):
                    data_types.add(dt)

    app["privacy_types"] = privacy_types
    app["data_categories"] = list(data_categories)
    app["data_types"] = list(data_types)

    return app

# Process all apps
processed_apps = [process_app(a) for a in apps]

In [None]:
# Save to CSV for model training
df = pd.DataFrame(processed_apps)
df.to_csv("data.csv", index=False)

NameError: name 'df' is not defined