In [1]:
import json
from typing import List, Dict, Any

# --- CONFIGURATION ---
RAW_DATA_PATH = "../data/raw_data.json"
CLEAN_DATA_PATH = "../data/cleaned_data.json"

def clean_user_data(data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Cleans raw user data by removing duplicates, empty names, and inactive users.
    """
    print(f"Starting ETL Process on {len(data['users'])} users...")
    
    cleaned_users = []
    seen_ids = set()

    for user in data["users"]:
        # 1. Validation: Skip users with empty names
        if not user.get("name") or not user["name"].strip():
            continue
            
        # 2. Validation: Skip duplicate IDs
        if user["id"] in seen_ids:
            continue
        seen_ids.add(user["id"])

        # 3. Transformation: Deduplicate friends list
        user["friends"] = list(set(user["friends"]))
        
        # 4. Filter: Keep only active users (must have friends or liked pages)
        if user["friends"] or user["liked_pages"]:
            cleaned_users.append(user)

    # 5. Transformation: Deduplicate Pages
    unique_pages = {page["id"]: page for page in data["pages"]}
    
    # Update data structure
    data["users"] = cleaned_users
    data["pages"] = list(unique_pages.values())
    
    print(f"Data Cleaning Complete. Active Users: {len(data['users'])}")
    return data

# --- EXECUTION ---
try:
    with open(RAW_DATA_PATH, "r") as f:
        raw_data = json.load(f)
        
    cleaned_data = clean_user_data(raw_data)
    
    with open(CLEAN_DATA_PATH, "w") as f:
        json.dump(cleaned_data, f, indent=4)
    print(f"Saved cleaned data to {CLEAN_DATA_PATH}")
    
except FileNotFoundError:
    print("Error: Raw data file not found. Check the path.")

Starting ETL Process on 5 users...
Data Cleaning Complete. Active Users: 3
Saved cleaned data to ../data/cleaned_data.json
