# Extract Users to Parquet
Normalize `users.json` into analytics-ready Parquet datasets.

This notebook will:
- Read `data/raw/users.json` (fallback: `data/backup/users.json`)
- Build a Users table (exclude nested `projects` list)
- Build a User Badges table with one row per badge name per user
- Parse timestamps and add `scraped_at`
- Save to `data/users.parquet` and `data/user_badges.parquet`

In [1]:
# Imports
import json
from pathlib import Path
import pandas as pd

OUTPUT = Path('../../data')

user_path = '../../data/raw/users.json'
parquet_out = OUTPUT / 'users.parquet'

with open(user_path, 'r') as f:
    raw = json.load(f)

In [None]:
users = raw.get('users', [])
scraped_at = raw.get('scraped_at')

parse_dt = lambda s: pd.to_datetime(s, errors='coerce', utc=True) if pd.notna(s) else pd.NaT

user_cols = [
    'id','slack_id','display_name','bio','projects_count','devlogs_count','votes_count','ships_count',
    'coding_time_seconds','coding_time_seconds_today','balance','created_at','updated_at','avatar','custom_css'
]
user_records = []
for u in users:
    rec = {k: u.get(k) for k in user_cols}
    rec['created_at'] = parse_dt(rec.get('created_at'))
    rec['updated_at'] = parse_dt(rec.get('updated_at'))
    rec['scraped_at'] = parse_dt(scraped_at) if scraped_at else pd.Timestamp.utcnow()
    user_records.append(rec)

df_users = pd.DataFrame(user_records)

In [3]:
badge_records = []
for u in users:
    uid = u.get('id')
    for b in u.get('badges', []) or []:
        badge_records.append({
            'user_id': uid,
            'badge_name': b.get('name'),
            'badge_text': b.get('text'),
            'badge_icon': b.get('icon'),
            'scraped_at': parse_dt(scraped_at) if scraped_at else pd.Timestamp.utcnow(),
        })

df_badges = pd.DataFrame(badge_records)

In [4]:
p_users = OUTPUT / 'users.parquet'
p_badges = OUTPUT / 'user_badges.parquet'

if not df_users.empty:
    df_users.to_parquet(p_users, engine="pyarrow", index=False)
if not df_badges.empty:
    df_badges.to_parquet(p_badges, engine="pyarrow", index=False)

print('Saved:')
for name, p in {'users': p_users, 'user_badges': p_badges}.items():
    if p.exists():
        print(f' - {name}: {p}')

print('\nRow counts:', {'users': len(df_users), 'user_badges': len(df_badges)})

Saved:
 - users: ../../data/users.parquet
 - user_badges: ../../data/user_badges.parquet

Row counts: {'users': 21143, 'user_badges': 2157}
