# Extract Projects
Cleans up the data from projects.json

This notebook will:
- Read `data/raw/projects.json`
- Flatten projects into tabular form
- Extract nested arrays into separate tables:
  - Projects
  - Project Followers (with `project_id`)
- Parse timestamps and add `scraped_at` to each row
- Save to `data/*.parquet`

In [13]:
import json
from pathlib import Path
import pandas as pd

OUTPUT = Path('../../data')

projects_path = '../../data/raw/projects.json'
parquet_out = OUTPUT / 'projects.parquet'

with open(projects_path, 'r') as f:
    raw = json.load(f)

In [14]:
projects = raw.get('projects', [])
scraped_at = raw.get('scraped_at')

parse_dt = lambda s: pd.to_datetime(s, errors='coerce', utc=True) if pd.notna(s) else pd.NaT

proj_cols = [
    'id', 'title', 'description', 'category', 'devlogs_count', 'total_seconds_coded',
    'is_shipped','readme_link','demo_link','repo_link','user_id','slack_id','x','y',
    'created_at','updated_at','banner'
]
proj_records = []
for p in projects:
    rec = {k: p.get(k) for k in proj_cols}
    rec['created_at'] = parse_dt(rec.get('created_at'))
    rec['updated_at'] = parse_dt(rec.get('updated_at'))
    rec['scraped_at'] = parse_dt(scraped_at) if scraped_at else pd.Timestamp.utcnow()
    proj_records.append(rec)

df_projects = pd.DataFrame(proj_records)

follower_records = []
for p in projects:
    for f in p.get('followers', []) or []:
        follower_records.append({
            'project_id': p.get('id'),
            'follower_id': f.get('id'),
            'follower_name': f.get('name'),
            'scraped_at': parse_dt(scraped_at) if scraped_at else pd.Timestamp.utcnow(),
        })

df_followers = pd.DataFrame(follower_records)
df_projects

Unnamed: 0,id,title,description,category,devlogs_count,total_seconds_coded,is_shipped,readme_link,demo_link,repo_link,user_id,slack_id,x,y,created_at,updated_at,banner,scraped_at
0,3,dreamland.js,Utilitarian web framework smaller than preact....,Something else,16,106565,False,https://raw.githubusercontent.com/MercuryWorks...,https://mercuryworkshop.github.io/dreamland2-p...,https://github.com/MercuryWorkshop/dreamlandjs...,14,U07UY5CR7U5,,,2025-06-16 00:00:06.951000+00:00,2025-07-30 20:03:23.829000+00:00,https://summer.hackclub.com/rails/active_stora...,2025-08-13 01:42:24.702000+00:00
1,4,Reactive,First site in react!,Web App,0,0,False,,,,12,U0824G9PTFE,,,2025-06-16 00:02:07.258000+00:00,2025-06-16 00:02:07.258000+00:00,,2025-08-13 01:42:24.702000+00:00
2,5,MusicMap,"How we feel for Music, currently has the playl...",Mobile App,9,359516,False,https://raw.githubusercontent.com/N1k0s1/Music...,https://github.com/N1k0s1/MusicMap/releases/ta...,https://github.com/N1k0s1/MusicMap,22,U07960MD940,,,2025-06-16 00:22:22.251000+00:00,2025-08-13 01:05:50.288000+00:00,https://summer.hackclub.com/rails/active_stora...,2025-08-13 01:42:24.702000+00:00
3,6,Summer of making Share votes,did it with high seas.. we doing it now,Something else,5,17946,False,https://raw.githubusercontent.com/NeonGamerBot...,https://chromewebstore.google.com/detail/som-v...,https://github.com/NeonGamerBot-QK/som-votes,5,U07L45W79E1,34.986536,65.917251,2025-06-16 00:47:20.524000+00:00,2025-07-18 20:55:05.945000+00:00,https://summer.hackclub.com/rails/active_stora...,2025-08-13 01:42:24.702000+00:00
4,7,Nomen,A web viewer and parser for every name listed ...,Web App,5,14619,False,https://raw.githubusercontent.com/aramshiva/no...,https://nomen.sh,https://github.com/aramshiva/nomen,39,U0616280E6P,91.518325,81.661442,2025-06-16 00:53:49.627000+00:00,2025-08-12 06:36:40.944000+00:00,https://summer.hackclub.com/rails/active_stora...,2025-08-13 01:42:24.702000+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,11054,Cats are Liquid (alits) Speedrun Timer (frontend),The frontend for my CaL speedrun timer.\r\n\r\...,,0,0,False,https://raw.githubusercontent.com/fily-gif/ali...,,https://github.com/fily-gif/alitsfe,3478,U082U8N45B5,,,2025-08-12 22:37:06.977000+00:00,2025-08-12 22:37:06.977000+00:00,,2025-08-13 01:42:24.702000+00:00
8470,11055,SimpleRPG,My rpg game,,0,0,False,,,,3490,U0878K0L352,,,2025-08-12 22:42:50.510000+00:00,2025-08-12 22:42:50.510000+00:00,,2025-08-13 01:42:24.702000+00:00
8471,11056,Overclock API,An API mod for Balatro that intends to keep i...,,0,0,False,https://raw.githubusercontent.com/basil-square...,,https://github.com/basil-squared/Overclock-API,2979,U07MJ75J4PK,,,2025-08-12 23:46:11.867000+00:00,2025-08-12 23:46:11.867000+00:00,,2025-08-13 01:42:24.702000+00:00
8472,11057,portfoilo,guam's portfolio (also known as mine)'s portfo...,,1,381,False,https://raw.githubusercontent.com/guamothy/por...,https://guamothy.github.io/portfoilo/,https://github.com/guamothy/portfoilo,20612,U099K0T2JM6,,,2025-08-13 00:09:01.884000+00:00,2025-08-13 00:14:01.968000+00:00,https://summer.hackclub.com/rails/active_stora...,2025-08-13 01:42:24.702000+00:00


In [15]:
paths = {
    'projects': OUTPUT / 'projects.parquet',
    'followers': OUTPUT / 'project_followers.parquet',
}

if not df_projects.empty:
    df_projects.to_parquet(paths['projects'], engine="pyarrow", index=False)
if not df_followers.empty:
    df_followers.to_parquet(paths['followers'], engine="pyarrow", index=False)

print('Saved:')
for name, p in paths.items():
    if p.exists():
        print(f' - {name}: {p}')

print('\nRow counts:')
print({
    'projects': len(df_projects),
    'followers': len(df_followers),
})

Saved:
 - projects: ../../data/projects.parquet
 - followers: ../../data/project_followers.parquet

Row counts:
{'projects': 8474, 'followers': 2821}


In [16]:
df_followers

Unnamed: 0,project_id,follower_id,follower_name,scraped_at
0,3,31,3rr0rc0d3z3r0,2025-08-13 01:42:24.702000+00:00
1,3,126,dave9123,2025-08-13 01:42:24.702000+00:00
2,3,5,Neon,2025-08-13 01:42:24.702000+00:00
3,3,3,nora,2025-08-13 01:42:24.702000+00:00
4,3,7077,Ghost of Gay of Ghost of Zombie,2025-08-13 01:42:24.702000+00:00
...,...,...,...,...
2816,10967,21427,Kristian Baraka,2025-08-13 01:42:24.702000+00:00
2817,10972,14057,Alex,2025-08-13 01:42:24.702000+00:00
2818,10998,2191,Sabio,2025-08-13 01:42:24.702000+00:00
2819,11011,14057,Alex,2025-08-13 01:42:24.702000+00:00
