# Extract Comments
Cleans the data from comments.json

- Reads `data/raw/comments.json` 
- Normalizes the `comments` array into tabular columns
- Parses timestamps and adds `scraped_at`
- Saves to `data/comments.parquet`

In [1]:
import json
from pathlib import Path
import pandas as pd

OUTPUT = Path('../../data')

comments_path = '../../data/raw/comments.json'
parquet_out = OUTPUT / 'comments.parquet'

with open(comments_path, 'r') as f:
    raw = json.load(f)

In [2]:

comments = raw.get('comments', [])
scraped_at = raw.get('scraped_at')

df = pd.DataFrame(comments)


for col in ['text', 'devlog_id', 'slack_id', 'created_at']:
    if col not in df.columns:
        df[col] = pd.NA

if 'created_at' in df.columns:
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True)

if scraped_at:
    df['scraped_at'] = pd.to_datetime(scraped_at, errors='coerce', utc=True)
else:
    df['scraped_at'] = pd.Timestamp.utcnow()

base_cols = ['text', 'devlog_id', 'slack_id', 'created_at', 'scraped_at']
other_cols = [c for c in df.columns if c not in base_cols]
df = df[base_cols + other_cols]

In [3]:
df.to_parquet(parquet_out, engine="pyarrow", index=False)
print(f'Saved {len(df)} rows to {parquet_out}')

Saved 1323 rows to ../../data/comments.parquet
