# Extract devlogs

Extracts devlogs from the scraped project data

devlogs_df schema
- devlog_id: int
- project_id: int
- project_title: string
- project_user_id: int
- project_slack_id: string
- category: string
- text: string
- text_length: int
- has_attachment: bool
- attachment_url: string | NaN
- attachment_host: string | NaN
- attachment_ext: string | NaN
- time_seconds: int
- time_minutes: float
- likes_count: int
- comments_count: int
- engagement: int (likes_count + comments_count)
- created_at: datetime64[ns, UTC]
- updated_at: datetime64[ns, UTC]
- created_date: date
- created_hour_utc: int [0..23]
- created_dow: int [0=Mon..6=Sun]
- created_week: int [1..53] (ISO)
- created_month: string (YYYY-MM)
- devlog_idx_in_project: int (1-based order by created_at within project)


Output artifacts:
- devlogs.parquet

In [None]:
%pip install pandas numpy pyarrow

In [None]:
import json
from pathlib import Path
from urllib.parse import urlparse
import pandas as pd
import numpy as np
from IPython.display import display

OUTPUT = Path('../../data')

projects_path = '../../data/raw/projects.json'

with open(projects_path, 'r') as f:
    raw = json.load(f)

projects = raw.get('projects', [])

In [3]:
rows = []
comment_rows = []
for p in projects:
    pid = p.get('id')
    ptitle = p.get('title')
    puser = p.get('user_id')
    pslack = p.get('slack_id')
    pcat = p.get('category')
    devlogs = p.get('devlogs', []) or []

    def parse_dt(s):
        return pd.to_datetime(s, utc=True, errors='coerce')
    devlogs_sorted = sorted(devlogs, key=lambda d: parse_dt(d.get('created_at')))

    for i, d in enumerate(devlogs_sorted, start=1):
        text = d.get('text') or ''
        att = d.get('attachment') or None
        parsed = urlparse(att) if att else None
        ext = None
        if att:
            path = parsed.path or ''
            if '.' in path:
                ext = path.rsplit('.', 1)[-1].lower()
        rows.append({
            'devlog_id': d.get('id'),
            'project_id': pid,
            'project_title': ptitle,
            'project_user_id': puser,
            'project_slack_id': pslack,
            'category': pcat,
            'text': text,
            'text_length': len(text),
            'has_attachment': bool(att),
            'attachment_url': att,
            'attachment_host': parsed.hostname if parsed else None,
            'attachment_ext': ext,
            'time_seconds': d.get('time_seconds'),
            'time_minutes': (d.get('time_seconds') or 0) / 60.0,
            'likes_count': d.get('likes_count') or 0,
            'comments_count': d.get('comments_count') or 0,
            'engagement': (d.get('likes_count') or 0) + (d.get('comments_count') or 0),
            'created_at': parse_dt(d.get('created_at')),
            'updated_at': parse_dt(d.get('updated_at')),
            'devlog_idx_in_project': i,
        })

        for c in d.get('comments') or []:
            comment_rows.append({
                'comment_id': c.get('id'),
                'devlog_id': d.get('id'),
                'project_id': pid,
                'content': c.get('content') or '',
                'content_length': len(c.get('content') or ''),
                'word_count': len((c.get('content') or '').split()),
                'created_at': parse_dt(c.get('created_at')),
            })


Enhance the data inside the dataframe

In [4]:
if rows:
    devlogs_df = pd.DataFrame(rows)
else:
    devlogs_df = pd.DataFrame(columns=[
        'devlog_id', 'project_id', 'project_title', 'project_user_id', 'project_slack_id', 'category', 'text', 'text_length', 'has_attachment', 'attachment_url', 'attachment_host', 'attachment_ext', 'time_seconds', 'time_minutes', 'likes_count', 'comments_count', 'engagement', 'created_at', 'updated_at', 'devlog_idx_in_project'
    ])

if not devlogs_df.empty:
    for col in ['project_title', 'project_slack_id', 'category', 'attachment_host', 'attachment_ext']:
        devlogs_df[col] = devlogs_df[col].astype('category')
    for col in ['devlog_id', 'project_id', 'project_user_id', 'time_seconds', 'likes_count', 'comments_count', 'devlog_idx_in_project']:
        devlogs_df[col] = pd.to_numeric(
            devlogs_df[col], errors='coerce').astype('Int64')


if not devlogs_df.empty:
    devlogs_df['created_date'] = devlogs_df['created_at'].dt.date
    devlogs_df['created_hour_utc'] = devlogs_df['created_at'].dt.hour
    devlogs_df['created_dow'] = devlogs_df['created_at'].dt.dayofweek
    devlogs_df['created_week'] = devlogs_df['created_at'].dt.isocalendar(
    ).week.astype('Int64')
    devlogs_df['created_month'] = devlogs_df['created_at'].dt.to_period(
        'M').astype(str)

  devlogs_df['created_month'] = devlogs_df['created_at'].dt.to_period(


In [5]:
OUTPUT.mkdir(parents=True, exist_ok=True)


def safe_write(df: pd.DataFrame, path: Path):
    try:
        df.to_parquet(path, index=False)
    except Exception as e:
        print(f'Failed to write {path}:', e)


safe_write(devlogs_df, OUTPUT / 'devlogs.parquet')

print('devlogs:', len(devlogs_df), 'rows across',
      devlogs_df['project_id'].nunique(), 'projects')
devlogs_df.head(3)

devlogs: 27737 rows across 6290 projects


Unnamed: 0,devlog_id,project_id,project_title,project_user_id,project_slack_id,category,text,text_length,has_attachment,attachment_url,...,comments_count,engagement,created_at,updated_at,devlog_idx_in_project,created_date,created_hour_utc,created_dow,created_week,created_month
0,72,3,dreamland.js,14,U07UY5CR7U5,Something else,Apparently `.class::before:where(...)` is inva...,210,True,https://summer.hackclub.com/rails/active_stora...,...,0,0,2025-06-16 19:28:04.969000+00:00,2025-08-12 21:02:23.728000+00:00,1,2025-06-16,19,0,25,2025-06
1,12556,3,dreamland.js,14,U07UY5CR7U5,Something else,Refactored the state system to make it cleaner...,279,True,https://summer.hackclub.com/rails/active_stora...,...,0,1,2025-07-04 14:03:39.379000+00:00,2025-08-12 21:01:36.933000+00:00,2,2025-07-04,14,4,27,2025-07
2,12680,3,dreamland.js,14,U07UY5CR7U5,Something else,"Attempted class components again, but it turns...",197,True,https://summer.hackclub.com/rails/active_stora...,...,0,0,2025-07-04 17:26:32.619000+00:00,2025-08-12 21:01:14.558000+00:00,3,2025-07-04,17,4,27,2025-07
