In [1]:
from cdp_data import CDPInstances, datasets

seattle_df = datasets.get_session_dataset(
    CDPInstances.Seattle,
    start_datetime="2023-01-01",
    end_datetime="2023-05-01",
    store_transcript=True,
    store_transcript_as_csv=True,
)
seattle_df.sample(3)

Fetching each model attached to event_ref:   0%|          | 0/32 [00:00<?, ?it/s]

Fetching transcripts:   0%|          | 0/32 [00:00<?, ?it/s]

Converting transcripts:   0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,session_datetime,session_index,session_content_hash,video_uri,video_start_time,video_end_time,caption_uri,external_source_id,id,key,event,transcript,transcript_path,transcript_as_csv_path
20,2023-03-28 16:30:00+00:00,0,9d39534c4aff2d6950e7a279a4f91217fe6419547200d0...,https://video.seattle.gov/media/council/safe_0...,,,https://www.seattlechannel.org/documents/seatt...,,cf402ba17c19,session/cf402ba17c19,<cdp_backend.database.models.Event object at 0...,<cdp_backend.database.models.Transcript object...,/home/eva/active/cdp/minutes-item-seg/cdp-data...,/home/eva/active/cdp/minutes-item-seg/cdp-data...
25,2023-04-12 16:30:00+00:00,0,527dffa11193464222d35dc7949005c3475c0c09b4e006...,https://video.seattle.gov/media/council/econ_0...,,,https://www.seattlechannel.org/documents/seatt...,,c460a8868bbf,session/c460a8868bbf,<cdp_backend.database.models.Event object at 0...,<cdp_backend.database.models.Transcript object...,/home/eva/active/cdp/minutes-item-seg/cdp-data...,/home/eva/active/cdp/minutes-item-seg/cdp-data...
1,2023-02-06 22:00:00+00:00,0,90a971243f7ffba7aa29a6ddba812b19de0bdb3aba2859...,https://video.seattle.gov/media/council/brief_...,,,https://www.seattlechannel.org/documents/seatt...,,2f8bd5a37370,session/2f8bd5a37370,<cdp_backend.database.models.Event object at 0...,<cdp_backend.database.models.Transcript object...,/home/eva/active/cdp/minutes-item-seg/cdp-data...,/home/eva/active/cdp/minutes-item-seg/cdp-data...


In [2]:
from functools import partial
import pandas as pd

def _process_session(row: pd.Series, extra_meta: dict) -> dict:
    # Read transcript CSV
    transcript = pd.read_csv(row.transcript_as_csv_path)
    
    return {
        "text": " ".join(transcript.text),
        "meta": {
            "event_id": row.event.id,
            "session_id": row.id,
            **extra_meta,
        },
    }

# Create partial with muni name
process_session_seattle = partial(_process_session, extra_meta={"muni": "seattle"})

# Process all rows in dataset to get full text to a single column
processed_sessions = pd.DataFrame(
    list(seattle_df.apply(process_session_seattle, axis=1)), 
)
processed_sessions.sample(1)

Unnamed: 0,text,meta
9,"Thank you, Madam Clerk. Thank you, son. Good m...","{'event_id': '292bd698ea59', 'session_id': 'd1..."


In [3]:
# Store processed dataset to JSONL
processed_sessions.to_json("seattle.jsonl", orient="records", lines=True)