In [1]:
from cdp_data import CDPInstances, datasets

seattle_df = datasets.get_session_dataset(
    CDPInstances.Seattle,
    start_datetime="2023-01-01",
    end_datetime="2023-05-01",
    store_transcript=True,
)
seattle_df.sample(3)

  from .autonotebook import tqdm as notebook_tqdm
Fetching each model attached to event_ref: 100%|██████████| 32/32 [00:00<00:00, 33.14it/s]
Fetching transcripts: 100%|██████████| 32/32 [00:01<00:00, 21.12it/s]


Unnamed: 0,session_datetime,session_index,session_content_hash,video_uri,video_start_time,video_end_time,caption_uri,external_source_id,id,key,event,transcript,transcript_path
3,2023-02-08 17:30:00+00:00,0,82240d413981d8162550eade2d18d1666e4aea04f57742...,https://video.seattle.gov/media/council/econ_0...,,,https://www.seattlechannel.org/documents/seatt...,,9c6836e9fcba,session/9c6836e9fcba,<cdp_backend.database.models.Event object at 0...,<cdp_backend.database.models.Transcript object...,/Users/eva/active/cdp/minutes-item-seg/cdp-dat...
28,2023-04-25 16:30:00+00:00,0,16bc21250c310933a4aff09291601498e59f0db6d21716...,https://video.seattle.gov/media/council/safe_0...,,,https://www.seattlechannel.org/documents/seatt...,,09cead5e909c,session/09cead5e909c,<cdp_backend.database.models.Event object at 0...,<cdp_backend.database.models.Transcript object...,/Users/eva/active/cdp/minutes-item-seg/cdp-dat...
4,2023-02-08 22:00:00+00:00,0,1d01974c56aefdc076f4ed6501a808931ac969c8d9152e...,https://video.seattle.gov/media/council/land_0...,,,https://www.seattlechannel.org/documents/seatt...,,254a8beedc40,session/254a8beedc40,<cdp_backend.database.models.Event object at 0...,<cdp_backend.database.models.Transcript object...,/Users/eva/active/cdp/minutes-item-seg/cdp-dat...


In [2]:
from functools import partial
import pandas as pd
from cdp_backend.pipeline.transcript_model import Transcript

def _process_session(
        row: pd.Series,
        extra_meta: dict,
        window_size: int = 4,
    ) -> list[dict]:
    # Read transcript
    with open(row.transcript_path, "r") as open_f:
        transcript = Transcript.from_json(open_f.read())

    # Process and created windowed portions of transcript sentences
    portions = []
    for s_i in range(len(transcript.sentences) - window_size + 1):
        content = " ".join([s.text for s in transcript.sentences[s_i: s_i + window_size]])
    
        portions.append({
            "text": content,
            "meta": {
                "event_id": row.event.id,
                "session_id": row.id,
                "sentence_start_index": s_i,
                **extra_meta,
            },
        })
    
    return portions

# Create partial with muni name
process_session_seattle = partial(_process_session, extra_meta={"muni": "seattle"})

# Process all rows in seattle dataset
windowed_portions = list(seattle_df.apply(process_session_seattle, axis=1))
flattened_windowed_portions = [
    item for sublist in windowed_portions for item in sublist
]

# Process all rows in dataset to get full text to a single column
processed_sessions = pd.DataFrame(flattened_windowed_portions)
processed_sessions.sample(3)

Unnamed: 0,text,meta
24085,"Chris call. Woody welcome. Thank you. Hi, my N...","{'event_id': '38aeb914cde7', 'session_id': 'c0..."
2965,My pleasure. I remember that came up last time...,"{'event_id': '66d084f9f0e5', 'session_id': '9c..."
19883,"I do want to know, I worked on legislation and...","{'event_id': 'b4e8a57e226d', 'session_id': '06..."


In [3]:
# Store processed dataset to JSONL
processed_sessions.to_json("seattle.jsonl", orient="records", lines=True)