In [1]:
from functools import partial

from cdp_data import CDPInstances, datasets
import pandas as pd

# How to open and convert each transcript
def _process_session(row: pd.Series, extra_meta: dict) -> dict:
    # Read transcript CSV
    transcript = pd.read_csv(row.transcript_as_csv_path)
    
    return {
        "text": " ".join(transcript.text.astype(str)),
        "meta": {
            "event_id": row.event.id,
            "session_id": row.id,
            **extra_meta,
        },
    }

# Get sessions
all_sessions = []
for city in [
    CDPInstances.Seattle,
    CDPInstances.Oakland,
    CDPInstances.Asheville,
    CDPInstances.Boston,
    CDPInstances.Milwaukee,
]:
    print(f"Working on council: {city}")
    # Get all sessions for city
    df = datasets.get_session_dataset(
        city,
        start_datetime="2020-01-01",
        end_datetime="2023-09-01",
        sample=10,
        store_transcript=True,
        store_transcript_as_csv=True,
        raise_on_error=False,
    )

    # Create partial with muni name
    process_session = partial(_process_session, extra_meta={"infrastructure": city})

    # Process all rows in dataset to get full text to a single column
    all_sessions.extend(
        list(df.apply(process_session, axis=1))
    )

# Store to single dataframe
processed_sessions = pd.DataFrame(all_sessions)

# Store processed dataset to JSONL
processed_sessions.to_json("public-comment-seg-multi-city.jsonl", orient="records", lines=True)

  from .autonotebook import tqdm as notebook_tqdm


Working on council: cdp-seattle-21723dcf


Fetching each model attached to event_ref: 100%|██████████| 10/10 [00:00<00:00, 93.71it/s]
Fetching transcripts: 100%|██████████| 10/10 [00:00<00:00, 17.98it/s]
Converting transcripts: 100%|██████████| 10/10 [00:01<00:00,  5.03it/s]


Working on council: cdp-oakland-ba81c097


Fetching each model attached to event_ref: 100%|██████████| 10/10 [00:00<00:00, 29.99it/s]
Fetching transcripts: 100%|██████████| 10/10 [00:01<00:00,  8.60it/s]
Converting transcripts: 100%|██████████| 10/10 [00:02<00:00,  4.01it/s]


Working on council: cdp-asheville-ektqmrjs


Fetching each model attached to event_ref: 100%|██████████| 10/10 [00:00<00:00, 26.07it/s]
Fetching transcripts: 100%|██████████| 10/10 [00:01<00:00,  8.75it/s]
Converting transcripts: 100%|██████████| 10/10 [00:02<00:00,  4.15it/s]


Working on council: cdp-boston-c384047b


Fetching each model attached to event_ref: 100%|██████████| 10/10 [00:00<00:00, 29.89it/s]
Fetching transcripts: 100%|██████████| 10/10 [00:00<00:00, 10.23it/s]
Converting transcripts: 100%|██████████| 10/10 [00:02<00:00,  3.70it/s]


Working on council: cdp-milwaukee-9f60e352


Fetching each model attached to event_ref: 100%|██████████| 10/10 [00:00<00:00, 97.58it/s]
Fetching transcripts: 100%|██████████| 10/10 [00:00<00:00, 12.51it/s]
Converting transcripts: 100%|██████████| 10/10 [00:03<00:00,  3.27it/s]
