# Minutes Item Segementation via GPT-3.5-Turbo

## Dataset Setup

In [1]:
from cdp_data import CDPInstances, datasets
from cdp_data.utils import connect_to_infrastructure
import pandas as pd

# Connect to infra
connect_to_infrastructure(CDPInstances.Seattle)

# Get dataset
seattle_df = datasets.get_session_dataset(
    CDPInstances.Seattle,
    start_datetime="2023-01-01",
    end_datetime="2023-02-15",
    store_transcript=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Fetching each model attached to event_ref: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 67.33it/s]
Fetching transcripts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 20.53it/s]


## Prompt Setup

In [2]:
from enum import Enum

from cdp_backend.pipeline.transcript_model import Transcript
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback 
from langchain.output_parsers import PydanticOutputParser
from langchain import PromptTemplate
from pydantic import BaseModel, Field
from langchain.schema import HumanMessage

###############################################################################

load_dotenv()
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

###############################################################################

class SectionLabel(Enum):
    roll_call = "Roll Call"
    public_comment = "Public Comment"
    discussion_on_bill_or_resolution = "Discussion on Bill or Resolution"
    voting_on_bill_or_resolution = "Voting on Bill or Resolution"
    presentation_report_or_general_update = "Presentation, Report, or General Update"
    meeting_administration = "Meeting Administration"

ALL_SECTION_LABELS = [item.value for item in SectionLabel]
ALL_SECTION_LABELS_STR = "\n- ".join(ALL_SECTION_LABELS)
ALL_SECTION_LABELS_STR = f"- {ALL_SECTION_LABELS_STR}"

class MeetingSection(BaseModel):
    classification: SectionLabel = Field(description="the section classification")
    first_sentence_text: str = Field(
        description="the exact copied text of the first sentence of the section",
    )
    last_sentence_text: str = Field(
        description="the exact copied text of the last sentence of the section",
    )

class MeetingSegmentation(BaseModel):
    sections: list[MeetingSection]

MEETING_SEG_PARSER = PydanticOutputParser(pydantic_object=MeetingSegmentation)

###############################################################################

MEETING_SEG_PROMPT = PromptTemplate.from_file(
    "prompts/v0-section-seg.jinja",
    input_variables=["transcript"],
    partial_variables={
        "section_labels": ALL_SECTION_LABELS_STR,
        "format_instructions": MEETING_SEG_PARSER.get_format_instructions(),
    },
    template_format="jinja2",
)

def _process_transcript(df: pd.DataFrame, index: int) -> MeetingSegmentation:
    # Get the meeting transcript
    session_details = df.loc[index]
    
    # Load transcript
    with open(session_details.transcript_path) as open_f:
        transcript = Transcript.from_json(open_f.read())

    # Convert to string
    transcript_str = "\n\n".join([s.text for s in transcript.sentences[:100]])
    
    # Fill the prompt
    input_ = MEETING_SEG_PROMPT.format_prompt(transcript=transcript_str)

    # Generate and log token usage
    with get_openai_callback() as api_usage:
        output = llm([HumanMessage(content=input_.to_string())])
    
        # Parse and print parsed
        try:
            parsed_output = MEETING_SEG_PARSER.parse(output.content)
            for section in parsed_output.sections:
                print(f"SECTION LABEL: {section.classification}")
                print(f"FIRST SENTENCE: '{section.first_sentence_text}'")
                print(f"LAST SENTENCE: '{section.last_sentence_text}'")

                start_sentence_index = -1
                end_sentence_index = -1
                for i, sentence in enumerate(transcript.sentences):
                    if start_sentence_index == -1 and sentence.text == section.first_sentence_text:
                        start_sentence_index = i
                    if start_sentence_index != -1 and sentence.text == section.last_sentence_text:
                        end_sentence_index = i + 1
                        break

                # Only print if both start and end were found
                if start_sentence_index != -1 and end_sentence_index != -1:
                    section_text = " ".join([s.text for s in transcript.sentences[start_sentence_index:end_sentence_index]])
                    print(f"SECTION FULL CONTENT: {section_text}")
                else:
                    print("SECTION FULL CONTENT: Could not find matching content")
                    
                print()
                print()
            print()
            print("-" * 80)
            print()

            return parsed_output
    
        except Exception as e:
            # Print output
            print("!!!! ERROR OCCURRED !!!!")
            print()
            print(output)
            print()
            print("-" * 80)
            print()
            raise e
    
        finally:
            # Print api usage
            print(api_usage)

## Outputs

In [3]:
seg = _process_transcript(seattle_df, 0)

SECTION LABEL: SectionLabel.roll_call
FIRST SENTENCE: 'Turn the meeting into a Councilmember Herbold?'
LAST SENTENCE: 'Five present.'
SECTION FULL CONTENT: Turn the meeting into a Councilmember Herbold? Here. Councilmember Peterson? Here. Councilmember Lewis? Present. Councilmember Nelson? Present. Mayor Mosqueda? Present. Five present.


SECTION LABEL: SectionLabel.meeting_administration
FIRST SENTENCE: 'Thank you very much Madam Clerk and again welcome to Melanie Cray who is our in-person clerk for the upcoming meetings here and thanks again to Farideh Cuevas who is our clerk from afar running things electronically and as always Emilia who is helping us coordinate as the official clerk for the city's proceedings.'
LAST SENTENCE: 'Hearing no objection today's agenda is adopted and reordered.'
SECTION FULL CONTENT: Thank you very much Madam Clerk and again welcome to Melanie Cray who is our in-person clerk for the upcoming meetings here and thanks again to Farideh Cuevas who is our cle

In [4]:
seg = _process_transcript(seattle_df, 5)

SECTION LABEL: SectionLabel.roll_call
FIRST SENTENCE: 'Good morning.'
LAST SENTENCE: 'Four present, one absent, one excused.'
SECTION FULL CONTENT: Good morning. We're getting a little bit of a late start, but this is the February 10th 2023 regularly scheduled meeting of the Neighborhoods, Education, and Planning Committee, and we're excited to call the roll. Council Member Lewis. Present. Council Member Nelson. Present. Vice Chair Sawant. Chair Morales. Here. Four present, one absent, one excused.


SECTION LABEL: SectionLabel.meeting_administration
FIRST SENTENCE: 'If there's no objection, today's agenda would be adopted.'
LAST SENTENCE: 'Hearing no objection, today's agenda is adopted.'
SECTION FULL CONTENT: If there's no objection, today's agenda would be adopted. Hearing no objection, today's agenda is adopted.


SECTION LABEL: SectionLabel.presentation_report_or_general_update
FIRST SENTENCE: 'So we have several folks on today's committee agenda, and then we'll have a community p

In [5]:
seg = _process_transcript(seattle_df, 7)

SECTION LABEL: SectionLabel.roll_call
FIRST SENTENCE: 'The February 14, 2023 meeting of the Public Safety and Human Services Committee will come to order.'
LAST SENTENCE: 'Five present.'
SECTION FULL CONTENT: The February 14, 2023 meeting of the Public Safety and Human Services Committee will come to order. It is 9.35 a.m. I'm Lisa Herbold, chair of the committee. Would the clerk please call the roll. Councilmember Mosqueda. Present. Councilmember Nelson. Present. Councilmember Pedersen. Present. Vice Chair Lewis. Present. Chair Herbold. Here. Five present.


SECTION LABEL: SectionLabel.meeting_administration
FIRST SENTENCE: 'Thank you so much.'
LAST SENTENCE: 'Seeing and hearing no objection, the agenda is adopted.'
SECTION FULL CONTENT: Thank you so much. So just a quick rundown on today's agenda. We have two items on the agenda after public comment. The first is a presentation on the Seattle Police Department 2022 Crime Report released last week. And the second is a presentation on 