# Minutes Item Segementation via GPT-3.5-Turbo

## Dataset Setup

In [1]:
from cdp_data import CDPInstances, datasets
from cdp_data.utils import connect_to_infrastructure
import pandas as pd

# Connect to infra
connect_to_infrastructure(CDPInstances.Seattle)

# Get dataset
seattle_df = datasets.get_session_dataset(
    CDPInstances.Seattle,
    start_datetime="2023-01-01",
    end_datetime="2023-02-15",
    store_transcript=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Fetching each model attached to event_ref: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 56.88it/s]
Fetching transcripts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 47.67it/s]


## Prompt Setup

In [2]:
from enum import Enum

from cdp_backend.pipeline.transcript_model import Transcript
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback 
from langchain.output_parsers import PydanticOutputParser
from langchain import PromptTemplate
from pydantic import BaseModel, Field
from langchain.schema import HumanMessage

###############################################################################

load_dotenv()
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

###############################################################################

class SectionLabels(Enum):
    public_comment = "Public Comment"

ALL_SECTION_LABELS = [item.value for item in SectionLabels]
ALL_SECTION_LABELS_STR = "\n- ".join(ALL_SECTION_LABELS)
ALL_SECTION_LABELS_STR = f"- {ALL_SECTION_LABELS_STR}"

class MeetingSection(BaseModel):
    label: SectionLabels = Field(description="the section label")
    first_sentence_text: str = Field(
        description="the exact copied text of the first sentence of the section",
    )
    last_sentence_text: str = Field(
        description="the exact copied text of the last sentence of the section",
    )

class MeetingSegmentation(BaseModel):
    sections: list[MeetingSection]

MEETING_SEG_PARSER = PydanticOutputParser(pydantic_object=MeetingSegmentation)

###############################################################################

MEETING_SEG_PROMPT = PromptTemplate.from_file(
    "prompts/v0.jinja",
    input_variables=["transcript"],
    partial_variables={
        "section_labels": ALL_SECTION_LABELS_STR,
        "format_instructions": MEETING_SEG_PARSER.get_format_instructions(),
    },
    template_format="jinja2",
)

def _process_transcript(df: pd.DataFrame, index: int) -> MeetingSegmentation:
    # Get the meeting transcript
    session_details = df.loc[index]
    
    # Load transcript
    with open(session_details.transcript_path) as open_f:
        transcript = Transcript.from_json(open_f.read())

    # Convert to string
    transcript_str = "\n\n".join([s.text for s in transcript.sentences[:200]])
    
    # Fill the prompt
    input_ = MEETING_SEG_PROMPT.format_prompt(transcript=transcript_str)

    # Generate and log token usage
    with get_openai_callback() as api_usage:
        output = llm([HumanMessage(content=input_.to_string())])
    
        # Parse and print parsed
        try:
            parsed_output = MEETING_SEG_PARSER.parse(output.content)
            for section in parsed_output.sections:
                print(f"SECTION LABEL: {section.label}")
                print(f"FIRST SENTENCE: '{section.first_sentence_text}'")
                print(f"LAST SENTENCE: '{section.last_sentence_text}'")

                start_sentence_index = -1
                end_sentence_index = -1
                for i, sentence in enumerate(transcript.sentences):
                    if start_sentence_index == -1 and sentence.text == section.first_sentence_text:
                        start_sentence_index = i
                    if start_sentence_index != -1 and sentence.text == section.last_sentence_text:
                        end_sentence_index = i
                        break

                # Only print if both start and end were found
                if start_sentence_index != -1 and end_sentence_index != -1:
                    section_text = " ".join([s.text for s in transcript.sentences[start_sentence_index:end_sentence_index]])
                    print(f"SECTION FULL CONTENT: {section_text}")
                else:
                    print("SECTION FULL CONTENT: Could not find matching content")
                    
                print()
                print()
            print()
            print("-" * 80)
            print()

            return parsed_output
    
        except Exception as e:
            # Print output
            print("!!!! ERROR OCCURRED !!!!")
            print()
            print(output)
            print()
            print("-" * 80)
            print()
            raise e
    
        finally:
            # Print api usage
            print(api_usage)

## Outputs

In [3]:
seg = _process_transcript(seattle_df, 0)

SECTION LABEL: SectionLabels.public_comment
FIRST SENTENCE: 'Let's go on to public comment.'
LAST SENTENCE: 'The public comment period is now closed.'
SECTION FULL CONTENT: Could not find matching content



--------------------------------------------------------------------------------

Tokens Used: 4938
	Prompt Tokens: 4898
	Completion Tokens: 40
Successful Requests: 1
Total Cost (USD): $0.14934


In [4]:
seg = _process_transcript(seattle_df, 5)

SECTION LABEL: SectionLabels.public_comment
FIRST SENTENCE: 'So at this time, we will open the general in- person and hybrid public comment.'
LAST SENTENCE: 'So I request you to vote no on such an ordinance simply because they don't want to get into caste-based discrimination issues as they might lose their license.'
SECTION FULL CONTENT: So at this time, we will open the general in- person and hybrid public comment. Do we have anybody in person? Okay. We've got several folks who are signed up... Sorry. Who are signed up virtually for public comment. So I will remind folks that you need to address things that are on today's agenda. So we do... It does remain the strong intent of Council to have public comment during our regularly scheduled meetings, and the Council reserves the right to modify the public comment period at any point if we deem that the system is being abused. I will moderate the public comment period in the following manner. This comment period will be for up to 20 minu

In [5]:
seg = _process_transcript(seattle_df, 7)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-kzj3voQusmCX8xQilGiEbeXI on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-kzj3voQusmCX8xQilGiEbeXI on tokens per min. Limit: 10000 / min. Please try again in 6ms. Contact us through our help center at help.openai.com if you continue to have issues..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for 10KTPM-200RPM in organization org-kzj3voQusmCX8xQilGiEbeXI on tokens p

SECTION LABEL: SectionLabels.public_comment
FIRST SENTENCE: 'At this time, we're going to move into public comment.'
LAST SENTENCE: 'And I please ask you to do something about this. Thank you.'
SECTION FULL CONTENT: Could not find matching content



--------------------------------------------------------------------------------

Tokens Used: 3498
	Prompt Tokens: 3447
	Completion Tokens: 51
Successful Requests: 1
Total Cost (USD): $0.10647
