# Minutes Item Segementation via GPT-3.5-Turbo

## Dataset Setup

In [1]:
from cdp_data import CDPInstances, datasets
from cdp_data.utils import connect_to_infrastructure
import pandas as pd

# Connect to infra
connect_to_infrastructure(CDPInstances.Seattle)

# Get dataset
seattle_df = datasets.get_session_dataset(
    CDPInstances.Seattle,
    start_datetime="2023-01-01",
    end_datetime="2023-02-15",
    store_transcript=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Fetching each model attached to event_ref: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 29.37it/s]
Fetching transcripts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 27.98it/s]


## Prompt Setup

In [2]:
from enum import Enum

from cdp_backend.pipeline.transcript_model import Transcript
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback 
from langchain.output_parsers import PydanticOutputParser
from langchain import PromptTemplate
from pydantic import BaseModel, Field
from langchain.schema import HumanMessage

###############################################################################

load_dotenv()
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)

###############################################################################

class PublicComment(BaseModel):
    first_sentence_text: str = Field(
        description=(
            "The text of the first sentence from the public commenters comment."
        ),
    )
    last_sentence_text: str | None = Field(
        description=(
            "The text of the last sentence from the public commenters comment. "
            "`None` if the comment is a single sentence long."
        ),
    )

class PublicComments(BaseModel):
    comments: list[PublicComment]

COMMENT_PARSER = PydanticOutputParser(pydantic_object=PublicComments)

###############################################################################

COMMENT_SEG_PROMPT = PromptTemplate.from_file(
    "prompts/v0-comment-seg.jinja",
    input_variables=["transcript"],
    partial_variables={
        "format_instructions": COMMENT_PARSER.get_format_instructions(),
    },
    template_format="jinja2",
)

def _process_transcript(df: pd.DataFrame, index: int) -> PublicComments:
    # Get the meeting transcript
    session_details = df.loc[index]
    
    # Load transcript
    with open(session_details.transcript_path) as open_f:
        transcript = Transcript.from_json(open_f.read())

    # Convert to string
    transcript_str = "\n\n".join([s.text for s in transcript.sentences[:200]])
    
    # Fill the prompt
    input_ = COMMENT_SEG_PROMPT.format_prompt(transcript=transcript_str)

    # Generate and log token usage
    with get_openai_callback() as api_usage:
        output = llm([HumanMessage(content=input_.to_string())])
    
        # Parse and print parsed
        try:
            parsed_output = COMMENT_PARSER.parse(output.content)
            for comment in parsed_output.comments:
                print(f"FIRST SENTENCE: '{comment.first_sentence_text}'")
                print(f"LAST SENTENCE: '{comment.last_sentence_text}'")

                start_sentence_index = -1
                end_sentence_index = -1
                for i, sentence in enumerate(transcript.sentences):
                    if start_sentence_index == -1 and sentence.text == comment.first_sentence_text:
                        start_sentence_index = i
                    if start_sentence_index != -1 and sentence.text == comment.last_sentence_text:
                        end_sentence_index = i + 1
                        break

                # Only print if both start and end were found
                if start_sentence_index != -1 and end_sentence_index != -1:
                    comment_text = " ".join([s.text for s in transcript.sentences[start_sentence_index:end_sentence_index]])
                    print(f"FULL COMMENT: {comment_text}")
                else:
                    print("FULL COMMENT: Could not find matching content")
                    
                print()
                print()
            print()
            print("-" * 80)
            print()

            return parsed_output
    
        except Exception as e:
            # Print output
            print("!!!! ERROR OCCURRED !!!!")
            print()
            print(output)
            print()
            print("-" * 80)
            print()
            raise e
    
        finally:
            # Print api usage
            print(api_usage)

## Outputs

In [3]:
seg = _process_transcript(seattle_df, 0)

FIRST SENTENCE: 'Turn the meeting into a Councilmember Herbold?'
LAST SENTENCE: 'Five present.'
FULL COMMENT: Turn the meeting into a Councilmember Herbold? Here. Councilmember Peterson? Here. Councilmember Lewis? Present. Councilmember Nelson? Present. Mayor Mosqueda? Present. Five present.


FIRST SENTENCE: 'Thank you for helping getting us up and running today as well.'
LAST SENTENCE: 'I appreciate those who are dialing in remotely.'
FULL COMMENT: Thank you for helping getting us up and running today as well. Colleagues thanks for joining us. We do have our efforts will continue to be remote as much as possible so I appreciate my colleagues for continuing to dial in remotely and for the folks who want to provide public comment for your thoughtful approach to deciding whether or not to come in. I appreciate those who are dialing in remotely.


FIRST SENTENCE: 'We have an opportunity to hear from folks both in person and remotely but I do encourage folks to continue to remain vigilant

In [None]:
seg = _process_transcript(seattle_df, 1)

In [None]:
seg = _process_transcript(seattle_df, 2)

In [4]:
seg = _process_transcript(seattle_df, 3)

FIRST SENTENCE: 'Thank you, Seattle.'
LAST SENTENCE: 'Thank you for your consideration.'
FULL COMMENT: Thank you, Seattle. Okay, so we have no in-person commenters signed up, so we'll go straight to remote speakers and everyone will have two minutes to speak and we'll start with Emma Weil. Please correct my mispronunciation. Thank you. Go ahead. Thank you. Thank you for allowing me to testify today on item D3. My name is Emma Weil and I am a senior policy analyst at Upturn, a nonprofit organization in Washington, D.C. that works to advance justice and use of technology. I'm here to testify about Seattle Police's use of mobile device forensics tools, or MDFTs, based on our report, Mass Extraction, which is the most comprehensive report of how police use MDFTs. These tools allow police to search your cell phone for years' worth of texts, photos, location data, online search history, and more. Crucially, at no point in human history have we casually stored so much information about oursel

In [None]:
seg = _process_transcript(seattle_df, 4)

In [6]:
seg = _process_transcript(seattle_df, 5)

!!!! ERROR OCCURRED !!!!

content='{\n    "comments": [\n        {\n            "first_sentence": "Good morning.",\n            "last_sentence": "Thank you."\n        },\n        {\n            "first_sentence": "Hi.",\n            "last_sentence": "Thank you."\n        },\n        {\n            "first_sentence": "Good morning.",\n            "last_sentence": "Thank you."\n        },\n        {\n            "first_sentence": "Hello.",\n            "last_sentence": "Thank you."\n        },\n        {\n            "first_sentence": "Hi.",\n            "last_sentence": "Thank you."\n        },\n        {\n            "first_sentence": "Hi.",\n            "last_sentence": "Thank you."\n        },\n        {\n            "first_sentence": "Good morning.",\n            "last_sentence": "Thank you."\n        },\n        {\n            "first_sentence": "Hi.",\n            "last_sentence": "Thank you."\n        },\n        {\n            "first_sentence": "Hi.",\n            "last_sentence": 

OutputParserException: Failed to parse PublicComments from completion {
    "comments": [
        {
            "first_sentence": "Good morning.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Hi.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Good morning.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Hello.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Hi.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Hi.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Good morning.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Hi.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Hi.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Good morning.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Good evening, council members, Seattle City Council members.",
            "last_sentence": "Thank you."
        },
        {
            "first_sentence": "Good morning.",
            "last_sentence": "Thank you."
        }
    ]
}. Got: 12 validation errors for PublicComments
comments -> 0 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 1 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 2 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 3 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 4 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 5 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 6 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 7 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 8 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 9 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 10 -> first_sentence_text
  field required (type=value_error.missing)
comments -> 11 -> first_sentence_text
  field required (type=value_error.missing)

In [7]:
seg = _process_transcript(seattle_df, 6)


--------------------------------------------------------------------------------

Tokens Used: 4071
	Prompt Tokens: 4064
	Completion Tokens: 7
Successful Requests: 1
Total Cost (USD): $0.01222


In [8]:
seg = _process_transcript(seattle_df, 7)

FIRST SENTENCE: 'Thank you.'
LAST SENTENCE: 'How we can put in jail people who steal candy for five bucks, for example, when you make a constitutional crime and nobody touch you.'
FULL COMMENT: Thank you. The February 14, 2023 meeting of the Public Safety and Human Services Committee will come to order. It is 9.35 a.m. I'm Lisa Herbold, chair of the committee. Would the clerk please call the roll. Councilmember Mosqueda. Present. Councilmember Nelson. Present. Councilmember Pedersen. Present. Vice Chair Lewis. Present. Chair Herbold. Here. Five present. Thank you so much. So just a quick rundown on today's agenda. We have two items on the agenda after public comment. The first is a presentation on the Seattle Police Department 2022 Crime Report released last week. And the second is a presentation on all of the upcoming 2023 Human Services Department Notice of Funding Availability. So those are just the opportunities to fund services to the community arising from the major lines of busi