# Public Comment Period Segmentation via GPT-3.5-Turbo-Instruct

## Dataset Setup

In [1]:
import pandas as pd

# Load annotated dataset
df = pd.read_json("trial-datasets/seattle-public-comment-period-seg-v0.jsonl", lines=True)

# Prep dataset for eval
prepped_eval_rows = []
for _, row in df.iterrows():
    # Get text
    text = row["text"]
    
    # Get meta
    meta = row["meta"]

    # Construct masses
    masses = []
    prev_index = 0
    if isinstance(row["spans"], list):
        for span in row["spans"]:
            # Choose what index to get based off label
            if span["label"] == "FIRST-SENTENCE":
                # Get start index
                mass_calc_index = span["start"]
            else:
                # Get end index
                mass_calc_index = span["end"]

            # Add masses to list
            masses.append(mass_calc_index - prev_index)

            # Update prev index
            prev_index = mass_calc_index
        
        # Add final mass
        masses.append(len(text) - prev_index)
    else:
        # Add mass for full text
        masses.append(len(text))

    # Add to list
    prepped_eval_rows.append({
        "text": row["text"],
        "meta": row["meta"],
        "true_masses": masses,
    })

# Convert to dataframe
prepped_eval_df = pd.DataFrame(prepped_eval_rows)
prepped_eval_df


Unnamed: 0,text,meta,true_masses
0,We are recording. Wonderful. Okay. Good aftern...,"{'event_id': '84bfb428c005', 'session_id': 'c4...","[1046, 563, 41062]"
1,"Good afternoon, everybody. Today is Tuesday, M...","{'event_id': 'c86c94ed1db7', 'session_id': 'c6...","[8847, 11850, 7373]"
2,"Good afternoon, everyone, the September 14th, ...","{'event_id': 'fa3fd088de8e', 'session_id': '7c...","[4282, 12860, 21504]"
3,"Discussion. 228, council meeting will come her...","{'event_id': '7cc7c93e7a63', 'session_id': 'f7...",[141623]
4,"Thank you. Have a great day. Good morning, eve...","{'event_id': 'c511fea02999', 'session_id': '93...","[447, 3749, 72862]"
5,Thank you very much. Thank you. The December 6...,"{'event_id': '2adc154d91b0', 'session_id': '6d...","[1537, 7617, 3517, 1938, 18775]"
6,"Thank you, son. Good afternoon, everybody. It'...","{'event_id': '4477546b534d', 'session_id': '72...","[1087, 8777, 2406]"
7,"Good afternoon, the September 21st, 2020 meeti...","{'event_id': '6562c700d929', 'session_id': '97...","[12166, 34099, 59278]"
8,Director Sawyer is ready to ready to go. After...,"{'event_id': 'fc5983dfdc1f', 'session_id': 'cd...","[3744, 11233, 11469, 7982, 40536]"
9,Budget meeting starting in a moment. I will ca...,"{'event_id': '23333c839436', 'session_id': '3b...","[10222, 12327, 102269]"


## Prompt Setup

In [2]:
from dotenv import load_dotenv
from langchain.chat_models import ChatAnthropic
from langchain.output_parsers import PydanticOutputParser
from langchain import PromptTemplate
from langchain.schema import HumanMessage
from pydantic import BaseModel, Field
import spacy

###############################################################################

load_dotenv()
llm = ChatAnthropic(model="claude-2.0", temperature=0)
nlp = spacy.load("en_core_web_trf")

###############################################################################

class PublicCommentPeriod(BaseModel):
    first_sentence_text: str = Field(
        description="the text of the sentence which introduces the public comment period",
    )
    last_sentence_text: str = Field(
        description="the text of the sentence which concludes the public comment period",
    )

class MultiPublicCommentPeriod(BaseModel):
    periods: list[PublicCommentPeriod] = Field(
        description="the list of public comment periods (sometimes also called public hearings)",
    )

PUBLIC_COMMENT_PERIOD_SEG_PARSER = PydanticOutputParser(pydantic_object=MultiPublicCommentPeriod)

###############################################################################

PUBLIC_COMMENT_PERIOD_SEG_PROMPT = PromptTemplate.from_file(
    "prompts/v0-public-comment-period-seg.jinja",
    input_variables=["transcript"],
    partial_variables={
        "format_instructions": PUBLIC_COMMENT_PERIOD_SEG_PARSER.get_format_instructions(),
    },
    template_format="jinja2",
)

def _process_transcript(text: str) -> list[int]:
    # Convert text to sentences
    sentences = list(nlp(text).sents)

    # Convert to prompt ready string
    transcript_str = "\n\n".join([sent.text for sent in sentences])

    # Fill the prompt
    input_ = PUBLIC_COMMENT_PERIOD_SEG_PROMPT.format_prompt(transcript=transcript_str)

    # Generate
    output = llm([HumanMessage(content=input_.to_string())])

    # Parse output
    try:
        pc_periods = PUBLIC_COMMENT_PERIOD_SEG_PARSER.parse(output.content)
        print(pc_periods)
    except:
        print(output.content)

    # Take batches of the sentences
    prev_index = 0
    predicted_masses = []
    for pc_period in pc_periods.periods:
        # Get first sentence
        if pc_period.first_sentence_text is not None:
            first_sentence_index = text.find(pc_period.first_sentence_text)
            predicted_masses.append(first_sentence_index - prev_index)
            prev_index = first_sentence_index

        # Get last sentence
        if pc_period.last_sentence_text is not None:
            last_sentence_index = text.find(pc_period.last_sentence_text)
            predicted_masses.append(last_sentence_index - prev_index)
            prev_index = last_sentence_index

    # Add final mass (or full text as mass)
    if len(predicted_masses) == 0:
        predicted_masses.append(len(text))
    else:
        predicted_masses.append(len(text) - prev_index)

    return predicted_masses

  from .autonotebook import tqdm as notebook_tqdm


## Outputs

In [3]:
_process_transcript(prepped_eval_df.iloc[0]["text"])

periods=[PublicCommentPeriod(first_sentence_text='So without further ado, my understanding is that we do not have any folks signed up for public comment or anyone in the weight room.', last_sentence_text='So that being said, will the clerk please read item one into the record?'), PublicCommentPeriod(first_sentence_text="If folks do sign up before the end of the meeting, we'll make sure to give them an opportunity to address the council.", last_sentence_text='If there is no further business to come before the committee, the committee will be adjourned.')]


[1046, 564, -119, 40821, 359]