# Public Comment Period Segmentation via GPT-3.5-Turbo-Instruct

## Dataset Setup

In [4]:
import pandas as pd

# Load annotated dataset
df = pd.read_json("trial-datasets/seattle-public-comment-period-seg-v0.jsonl", lines=True)

# Prep dataset for eval
prepped_eval_rows = []
for _, row in df.iterrows():
    # Get text
    text = row["text"]
    
    # Get meta
    meta = row["meta"]

    # Construct masses
    masses = []
    prev_index = 0
    if isinstance(row["spans"], list):
        for span in row["spans"]:
            # Choose what index to get based off label
            if span["label"] == "FIRST-SENTENCE":
                # Get start index
                mass_calc_index = span["start"]
            else:
                # Get end index
                mass_calc_index = span["end"]

            # Add masses to list
            masses.append(mass_calc_index - prev_index)

            # Update prev index
            prev_index = mass_calc_index
        
        # Add final mass
        masses.append(len(text) - prev_index)
    else:
        # Add mass for full text
        masses.append(len(text))

    # Add to list
    prepped_eval_rows.append({
        "text": row["text"],
        "meta": row["meta"],
        "true_masses": masses,
    })

# Convert to dataframe
prepped_eval_df = pd.DataFrame(prepped_eval_rows)
prepped_eval_df = prepped_eval_df.sample(3)
prepped_eval_df


Unnamed: 0,text,meta,true_masses
11,"The May 11th, 2022 meeting of the Seattle City...","{'event_id': 'b5e3673a68ff', 'session_id': 'c9...","[3133, 3043, 59419]"
0,We are recording. Wonderful. Okay. Good aftern...,"{'event_id': '84bfb428c005', 'session_id': 'c4...","[1046, 563, 41062]"
4,"Thank you. Have a great day. Good morning, eve...","{'event_id': 'c511fea02999', 'session_id': '93...","[447, 3749, 72862]"


## Prompt Setup

In [5]:
import json

import backoff
from dotenv import load_dotenv
from langchain.chat_models import ChatAnthropic
from langchain.output_parsers import PydanticOutputParser
from langchain import PromptTemplate
from langchain.schema import HumanMessage
from pydantic import BaseModel, Field
import spacy

###############################################################################

load_dotenv()
llm = ChatAnthropic(model="claude-2.0", temperature=0, max_tokens_to_sample=4096)
nlp = spacy.load("en_core_web_trf")

###############################################################################

class PublicCommentPeriod(BaseModel):
    first_sentence_text: str | None = Field(
        description="the text of the sentence which introduces the public comment period, or null if no public comment period was found",
    )
    last_sentence_text: str | None = Field(
        description="the text of the sentence which concludes the public comment period, or if null no public comment period was found",
    )

class MultiPublicCommentPeriod(BaseModel):
    periods: list[PublicCommentPeriod] = Field(
        description="the list of public comment periods (sometimes also called public hearings)",
    )

PUBLIC_COMMENT_PERIOD_SEG_PARSER = PydanticOutputParser(pydantic_object=MultiPublicCommentPeriod)

###############################################################################

PUBLIC_COMMENT_PERIOD_SEG_PROMPT = PromptTemplate.from_file(
    "prompts/v0-public-comment-period-seg.jinja",
    input_variables=["transcript"],
    partial_variables={
        "format_instructions": PUBLIC_COMMENT_PERIOD_SEG_PARSER.get_format_instructions(),
    },
    template_format="jinja2",
)

@backoff.on_exception(backoff.expo, json.JSONDecodeError, max_tries=3)
def _process_transcript(text: str) -> list[int]:
    # Convert text to sentences
    sentences = list(nlp(text).sents)

    # Convert to prompt ready string
    transcript_str = "\n\n".join([sent.text for sent in sentences])

    # Fill the prompt
    input_ = PUBLIC_COMMENT_PERIOD_SEG_PROMPT.format_prompt(transcript=transcript_str)

    # Generate
    output = llm([HumanMessage(content=input_.to_string())])

    # Parse output
    try:
        pc_periods = PUBLIC_COMMENT_PERIOD_SEG_PARSER.parse(output.content)

    except:
        print(output.content)
        raise Exception("Failed to parse output")

    # Process all periods found
    prev_index = 0
    predicted_masses = []
    for pc_period in pc_periods.periods:
        # Process masses
        if (
            pc_period.first_sentence_text is not None
            and pc_period.last_sentence_text is not None
        ):
            first_sentence_index = text.find(pc_period.first_sentence_text)
            predicted_masses.append(first_sentence_index - prev_index)
            prev_index = first_sentence_index

            last_sentence_index = text.find(pc_period.last_sentence_text)
            predicted_masses.append(last_sentence_index - prev_index)
            prev_index = last_sentence_index

    # Add final mass (or full text as mass)
    if len(predicted_masses) == 0:
        predicted_masses.append(len(text))
    else:
        predicted_masses.append(len(text) - prev_index)

    return predicted_masses

## Outputs

In [6]:
import segeval
from tqdm import tqdm

results = []
for _, row in tqdm(prepped_eval_df.iterrows(), total=len(prepped_eval_df)):
    # Get masses
    predicted_masses = _process_transcript(row["text"])

    # Get similarity
    sim = segeval.boundary_similarity(row["true_masses"], predicted_masses, n_t=int(len(row["text"]) * 0.07))

    # Get confusion matrix
    matrix = segeval.boundary_confusion_matrix(row["true_masses"], predicted_masses, n_t=int(len(row["text"]) * 0.07))

    # Get precision, recall, and f1
    precision = segeval.precision(matrix)
    recall = segeval.recall(matrix)
    f1 = segeval.fmeasure(matrix)

    # Add to results
    results.append({
        "text": row["text"],
        "meta": row["meta"],
        "true_masses": row["true_masses"],
        "predicted_masses": predicted_masses,
        "similarity": sim,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    })

# Convert to dataframe
results_df = pd.DataFrame(results)

print("Mean Similarity:", results_df["similarity"].mean())
print("Mean Precision:", results_df["precision"].mean())
print("Mean Recall:", results_df["recall"].mean())
print("Mean F1:", results_df["f1"].mean())
print()
results_df

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print("Mean Similarity:", results_df["similarity"].mean())
print("Mean Precision:", results_df["precision"].mean())
print("Mean Recall:", results_df["recall"].mean())
print("Mean F1:", results_df["f1"].mean())
print()
results_df