# Classify comments

In this notebook we will start with analyzing the comments. We will first identify categories with the help of an LLM, then try to refine the found categories and finally classify all comments of the survey into defined categories.


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import math
import pickle
import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

import data
import prompt

In [3]:
plt.style.use("ggplot")

In [None]:
load_dotenv()

In [None]:
# allow wider display of columns
# don't truncate the display of columns
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_colwidth", None)

In [5]:
client = OpenAI()

## Load data


The preprocessing and mappint to the city neighborhoods is defined in [data.py](./data.py)


In [None]:
df = data.load_data("Umfrage DE submissions 2024-07-31 00_25.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.Stadtteil.str.title().value_counts().nlargest(10)

In [None]:
df.Region.value_counts()

In [None]:
ax = df.Region.value_counts().plot(
    kind="bar", title="Teilnehmende pro Region", width=0.8, color="#0160ad", alpha=1
)

plt.xticks(rotation=0)
plt.xlabel("")
plt.ylim((0, 310))
plt.yticks(range(0, 301, 50))
# plt.ylabel("Anzahl")
ax.bar_label(ax.containers[0])

## EDA comments


In [None]:
df.Positive[df.Positive.str.len() < 5].shape[0] / df.shape[0]

In [None]:
df.Positive[df.Positive.str.len() < 5]

In [None]:
df.Negative[df.Negative.str.len() < 5].shape[0] / df.shape[0]

In [None]:
for x in df.Negative[df.Negative.str.len() < 5]:
    print(x)

In [None]:
df[(df.Negative.str.len() < 5) & (df.Positive.str.len() < 5)]

## Analyze spam


Anything that we could not map to a city neighborhood is treated as SPAM


In [None]:
df[df.Stadtteil == "SPAM"].shape[0] / df.shape[0]

In [None]:
df[df.Stadtteil == "SPAM"].shape

In [7]:
df = df[df.Stadtteil != "SPAM"]

In [None]:
df.shape

In [None]:
df.sample(20)[["Positive", "Negative"]]

In [None]:
df[(df["Positive"].str.len() <= 7) & (df["Negative"].str.len() <= 7)]

In [8]:
df = df[(df["Positive"].str.len() > 7) | (df["Negative"].str.len() > 7)]

In [None]:
df.shape

## Detect categories


In [None]:
df.Positive.sample(10).to_list()

In [None]:
df.Negative.sample(10).to_list()

### Prompt Engineering for Categories


The first thing we do is to ask GPT for a list of categories the comments belong to. We tried in a first attempt to create two lists, one for positive comments and one for negative comments, as well as creating a hierarchy of categories with main and sub categories, but all these attempts worked less well as the most simple approach to come up with a list of categories independent of the sentiment of the comment. We give the model a few examples of good categories and N samples from the data set.


In [None]:
model = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o-mini"][-1]
temperature = 0
df_samples = df[(df["Positive"].str.len() > 15) & (df["Negative"].str.len() > 15)][
    ["Positive", "Negative"]
]

response = client.chat.completions.create(
    model=model,
    # response_format={"type": "json_object"},
    messages=[
        {
            "role": "system",
            "content": prompt.SYSTEM_PROMPT_CAT_v2,
        },
        {
            "role": "user",
            "content": prompt.USER_PROMPT_CAT_v2.format(
                comments="\n".join(
                    f"{i+1}. {c}"
                    for i, c in enumerate(
                        df_samples.sample(20).values.flatten().tolist()
                    )
                )
            ),
        },
    ],
    temperature=temperature,
)

print(response.choices[0].finish_reason)

In [None]:
print(response.choices[0].message.content)

In [15]:
with open("artifacts/response_cat.pkl", "wb") as f:
    pickle.dump(response.choices[0].message.content, f)

## Improve categories


After generating the first category list, we are now asking GPT to refine this list step by step by giving it another set of examples and allowing it to modify the categories but not deleting any.


In [16]:
with open("artifacts/response_cat.pkl", "rb") as f:
    categories_str = pickle.load(f)

In [None]:
categories = [c.strip("- ") for c in categories_str.splitlines()]
categories

In [None]:
model = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o-mini"][-1]
temperature = 0
df_samples = df[(df["Positive"].str.len() > 15) & (df["Negative"].str.len() > 15)][
    ["Positive", "Negative"]
]

for batch in range(4):
    print("round ", batch)
    response = client.chat.completions.create(
        model=model,
        # response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": prompt.SYSTEM_PROMPT_REFINED_v2,
            },
            {
                "role": "user",
                "content": prompt.USER_PROMPT_REFINED_v2.format(
                    comments="\n".join(
                        f"{i+1}. {c}"
                        for i, c in enumerate(
                            df_samples.sample(15).values.flatten().tolist()
                        )
                    ),
                    categories="\n".join(f"- {c}" for c in categories),
                ),
            },
        ],
        temperature=temperature,
    )

    print(response.choices[0].finish_reason)
    categories_str = response.choices[0].message.content
    categories = [c.strip("- ") for c in categories_str.splitlines()]
    print(categories)

In [None]:
print(response.choices[0].message.content)

In [20]:
with open("artifacts/response_refined.pkl", "wb") as f:
    pickle.dump(response.choices[0].message.content, f)

In [22]:
with open("artifacts/response_refined.pkl", "rb") as f:
    categories_refined_str = pickle.load(f)

In [None]:
categories_refined = [c.strip("- ") for c in categories_refined_str.splitlines()]
categories_refined

### Merge categories


In [None]:
model = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o-mini"][-1]
temperature = 0

draft_categories = """
Set 1:
- Baby-Friendly Activities
- Accessibility and Transportation
- Childcare Availability
- Community Support and Resources
- Green Spaces and Nature
- Safety and Cleanliness
- Family-Friendly Facilities
- Healthcare and Support Services
- Social Interaction Opportunities
- Educational and Developmental Programs
- Public Amenities and Facilities
- Availability of Playgrounds and Recreational Areas
- Information and Transparency about Services
Set 2:
- Public Spaces and Playgrounds
- Family Support Services
- Healthcare and Medical Facilities
- Transportation Accessibility
- Community Activities and Events
- Cleanliness and Safety
- Childcare Availability
- Indoor Play Areas
- Parent Networking Opportunities
- Dining Options for Families
- Accessibility and Inclusivity
- Green Spaces and Nature Access
- Family-Friendly Facilities and Amenities
- Childcare and Educational Resources
- Housing Affordability and Availability
Set 3:
- Family-Friendly Activities and Events
- Healthcare Access and Services
- Childcare and Kindergarten Availability
- Public Transportation Accessibility
- Parks and Green Spaces
- Community Support and Resources
- Safety and Mobility
- Cafés and Social Spaces for Families
- Cleanliness and Maintenance of Facilities
- Educational and Developmental Programs for Children
- Availability of Childcare Resources
- Support for Parents and Families
- Accessibility and Inclusivity for Families
- Bureaucratic Processes and Delays
- Infrastructure and Facilities for Families
Set 4:
- Family Support Services
- Playgrounds and Recreational Areas
- Accessibility and Transportation
- Healthcare and Medical Services
- Community Events and Activities
- Safety and Cleanliness
- Childcare and Early Education
- Social Interaction Opportunities
- Information and Resource Availability
- Environmental Quality and Green Spaces
- Availability of Family-Friendly Programs and Services
- Family-Friendly Infrastructure and Amenities
- Public Transportation and Mobility
- Availability of Childcare Facilities
"""

response = client.chat.completions.create(
    model=model,
    # response_format={"type": "json_object"},
    messages=[
        {
            "role": "system",
            "content": prompt.SYSTEM_PROMPT_REFINED_FINAL,
        },
        {
            "role": "user",
            "content": prompt.USER_PROMPT_REFINED_FINAL.format(
                categories=draft_categories,
            ),
        },
    ],
    temperature=temperature,
)

print(response.choices[0].finish_reason)

In [None]:
print(response.choices[0].message.content)

In [27]:
with open("artifacts/response_final.pkl", "wb") as f:
    pickle.dump(response.choices[0].message.content, f)

In [28]:
with open("artifacts/response_final.pkl", "rb") as f:
    final_categories_str = pickle.load(f)

In [None]:
final_categories = [c.strip("- ") for c in final_categories_str.splitlines()]
final_categories

## Classify comments


After discussing the categories with Babylotse, we decided on the final list of categories


In [None]:
data.final_categories

Because we have a lot of jobs and don't want to run into API limits, we use the batch API endpoint: https://cookbook.openai.com/examples/batch_processing


In [11]:
def create_job(
    custom_id: int, sentiment: str, comment: str, categories: list[str]
) -> dict:
    return {
        "custom_id": f"{custom_id}-{sentiment}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {
                    "role": "system",
                    "content": prompt.SYSTEM_PROMPT_CLASSIFY.format(
                        categories=",".join(categories)
                    ),
                },
                {"role": "user", "content": f"Comment: {comment}"},
            ],
            "temperature": 0,
        },
    }


def create_files(df: pd.DataFrame, filename: str) -> list[str]:
    indices = df.index
    files = []

    for i, batch in enumerate(np.array_split(indices, math.ceil(len(indices) / 50))):
        filename_ = f"batch/{filename}_batch_{i+1}.json"
        files.append(filename_)
        with open(filename_, "w") as f:
            for row in df.loc[batch, ["Positive", "Negative"]].itertuples(index=True):
                f.write(
                    json.dumps(
                        create_job(
                            row.Index, "Positive", row.Positive, data.final_categories
                        )
                    )
                )
                f.write("\n")
                f.write(
                    json.dumps(
                        create_job(
                            row.Index, "Negative", row.Negative, data.final_categories
                        )
                    )
                )
                f.write("\n")

    return files

In [12]:
files = create_files(df, "batchinput")

In [None]:
files[0]

Now we can use the `client.files.create` endpoint to upload our batch files, which is necessary to start jobs.


In [15]:
batch_files = []
for file in files:
    batch_input_file = client.files.create(
        file=open(file, "rb"),
        purpose="batch",
    )
    batch_files.append(batch_input_file)

In [None]:
batch_files[0]

With each uploaded file, we have a file_id that can be used to start a job with the `client.batches.create` endpoint.


In [19]:
batch_jobs = []
for i, batch_file in enumerate(batch_files):
    batch_input_file_id = batch_file.id

    batch_job = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"Processing batch {i+1} of {len(batch_files)} with {50} comments for Babylotse"
        },
    )

    batch_jobs.append(batch_job)

In [23]:
with open("batch/batch_jobs.pkl", "wb") as f:
    pickle.dump(batch_jobs, f)

In [24]:
with open("batch/batch_jobs.pkl", "rb") as f:
    batch_jobs = pickle.load(f)

In [None]:
for i, batch_job in enumerate(batch_jobs):
    print(i, client.batches.retrieve(batch_job.id).status)

In [None]:
# only necessary if you want to wait "live", otherwise you can just use the code above to check the status on bulk as it can take up to 24 hours
while True:
    batch_info = client.batches.retrieve(batch_job.id)
    print(batch_info.status)
    if batch_info.status == "completed":
        print(f"Batch {i+1} finished.")
        break
    if batch_info.status == "failed":
        print(f"Batch {i+1} failed.")
        print(batch_info.errors)
        break
    time.sleep(5)

In [37]:
for job in batch_jobs:
    batch_info = client.batches.retrieve(job.id)
    file_response = client.files.content(batch_info.output_file_id)

    with open(f"batch/{batch_info.output_file_id}.jsonl", "w") as fhandle:
        fhandle.write(file_response.text)

In [38]:
files = list(Path("batch").glob("file-*"))

pos_comments = []
neg_comments = []
for file in files:
    with open(file) as f:
        for line in f:
            item = json.loads(line)
            id, sentiment = item["custom_id"].split("-")

            message = item["response"]["body"]["choices"][0]["message"]["content"]
            if sentiment == "Positive":
                pos_comments.append((int(id), message))
            elif sentiment == "Negative":
                neg_comments.append((int(id), message))
            else:
                print(f"Error! Don't support {sentiment}")

pos_comments = sorted(pos_comments, key=lambda x: x[0])
neg_comments = sorted(neg_comments, key=lambda x: x[0])

In [None]:
len(pos_comments), len(neg_comments)

In [None]:
pos_comments[:20]

In [42]:
with open("artifacts/pos_results.pkl", "wb") as f:
    pickle.dump(pos_comments, f)

with open("artifacts/neg_results.pkl", "wb") as f:
    pickle.dump(neg_comments, f)