In [None]:
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv

import pipeline  # our local pipeline.py

# Load .env so OPENAI_API_KEY is available
load_dotenv()

# Detect project root by climbing until we see data/
ROOT_DIR = Path.cwd()
for _ in range(6):
    if (ROOT_DIR / "data").exists():
        break
    ROOT_DIR = ROOT_DIR.parent

print("Detected project root:", ROOT_DIR)

INPUT_JSONL = ROOT_DIR / "data" / "raw" / "synthetic" / "issue_tracking" / "synthetic_whatsapp_client_messages.jsonl"
OUTPUT_DIR = ROOT_DIR / "notebooks" / "issue_tracking" / "cel" / "results"

INPUT_JSONL, OUTPUT_DIR

In [None]:
from importlib import reload

reload(pipeline)  # so edits to pipeline.py are picked up

results_df = pipeline.run_issue_tracking_pipeline(
    input_path=INPUT_JSONL,
    output_dir=OUTPUT_DIR,
    model="gpt-4.1-mini",
    test_mode=True,      # change to False for full run
    max_rows=30,
    sleep_sec=0.0,
)

results_df.head()

In [None]:
# How many had parsing/LLM errors?
print("Total rows:", len(results_df))
print("Errors:", results_df["error"].notna().sum())

# Quick look at issue category distribution
results_df["issue"].value_counts(dropna=False)

In [None]:
# See rows with errors (if any)
results_df[results_df["error"].notna()][[
    "chat_id",
    "message_id",
    "message_text",
    "raw_model_output",
    "error",
]].head()

# See a sample mapping from raw WhatsApp message to issue row
cols = [
    "chat_id",
    "message_id",
    "timestamp",
    "sender_name",
    "message_text",
    "issue",
    "assigned_to",
    "to_inform",
    "resolution_score",
    "comments",
]
results_df[cols].head(10)