# ProblemFinder Pipeline Demo
This notebook demonstrates how to run the deduplication and classification pipeline from Python.

In [None]:
from pathlib import Path

import pandas as pd

from problemfinder.core.cache import ResponseCache
from problemfinder.core.configuration import build_run_config
from problemfinder.core.pipeline import run_pipeline
from problemfinder.utils.io import load_dataframe
from problemfinder.utils.rate_limit import RateLimiter

# Load the CLI-style configuration
from argparse import Namespace

args = Namespace(
    config=Path("config.yaml"),
    input=Path("data/raw_data.csv"),
    output=Path("data/labeled_sample.csv"),
    dedupe='on',
    similarity_threshold=0.5,
    soft_similarity_threshold=0.35,
    canonical_policy='earliest',
    dedupe_report=None,
    no_split=False,
    train_ratio=0.8,
    val_ratio=0.1,
    test_ratio=0.1,
    ensemble='off',
    ensemble_members='direct,reasoning,rules',
    ensemble_disagreement_threshold=0.3,
    model='gpt-4o',
    temperature=None,
    seed=None,
    cache=None,
    cache_ttl=None,
    cache_path=None,
    max_workers=None,
    rate_limit=None,
    chunk_size=None,
    evaluation=None,
    evaluation_gold_set=None,
    report_path=None,
    resume=False,
    resume_from=None,
    log_level='INFO',
)

df = load_dataframe(args.input)
run_config = build_run_config(args)
cache = ResponseCache(run_config.cache)
rate_limiter = RateLimiter(run_config.parallel.rate_limit)

results = run_pipeline(
    df=df,
    run_config=run_config,
    cache=cache,
    rate_limiter=rate_limiter,
)

results['canonical_df'].head()