# Analyst Toolkit — Template

This notebook runs the toolkit step-by-step using YAML configs in `config/`.
Use `config/run_toolkit_config.yaml` to toggle modules on/off.

> TODO: Select your VS Code/Jupyter kernel.

- Optionally set `python.defaultInterpreterPath` and `jupyter.kernelsFilter` in `.vscode/settings.json`.
- Or comment those settings out to use your workspace defaults.

In [1]:
# Imports
from analyst_toolkit.m00_utils.config_loader import load_config
from analyst_toolkit.m00_utils.load_data import load_csv

# Module runners (imported lazily below as needed)
# Diagnostics
from analyst_toolkit.m01_diagnostics import run_diag_pipeline
# Validation
from analyst_toolkit.m02_validation import run_validation_pipeline
# Normalization
from analyst_toolkit.m03_normalization import run_normalization_pipeline
# Duplicates
from analyst_toolkit.m04_duplicates import run_duplicates_pipeline
# Outlier Detection
from analyst_toolkit.m05_detect_outliers.run_detection_pipeline import run_outlier_detection_pipeline
# Outlier Handling
from analyst_toolkit.m06_outlier_handling import run_outlier_handling_pipeline
# Imputation
from analyst_toolkit.m07_imputation import run_imputation_pipeline
# Final Audit
from analyst_toolkit.m10_final_audit import run_final_audit_pipeline

In [2]:
# Paths and config loader helpers
from pathlib import Path
import os
def find_project_root(markers=("config","notebooks")):
    p = Path.cwd()
    for cand in [p, *p.parents]:
        if all((cand / m).exists() for m in markers):
            return cand
    return Path.cwd()

# Safe path formatting to avoid leaking local absolute paths
def format_path_safe(p, base=None):
    p = Path(p)
    if base is None:
        try:
            base = ROOT
        except NameError:
            base = find_project_root()
    # Prefer path relative to project root
    try:
        rel = p.relative_to(base)
        return f"<repo>/{rel}"
    except Exception:
        pass
    # Otherwise redact the home directory
    home = Path.home()
    try:
        rel_home = p.relative_to(home)
        return f"~/{rel_home}"
    except Exception:
        return str(p)
ROOT = find_project_root()
def resolve(p):
    p = Path(p)
    return str(p) if p.is_absolute() else str(ROOT / p)
print("CWD:", format_path_safe(Path.cwd()))
print("ROOT:", format_path_safe(ROOT))

# Load master run config
RUN_CONFIG_PATH = resolve("config/run_toolkit_config.yaml")
cfg = load_config(RUN_CONFIG_PATH)
run_id = cfg.get("run_id", "demo_run")
notebook_mode = cfg.get("notebook", True)
modules_cfg = cfg.get("modules", {})
print(f'run_id: {run_id} | notebook: {notebook_mode}')
entry_path = cfg.get("pipeline_entry_path")
entry_path = resolve(entry_path) if entry_path else entry_path
print("pipeline_entry_path:", format_path_safe(entry_path) if entry_path else None)

CWD: <repo>/notebooks
ROOT: <repo>/.
run_id: demo_01_ | notebook: True
pipeline_entry_path: <repo>/data/raw/synthetic_penguins_v3.5.csv


In [3]:
# Optional: Load raw data explicitly (if module configs don't specify)
df = None
if entry_path:
    try:
        df = load_csv(entry_path)
        print("Loaded raw data from:", format_path_safe(entry_path))
    except Exception as e:
        print("Could not load entry file:", format_path_safe(entry_path))
        print("Error:", type(e).__name__)

Loaded raw data from: <repo>/data/raw/synthetic_penguins_v3.5.csv


## M01 — Diagnostics

In [4]:
if modules_cfg.get('diagnostics', {}).get('run', False):
    diag_cfg_path = modules_cfg['diagnostics'].get('config_path', 'config/diag_config_template.yaml')
    diag_all = load_config(resolve(diag_cfg_path))
    diag_cfg = diag_all.get('diagnostics', {})
    print("Running diagnostics...")
    df = run_diag_pipeline(df=df, config=diag_cfg, notebook=notebook_mode, run_id=run_id)
else:
    print("Skipping diagnostics.")

Running diagnostics...


Rows,Columns
5540,15

Memory Usage
3.65 MB

Duplicate Rows,Duplicate %
0,0.0


Column,Unique Values
tag_id,2678
capture_date,1917
date_egg,1656
colony_id,19
study_name,12
island,11

Column,Dtype,Unique Values,Audit Remarks,Missing Count,Missing %
tag_id,object,2678,✅ OK,2241,40.45
species,object,5,✅ OK,166,3.0
bill_length_mm,float64,1984,✅ OK,429,7.74
bill_depth_mm,float64,862,✅ OK,417,7.53
flipper_length_mm,float64,1466,✅ OK,451,8.14
body_mass_g,float64,3328,✅ OK,406,7.33
age_group,object,7,✅ OK,121,2.18
sex,object,6,✅ OK,2739,49.44
colony_id,object,19,✅ OK,405,7.31
island,object,11,✅ OK,584,10.54


Metric,count,mean,std,min,25%,50%,75%,max,skew,kurtosis
bill_length_mm,5111.0,45.165934,5.666712,30.63,40.51,45.95,49.36,62.64,-0.145621,-0.606988
bill_depth_mm,5123.0,17.306,2.231266,12.37,15.495,17.49,19.03,23.01,-0.111777,-0.896914
flipper_length_mm,5089.0,202.234132,14.341644,162.79,191.1,199.31,214.1,252.4,0.329565,-0.615484
body_mass_g,5134.0,3853.248624,897.870628,2376.56,3219.25,3742.0,4376.2025,7378.33,0.616697,0.087967


tag_id,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,age_group,sex,colony_id,island,capture_date,health_status,study_name,clutch_completion,date_egg
,Gentoo,48.99,14.11,220.9,5890.0,Adult,Male,Torgersen North,Torgersen,2023-11-17,,PAPRI2023,Yes,2023-11-09
ADE-0001,Adelie,39.55,19.92,186.2,2500.0,Chick,Male,Biscoe West,Biscoe,2024-13-03,Underweight,PAPRI2022,Yes,2022-07-20
,Gentoo,48.23,13.0,,4536.0,Adult,Female,Biscoe West,,2024-04-14,Healthy,,Yes,2024-04-12
GEN-0001,Gentoo,46.22,13.91,212.8,2500.0,Juvenile,Female,Dream South,Dream,,Underweight,PAPRI2020,Yes,2020-04-14
,Chinstrap,49.02,16.22,192.2,3735.0,Adult,,Biscoe West,Biscoe,2022-10-03,Healthy,PAPRI2022,Yes,2022-10-02


Accordion(children=(VBox(children=(HTML(value="<h3 style='margin-top:10px'>Visual Profile</h3>"), HBox(childre…

## M02 — Validation

In [5]:
if modules_cfg.get('validation', {}).get('run', False):
    val_cfg_path = modules_cfg['validation'].get('config_path', 'config/validation_config_template.yaml')
    val_all = load_config(resolve(val_cfg_path))
    val_cfg = val_all.get('validation', {})
    print("Running validation...")
    df = run_validation_pipeline(df=df, config=val_cfg, notebook=notebook_mode, run_id=run_id)
else:
    print("Skipping validation.")

Running validation...


Validation Rule,Description,Status
Schema Conformity,Verify column names match the expected schema.,⚠️ Fail (2 issues)
Dtype Enforcement,Verify column data types match expectations.,✅ Pass
Categorical Values,Verify values in categorical columns are within an allowed set.,✅ Pass
Numeric Ranges,Verify values in numeric columns are within a defined range.,✅ Pass


Issue Type,Columns
Missing,example
Unexpected,"species, island, flipper_length_mm, body_mass_g, bill_depth_mm, tag_id, health_status, clutch_completion, date_egg, capture_date, colony_id, sex, study_name, bill_length_mm, age_group"


## M03 — Normalization

In [6]:
if modules_cfg.get('normalization', {}).get('run', False):
    norm_cfg_path = modules_cfg['normalization'].get('config_path', 'config/normalization_config_template.yaml')
    norm_all = load_config(resolve(norm_cfg_path))
    norm_cfg = norm_all.get('normalization', {})
    print("Running normalization...")
    df = run_normalization_pipeline(df=df, config=norm_cfg, notebook=notebook_mode, run_id=run_id)
else:
    print("Skipping normalization.")

Running normalization...


KeyError: 'column'

## M04 — Duplicates

In [None]:
if modules_cfg.get('duplicates', {}).get('run', False):
    dup_cfg_path = modules_cfg['duplicates'].get('config_path', 'config/dups_config_template.yaml')
    dup_all = load_config(resolve(dup_cfg_path))
    dup_cfg = dup_all.get('duplicates', {})
    print("Running duplicates...")
    df = run_duplicates_pipeline(df=df, config=dup_cfg, notebook=notebook_mode, run_id=run_id)
else:
    print("Skipping duplicates.")

## M05 — Outlier Detection

In [None]:
if modules_cfg.get('outlier_detection', {}).get('run', False):
    det_cfg_path = modules_cfg['outlier_detection'].get('config_path', 'config/outlier_config_template.yaml')
    det_all = load_config(resolve(det_cfg_path))
    det_cfg = det_all.get('outlier_detection', {})
    print("Running outlier detection...")
    df, detection_results = run_outlier_detection_pipeline(df=df, config=det_cfg, notebook=notebook_mode, run_id=run_id)
else:
    print("Skipping outlier detection.")

## M06 — Outlier Handling

In [None]:
if modules_cfg.get('outlier_handling', {}).get('run', False):
    handle_cfg_path = modules_cfg['outlier_handling'].get('config_path', 'config/handling_config_template.yaml')
    handle_all = load_config(resolve(handle_cfg_path))
    handle_cfg = handle_all.get('outlier_handling', {})
    print("Running outlier handling...")
    df = run_outlier_handling_pipeline(df=df, config=handle_cfg, notebook=notebook_mode, run_id=run_id)
else:
    print("Skipping outlier handling.")

## M07 — Imputation

In [None]:
if modules_cfg.get('imputation', {}).get('run', False):
    imp_cfg_path = modules_cfg['imputation'].get('config_path', 'config/imputation_config_template.yaml')
    imp_all = load_config(resolve(imp_cfg_path))
    imp_cfg = imp_all.get('imputation', {})
    print("Running imputation...")
    df = run_imputation_pipeline(df=df, config=imp_cfg, notebook=notebook_mode, run_id=run_id)
else:
    print("Skipping imputation.")

## M10 — Final Audit

In [None]:
if modules_cfg.get('final_audit', {}).get('run', False):
    audit_cfg_path = modules_cfg['final_audit'].get('config_path', 'config/final_audit_config_template.yaml')
    audit_all = load_config(resolve(audit_cfg_path))
    audit_cfg = audit_all.get('final_audit', {})
    print("Running final audit...")
    df = run_final_audit_pipeline(df=df, config=audit_cfg, notebook=notebook_mode, run_id=run_id)
else:
    print("Skipping final audit.")

In [None]:
# Preview final dataframe if available
try:
    display(df.head())
except Exception as e:
    print("No dataframe to display or display failed:", e)