In [2]:
import pandas as pd
import numpy as np
import json
from openai import OpenAI
from typing import Dict, List, Any
import os
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import precision_recall_curve, f1_score, precision_score, recall_score
import warnings
import time, traceback
from typing import Any, Dict, List, Optional
import math


# Suppress deprecation warnings from Jupyter
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Read CSV and get headers - replace data.csv with some given csv
# df = pd.read_csv('/content/diabetes.csv')
# headers = df.columns.tolist()

# Example metadata - description of how data was accomplished
#expecting inclusion of problem to solve, patient statistics, previous work if applicable

#to add - category of data(e.g. time-series, nnumerical, categorical, text)
#specificed tools

import re

In [3]:
# ---- put near your imports ----
import json, time, traceback
from typing import Any, Dict, Optional, List

class ResearchFailedError(RuntimeError):
    pass

try:
    from pydantic import BaseModel, Field, ValidationError
    _HAVE_PYDANTIC = True
except Exception:
    _HAVE_PYDANTIC = False


# ---- minimal strict schema (optional but helpful) ----
if _HAVE_PYDANTIC:
    class InterpretationGuide(BaseModel):
        high_values: Optional[str] = None
        low_values: Optional[str] = None

    class ExplainableFeature(BaseModel):
        feature_concept: str
        formula_pattern: Optional[str] = None
        physiological_explanation: Optional[str] = None
        clinical_relevance: Optional[str] = None
        explainability_score: Optional[int] = Field(default=None, ge=1, le=5)
        used_by: Optional[str] = None
        interpretation_guide: Optional[InterpretationGuide] = None
        source: Optional[str] = None

    class PhysiologicalResearchResponse(BaseModel):
        explainable_features: List[ExplainableFeature] = Field(default_factory=list)
        domain_transformations: List[Dict[str, Any]] = Field(default_factory=list)
        explainable_interactions: List[Any] = Field(default_factory=list)
        features_to_avoid: List[str] = Field(default_factory=list)
        insights_summary: Optional[str] = ""


def _strip_markdown_fences(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        # remove backticks and take substring from the first '{'
        s = s.strip("`")
        i = s.find("{")
        if i >= 0:
            s = s[i:]
    return s


class ResearchAgent:
    """Web research for physiologically explainable features. Hard-fail on errors (no fallback)."""

    def __init__(
        self,
        client,
        model: str = "gpt-4o-2024-08-06",
        retries: int = 0,              # 0 = no retry (minimize token use)
        backoff_sec: float = 0.6,
        require_min_features: int = 1, # fail if fewer than this survive
        min_explainability: int = 3    # filter threshold
        ):
        self.client = client
        self.model = model
        self.retries = max(0, retries)
        self.backoff_sec = backoff_sec
        self.require_min_features = require_min_features
        self.min_explainability = min_explainability

    def search(self, target: str, metadata: Dict[str, Any], feedback: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        """Attempt research once (or with small retry). If it fails -> raise ResearchFailedError."""
        print("  🔍 Performing physiologically-focused research with OpenAI...")

        if self.client is None:
            raise ResearchFailedError("OpenAI client is not configured.")

        # Keep prompt compact to reduce tokens and reduce failure modes.
        domain = metadata.get("domain", "general")
        problem_description = metadata.get("problem", metadata.get("description", ""))
        cols = metadata.get("column_names", [])
        cols_disp = cols[:30]  # cap column list length

        research_prompt = self._build_prompt(
            domain=domain,
            problem=problem_description,
            target=target,
            cols_disp=cols_disp,
            feedback=feedback
        )

        last_exc = None
        attempts = self.retries + 1
        for attempt in range(attempts):
            try:
                completion = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {
                            "role": "system",
                            "content": (
                                "You are a medical/biological research analyst specializing in INTERPRETABLE ML. "
                                "Return ONLY valid JSON (no markdown)."
                            )
                        },
                        {"role": "user", "content": research_prompt}
                    ],
                    response_format={"type": "json_object"},  # <-- valid for Chat Completions
                    temperature=0.0,  # deterministic; reduces junk
                    top_p=1.0
                    # (Set a request timeout at client level if your SDK supports it)
                )

                raw = completion.choices[0].message.content or "{}"
                raw = _strip_markdown_fences(raw)

                # Parse/validate
                result = self._parse_and_filter(raw)

                n = len(result.get("explainable_features", []))
                if n < self.require_min_features:
                    raise ResearchFailedError(
                        f"Research returned {n} explainable feature(s), fewer than required minimum {self.require_min_features}."
                    )

                print(f"  ✓ Extracted {n} physiologically meaningful features")
                if result.get("insights_summary"):
                    summary = result["insights_summary"][:100]
                    print(f"  💡 {summary}{'...' if len(result['insights_summary']) > 100 else ''}")

                return result

            except Exception as e:
                last_exc = e
                print(f"  ⚠ Research attempt {attempt+1}/{attempts} failed: {e}")
                traceback.print_exc()
                if attempt < attempts - 1:
                    time.sleep(self.backoff_sec)

        # All attempts failed -> raise, so caller can abort without burning more tokens
        raise ResearchFailedError(f"Research failed after {attempts} attempt(s): {last_exc}")

    # ---------- helpers ----------

    def _parse_and_filter(self, raw_json: str) -> Dict[str, Any]:
        """Parse JSON (optionally with Pydantic), enforce explainability filter, and required keys."""
        if _HAVE_PYDANTIC:
            try:
                parsed = PhysiologicalResearchResponse.model_validate_json(raw_json)
                result = parsed.model_dump()
            except ValidationError as ve:
                # If schema fails, treat as failure (no fallback)
                raise ResearchFailedError(f"Invalid research JSON schema: {ve}") from ve
        else:
            try:
                result = json.loads(raw_json)
                if not isinstance(result, dict):
                    raise ResearchFailedError("Research response was not a JSON object.")
            except Exception as je:
                raise ResearchFailedError(f"Research JSON could not be parsed: {je}") from je

        # ensure keys
        for k, default in [
            ("explainable_features", []),
            ("domain_transformations", []),
            ("explainable_interactions", []),
            ("features_to_avoid", []),
            ("insights_summary", "")
        ]:
            result.setdefault(k, default)

        # filter by explainability threshold
        feats = result.get("explainable_features", [])
        filtered = []
        for f in feats:
            if not isinstance(f, dict):
                continue
            score = f.get("explainability_score", 0) or 0
            if score >= self.min_explainability:
                filtered.append(f)
        result["explainable_features"] = filtered
        return result

    def _build_prompt(self, domain, problem, target, cols_disp, feedback) -> str:
        lines = [
            f"Domain: {domain}",
            f"Problem: {problem}",
            f"Target variable: {target}",
            f"Available columns: {cols_disp}"
        ]
        if feedback:
            lines.append("Previously successful features (if any):")
            lines.append(f"  - {feedback.get('top_features', [])[:5]}")
            lines.append(f"  - Best Score: {feedback.get('best_score', 'N/A')}")

        ctx = "\n".join(lines)

        # Keep the schema instruction tight to reduce hallucination + token use.
        return f"""
You are assisting clinical feature engineering for an interpretable stroke prediction model.

{ctx}

Return STRICT JSON with keys:
- "explainable_features": list of objects with fields:
  "feature_concept": string,
  "formula_pattern": string (Pythonic expression using only available columns),
  "physiological_explanation": string,
  "clinical_relevance": string,
  "explainability_score": integer in [1,5],
  "used_by": string,
  "interpretation_guide": {{"high_values": string, "low_values": string}},
  "source": string
- "domain_transformations": list of objects: {{"name": string, "expr": string}}
- "explainable_interactions": list (each an array of 2 column names or an object)
- "features_to_avoid": list of strings
- "insights_summary": string

Rules:
- Focus on physiologically explainable features used in cardiovascular/neurology risk (stroke).
- Prefer standard clinical indicators and clear mechanisms over black-box constructs.
- Use only the provided columns in formula_pattern.
- Output MUST be valid JSON. No markdown, no commentary outside JSON.
""".strip()

    def generate_explanation_report(self, research_results: Dict[str, Any], domain: str) -> str:
        feats = research_results.get("explainable_features", [])
        if not feats:
            return "No explainable features returned."

        lines = [
            "\n" + "="*70,
            "PHYSIOLOGICALLY EXPLAINABLE FEATURES REPORT",
            f"Domain: {domain}",
            "="*70 + "\n"
        ]
        for i, f in enumerate(feats, 1):
            lines.append(f"\n{i}. {f.get('feature_concept','').upper()}")
            lines.append("   " + "─"*66)
            lines.append(f"   Formula: {f.get('formula_pattern','N/A')}")
            lines.append("\n   Physiological Explanation:")
            lines.append(f"   {f.get('physiological_explanation','N/A')}")
            lines.append(f"\n   Clinical Relevance: {f.get('clinical_relevance','UNKNOWN')}")
            lines.append(f"   Explainability Score: {f.get('explainability_score', 0)}/5")
            lines.append(f"   Used By: {f.get('used_by','Unknown')}")
            if 'interpretation_guide' in f and isinstance(f['interpretation_guide'], dict):
                g = f['interpretation_guide']
                lines.append("\n   Interpretation:")
                lines.append(f"   • High values → {g.get('high_values','N/A')}")
                lines.append(f"   • Low values → {g.get('low_values','N/A')}")
            lines.append(f"\n   Source: {f.get('source','Not specified')}\n")

        lines.append("\n" + "="*70)
        lines.append("SUMMARY")
        lines.append("="*70)
        lines.append(research_results.get("insights_summary", ""))
        lines.append("\n" + "="*70 + "\n")
        return "\n".join(lines)


In [4]:
class FeatureStrategyAgent:
    """Generates feature engineering strategies using LLM (THE DECISION MAKER)"""

    def __init__(self, client, validator=None):
        self.client = client
        self.validator = validator

    def design_strategy(self, X, target, feedback=None, research_context=None):
        """
        Design a feature engineering strategy with KEEP/CREATE/REMOVE actions.
        Now enforces physiological explainability requirements.
        """
        prompt = self._build_strategy_prompt(X, target, feedback, research_context)

        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": """You are a feature engineering decision maker specializing in INTERPRETABLE features.

CRITICAL: Every feature you create must be PHYSIOLOGICALLY/BIOLOGICALLY EXPLAINABLE.

Priorities (in order):
1. Clinical indicators and standard medical metrics
2. Features with clear biological interpretation
3. Explainable transformations that preserve meaning
4. Research-backed feature interactions

Always respond with valid JSON including physiological_rationale for each CREATE action."""
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            response_format={"type": "json_object"},
            temperature=0.4
        )

        result = json.loads(response.choices[0].message.content)

        print(f"\n  📋 Agent Strategy:")
        print(f"    Domain: {result.get('domain', 'unknown')}")
        print(f"    Reasoning: {result.get('reasoning', 'N/A')[:100]}...")

        # Count explainability scores
        self._print_explainability_summary(result)

        if self.validator:
            validated_result = self.validator.validate_strategy(result, X, feedback)
            return validated_result

        return result

    def _build_strategy_prompt(self, X, target, feedback, research_context):
        """
        Build the prompt for strategy generation with EXPLAINABILITY focus.
        Enhanced to require physiological rationale for all features.
        """

        # Feedback section
        feedback_section = ""
        if feedback:
            feedback_section = f"""PREVIOUS ITERATION FEEDBACK:
- Score: {feedback['best_score']:.3f}
- Top 5 performing features: {feedback['top_features'][:5]}
- What worked: {feedback.get('what_worked', 'N/A')}
- What didn't work: {feedback.get('what_failed', 'N/A')}
- Strategy: Build on what's working, avoid what failed

🔍 Explainability Analysis:
{self._format_previous_explainability(feedback)}
"""
        else:
            feedback_section = """FIRST ITERATION: Explore PHYSIOLOGICALLY MEANINGFUL feature relationships

Focus on:
- Standard clinical indicators
- Known biomarkers
- Established medical calculations
- Explainable interactions"""

        # Research section - ENHANCED for physiological features
        research_section = ""
        if research_context:
            research_section = self._format_physiological_research(research_context)

        # Get column info with types
        column_info = X.columns.tolist()
        if hasattr(X, 'dtypes'):
            column_info = [f"{col} ({dtype})" for col, dtype in zip(X.columns, X.dtypes)]

        # BUILD THE COMPLETE PROMPT WITH EXPLAINABILITY REQUIREMENTS
        return f"""You are the DECISION MAKER for PHYSIOLOGICALLY EXPLAINABLE feature engineering.

DATASET INFO:
- Target: {target}
- Current Columns ({len(X.columns)}): {column_info}
- Shape: {X.shape}
- Available columns list: {X.columns.tolist()}

{feedback_section}

{research_section}

YOUR TASK: Design a feature engineering strategy prioritizing EXPLAINABILITY.

🎯 EXPLAINABILITY REQUIREMENTS:

Every CREATE action MUST include:
1. **physiological_rationale** - WHY this feature is biologically meaningful
2. **explainability_score** (1-5):
   - 5 = Standard clinical metric (e.g., BMI, eGFR, cholesterol/HDL)
   - 4 = Clear biological meaning (e.g., age × kidney_function - aging affects kidneys)
   - 3 = Interpretable transformation (e.g., log(glucose) - normalizes skewed data)
   - 2 = Weak biological basis (e.g., age^2 - captures curve but unclear WHY)
   - 1 = "Black box" (e.g., complex polynomial - AVOID)
3. **clinical_relevance** - HIGH/MEDIUM/LOW
4. **interpretation** - What does high/low value mean?

ACTIONS YOU MUST TAKE:

1. **KEEP** - Preserve existing features that are valuable
   - MANDATORY: Keep all top-performing features from previous iteration
   - Keep features that align with research insights
   - Keep features with high explainability scores

2. **CREATE** - Generate new EXPLAINABLE features

   ✅ HIGH PRIORITY (Explainability 4-5):
   - Clinical ratios (e.g., cholesterol/HDL - standard medical metric)
   - Standard biomarkers (e.g., BMI calculation if height/weight available)
   - Known medical formulas (e.g., eGFR, ASCVD risk score components)
   - Physiological interactions (e.g., age × kidney_function - biological aging)

   ⚠️ MEDIUM PRIORITY (Explainability 3):
   - Interpretable transformations (log for skewed data, sqrt for variance)
   - Clinical categories/binning (e.g., blood pressure stages)
   - Simple polynomials IF explainable (age^2 for accelerating risk)

   ❌ AVOID (Explainability 1-2):
   - Complex polynomials without biological meaning
   - Arbitrary mathematical transformations
   - Features that can't be explained to a clinician

   CRITICAL RULES:
   ✓ Use ONLY EXISTING column names from: {X.columns.tolist()}
   ✓ DO NOT reference features being created in this same iteration
   ✓ Each CREATE formula can ONLY use columns that currently exist
   ✓ If previous iteration had good features, KEEP them first
   ✓ Formulas must be executable Python (use np.log, np.sqrt, np.abs, etc.)
   ✓ Aim for 8-12 total actions (balance keep + create, quality > quantity)

3. **REMOVE** - Drop features that aren't helping (OPTIONAL)
   - Remove low explainability features that don't perform well
   - Remove redundant features

OUTPUT FORMAT (JSON only):
{{
  "domain": "identified domain (e.g., healthcare - cardiology)",
  "reasoning": "Your strategy explanation with focus on explainability",
  "explainability_focus": "Why these features are physiologically meaningful",
  "actions": [
    {{
      "action": "keep",
      "name": "bmi",
      "why": "Top performing feature from previous iteration",
      "explainability_score": 5,
      "physiological_rationale": "Body Mass Index - standard clinical obesity metric"
    }},
    {{
      "action": "create",
      "name": "insulin_glucose_ratio",
      "formula": "df['insulin'] / (df['glucose'] + 1)",
      "why": "Standard endocrinology metric - measures pancreatic function",
      "physiological_rationale": "Ratio of insulin to glucose indicates pancreatic beta cell function. Low ratio suggests insulin resistance, a key Type 2 diabetes mechanism. Used by endocrinologists for diagnosis.",
      "explainability_score": 5,
      "clinical_relevance": "HIGH",
      "interpretation": {{
        "high": "Good insulin production (healthy)",
        "low": "Insulin resistance or pancreatic dysfunction (diabetes risk)"
      }},
      "source": "American Diabetes Association guidelines"
    }},
    {{
      "action": "create",
      "name": "age_kidney_interaction",
      "formula": "df['age'] * df['kidney_function_score']",
      "why": "Aging compounds kidney disease effects",
      "physiological_rationale": "Kidney function naturally declines with age (~1 ml/min/year after age 30). This interaction captures compounded risk from both aging and reduced kidney function.",
      "explainability_score": 4,
      "clinical_relevance": "MEDIUM",
      "interpretation": {{
        "high": "Older patient with kidney issues - compound risk",
        "low": "Young or good kidney function - lower risk"
      }},
      "source": "Nephrology literature on age-related GFR decline"
    }},
    {{
      "action": "create",
      "name": "glucose_log",
      "formula": "np.log1p(df['glucose'])",
      "why": "Normalizes right-skewed glucose distribution",
      "physiological_rationale": "Glucose values are right-skewed. Log transform normalizes distribution while preserving relative differences. Common preprocessing for skewed medical measurements.",
      "explainability_score": 3,
      "clinical_relevance": "MEDIUM",
      "interpretation": {{
        "high": "High glucose (preserves clinical meaning)",
        "low": "Normal glucose (compressed very low values)"
      }},
      "source": "Standard statistical preprocessing for biomedical data"
    }}
  ]
}}

CRITICAL:
- Target 8-12 actions with explainability_score >= 3
- Every CREATE action MUST have physiological_rationale
- Prioritize features clinicians can understand and trust
- ONLY use existing columns: {X.columns.tolist()}"""

    def _format_physiological_research(self, research_context):
        """Format research context with emphasis on explainability"""

        # Handle both old structure and new physiological structure
        if 'explainable_features' in research_context:
            # New physiological structure
            section = """RESEARCH-BACKED PHYSIOLOGICALLY EXPLAINABLE INSIGHTS:

🔬 Explainable Features Found:
"""
            for feature in research_context.get('explainable_features', [])[:4]:
                section += f"""
  • {feature.get('feature_concept', 'Unknown')} (Score: {feature.get('explainability_score', 'N/A')}/5)
    Formula: {feature.get('formula_pattern', 'N/A')}
    Why: {feature.get('physiological_explanation', 'N/A')[:120]}...
    Clinical Relevance: {feature.get('clinical_relevance', 'UNKNOWN')}
"""

            if research_context.get('domain_transformations'):
                section += "\n⚙️ Domain-Specific Transformations:\n"
                for trans in research_context.get('domain_transformations', [])[:3]:
                    section += f"  • {trans.get('transformation', trans) if isinstance(trans, dict) else trans}\n"

            if research_context.get('explainable_interactions'):
                section += "\n🔗 Explainable Interactions:\n"
                for inter in research_context.get('explainable_interactions', [])[:3]:
                    section += f"  • {inter.get('interaction', inter) if isinstance(inter, dict) else inter}\n"

            if research_context.get('features_to_avoid'):
                section += "\n⚠️ Features to AVOID (Low Explainability):\n"
                for avoid in research_context.get('features_to_avoid', [])[:3]:
                    section += f"  • {avoid.get('feature', avoid) if isinstance(avoid, dict) else avoid}\n"

        else:
            # Old structure - still usable
            section = """RESEARCH-BACKED DOMAIN INSIGHTS:

📊 Domain Patterns Found:
"""
            section += "\n".join(f"  • {p}" for p in research_context.get('domain_patterns', [])[:4])

            section += "\n\n🔧 Recommended Feature Types:\n"
            section += "\n".join(f"  • {t}" for t in research_context.get('recommended_feature_types', [])[:4])

            if research_context.get('domain_transformations'):
                section += "\n\n⚙️ Domain-Specific Transformations:\n"
                section += "\n".join(f"  • {t}" for t in research_context.get('domain_transformations', [])[:3])

            if research_context.get('key_interactions'):
                section += "\n\n🔗 Key Interactions to Consider:\n"
                section += "\n".join(f"  • {i}" for i in research_context.get('key_interactions', [])[:3])

            if research_context.get('pitfalls_to_avoid'):
                section += "\n\n⚠️ Pitfalls to Avoid:\n"
                section += "\n".join(f"  • {p}" for p in research_context.get('pitfalls_to_avoid', [])[:3])

        section += f"\n\n💡 Research Summary: {research_context.get('insights_summary', '')}\n"
        section += "\nUSE these insights to create EXPLAINABLE features.\n"

        return section

    def _format_previous_explainability(self, feedback):
        """Format explainability information from previous iteration"""

        if 'feature_explainability' in feedback:
            lines = []
            for feature, info in list(feedback['feature_explainability'].items())[:5]:
                score = info.get('score', 'N/A')
                lines.append(f"  - {feature}: Score {score}/5")
            return "\n".join(lines) if lines else "  No explainability data from previous iteration"

        return "  Explainability tracking not available from previous iteration"

    def _print_explainability_summary(self, result):
        """Print summary of explainability scores for created features"""

        create_actions = [a for a in result.get('actions', []) if a.get('action') == 'create']

        if not create_actions:
            return

        scores = [a.get('explainability_score', 0) for a in create_actions]

        if scores:
            avg_score = sum(scores) / len(scores)
            high_quality = sum(1 for s in scores if s >= 4)
            medium_quality = sum(1 for s in scores if 3 <= s < 4)
            low_quality = sum(1 for s in scores if s < 3)

            print(f"    Explainability: Avg {avg_score:.1f}/5")
            print(f"    Quality: {high_quality} high (4-5), {medium_quality} medium (3), {low_quality} low (<3)")

    def execute_actions(self, df, actions, verbose=True):
        """
        Execute validated actions on the dataframe.
        Enhanced with explainability tracking.

        ASSUMES: Actions have already been validated by FeatureValidator
        """
        results = {
            'kept': [],
            'created': [],
            'removed': [],
            'failed': [],
            'explainability_scores': {}  # NEW: Track explainability
        }

        print("\n  🔧 Executing actions...")

        for action in actions:
            action_type = action.get('action')
            name = action.get('name')

            try:
                if action_type == 'keep':
                    results['kept'].append(name)
                    if verbose:
                        exp_score = action.get('explainability_score', 'N/A')
                        print(f"  ✓ Kept: {name} (Explainability: {exp_score}/5)")

                    # Track explainability
                    if 'explainability_score' in action:
                        results['explainability_scores'][name] = {
                            'score': action['explainability_score'],
                            'rationale': action.get('physiological_rationale', 'N/A')
                        }

                elif action_type == 'create':
                    formula = action.get('formula')

                    # Execute the formula
                    df[name] = eval(formula)
                    results['created'].append(name)

                    if verbose:
                        exp_score = action.get('explainability_score', '?')
                        clinical = action.get('clinical_relevance', '?')
                        print(f"  ✓ Created: {name} (Explainability: {exp_score}/5, Clinical: {clinical})")

                    # Track explainability
                    results['explainability_scores'][name] = {
                        'score': action.get('explainability_score', 0),
                        'rationale': action.get('physiological_rationale', 'Not provided'),
                        'clinical_relevance': action.get('clinical_relevance', 'UNKNOWN'),
                        'interpretation': action.get('interpretation', {})
                    }

                elif action_type == 'remove':
                    df.drop(columns=[name], inplace=True)
                    results['removed'].append(name)
                    if verbose:
                        print(f"  ✓ Removed: {name}")

            except Exception as e:
                if verbose:
                    print(f"  ❌ Failed {action_type} for '{name}': {e}")
                results['failed'].append({
                    'action': action,
                    'error': str(e)
                })

        # Print explainability summary
        if results['explainability_scores']:
            avg_score = sum(s['score'] for s in results['explainability_scores'].values()) / len(results['explainability_scores'])
            high_quality = sum(1 for s in results['explainability_scores'].values() if s['score'] >= 4)

            print(f"\n  📊 Execution complete: {len(results['created'])} created, "
                  f"{len(results['kept'])} kept, {len(results['removed'])} removed, "
                  f"{len(results['failed'])} failed")
            print(f"  🔍 Explainability: Avg {avg_score:.1f}/5, {high_quality} highly explainable (≥4)")
        else:
            print(f"\n  📊 Execution complete: {len(results['created'])} created, "
                  f"{len(results['kept'])} kept, {len(results['removed'])} removed, "
                  f"{len(results['failed'])} failed")

        return results

    def generate_feature_documentation(self, actions, domain):
        """
        Generate human-readable documentation for all features.
        Explains what each feature means physiologically.
        """

        doc_lines = [
            f"\n{'='*70}",
            f"FEATURE DOCUMENTATION - {domain.upper()}",
            f"{'='*70}\n"
        ]

        create_actions = [a for a in actions if a.get('action') == 'create']
        keep_actions = [a for a in actions if a.get('action') == 'keep']

        if create_actions:
            doc_lines.append("NEWLY CREATED FEATURES:")
            doc_lines.append("─" * 70)

            for idx, action in enumerate(create_actions, 1):
                doc_lines.append(f"\n{idx}. {action['name']}")
                doc_lines.append(f"   Formula: {action.get('formula', 'N/A')}")
                doc_lines.append(f"   Explainability: {action.get('explainability_score', '?')}/5")
                doc_lines.append(f"   Clinical Relevance: {action.get('clinical_relevance', 'UNKNOWN')}")
                doc_lines.append(f"\n   Physiological Meaning:")
                doc_lines.append(f"   {action.get('physiological_rationale', 'Not provided')}")

                if 'interpretation' in action and action['interpretation']:
                    interp = action['interpretation']
                    doc_lines.append(f"\n   Interpretation:")
                    doc_lines.append(f"   • High values: {interp.get('high', 'N/A')}")
                    doc_lines.append(f"   • Low values: {interp.get('low', 'N/A')}")

                if 'source' in action:
                    doc_lines.append(f"\n   Source: {action['source']}")

                doc_lines.append("")

        if keep_actions:
            doc_lines.append(f"\n{'─' * 70}")
            doc_lines.append("KEPT FEATURES (from previous iteration):")
            doc_lines.append("─" * 70)

            for action in keep_actions:
                exp_score = action.get('explainability_score', 'N/A')
                doc_lines.append(f"  • {action['name']} (Explainability: {exp_score}/5)")
                if 'physiological_rationale' in action:
                    doc_lines.append(f"    {action['physiological_rationale'][:80]}...")

        doc_lines.append(f"\n{'='*70}\n")

        return "\n".join(doc_lines)

In [5]:
class FeatureValidator:
    """Validates feature engineering actions and formulas with comprehensive checks"""

    def __init__(self, client, min_explainability_score=3):
        """
        Args:
            client: OpenAI client for LLM-based self-correction
            min_explainability_score: Minimum explainability score (1-5) to accept features
        """
        self.client = client
        self.min_explainability_score = min_explainability_score

    def validate_strategy(self, result, X, feedback=None, max_retry_attempts=2):
        """
        Validate and sanitize agent suggestions with explainability checks.
        If validation fails, pass errors back to LLM for self-correction.

        Args:
            result: Strategy result from FeatureStrategyAgent
            X: DataFrame with current features
            feedback: Feedback from previous iteration
            max_retry_attempts: How many times to let LLM retry fixing errors

        Returns:
            Validated result with valid actions
        """
        print("  🔍 Validating strategy (technical + explainability)...")

        # Try validation with potential retries
        for attempt in range(max_retry_attempts + 1):
            valid_actions = []
            validation_errors = []
            existing_cols = set(X.columns)

            # Step 1: Auto-keep top features (even if LLM forgot)
            auto_keeps = self._auto_keep_top_features(result, X, feedback)
            valid_actions.extend(auto_keeps)

            # Step 2: Validate each action from LLM
            for action in result.get('actions', []):
                validated, error = self._validate_action_with_error(
                    action, existing_cols, feedback, valid_actions, X
                )
                if validated:
                    valid_actions.append(validated)
                else:
                    validation_errors.append({
                        'action': action,
                        'error': error
                    })

            # Step 3: Filter by explainability
            if self.min_explainability_score > 0:
                valid_actions, low_explainability_errors = self._filter_by_explainability(
                    valid_actions,
                    X,
                    min_score=self.min_explainability_score
                )
                validation_errors.extend(low_explainability_errors)

            # Step 4: Check if we have enough valid actions
            if len(valid_actions) >= 3:  # Success threshold
                result['actions'] = valid_actions
                result['validation_errors'] = validation_errors
                self._print_validation_summary(valid_actions, validation_errors)
                return result

            # Step 5: If failed and have retries left, ask LLM to fix
            if attempt < max_retry_attempts and validation_errors:
                print(f"\n  ⚠ Validation issues found. Attempt {attempt + 1}/{max_retry_attempts} - asking LLM to fix...")
                result = self._request_llm_correction(result, validation_errors, X, feedback)
                # Loop will retry with corrected result
            else:
                break

        # Step 6: Fallback if everything failed
        if len(valid_actions) < 3:
            print("  ⚠ WARNING: Insufficient valid actions after retries! Using fallback...")
            valid_actions.extend(self._create_fallback_actions(X, feedback))

        result['actions'] = valid_actions
        result['validation_errors'] = validation_errors
        self._print_validation_summary(valid_actions, validation_errors)

        return result

    def _validate_action_with_error(self, action, existing_cols, feedback, valid_actions, X):
        """
        Validate a single action and return both result and error message.

        Returns:
            (validated_action, error_message)
            - If valid: (action_dict, None)
            - If invalid: (None, error_string)
        """
        action_type = action.get('action', '').lower()
        name = action.get('name', '')

        if not name:
            return None, "Action has no name"

        if action_type == 'keep':
            return self._validate_keep_with_error(action, name, existing_cols, valid_actions)

        elif action_type == 'create':
            return self._validate_create_with_error(action, name, X, existing_cols, valid_actions)

        elif action_type == 'remove':
            return self._validate_remove_with_error(action, name, existing_cols, feedback)

        else:
            return None, f"Unknown action type: '{action_type}'"

    def _validate_keep_with_error(self, action, name, existing_cols, valid_actions):
        """Validate keep action, return error if invalid"""
        if name not in existing_cols:
            return None, f"Cannot keep '{name}' - not in dataset (available: {list(existing_cols)[:10]})"

        # Check for duplicate keeps
        if any(a['action'] == 'keep' and a['name'] == name for a in valid_actions):
            return None, f"Duplicate keep for '{name}'"

        exp_score = action.get('explainability_score', 'N/A')
        print(f"  ✓ Keep: {name} (Explainability: {exp_score})")
        return action, None

    def _validate_create_with_error(self, action, name, X, existing_cols, valid_actions=None):
        """Validate create action with comprehensive error reporting"""
        formula = action.get('formula', '')

        if not formula:
            return None, f"Cannot create '{name}' - no formula provided"

        # Check for explainability metadata first (so LLM knows to add it)
        exp_error = self._check_explainability_metadata(action, name)
        if exp_error:
            return None, exp_error

        available_set = set(existing_cols)
        if valid_actions:
            for a in valid_actions:
                try:
                    if a.get('action','').lower() == 'create' and a.get('name'):
                        available_set.add(a['name'])
                except Exception:
                    pass

        # Technical validation (formula correctness)
        validation_result = self.validate_formula(
        formula, X, name,
        available_columns_override=available_set
          )

        if not validation_result['valid']:
            error_msg = f"Invalid formula for '{name}': {validation_result['error']}"
            if validation_result.get('details'):
                error_msg += f". Details: {validation_result['details']}"
            return None, error_msg

        exp_score = action.get('explainability_score', '?')
        clinical = action.get('clinical_relevance', '?')
        print(f"  ✓ Create: {name} (Explainability: {exp_score}/5, Clinical: {clinical})")

        if validation_result.get('warning'):
            print(f"    ⚠ Warning: {validation_result['warning']}")

        return action, None

    def _validate_remove_with_error(self, action, name, existing_cols, feedback):
        """Validate remove action with error reporting"""
        if name not in existing_cols:
            return None, f"Cannot remove '{name}' - not in dataset"

        # Block removal of top features
        if feedback and name in feedback.get('top_features', [])[:5]:
            return None, f"Cannot remove '{name}' - it's a top-5 performing feature"

        print(f"  ✓ Remove: {name}")
        return action, None

    def _check_explainability_metadata(self, action, name):
        """
        Check if action has required explainability metadata.
        Returns error string if invalid, None if valid.
        """
        required_fields = ['physiological_rationale', 'explainability_score']
        missing_fields = [f for f in required_fields if f not in action]

        if missing_fields:
            return f"Missing required explainability fields for '{name}': {missing_fields}. Every CREATE action must include physiological_rationale and explainability_score (1-5)."

        # Validate explainability_score is in valid range
        score = action.get('explainability_score')
        if not isinstance(score, (int, float)) or score < 1 or score > 5:
            return f"Invalid explainability_score for '{name}': {score}. Must be integer 1-5."

        return None  # Valid

    def _filter_by_explainability(self, actions, X, min_score=3):
        """
        Filter actions based on explainability score.

        Returns:
            (valid_actions, error_list)
        """
        valid = []
        errors = []

        for action in actions:
            # KEEP and REMOVE actions always pass
            if action['action'] in ['keep', 'remove']:
                valid.append(action)
                continue

            # For CREATE actions, check explainability score
            exp_score = action.get('explainability_score')

            # If no score provided (shouldn't happen after metadata check, but just in case)
            if exp_score is None:
                errors.append({
                    'action': action,
                    'error': f"No explainability_score for '{action['name']}'"
                })
                continue

            if exp_score >= min_score:
                valid.append(action)
            else:
                error_msg = (f"Feature '{action['name']}' has explainability_score {exp_score}, "
                           f"which is below minimum threshold of {min_score}. "
                           f"Feature rationale: '{action.get('physiological_rationale', 'N/A')}'. "
                           f"Either improve the explainability or increase the score if the rationale is actually sound.")
                print(f"    ❌ Rejecting '{action['name']}' - explainability too low ({exp_score}/{min_score})")
                errors.append({
                    'action': action,
                    'error': error_msg
                })

        return valid, errors

    def _request_llm_correction(self, result, validation_errors, X, feedback):
        """
        Pass validation errors back to LLM and request corrected strategy.

        This is the key feature: LLM sees what went wrong and fixes it.
        """
        # Build error summary
        error_summary = self._format_errors_for_llm(validation_errors, X)

        correction_prompt = f"""Your previous feature engineering strategy had validation errors. Please fix them.

ORIGINAL STRATEGY:
Domain: {result.get('domain', 'unknown')}
Reasoning: {result.get('reasoning', 'N/A')}

VALIDATION ERRORS ({len(validation_errors)} issues):
{error_summary}

CURRENT DATASET COLUMNS:
{X.columns.tolist()}

YOUR TASK: Generate a CORRECTED strategy that fixes all validation errors.

Common fixes needed:
1. Missing explainability metadata? → Add physiological_rationale and explainability_score (1-5)
2. Referencing non-existent columns? → Use only columns from the current dataset list above
3. Low explainability score? → Either improve the rationale or create a more explainable feature
4. Invalid formula? → Fix syntax, avoid dangerous operations, ensure columns exist

CRITICAL REQUIREMENTS:
✓ Every CREATE action must have: physiological_rationale, explainability_score (1-5), clinical_relevance
✓ Only reference columns that exist in dataset: {X.columns.tolist()}
✓ Explainability scores must be >= {self.min_explainability_score} (prioritize clinically meaningful features)
✓ Formulas must be valid Python/pandas

OUTPUT FORMAT (JSON only):
{{
  "domain": "domain",
  "reasoning": "Why this corrected strategy will work",
  "actions": [
    {{
      "action": "create",
      "name": "feature_name",
      "formula": "df['col1'] * df['col2']",
      "why": "Brief explanation",
      "physiological_rationale": "Detailed biological/medical explanation of why this feature is meaningful",
      "explainability_score": 4,
      "clinical_relevance": "HIGH/MEDIUM/LOW",
      "interpretation": {{
        "high": "What high values mean",
        "low": "What low values mean"
      }}
    }}
  ]
}}

Generate 8-12 actions. Fix all errors listed above."""

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a feature engineering expert fixing validation errors. Always respond with valid JSON."
                    },
                    {
                        "role": "user",
                        "content": correction_prompt
                    }
                ],
                response_format={"type": "json_object"},
                temperature=0.4,
                max_tokens=2000
            )

            corrected_result = json.loads(response.choices[0].message.content)
            print(f"  ✓ LLM generated corrected strategy with {len(corrected_result.get('actions', []))} actions")

            return corrected_result

        except Exception as e:
            print(f"  ⚠ LLM correction failed: {e}")
            return result  # Return original if correction fails

    def _format_errors_for_llm(self, validation_errors, X):
        """Format validation errors in a clear way for LLM to understand and fix"""
        error_lines = []

        for idx, error_item in enumerate(validation_errors, 1):
            action = error_item['action']
            error = error_item['error']

            error_lines.append(f"\n{idx}. Feature: {action.get('name', 'unknown')}")
            error_lines.append(f"   Action: {action.get('action', 'unknown')}")
            error_lines.append(f"   ERROR: {error}")

            if action.get('formula'):
                error_lines.append(f"   Formula: {action['formula']}")
            if action.get('physiological_rationale'):
                error_lines.append(f"   Rationale: {action['physiological_rationale'][:100]}...")

        return "\n".join(error_lines)

    def _auto_keep_top_features(self, result, X, feedback):
        """
        Automatically keep top-performing features from previous iteration.
        Includes explainability metadata if available.
        """
        if not feedback or not feedback.get('top_features'):
            return []

        top_features = feedback['top_features'][:3]
        suggested_keeps = {
            a['name'] for a in result.get('actions', [])
            if a.get('action') == 'keep'
        }
        existing_cols = set(X.columns)

        auto_keeps = []
        for feature in top_features:
            if feature in existing_cols and feature not in suggested_keeps:
                keep_action = {
                    "action": "keep",
                    "name": feature,
                    "why": f"Auto-kept: top feature from previous iteration (score: {feedback.get('best_score', 'N/A')})"
                }

                # Preserve explainability metadata if available
                if 'feature_explainability' in feedback and feature in feedback['feature_explainability']:
                    exp_data = feedback['feature_explainability'][feature]
                    keep_action['explainability_score'] = exp_data.get('score', None)
                    keep_action['physiological_rationale'] = exp_data.get('rationale', None)

                auto_keeps.append(keep_action)
                print(f"  ⚠ Auto-keeping top feature: {feature}")

        return auto_keeps

    def validate_formula(self, formula, X, feature_name=None, available_columns_override=None):
        """Comprehensively validate a feature formula with detailed error reporting"""

        # Check 1: Basic validation
        if not formula or not isinstance(formula, str):
            return {
                'valid': False,
                'error': 'Formula is empty or not a string',
                'details': None
            }

        # Check 2: Dangerous operations
        dangerous = [
            'eval', 'exec', 'import', '__', 'open', 'file',
            'subprocess', 'compile', 'globals', 'locals', 'vars', 'del',
            'input', 'raw_input'
        ]

        for word in dangerous:
            pattern = r'\b' + re.escape(word) + r'\b'
            if re.search(pattern, formula):
                return {
                    'valid': False,
                    'error': f"Contains dangerous operation: '{word}'",
                    'details': 'Remove dangerous operations from formula'
                }

        # Special check for 'os' and 'sys' modules
        if re.search(r'\bos\s*\.', formula) or re.search(r'\bsys\s*\.', formula):
            return {
                'valid': False,
                'error': 'Cannot use os or sys modules',
                'details': 'System modules are not allowed'
            }

        # Check 3: Validate column references exist
        referenced_cols = re.findall(r"df\['([^']+)'\]", formula)
        referenced_cols += re.findall(r'df\["([^"]+)"\]', formula)

        if available_columns_override is not None:
            available_columns = set(available_columns_override)
        else:
            available_columns = set(X.columns)

        missing_cols = [col for col in referenced_cols if col not in available_columns]
        if missing_cols:
            return {
                'valid': False,
                'error': f"References missing columns: {missing_cols}",
                'details': f"Available columns: {list(available_columns)[:15]}"
            }

        # Check 4: Test execution on small subset
        try:
            test_df = X.head(5).copy()
            test_formula = formula.replace('df[', 'test_df[')
            test_result = eval(test_formula)

            # Validate result type
            if not isinstance(test_result, (pd.Series, np.ndarray, int, float)):
                return {
                    'valid': False,
                    'error': f"Formula returns invalid type: {type(test_result)}",
                    'details': 'Formula must return a numeric Series, array, or scalar'
                }

            # Convert to Series if needed
            if isinstance(test_result, (int, float)):
                test_result = pd.Series([test_result] * len(test_df))
            elif isinstance(test_result, np.ndarray):
                test_result = pd.Series(test_result)

            # Check 5: Validate output quality
            warning = None

            if pd.isna(test_result).all():
                return {
                    'valid': False,
                    'error': 'Formula produces all NaN values',
                    'details': 'Check for division by zero or invalid operations'
                }

            if np.isinf(test_result).any():
                return {
                    'valid': False,
                    'error': 'Formula produces infinite values',
                    'details': 'Add small constant to denominator to avoid division by zero'
                }

            nan_rate = pd.isna(test_result).mean()
            if nan_rate > 0.5:
                warning = f"Formula produces {nan_rate*100:.1f}% NaN values"

            if test_result.nunique() == 1:
                warning = "Formula produces constant values (zero variance)"

            return {
                'valid': True,
                'error': None,
                'warning': warning,
                'details': None
            }

        except ZeroDivisionError:
            return {
                'valid': False,
                'error': 'Division by zero in formula',
                'details': 'Add small constant to denominator: e.g., df["col"] / (df["other"] + 1e-6)'
            }

        except TypeError as e:
            return {
                'valid': False,
                'error': f'Type error: {str(e)}',
                'details': 'Check that all columns are numeric and operations are compatible'
            }

        except Exception as e:
            return {
                'valid': False,
                'error': f'Execution error: {str(e)}',
                'details': 'Check formula syntax and ensure all operations are valid'
            }

    def _create_fallback_actions(self, X, feedback):
        """Create safe fallback actions with explainability metadata"""
        print("  📋 Creating fallback strategy...")

        fallback_actions = []

        # Strategy 1: Keep top features from feedback
        if feedback and feedback.get('top_features'):
            for feature in feedback['top_features'][:5]:
                if feature in X.columns:
                    action = {
                        "action": "keep",
                        "name": feature,
                        "why": "Fallback: previous top feature",
                        "explainability_score": 3
                    }

                    # Preserve explainability if available
                    if 'feature_explainability' in feedback and feature in feedback['feature_explainability']:
                        exp_data = feedback['feature_explainability'][feature]
                        action['explainability_score'] = exp_data.get('score', 3)
                        action['physiological_rationale'] = exp_data.get('rationale', 'Previous top feature')

                    fallback_actions.append(action)

        # Strategy 2: Keep first few numeric columns
        if len(fallback_actions) < 3:
            numeric_cols = X.select_dtypes(include=[np.number]).columns[:5]
            for col in numeric_cols:
                if not any(a['name'] == col for a in fallback_actions):
                    fallback_actions.append({
                        "action": "keep",
                        "name": col,
                        "why": "Fallback: numeric column preservation",
                        "explainability_score": 2,
                        "physiological_rationale": "Fallback feature - needs review"
                    })

        return fallback_actions[:10]

    def _print_validation_summary(self, valid_actions, validation_errors):
        """Print enhanced validation summary with explainability stats"""

        action_counts = self._count_actions(valid_actions)

        # Calculate explainability stats
        create_actions = [a for a in valid_actions if a['action'] == 'create']

        if create_actions:
            scores = [a.get('explainability_score', 0) for a in create_actions]
            avg_score = sum(scores) / len(scores) if scores else 0
            high_quality = sum(1 for s in scores if s >= 4)
            medium_quality = sum(1 for s in scores if 3 <= s < 4)
            low_quality = sum(1 for s in scores if s < 3)

            print(f"  ✓ Validated {len(valid_actions)} actions: "
                  f"Keep={action_counts['keep']}, Create={action_counts['create']}, "
                  f"Remove={action_counts['remove']}")
            print(f"  📊 Explainability: Avg {avg_score:.1f}/5 | "
                  f"High (≥4): {high_quality}, Medium (3): {medium_quality}, Low (<3): {low_quality}")
        else:
            print(f"  ✓ Validated {len(valid_actions)} actions: "
                  f"Keep={action_counts['keep']}, Create={action_counts['create']}, "
                  f"Remove={action_counts['remove']}")

        if validation_errors:
            print(f"  ⚠ Had {len(validation_errors)} validation errors (corrected via LLM retry)")

    def _count_actions(self, actions):
        """Count actions by type for reporting"""
        counts = {'keep': 0, 'create': 0, 'remove': 0}
        for action in actions:
            action_type = action.get('action', '')
            if action_type in counts:
                counts[action_type] += 1
        return counts

In [6]:
class FeatureGenerator:
    """Generates features based on validated strategies"""

    def generate(self, X, agent_result):
        """Generate features with dependency resolution and multi-pass creation"""
        X_new = X.copy()

        # Track what we're doing (including explainability)
        kept_features = set()
        created_features = set()
        removed_features = set()
        failed_features = []
        feature_metadata = {}  # NEW: Track explainability metadata

        print("\n  🔧 Executing feature actions:")

        # Extract validation errors if present
        validation_errors = agent_result.get('validation_errors', [])
        if validation_errors:
            print(f"  ℹ️  Note: {len(validation_errors)} actions were filtered during validation")

        # Step 1: Process KEEP actions first
        keep_actions = [a for a in agent_result.get('actions', []) if a.get('action') == 'keep']
        for action in keep_actions:
            name = action.get('name')
            if not name:
                continue

            if name in kept_features:
                print(f"  ⚠ Duplicate keep for '{name}' - skipping")
                continue

            if name in X_new.columns:
                kept_features.add(name)
                # NEW: Preserve explainability metadata
                if 'explainability_score' in action:
                    feature_metadata[name] = {
                        'explainability_score': action.get('explainability_score'),
                        'physiological_rationale': action.get('physiological_rationale'),
                        'action_type': 'keep'
                    }
                print(f"  ✓ Keeping: {name}")
            else:
                print(f"  ⚠ Cannot keep '{name}' - doesn't exist")

        # Step 2: Process CREATE actions with dependency resolution
        create_actions = [a for a in agent_result.get('actions', []) if a.get('action') == 'create']
        max_passes = 5

        for pass_num in range(max_passes):
            if not create_actions:
                break

            if pass_num > 0:
                print(f"\n  🔄 Pass {pass_num + 1}: Resolving dependencies for {len(create_actions)} features...")

            remaining_actions = []
            created_this_pass = 0

            for action in create_actions:
                name = action.get('name')
                formula = action.get('formula', '')

                if name in created_features:
                    continue

                # Extract dependencies
                import re
                referenced_cols = re.findall(r"df\['([^']+)'\]", formula)
                referenced_cols += re.findall(r'df\["([^"]+)"\]', formula)

                missing = [col for col in referenced_cols if col not in X_new.columns]

                if missing:
                    remaining_actions.append(action)
                    if pass_num == max_passes - 1:
                        print(f"  ❌ Cannot create '{name}' - missing columns: {missing}")
                        failed_features.append({
                            'name': name,
                            'missing': missing,
                            'formula': formula
                        })
                    continue

                # Try to create
                try:
                    X_new[name] = eval(formula)
                    created_features.add(name)
                    created_this_pass += 1

                    # NEW: Store explainability metadata
                    feature_metadata[name] = {
                        'explainability_score': action.get('explainability_score'),
                        'physiological_rationale': action.get('physiological_rationale'),
                        'clinical_relevance': action.get('clinical_relevance'),
                        'interpretation': action.get('interpretation'),
                        'action_type': 'create',
                        'formula': formula
                    }

                    exp_score = action.get('explainability_score', '?')
                    print(f"  ✓ Created: {name} (Explainability: {exp_score}/5)")

                except Exception as e:
                    print(f"  ❌ Failed to create '{name}': {e}")
                    failed_features.append({
                        'name': name,
                        'error': str(e),
                        'formula': formula
                    })

            create_actions = remaining_actions

            if created_this_pass == 0 and create_actions:
                print(f"  ⚠ No progress in pass {pass_num + 1}, stopping early")
                for action in create_actions:
                    name = action.get('name')
                    formula = action.get('formula', '')
                    import re
                    referenced_cols = re.findall(r"df\['([^']+)'\]", formula)
                    referenced_cols += re.findall(r'df\["([^"]+)"\]', formula)
                    missing = [col for col in referenced_cols if col not in X_new.columns]
                    print(f"  ❌ Circular dependency or missing: '{name}' needs {missing}")
                break

        # Step 3: Process REMOVE actions
        remove_actions = [a for a in agent_result.get('actions', []) if a.get('action') == 'remove']
        for action in remove_actions:
            name = action.get('name')
            if not name:
                continue

            if name in kept_features:
                print(f"  ⚠ Skipping removal of '{name}' - explicitly kept")
                continue
            if name in created_features:
                print(f"  ⚠ Skipping removal of '{name}' - just created")
                continue

            removed_features.add(name)

        for name in removed_features:
            if name in X_new.columns:
                X_new.drop(columns=[name], inplace=True)
                print(f"  ✓ Removed: {name}")

        # Step 4: Determine final feature set
        if kept_features or created_features:
            final_features = kept_features.union(created_features)
            available_finals = [f for f in final_features if f in X_new.columns]
            X_new = X_new[available_finals]
        else:
            print("  ℹ️ No features explicitly kept - preserving all original features")

        # Step 5: Quality checks
        print("\n  🔍 Final data quality check:")
        X_new = self._quality_check(X_new, len(X.columns))

        # Step 6: Add metadata to result
        result = {
            'X_new': X_new,
            'feature_metadata': feature_metadata,  # NEW
            'summary': {
                'input_features': len(X.columns),
                'output_features': len(X_new.columns),
                'kept': len(kept_features),
                'created': len(created_features),
                'removed': len(removed_features),
                'failed': len(failed_features)
            }
        }

        # Summary
        print(f"\n  📊 Feature Generation Summary:")
        print(f"     Input features:   {len(X.columns)}")
        print(f"     Output features:  {len(X_new.columns)}")
        print(f"     Kept:             {len(kept_features)}")
        print(f"     Created:          {len(created_features)}")
        print(f"     Removed:          {len(removed_features)}")
        print(f"     Failed:           {len(failed_features)}")

        if failed_features and len(failed_features) <= 5:
            print(f"\n  ⚠ Failed features:")
            for fail in failed_features[:5]:
                if 'missing' in fail:
                    print(f"     - {fail['name']}: missing {fail['missing']}")
                else:
                    print(f"     - {fail['name']}: {fail.get('error', 'unknown error')}")

        return result  # NEW: Return dict with metadata instead of just X_new

    def _quality_check(self, X_new, original_count):
        """Ensure the new feature set is valid"""
        import numpy as np

        # Check 1: Not empty
        if X_new.empty or len(X_new.columns) == 0:
            raise ValueError("Feature generation resulted in empty dataframe!")

        # Check 2: No duplicate columns
        if len(X_new.columns) != len(set(X_new.columns)):
            duplicates = [col for col in X_new.columns if list(X_new.columns).count(col) > 1]
            print(f"    ⚠ Removing duplicate columns: {set(duplicates)}")
            X_new = X_new.loc[:, ~X_new.columns.duplicated()]

        # Check 3: Remove all-NaN columns
        all_nan_cols = X_new.columns[X_new.isna().all()].tolist()
        if all_nan_cols:
            print(f"    ⚠ Removing all-NaN columns: {all_nan_cols}")
            X_new = X_new.drop(columns=all_nan_cols)

        # Check 4: Remove constant columns (zero variance)
        constant_cols = []
        for col in X_new.columns:
            if X_new[col].dtype in [np.number]:
                if X_new[col].nunique() == 1:
                    constant_cols.append(col)

        if constant_cols:
            print(f"    ⚠ Removing constant columns: {constant_cols}")
            X_new = X_new.drop(columns=constant_cols)

        # Check 5: Reasonable feature count
        if len(X_new.columns) > original_count * 5:
            print(f"    ⚠ WARNING: Feature count exploded: {original_count} → {len(X_new.columns)}")
            print(f"       This might cause overfitting!")

        print(f"    ✓ Passed quality checks")

        return X_new

In [7]:
def best_threshold_for_f1(y_true, y_prob):
    """
    Find threshold maximizing F1-score for given probabilities.
    Returns (best_threshold, best_f1, precision, recall).
    """
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    f1s = 2 * precision * recall / (precision + recall + 1e-12)
    idx = np.nanargmax(f1s)
    return thresholds[idx], f1s[idx], precision[idx], recall[idx]

In [8]:
class FeatureEvaluator:
    """Evaluates feature sets using explainable models with comprehensive metrics"""

    def __init__(self, task_type='classification', models=None, n_cv_folds=5,
                 preprocessing='auto', random_state=42):
        """
        Initialize evaluator with explainable models and detailed metrics

        Args:
            task_type: 'classification' or 'regression'
            models: List of model names. If None, uses explainable defaults.
                   Options: ['random_forest', 'xgboost', 'gradient_boosting', 'svm',
                            'decision_tree', 'extra_trees']
            n_cv_folds: Number of cross-validation folds
            preprocessing: 'auto', 'standard', 'minmax', 'robust', or None
            random_state: Random seed for reproducibility
        """
        self.task_type = task_type
        self.n_cv_folds = n_cv_folds
        self.preprocessing = preprocessing
        self.random_state = random_state

        # Set default to EXPLAINABLE models only
        if models is None:
            self.models = ['random_forest', 'xgboost', 'gradient_boosting', 'svm']
        else:
            self.models = models

        self.scaler = None
        self.imputer = None

    def evaluate(self, X, y, feature_metadata=None, return_all_scores=False):
        """
        Evaluate features with multiple explainable models and detailed metrics

        Args:
            X: Feature dataframe
            y: Target variable
            feature_metadata: Dict with explainability scores from FeatureGenerator
            return_all_scores: If True, return detailed metrics per model

        Returns:
            avg_score: Average primary metric across all models
            top_features: List of most important features
            detailed_metrics: (optional) Comprehensive metrics if return_all_scores=True
        """
        from sklearn.model_selection import cross_val_score, cross_validate
        import numpy as np

        # Clean and preprocess data
        X_cleaned = self._clean_data(X)
        X_processed = self._preprocess_data(X_cleaned)

        self._spw = None
        if self.task_type == 'classification':
            classes = np.unique(y)
            if len(classes) == 2:
                n_pos = np.sum(y == classes[1])
                n_neg = np.sum(y == classes[0])
                if n_pos > 0:
                    self._spw = float(n_neg) / float(n_pos)

        # Track results
        model_scores = {}
        all_importances = {}
        detailed_metrics = {}

        print(f"\n  📊 Evaluating with {len(self.models)} explainable models ({self.n_cv_folds}-fold CV):")

        for model_name in self.models:
            try:
                model = self._get_model(model_name)

                # Get comprehensive metrics using cross_validate
                scoring_metrics = self._get_scoring_metrics()

                cv_results = cross_validate(
                    model, X_processed, y,
                    cv=self.n_cv_folds,
                    scoring=scoring_metrics,
                    return_train_score=False
                )

                # Extract primary metric
                primary_metric = self._get_primary_metric_name()
                primary_scores = cv_results[f'test_{primary_metric}']
                avg_score = primary_scores.mean()
                std_score = primary_scores.std()

                model_scores[model_name] = {
                    'mean': avg_score,
                    'std': std_score,
                    'scores': primary_scores
                }

                # Store detailed metrics
                detailed_metrics[model_name] = self._extract_detailed_metrics(cv_results)

                # Print summary with explainability note
                print(f"    {model_name:20s}: {avg_score:.4f} (±{std_score:.4f}) "
                      f"[Explainable ✓]")

                # Get feature importances
                importances = self._get_feature_importance(model, X_processed, y)
                if importances is not None:
                    all_importances[model_name] = importances

            except Exception as e:
                print(f"    ⚠ {model_name} failed: {e}")
                model_scores[model_name] = {'mean': 0.0, 'std': 0.0, 'scores': []}

        # Aggregate results
        if not model_scores or all(s['mean'] == 0.0 for s in model_scores.values()):
            print("  ⚠ All models failed! Returning fallback values.")
            return 0.0, list(X_cleaned.columns[:5]), {}

        # Calculate average score across models
        valid_scores = [s['mean'] for s in model_scores.values() if s['mean'] > 0]
        avg_score = np.mean(valid_scores) if valid_scores else 0.0

        # Aggregate feature importances with explainability weighting
        top_features = self._aggregate_feature_importance(
            all_importances,
            X_processed.columns,
            feature_metadata
        )

        # Print summary with detailed metrics
        print(f"\n  ✓ Average {self._get_primary_metric_name()}: {avg_score:.4f}")
        self._print_detailed_metrics_summary(detailed_metrics)
        print(f"  ✓ Top features: {top_features[:5]}")

        if return_all_scores:
            return avg_score, top_features, {
                'model_scores': model_scores,
                'detailed_metrics': detailed_metrics,
                'feature_importances': all_importances
            }
        else:
            return avg_score, top_features

    def _get_model(self, model_name):
        """Get explainable model instance by name"""
        from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
                                       GradientBoostingClassifier, GradientBoostingRegressor,
                                       ExtraTreesClassifier, ExtraTreesRegressor)
        from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
        from sklearn.svm import SVC, SVR

        # Try to import optional libraries
        try:
            from xgboost import XGBClassifier, XGBRegressor
            has_xgboost = True
        except ImportError:
            has_xgboost = False

        is_classification = self.task_type == 'classification'

        # EXPLAINABLE MODELS ONLY
        if model_name == 'random_forest':
            if is_classification:
                return RandomForestClassifier(
                n_estimators=400,
                max_depth=12,
                min_samples_split=5,
                min_samples_leaf=2,
                class_weight='balanced',  # ← add this
                random_state=self.random_state,
                n_jobs=-1
            )
            else:
                return RandomForestRegressor(
                    n_estimators=100,
                    max_depth=10,
                    min_samples_split=5,
                    min_samples_leaf=2,
                    random_state=self.random_state,
                    n_jobs=-1
                )

        elif model_name == 'xgboost':
            if not has_xgboost:
                raise ImportError("XGBoost not installed. Install with: pip install xgboost")

            if is_classification:
                return XGBClassifier(
                n_estimators=600,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.9,
                colsample_bytree=0.8,
                random_state=self.random_state,
                scale_pos_weight=(self._spw if getattr(self, "_spw", None) else 1.0),  # ← add this
                eval_metric='aucpr'  # ← focus on PR
            )
            else:
                return XGBRegressor(
                    n_estimators=100,
                    max_depth=6,
                    learning_rate=0.1,
                    random_state=self.random_state,
                    verbosity=0
                )

        elif model_name == 'gradient_boosting':
            # Scikit-learn's GradientBoosting - highly explainable
            if is_classification:
                return GradientBoostingClassifier(
                    n_estimators=100,
                    max_depth=5,
                    learning_rate=0.1,
                    random_state=self.random_state
                )
            else:
                return GradientBoostingRegressor(
                    n_estimators=100,
                    max_depth=5,
                    learning_rate=0.1,
                    random_state=self.random_state
                )

        elif model_name == 'extra_trees':
            # Extra Trees - similar to RF but different splitting strategy
            if is_classification:
                return ExtraTreesClassifier(
                    n_estimators=100,
                    max_depth=10,
                    min_samples_split=5,
                    random_state=self.random_state,
                    n_jobs=-1
                )
            else:
                return ExtraTreesRegressor(
                    n_estimators=100,
                    max_depth=10,
                    min_samples_split=5,
                    random_state=self.random_state,
                    n_jobs=-1
                )

        elif model_name == 'decision_tree':
            # Single decision tree - maximally explainable
            if is_classification:
                return DecisionTreeClassifier(
                    max_depth=8,
                    min_samples_split=10,
                    min_samples_leaf=5,
                    random_state=self.random_state
                )
            else:
                return DecisionTreeRegressor(
                    max_depth=8,
                    min_samples_split=10,
                    min_samples_leaf=5,
                    random_state=self.random_state
                )

        elif model_name == 'svm':
            # SVM requires scaling
            if is_classification:
                return SVC(
                    kernel='rbf',
                    C=1.0,
                    random_state=self.random_state,
                    max_iter=1000,
                    probability=True  # For ROC-AUC
                )
            else:
                return SVR(
                    kernel='rbf',
                    C=1.0,
                    max_iter=1000
                )

        else:
            raise ValueError(f"Unknown model: {model_name}. Use explainable models only.")

    def _get_scoring_metrics(self):
      if self.task_type == 'classification':
        return {
            'average_precision': 'average_precision',  # PRIMARY (AUCPR)
            'roc_auc': 'roc_auc',
            'accuracy': 'accuracy',
            'precision': 'precision_macro',
            'recall': 'recall_macro',
            'f1': 'f1_macro',
        }
      else:
            return {
                'r2': 'r2',
                'neg_mse': 'neg_mean_squared_error',
                'neg_mae': 'neg_mean_absolute_error',
                'neg_rmse': 'neg_root_mean_squared_error'
            }

    def _get_primary_metric_name(self):
      return 'average_precision' if self.task_type == 'classification' else 'r2'

    def _extract_detailed_metrics(self, cv_results):
        """Extract and organize detailed metrics from cross-validation"""
        import numpy as np

        metrics = {}

        for key, values in cv_results.items():
            if key.startswith('test_'):
                metric_name = key.replace('test_', '')
                metrics[metric_name] = {
                    'mean': np.mean(values),
                    'std': np.std(values),
                    'values': values
                }

        return metrics

    def _print_detailed_metrics_summary(self, detailed_metrics):
        """Print summary of detailed metrics across all models"""
        if not detailed_metrics:
            return

        print("\n  📈 Detailed Performance Metrics (averaged across models):")

        # Aggregate metrics across models
        all_metric_names = set()
        for model_metrics in detailed_metrics.values():
            all_metric_names.update(model_metrics.keys())

        aggregated = {}
        for metric_name in all_metric_names:
            values = []
            for model_metrics in detailed_metrics.values():
                if metric_name in model_metrics:
                    values.append(model_metrics[metric_name]['mean'])

            if values:
                aggregated[metric_name] = {
                    'mean': np.mean(values),
                    'std': np.std(values)
                }

        # Print classification metrics
        if self.task_type == 'classification':
            if 'accuracy' in aggregated:
                print(f"    Accuracy:  {aggregated['accuracy']['mean']:.4f} (±{aggregated['accuracy']['std']:.4f})")
            if 'precision' in aggregated:
                print(f"    Precision: {aggregated['precision']['mean']:.4f} (±{aggregated['precision']['std']:.4f}) "
                      "[True Pos / (True Pos + False Pos)]")
            if 'recall' in aggregated:
                print(f"    Recall:    {aggregated['recall']['mean']:.4f} (±{aggregated['recall']['std']:.4f}) "
                      "[True Pos / (True Pos + False Neg)]")
            if 'f1' in aggregated:
                print(f"    F1 Score:  {aggregated['f1']['mean']:.4f} (±{aggregated['f1']['std']:.4f}) "
                      "[Harmonic mean of precision & recall]")
            if 'roc_auc' in aggregated:
                print(f"    ROC-AUC:   {aggregated['roc_auc']['mean']:.4f} (±{aggregated['roc_auc']['std']:.4f})")

        # Print regression metrics
        else:
            if 'r2' in aggregated:
                print(f"    R² Score:  {aggregated['r2']['mean']:.4f} (±{aggregated['r2']['std']:.4f})")
            if 'neg_mae' in aggregated:
                mae = -aggregated['neg_mae']['mean']  # Convert back to positive
                print(f"    MAE:       {mae:.4f} (±{aggregated['neg_mae']['std']:.4f})")
            if 'neg_rmse' in aggregated:
                rmse = -aggregated['neg_rmse']['mean']
                print(f"    RMSE:      {rmse:.4f} (±{aggregated['neg_rmse']['std']:.4f})")

    def _clean_data(self, X):
        """Clean data: handle inf, NaN, duplicates, constants"""
        import numpy as np
        import pandas as pd

        X_cleaned = X.copy()

        print("  🧹 Cleaning data...")

        # Step 1: Handle infinite values
        inf_cols = X_cleaned.columns[np.isinf(X_cleaned).any()].tolist()
        if inf_cols:
            print(f"    ⚠ Replacing inf values in {len(inf_cols)} columns")
            X_cleaned = X_cleaned.replace([np.inf, -np.inf], np.nan)

        # Step 2: Check for constant columns (zero variance)
        constant_cols = []
        for col in X_cleaned.columns:
            if X_cleaned[col].nunique() <= 1:
                constant_cols.append(col)

        if constant_cols:
            print(f"    ⚠ Dropping {len(constant_cols)} constant columns: {constant_cols[:5]}")
            X_cleaned = X_cleaned.drop(columns=constant_cols)

        # Step 3: Check for duplicate columns
        duplicate_cols = []
        for i, col1 in enumerate(X_cleaned.columns):
            for col2 in X_cleaned.columns[i+1:]:
                if X_cleaned[col1].equals(X_cleaned[col2]):
                    duplicate_cols.append(col2)

        if duplicate_cols:
            print(f"    ⚠ Dropping {len(duplicate_cols)} duplicate columns")
            X_cleaned = X_cleaned.drop(columns=duplicate_cols)

        # Step 4: Handle categorical columns
        categorical_cols = X_cleaned.select_dtypes(include=['object', 'category']).columns
        if len(categorical_cols) > 0:
            print(f"    ⚠ Converting {len(categorical_cols)} categorical columns to numeric")
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            for col in categorical_cols:
                try:
                    X_cleaned[col] = le.fit_transform(X_cleaned[col].astype(str))
                except:
                    print(f"      ⚠ Failed to encode {col}, dropping it")
                    X_cleaned = X_cleaned.drop(columns=[col])

        print(f"    ✓ Cleaned: {X.shape[1]} → {X_cleaned.shape[1]} features")

        return X_cleaned

    def _preprocess_data(self, X):
        """Preprocess data with imputation and scaling"""
        from sklearn.impute import SimpleImputer
        from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
        import numpy as np
        import pandas as pd

        print("  ⚙️ Preprocessing data...")

        # Step 1: Imputation
        if X.isna().any().any():
            nan_counts = X.isna().sum()
            nan_cols = nan_counts[nan_counts > 0]
            print(f"    Imputing NaN in {len(nan_cols)} columns")

            self.imputer = SimpleImputer(strategy='median')
            X_imputed = pd.DataFrame(
                self.imputer.fit_transform(X),
                columns=X.columns,
                index=X.index
            )
        else:
            X_imputed = X.copy()

        # Step 2: Scaling (important for SVM)
        if self.preprocessing is None:
            print("    No scaling applied")
            return X_imputed

        # Auto-detect best scaler
        if self.preprocessing == 'auto':
            has_outliers = False
            for col in X_imputed.columns:
                q1, q3 = X_imputed[col].quantile([0.25, 0.75])
                iqr = q3 - q1
                if iqr > 0:
                    outlier_range = (X_imputed[col].max() - X_imputed[col].min()) / iqr
                    if outlier_range > 10:
                        has_outliers = True
                        break

            scaler_type = 'robust' if has_outliers else 'standard'
            print(f"    Auto-selected {scaler_type} scaling")
        else:
            scaler_type = self.preprocessing

        # Apply scaling
        if scaler_type == 'standard':
            self.scaler = StandardScaler()
        elif scaler_type == 'minmax':
            self.scaler = MinMaxScaler()
        elif scaler_type == 'robust':
            self.scaler = RobustScaler()
        else:
            print(f"    ⚠ Unknown scaler: {scaler_type}, using standard")
            self.scaler = StandardScaler()

        X_scaled = pd.DataFrame(
            self.scaler.fit_transform(X_imputed),
            columns=X_imputed.columns,
            index=X_imputed.index
        )

        print(f"    ✓ Applied {scaler_type} scaling")

        return X_scaled

    def _get_feature_importance(self, model, X, y):
        """Extract feature importance from model"""
        import numpy as np

        # Fit model if not already fitted
        if not hasattr(model, 'feature_importances_') and not hasattr(model, 'coef_'):
            try:
                model.fit(X, y)
            except:
                return None

        # Get importances
        if hasattr(model, 'feature_importances_'):
            return dict(zip(X.columns, model.feature_importances_))
        elif hasattr(model, 'coef_'):
            # For SVM with linear kernel or other linear models
            coef = model.coef_
            if len(coef.shape) > 1:  # Multi-class
                coef = np.abs(coef).mean(axis=0)
            else:
                coef = np.abs(coef)
            return dict(zip(X.columns, coef))
        else:
            return None

    def _aggregate_feature_importance(self, all_importances, feature_names, feature_metadata=None):
        """
        Aggregate feature importance across models with explainability weighting

        Features with higher explainability scores get a small boost in ranking
        """
        import numpy as np

        if not all_importances:
            return list(feature_names[:10])

        # Get explainability scores if available
        explainability_scores = {}
        if feature_metadata:
            for feat in feature_names:
                if feat in feature_metadata:
                    explainability_scores[feat] = feature_metadata[feat].get('explainability_score', 3)
                else:
                    explainability_scores[feat] = 3  # Default

        # Rank features for each model
        feature_ranks = {feat: [] for feat in feature_names}

        for model_name, importances in all_importances.items():
            # Sort by importance
            sorted_features = sorted(importances.items(), key=lambda x: -x[1])

            # Assign ranks
            for rank, (feat, _) in enumerate(sorted_features, 1):
                feature_ranks[feat].append(rank)

        # Average ranks with optional explainability weighting
        avg_ranks = {}
        for feat, ranks in feature_ranks.items():
            if ranks:
                base_rank = np.mean(ranks)

                # Apply small explainability bonus (10% max)
                if feat in explainability_scores:
                    exp_score = explainability_scores[feat]
                    # Higher explainability = lower (better) rank
                    explainability_bonus = (5 - exp_score) * 0.02 * base_rank
                    avg_ranks[feat] = base_rank + explainability_bonus
                else:
                    avg_ranks[feat] = base_rank
            else:
                avg_ranks[feat] = len(feature_names)

        # Sort by average rank (lower is better)
        top_features = sorted(avg_ranks.items(), key=lambda x: x[1])

        return [f[0] for f in top_features[:10]]

In [9]:
#patch class

def _extract_explainability_scores(feature_metadata):
    """
    Return a list of numeric 'score' values from feature_metadata in a tolerant way.
    Accepts dicts of dicts, dicts of numbers, or lists of either.
    """
    out = []
    if feature_metadata is None:
        return out

    def _maybe_add(x):
        try:
            val = float(x)
            if math.isfinite(val):
                out.append(val)
        except Exception:
            pass

    # Case: dict-like
    if isinstance(feature_metadata, dict):
        for v in feature_metadata.values():
            if isinstance(v, dict):
                s = v.get('score')
                if isinstance(s, (int, float)):
                    _maybe_add(s)
            elif isinstance(v, (int, float)):
                _maybe_add(v)
            # else: ignore strings/None/etc.

    # Case: list-like
    elif isinstance(feature_metadata, (list, tuple)):
        for v in feature_metadata:
            if isinstance(v, dict):
                s = v.get('score')
                if isinstance(s, (int, float)):
                    _maybe_add(s)
            elif isinstance(v, (int, float)):
                _maybe_add(v)

    # Anything else: ignore
    return out



In [10]:
class FeatureEngineeringPipeline:
    """Main pipeline that orchestrates all components with explainability tracking"""

    def __init__(self, api_key, task_type='classification',
             models=None, preprocessing='auto', n_cv_folds=5,
             min_explainability_score=3, random_state=42):
        """
        Initialize the feature engineering pipeline

        Args:
            api_key: OpenAI API key
            task_type: 'classification' or 'regression'
            models: List of models for evaluation
            preprocessing: Preprocessing strategy
            n_cv_folds: Number of cross-validation folds
            min_explainability_score: Minimum explainability score (1-5)
            random_state: Random seed
        """
        self.client = OpenAI(api_key=api_key)
        self.task_type = task_type
        self.random_state = random_state
        self.history = []

        # Track best results
        self.best_model = None
        self.best_X = None
        self.best_y = None
        self.feature_formulas = {}
        self.feature_metadata = {}

        # Initialize components
        self.researcher = ResearchAgent(self.client)  # No serper key needed
        self.validator = FeatureValidator(
            client=self.client,
            min_explainability_score=min_explainability_score
        )
        self.strategist = FeatureStrategyAgent(self.client, validator=self.validator)
        self.generator = FeatureGenerator()
        self.evaluator = FeatureEvaluator(
            task_type=task_type,
            models=models,
            preprocessing=preprocessing,
            n_cv_folds=n_cv_folds,
            random_state=random_state
        )
    def _safe_scores_from_metadata(self, feature_metadata):
      vals = {}
      if not feature_metadata:
          return vals
      if isinstance(feature_metadata, dict):
          for k, v in feature_metadata.items():
              if isinstance(v, dict) and 'score' in v and isinstance(v['score'], (int, float)):
                  vals[k] = float(v['score'])
              elif isinstance(v, (int, float)):
                  vals[k] = float(v)
      elif isinstance(feature_metadata, (list, tuple)):
          # optional: support list forms
          for item in feature_metadata:
              if isinstance(item, dict):
                  name = item.get('name')
                  s = item.get('score')
                  if name and isinstance(s, (int, float)):
                      vals[name] = float(s)
      return vals
    def run(self, df, target_col, max_iterations=30, min_improvement=0.01,
            patience=3, metadata=None, verbose=True):
        """
        Run the feature engineering pipeline

        Args:
            df: Input dataframe
            target_col: Name of target column
            max_iterations: Maximum number of iterations
            min_improvement: Minimum score improvement to reset patience
            patience: Number of iterations without improvement before stopping
            metadata: Dict with 'domain' and 'problem' description
            verbose: Print detailed progress

        Returns:
            result: Dict with:
                - X_best: Best feature dataframe
                - y: Target variable
                - best_score: Best score achieved
                - model: Trained model on best features
                - feature_documentation: Human-readable explanations
                - history: Complete iteration history
        """
        X = df.drop(columns=[target_col])
        y = df[target_col]

        # Enrich metadata
        if metadata is None:
            metadata = {}
        metadata = self._enrich_metadata(metadata, X, y, target_col)

        # Track best results
        best_score = 0
        best_X = X.copy()
        best_iteration_data = None
        no_improvement_count = 0

        print(f"\n{'='*70}")
        print(f"🚀 STARTING EXPLAINABLE FEATURE ENGINEERING PIPELINE")
        print(f"{'='*70}")
        print(f"  Dataset: {X.shape[0]} rows × {X.shape[1]} features")
        print(f"  Target: {target_col} ({self.task_type})")
        print(f"  Domain: {metadata.get('domain', 'unknown')}")
        print(f"  Models: {self.evaluator.models}")
        print(f"  Min Explainability: {self.validator.min_explainability_score}/5")
        print(f"  Max iterations: {max_iterations} (patience: {patience})")
        print(f"{'='*70}\n")

        for iteration in range(1, max_iterations + 1):
            print(f"\n{'='*70}")
            print(f"🔄 ITERATION {iteration}/{max_iterations}")
            print(f"{'='*70}")

            # Get feedback from previous iteration
            feedback = self._build_feedback(iteration)

            # Phase 1: Research (optional)
            research_context = None
            print("\n📚 Phase 1: Research")
            try:
                research_context = self.researcher.search(
                target_col,
                metadata,
                feedback
            )
            except Exception as e:
                print(f"  ⚠ Research failed: {e}")
                print("  Continuing without research insights...")

            # Phase 2: Strategy Design
            print("\n🎯 Phase 2: Strategy Design & Validation")
            try:
                strategy = self.strategist.design_strategy(
                    X,
                    target_col,
                    feedback,
                    research_context
                )
            except Exception as e:
                print(f"  ❌ Strategy design failed: {e}")
                break

            # Phase 3: Feature Generation
            print("\n🔧 Phase 3: Feature Generation")
            try:
                generation_result = self.generator.generate(X, strategy)

                # Handle both return formats
                X_augmented = generation_result['X_new'] if isinstance(generation_result, dict) else generation_result
                last_X_augmented = X_augmented
                feature_metadata = {}

                metadata = self._enrich_metadata(metadata, X_augmented, y, target_col)

            except Exception as e:
                print(f"  ❌ Feature generation failed: {e}")
                import traceback
                traceback.print_exc()
                break

            # Phase 4: Evaluation
            print("\n📊 Phase 4: Evaluation")
            try:
                score, top_features, eval_details = self.evaluator.evaluate(
                    X_augmented,
                    y,
                    feature_metadata=feature_metadata,
                    return_all_scores=True
                )
            except Exception as e:
                print(f"  ❌ Evaluation failed: {e}")
                import traceback
                traceback.print_exc()
                break

            # Record history
            iteration_result = {
                'iteration': iteration,
                'score': score,
                'num_features': X_augmented.shape[1],
                'top_features': top_features,
                'model_scores': eval_details.get('model_scores', {}),
                'detailed_metrics': eval_details.get('detailed_metrics', {}),
                'strategy': strategy,
                'feature_names': list(X_augmented.columns),
                'feature_metadata': feature_metadata,
                'feature_formulas': self._extract_formulas(strategy),
            }
            self.history.append(iteration_result)

            # Print iteration summary
            self._print_iteration_summary(iteration, score, X, X_augmented,
                                         top_features, feature_metadata)

            # Check improvement
            improvement = score - best_score

            if improvement > min_improvement:
                print(f"  ✅ Improvement: +{improvement:.4f}")
                best_score = score
                best_X = X_augmented.copy()
                best_iteration_data = iteration_result
                no_improvement_count = 0
                X = X_augmented
            else:
                print(f"  ⚠️  No significant improvement: +{improvement:.4f}")
                no_improvement_count += 1

                if no_improvement_count >= patience:
                    print(f"\n{'='*70}")
                    print(f"🛑 EARLY STOPPING: No improvement for {patience} iterations")
                    print(f"{'='*70}")
                    break

                X = X_augmented

        # Check if we have results
        if not self.history:
            print("\n⚠️ No iterations completed successfully!")
            return self._create_fallback_result(X, y)

        # Get best iteration
        best_iteration_data = max(self.history, key=lambda x: x['score'])

        # Keep the metadata for docs
        self.feature_formulas = best_iteration_data['feature_formulas']
        self.feature_metadata = best_iteration_data['feature_metadata']

        # Use the best_X you tracked during the loop; fallback to the last augmented X if needed
        if best_X is None:
            # Fallback: if nothing ever cleared min_improvement, use the final X from the last iteration
            # (see the small change below to capture `last_X_augmented` inside the loop)
            best_X = last_X_augmented if 'last_X_augmented' in locals() and last_X_augmented is not None else X

        self.best_X = best_X
        self.best_y = y

        # Train final model
        print("\n🎓 Training final model on best features...")
        self.best_model = self._train_final_model(best_X, y)

        # Generate documentation
        print("\n📝 Generating feature documentation...")
        feature_docs = self._generate_documentation(best_iteration_data)

        # Print final results
        self._print_final_results(best_iteration_data)

        # Return results
        return {
            'X_best': best_X,
            'y': y,
            'best_score': best_score,
            'best_iteration': best_iteration_data['iteration'],
            'model': self.best_model,
            'feature_formulas': self.feature_formulas,
            'feature_metadata': self.feature_metadata,
            'feature_documentation': feature_docs,
            'top_features': best_iteration_data['top_features'],
            'history': self.history
        }

    def _extract_formulas(self, strategy):
        """Extract formulas from strategy actions"""
        formulas = {}

        for action in strategy.get('actions', []):
            if action.get('action') == 'create':
                name = action.get('name')
                formula = action.get('formula')
                if name and formula:
                    formulas[name] = formula

        return formulas

    def _train_final_model(self, X, y):
        """Train a final model on the best feature set"""
        from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

        # Use Random Forest as it's highly explainable
        if self.task_type == 'classification':
            model = RandomForestClassifier(
                n_estimators=200,
                max_depth=12,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=self.random_state,
                n_jobs=-1
            )
        else:
            model = RandomForestRegressor(
                n_estimators=200,
                max_depth=12,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=self.random_state,
                n_jobs=-1
            )

        # Preprocess data
        X_cleaned = self.evaluator._clean_data(X)
        X_processed = self.evaluator._preprocess_data(X_cleaned)

        # Train
        model.fit(X_processed, y)

        # Get feature importances
        importances = dict(zip(X_processed.columns, model.feature_importances_))
        top_5 = sorted(importances.items(), key=lambda x: -x[1])[:5]

        print(f"  ✓ Model trained: {model.__class__.__name__}")
        print(f"  Top 5 important features:")
        for feat, imp in top_5:
            print(f"    - {feat}: {imp:.4f}")

        return model

    def _generate_documentation(self, best_iteration_data):
        """Generate comprehensive human-readable documentation"""

        strategy = best_iteration_data['strategy']
        domain = strategy.get('domain', 'Unknown')

        # Use strategist to generate documentation
        docs = self.strategist.generate_feature_documentation(
            strategy.get('actions', []),
            domain
        )

        # Add explainability summary
        metadata = best_iteration_data.get('feature_metadata', {})
        scores_map = self._safe_scores_from_metadata(metadata)
        scores = list(scores_map.values())
        if scores:
            avg_score = sum(scores) / len(scores)
            high_quality = sum(1 for s in scores if s >= 4)
            summary = f"\n{'='*70}\nEXPLAINABILITY SUMMARY\n{'='*70}\n"
            summary += f"Average Explainability Score: {avg_score:.2f}/5\n"
            summary += f"Highly Explainable Features (≥4): {high_quality}/{len(scores)}\n"
            summary += f"Total Features: {len(scores)}\n"
            docs = summary + "\n" + docs

        return docs

    def _print_iteration_summary(self, iteration, score, X_old, X_new,
                             top_features, feature_metadata):
      """Print detailed iteration summary (safe version)"""

      def _extract_explainability_scores(feature_metadata):
        """Extract numeric 'score' values robustly."""
        out = []
        if feature_metadata is None:
            return out

        def _add(v):
            try:
                val = float(v)
                if math.isfinite(val):
                    out.append(val)
            except Exception:
                pass

        # Dict of dicts or dict of numbers
        if isinstance(feature_metadata, dict):
            for v in feature_metadata.values():
                if isinstance(v, dict):
                    s = v.get("score")
                    if isinstance(s, (int, float)):
                        _add(s)
                elif isinstance(v, (int, float)):
                    _add(v)
        # List/tuple of dicts or numbers
        elif isinstance(feature_metadata, (list, tuple)):
            for v in feature_metadata:
                if isinstance(v, dict):
                    s = v.get("score")
                    if isinstance(s, (int, float)):
                        _add(s)
                elif isinstance(v, (int, float)):
                    _add(v)
        return out

      print(f"\n{'─'*70}")
      print(f"📈 ITERATION {iteration} SUMMARY")
      print(f"{'─'*70}")
      print(f"  Score: {score:.4f}")
      print(f"  Features: {X_old.shape[1]} → {X_new.shape[1]}")

      # Explainability stats (robust)
      try:
          scores = _extract_explainability_scores(feature_metadata)
          if scores:
              avg_score = sum(scores) / len(scores)
              high_quality = sum(1 for s in scores if s >= 4)
              medium_quality = sum(1 for s in scores if s == 3)
              low_quality = sum(1 for s in scores if s < 3)
              print(f"  Explainability: {avg_score:.1f}/5 avg | "
                    f"High (≥4): {high_quality}, Medium (3): {medium_quality}, Low (<3): {low_quality}")
      except Exception as e:
          print(f"  (Explainability stats unavailable: {e})")

      print(f"  Top 3 features: {top_features[:3]}")
    def _enrich_metadata(self, metadata, X, y, target_col):
        """Enrich metadata with dataset statistics"""
        import numpy as np

        enriched = metadata.copy()

        if 'domain' not in enriched:
            enriched['domain'] = 'general'

        enriched['target'] = target_col
        enriched['n_rows'] = len(X)
        enriched['n_features'] = len(X.columns)
        enriched['headers'] = list(X.columns)
        enriched['column_types'] = [str(dtype) for dtype in X.dtypes]
        enriched['column_names'] = list(X.columns)

        if self.task_type == 'classification':
            enriched['n_classes'] = y.nunique()
            enriched['class_balance'] = dict(y.value_counts(normalize=True))
        else:
            enriched['target_mean'] = float(y.mean())
            enriched['target_std'] = float(y.std())

        enriched['data_stats'] = {
            'n_rows': len(X),
            'missing_rate': float(X.isna().mean().mean()),
            'missing_summary': dict(X.isna().sum()[X.isna().sum() > 0])
        }

        return enriched

    def _build_feedback(self, iteration):
        """Build feedback from previous iterations"""
        if not self.history:
            return None

        prev = self.history[-1]

        feedback = {
            'iteration': iteration - 1,
            'best_score': prev['score'],
            'top_features': prev['top_features'],
            'num_features': prev['num_features'],
            'feature_explainability': prev.get('feature_metadata', {})
        }

        if len(self.history) >= 2:
            prev_prev = self.history[-2]
            improvement = prev['score'] - prev_prev['score']

            if improvement > 0.01:
                feedback['what_worked'] = f"Score improved by {improvement:.4f}"
            elif improvement < -0.01:
                feedback['what_failed'] = f"Score decreased by {abs(improvement):.4f}"
            else:
                feedback['what_failed'] = "Marginal improvement, try different approach"

        if 'model_scores' in prev:
            best_model = max(prev['model_scores'].items(), key=lambda x: x[1]['mean'])
            feedback['best_model'] = best_model[0]
            feedback['best_model_score'] = best_model[1]['mean']

        return feedback

    def _print_final_results(self, best_iteration_data):
        """Print comprehensive final results"""

        print(f"\n\n{'='*70}")
        print(f"🎉 FINAL RESULTS")
        print(f"{'='*70}")
        print(f"  Best iteration: {best_iteration_data['iteration']}/{len(self.history)}")
        print(f"  Best score: {best_iteration_data['score']:.4f}")
        print(f"  Features: {best_iteration_data['num_features']}")

        # Explainability summary
        metadata = best_iteration_data.get('feature_metadata', {})
        scores_map = self._safe_scores_from_metadata(metadata)
        if scores_map:
            scores = list(scores_map.values())
            avg_score = sum(scores) / len(scores)
            high_quality = sum(1 for s in scores if s >= 4)
            print(f"  Explainability: {avg_score:.1f}/5 avg, {high_quality}/{len(scores)} highly explainable")

        print(f"\n  Top 10 features:")
        for i, feat in enumerate(best_iteration_data['top_features'][:10], 1):
            exp_info = ""
            if feat in scores_map:
                exp_info = f" [Explainability: {scores_map[feat]:.1f}/5]"
            print(f"    {i:2d}. {feat}{exp_info}")
        if 'model_scores' in best_iteration_data:
            print(f"\n  Model Performance:")
            for model, scores in best_iteration_data['model_scores'].items():
                print(f"    {model:20s}: {scores['mean']:.4f} (±{scores['std']:.4f})")

        print(f"\n  Score Progression:")
        for i, result in enumerate(self.history, 1):
            improvement = ""
            if i > 1:
                prev_score = self.history[i-2]['score']
                diff = result['score'] - prev_score
                improvement = f" ({diff:+.4f})"
            star = " ⭐" if result['iteration'] == best_iteration_data['iteration'] else ""
            print(f"    Iteration {i:2d}: {result['score']:.4f}{improvement}{star}")

        print(f"{'='*70}\n")

    def _create_fallback_result(self, X, y):
        """Create fallback result when no iterations complete"""
        print("\n⚠️ No iterations completed successfully!")

        return {
            'X_best': X.copy(),
            'y': y,
            'best_score': 0.0,
            'best_iteration': 0,
            'model': None,
            'feature_formulas': {},
            'feature_metadata': {},
            'feature_documentation': "No documentation available",
            'top_features': [],
            'history': []
        }

    def save_results(self, result, base_filename='pipeline_results'):
        """
        Save all results to files

        Args:
            result: Result dict from run()
            base_filename: Base name for output files
        """
        import pickle
        import pandas as pd
        import json

        print(f"\n💾 Saving results...")

        # 1. Save best dataset (X + y)
        df_best = result['X_best'].copy()
        df_best['target'] = result['y']
        csv_path = f"{base_filename}_data.csv"
        df_best.to_csv(csv_path, index=False)
        print(f"  ✓ Best dataset saved to {csv_path}")

        # 2. Save trained model
        if result['model']:
            model_path = f"{base_filename}_model.pkl"
            with open(model_path, 'wb') as f:
                pickle.dump(result['model'], f)
            print(f"  ✓ Trained model saved to {model_path}")

        # 3. Save feature documentation
        if result['feature_documentation']:
            doc_path = f"{base_filename}_features.txt"
            with open(doc_path, 'w') as f:
                f.write(result['feature_documentation'])
            print(f"  ✓ Feature documentation saved to {doc_path}")

        # 4. Save feature formulas and metadata
        metadata_path = f"{base_filename}_metadata.json"
        metadata_export = {
            'feature_formulas': result['feature_formulas'],
            'feature_metadata': result['feature_metadata'],
            'best_score': result['best_score'],
            'best_iteration': result['best_iteration'],
            'top_features': result['top_features'],
            'num_features': len(result['X_best'].columns),
            'explainability_summary': self._get_explainability_summary(result)
        }
        with open(metadata_path, 'w') as f:
            json.dump(metadata_export, f, indent=2)
        print(f"  ✓ Feature metadata saved to {metadata_path}")

        # 5. Save complete history
        history_path = f"{base_filename}_history.json"
        # Convert history to JSON-serializable format
        history_export = []
        for h in result['history']:
            h_copy = h.copy()
            # Remove non-serializable items
            if 'detailed_metrics' in h_copy:
                del h_copy['detailed_metrics']
            history_export.append(h_copy)

        with open(history_path, 'w') as f:
            json.dump(history_export, f, indent=2)
        print(f"  ✓ Complete history saved to {history_path}")

        print(f"\n✅ All results saved with base name: {base_filename}")

        return {
            'data': csv_path,
            'model': model_path if result['model'] else None,
            'documentation': doc_path,
            'metadata': metadata_path,
            'history': history_path
        }

    def _get_explainability_summary(self, result):
        """Get explainability summary statistics"""
        metadata = result.get('feature_metadata', {})

        if not metadata:
            return {}

        scores = [m['score'] for m in metadata.values() if 'score' in m]

        if not scores:
            return {}

        return {
            'average_score': sum(scores) / len(scores),
            'high_quality_count': sum(1 for s in scores if s >= 4),
            'medium_quality_count': sum(1 for s in scores if 3 <= s < 4),
            'low_quality_count': sum(1 for s in scores if s < 3),
            'total_features': len(scores)
        }

    def plot_progression(self):
        """Plot score progression over iterations"""
        try:
            import matplotlib.pyplot as plt

            iterations = [h['iteration'] for h in self.history]
            scores = [h['score'] for h in self.history]

            plt.figure(figsize=(12, 6))

            # Plot score progression
            plt.subplot(1, 2, 1)
            plt.plot(iterations, scores, marker='o', linewidth=2, markersize=8)
            plt.xlabel('Iteration', fontsize=12)
            plt.ylabel('Score', fontsize=12)
            plt.title('Score Progression', fontsize=14, fontweight='bold')
            plt.grid(True, alpha=0.3)

            # Plot explainability progression
            plt.subplot(1, 2, 2)
            explainability_avgs = []
            for h in self.history:
                metadata = h.get('feature_metadata', {})
                if metadata:
                    scores_list = [m['score'] for m in metadata.values() if 'score' in m]
                    if scores_list:
                        explainability_avgs.append(sum(scores_list) / len(scores_list))
                    else:
                        explainability_avgs.append(0)
                else:
                    explainability_avgs.append(0)

            if any(explainability_avgs):
                plt.plot(iterations, explainability_avgs, marker='s',
                        linewidth=2, markersize=8, color='green')
                plt.xlabel('Iteration', fontsize=12)
                plt.ylabel('Avg Explainability Score', fontsize=12)
                plt.title('Explainability Progression', fontsize=14, fontweight='bold')
                plt.ylim(0, 5)
                plt.grid(True, alpha=0.3)

            plt.tight_layout()
            plt.show()

        except ImportError:
            print("matplotlib not installed. Install with: pip install matplotlib")

In [12]:
if __name__ == "__main__":
    import pandas as pd
    from google.colab import userdata
    from sklearn.preprocessing import LabelEncoder

    print("="*70)
    print("🏥 STROKE PREDICTION - FEATURE ENGINEERING PIPELINE")
    print("="*70 + "\n")

    # ==================== 1. LOAD DATA ====================
    print("📁 Loading dataset...")
    df = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

    print(f"  Shape: {df.shape}")
    print(f"  Target distribution:\n{df['stroke'].value_counts()}")
    print(f"  Class imbalance: {(df['stroke']==0).sum()}/{(df['stroke']==1).sum()} (no stroke/stroke)")

    # ==================== 2. PREPROCESS DATA ====================
    print("\n🔧 Preprocessing data...")

    # Drop ID column
    if 'id' in df.columns:
        df = df.drop(columns=['id'])
        print("  ✓ Dropped 'id' column")

    # Handle missing BMI values
    if df['bmi'].isnull().any():
        n_missing = df['bmi'].isnull().sum()
        df['bmi'] = df['bmi'].fillna(df['bmi'].median())
        print(f"  ✓ Imputed {n_missing} missing BMI values with median")

    # Encode categorical variables
    categorical_cols = ['gender', 'ever_married', 'work_type',
                       'Residence_type', 'smoking_status']

    le = LabelEncoder()
    for col in categorical_cols:
        if col in df.columns:
            df[col] = le.fit_transform(df[col].astype(str))

    print(f"  ✓ Encoded {len(categorical_cols)} categorical columns")
    print(f"  ✓ Final shape: {df.shape}")

    # ==================== 3. SETUP METADATA ====================
    metadata = {
        'domain': 'healthcare - cardiovascular/neurology',
        'problem': 'stroke risk prediction',
        'description': '''Predicting stroke occurrence based on patient demographics,
        health metrics (hypertension, heart disease, BMI, glucose), and lifestyle factors
        (smoking, work type). Dataset is highly imbalanced (~5% stroke rate).'''
    }

    # ==================== 4. INITIALIZE PIPELINE ====================
    print("\n🚀 Initializing pipeline...")

    pipeline = FeatureEngineeringPipeline(
        api_key=userdata.get('openaiapi'),
        task_type='classification',
        models=['random_forest', 'xgboost', 'gradient_boosting'],
        preprocessing='robust',  # Good for medical data with outliers
        n_cv_folds=5,
        min_explainability_score=3  # Require explainable features
    )

    # ==================== 5. RUN PIPELINE ====================
    print("\n" + "="*70)
    print("▶️  RUNNING FEATURE ENGINEERING PIPELINE")
    print("="*70 + "\n")

    result = pipeline.run(
        df=df,
        target_col='stroke',
        max_iterations=15,
        min_improvement=0.005,  # Small improvements matter for imbalanced data
        patience=3,
        metadata=metadata,
        verbose=True
    )

    # ==================== 6. SAVE RESULTS ====================
    print("\n💾 Saving results...")

    saved_files = pipeline.save_results(
        result,
        base_filename='stroke_prediction'
    )

    # ==================== 7. PRINT SUMMARY ====================
    print("\n" + "="*70)
    print("📊 FINAL SUMMARY")
    print("="*70)

    print(f"\n🎯 Performance:")
    print(f"  Best Score: {result['best_score']:.4f}")
    print(f"  Best Iteration: {result['best_iteration']}")
    print(f"  Input Features: {df.shape[1]-1}")
    print(f"  Output Features: {len(result['X_best'].columns)}")

    # Model breakdown
    if result.get('model_scores'):
        print(f"\n🤖 Model Performance:")
        for model, scores in result['model_scores'].items():
            print(f"  {model:20s}: {scores['mean']:.4f} ± {scores['std']:.4f}")

    # Top features
    print(f"\n⭐ Top 10 Features:")
    for i, feat in enumerate(result['top_features'][:10], 1):
        exp_info = ""
        if feat in result['feature_metadata'] and 'score' in result['feature_metadata'][feat]:
            exp_score = result['feature_metadata'][feat]['score']
            exp_info = f" [Explainability: {exp_score}/5]"
        print(f"  {i:2d}. {feat}{exp_info}")

    # Explainability summary
    metadata_scores = [m['score'] for m in result['feature_metadata'].values() if 'score' in m]
    if metadata_scores:
        avg_exp = sum(metadata_scores) / len(metadata_scores)
        high_exp = sum(1 for s in metadata_scores if s >= 4)
        print(f"\n🔍 Explainability:")
        print(f"  Average Score: {avg_exp:.1f}/5")
        print(f"  Highly Explainable (≥4): {high_exp}/{len(metadata_scores)}")

    # Progression
    print(f"\n📈 Score Progression:")
    for i, h in enumerate(result['history'], 1):
        change = ""
        if i > 1:
            diff = h['score'] - result['history'][i-2]['score']
            change = f" ({diff:+.4f})"
        star = " ⭐" if h['iteration'] == result['best_iteration'] else ""
        print(f"  Iteration {i:2d}: {h['score']:.4f}{change}{star}")

    # Saved files
    print(f"\n📂 Saved Files:")
    for file_type, filepath in saved_files.items():
        if filepath:
            print(f"  • {file_type}: {filepath}")

    # ==================== 8. DOMAIN-SPECIFIC INSIGHTS ====================
    print("\n" + "="*70)
    print("🏥 STROKE-SPECIFIC INSIGHTS")
    print("="*70)

    # Identify new features created
    original_features = set(df.columns) - {'stroke'}
    new_features = [f for f in result['X_best'].columns if f not in original_features]

    print(f"\n✨ Created {len(new_features)} new features")

    # Check for clinically relevant features in top 10
    clinical_keywords = ['age', 'hypertension', 'heart', 'glucose', 'bmi',
                        'smoking', 'cardiovascular', 'risk']

    clinical_features = [
        f for f in result['top_features'][:15]
        if any(kw in f.lower() for kw in clinical_keywords)
    ]

    print(f"\n⚕️  Clinical Risk Features in Top 15: {len(clinical_features)}")
    if clinical_features:
        for feat in clinical_features[:8]:
            # Try to get explanation if available
            if feat in result['feature_metadata']:
                meta = result['feature_metadata'][feat]
                rationale = meta.get('rationale', 'N/A')
                if rationale != 'N/A' and len(rationale) > 0:
                    print(f"  • {feat}")
                    print(f"    └─ {rationale[:80]}...")
                else:
                    print(f"  • {feat}")
            else:
                print(f"  • {feat}")

    # Check for interaction features
    interaction_features = [
        f for f in new_features
        if any(op in f for op in ['*', '/', '_x_', '_ratio', '_interaction'])
    ]

    if interaction_features:
        print(f"\n🔗 Interaction Features: {len(interaction_features)}")
        for feat in interaction_features[:5]:
            print(f"  • {feat}")

    # ==================== 9. OPTIONAL VISUALIZATION ====================
    try:
        print("\n📊 Generating progression plots...")
        pipeline.plot_progression()
    except Exception as e:
        print(f"  ⚠️  Could not generate plots: {e}")

    print("\n" + "="*70)
    print("✅ PIPELINE COMPLETE!")
    print("="*70)
    print(f"\nNext steps:")
    print(f"  1. Review feature documentation: {saved_files.get('documentation')}")
    print(f"  2. Examine feature formulas: {saved_files.get('metadata')}")
    print(f"  3. Use trained model: result['model'] for predictions")
    print(f"  4. Load best data: pd.read_csv('{saved_files.get('data')}')")

🏥 STROKE PREDICTION - FEATURE ENGINEERING PIPELINE

📁 Loading dataset...
  Shape: (5110, 12)
  Target distribution:
stroke
0    4861
1     249
Name: count, dtype: int64
  Class imbalance: 4861/249 (no stroke/stroke)

🔧 Preprocessing data...
  ✓ Dropped 'id' column
  ✓ Imputed 201 missing BMI values with median
  ✓ Encoded 5 categorical columns
  ✓ Final shape: (5110, 11)

🚀 Initializing pipeline...

▶️  RUNNING FEATURE ENGINEERING PIPELINE


🚀 STARTING EXPLAINABLE FEATURE ENGINEERING PIPELINE
  Dataset: 5110 rows × 10 features
  Target: stroke (classification)
  Domain: healthcare - cardiovascular/neurology
  Models: ['random_forest', 'xgboost', 'gradient_boosting']
  Min Explainability: 3/5
  Max iterations: 15 (patience: 3)


🔄 ITERATION 1/15

📚 Phase 1: Research
  🔍 Performing physiologically-focused research with OpenAI...
  ✓ Extracted 3 physiologically meaningful features
  💡 The model focuses on well-established clinical risk factors for stroke, such as age, hypertension, B...



AttributeError: 'FeatureEngineeringPipeline' object has no attribute '_safe_scores_from_metadata'

In [None]:
!pip install wfdb

Collecting wfdb
  Downloading wfdb-4.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pandas>=2.2.3 (from wfdb)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading wfdb-4.3.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, wfdb
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into acco

In [None]:
import pandas as pd
import numpy as np
import wfdb
import os
from pathlib import Path

def load_chapman_shaoxing_ecg(data_dir='chapman_ecg_data',
                               max_samples=1000,
                               extract_features=True):
    """
    Load Chapman-Shaoxing 12-lead ECG Database

    Args:
        data_dir: Directory containing the dataset
        max_samples: Maximum number of records to load (dataset is large)
        extract_features: If True, extract statistical features from signals

    Returns:
        df: DataFrame with features and target
    """

    print(f"Loading Chapman-Shaoxing ECG Database...")

    # If not downloaded, provide instructions
    if not os.path.exists(data_dir):
        print(f"\n⚠️  Dataset not found at {data_dir}")
        print("\nTo download the Chapman-Shaoxing database:")
        print("1. Install wfdb: pip install wfdb")
        print("2. Download using:")
        print("   import wfdb")
        print("   wfdb.dl_database('chapman', dl_dir='chapman_ecg_data')")
        print("\nOr download from: https://physionet.org/content/ecg-arrhythmia/1.0.0/")
        return None

    # Load the database records
    records = []
    labels = []

    # Get list of record files
    record_files = list(Path(data_dir).glob('*.hea'))

    print(f"Found {len(record_files)} records")
    print(f"Loading up to {max_samples} samples...")

    for i, record_file in enumerate(record_files[:max_samples]):
        if i % 100 == 0:
            print(f"  Processed {i}/{min(max_samples, len(record_files))} records...")

        try:
            # Get record name without extension
            record_name = str(record_file).replace('.hea', '')

            # Read the record
            record = wfdb.rdrecord(record_name)

            # Get the signals (12 leads)
            signals = record.p_signal  # Shape: (n_samples, 12)

            # Get diagnosis from comments
            diagnosis = None
            if hasattr(record, 'comments'):
                for comment in record.comments:
                    if 'Reason for admission' in comment or 'Diagnosis' in comment:
                        diagnosis = comment
                        break

            # Extract features if requested
            if extract_features:
                features = extract_ecg_features(signals, record.sig_name)
                records.append(features)
            else:
                # Just use raw signals (will be very large)
                features = signals.flatten()
                records.append(features)

            labels.append(diagnosis if diagnosis else 'Unknown')

        except Exception as e:
            print(f"    ⚠️  Error loading {record_file}: {e}")
            continue

    print(f"\n✓ Loaded {len(records)} records successfully")

    # Create DataFrame
    if extract_features:
        df = pd.DataFrame(records)
    else:
        # For raw signals, create column names
        n_features = len(records[0])
        columns = [f'signal_{i}' for i in range(n_features)]
        df = pd.DataFrame(records, columns=columns)

    df['diagnosis'] = labels

    return df


def extract_ecg_features(signals, lead_names):
    """
    Extract statistical and clinical features from 12-lead ECG signals

    Args:
        signals: numpy array of shape (n_samples, 12) - the ECG signals
        lead_names: list of lead names (e.g., ['I', 'II', 'III', 'aVR', ...])

    Returns:
        features: dict of extracted features
    """
    features = {}

    # Ensure we have 12 leads
    if signals.shape[1] != 12:
        print(f"    ⚠️  Expected 12 leads, got {signals.shape[1]}")

    # Extract features for each lead
    for i, lead_name in enumerate(lead_names):
        if i >= signals.shape[1]:
            break

        signal = signals[:, i]

        # Basic statistical features
        features[f'{lead_name}_mean'] = np.mean(signal)
        features[f'{lead_name}_std'] = np.std(signal)
        features[f'{lead_name}_min'] = np.min(signal)
        features[f'{lead_name}_max'] = np.max(signal)
        features[f'{lead_name}_range'] = np.max(signal) - np.min(signal)

        # Percentiles
        features[f'{lead_name}_p25'] = np.percentile(signal, 25)
        features[f'{lead_name}_p50'] = np.percentile(signal, 50)
        features[f'{lead_name}_p75'] = np.percentile(signal, 75)

        # Advanced features
        features[f'{lead_name}_skewness'] = calculate_skewness(signal)
        features[f'{lead_name}_kurtosis'] = calculate_kurtosis(signal)
        features[f'{lead_name}_energy'] = np.sum(signal ** 2)
        features[f'{lead_name}_rms'] = np.sqrt(np.mean(signal ** 2))

        # Zero crossing rate
        features[f'{lead_name}_zcr'] = np.sum(np.diff(np.sign(signal)) != 0) / len(signal)

    # Cross-lead features
    # Standard limb leads: I, II, III
    if len(lead_names) >= 3:
        features['limb_correlation_I_II'] = np.corrcoef(signals[:, 0], signals[:, 1])[0, 1]
        features['limb_correlation_I_III'] = np.corrcoef(signals[:, 0], signals[:, 2])[0, 1]
        features['limb_correlation_II_III'] = np.corrcoef(signals[:, 1], signals[:, 2])[0, 1]

    # Precordial leads correlation (V1-V6, typically leads 6-11)
    if signals.shape[1] >= 12:
        precordial_signals = signals[:, 6:12]
        features['precordial_mean_correlation'] = np.mean([
            np.corrcoef(precordial_signals[:, i], precordial_signals[:, j])[0, 1]
            for i in range(6) for j in range(i+1, 6)
        ])

    # Overall heart activity
    features['overall_mean_amplitude'] = np.mean(np.abs(signals))
    features['overall_max_amplitude'] = np.max(np.abs(signals))
    features['overall_variance'] = np.var(signals)

    return features


def calculate_skewness(signal):
    """Calculate skewness of signal"""
    mean = np.mean(signal)
    std = np.std(signal)
    if std == 0:
        return 0
    return np.mean(((signal - mean) / std) ** 3)


def calculate_kurtosis(signal):
    """Calculate kurtosis of signal"""
    mean = np.mean(signal)
    std = np.std(signal)
    if std == 0:
        return 0
    return np.mean(((signal - mean) / std) ** 4) - 3


def prepare_ecg_for_pipeline(df, target_type='binary'):
    """
    Prepare ECG data for the feature engineering pipeline

    Args:
        df: DataFrame from load_chapman_shaoxing_ecg()
        target_type: 'binary' (normal vs abnormal) or 'multiclass' (specific diagnoses)

    Returns:
        df_prepared: DataFrame ready for pipeline
        target_col: Name of target column
    """

    print("\n📋 Preparing ECG data for pipeline...")

    df_prepared = df.copy()

    # Process diagnosis column
    if target_type == 'binary':
        # Create binary target: Normal vs Abnormal
        # You'll need to define what "Normal" means based on the dataset
        df_prepared['target'] = df_prepared['diagnosis'].apply(
            lambda x: 0 if 'normal' in str(x).lower() or 'Unknown' in str(x) else 1
        )
        target_col = 'target'
        print(f"  Created binary target: {df_prepared['target'].value_counts().to_dict()}")

    elif target_type == 'multiclass':
        # Use diagnosis categories directly
        # Clean up diagnosis labels
        df_prepared['target'] = df_prepared['diagnosis'].fillna('Unknown')

        # Encode to numeric
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        df_prepared['target'] = le.fit_transform(df_prepared['target'])

        target_col = 'target'
        print(f"  Created multiclass target with {df_prepared['target'].nunique()} classes")
        print(f"  Class distribution:\n{df_prepared['target'].value_counts()}")

    # Drop the original diagnosis column
    df_prepared = df_prepared.drop(columns=['diagnosis'])

    # Remove any rows with missing values
    initial_rows = len(df_prepared)
    df_prepared = df_prepared.dropna()
    if len(df_prepared) < initial_rows:
        print(f"  ⚠️  Dropped {initial_rows - len(df_prepared)} rows with missing values")

    print(f"  ✓ Final dataset: {df_prepared.shape[0]} rows × {df_prepared.shape[1]-1} features")

    return df_prepared, target_col

In [None]:


# 1. Load and prepare the ECG data
df_ecg = load_chapman_shaoxing_ecg(
    data_dir='chapman_ecg_data',
    max_samples=500,  # Start with subset for testing
    extract_features=True
)

if df_ecg is not None:
    # 2. Prepare for pipeline
    df_prepared, target_col = prepare_ecg_for_pipeline(
        df_ecg,
        target_type='binary'  # or 'multiclass'
    )

    # 3. Initialize pipeline
    pipeline = FeatureEngineeringPipeline(
        api_key="your-openai-key",
        serper_api_key=None,  # Optional
        task_type='classification',
        models=['random_forest', 'xgboost', 'gradient_boosting'],
        min_explainability_score=3,
        max_iterations=10,
        patience=3
    )

    # 4. Add domain metadata for better results
    metadata = {
        'domain': 'healthcare - cardiology',
        'problem': 'ECG arrhythmia detection from 12-lead signals',
        'description': 'Predicting cardiac abnormalities from Chapman-Shaoxing ECG database with statistical features extracted from 12 leads (I, II, III, aVR, aVL, aVF, V1-V6)'
    }

    # 5. Run pipeline
    result = pipeline.run(
        df_prepared,
        target_col=target_col,
        max_iterations=15,
        min_improvement=0.005,
        patience=3,
        metadata=metadata,
        verbose=True
    )

    # 6. Save results
    saved_files = pipeline.save_results(result, base_filename='ecg_arrhythmia')

    # 7. Optional: Plot progression
    pipeline.plot_progression()

    print("\n" + "="*70)
    print("🎉 ECG PIPELINE COMPLETE!")
    print("="*70)
    print(f"Best Score: {result['best_score']:.4f}")
    print(f"Best Features: {len(result['X_best'].columns)}")
    print(f"\nSaved files:")
    for key, path in saved_files.items():
        print(f"  - {key}: {path}")