In [4]:
import pandas as pd
import numpy as np
import json
from openai import OpenAI
from typing import Dict, List, Any
import os
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


# Read CSV and get headers - replace data.csv with some given csv
df = pd.read_csv('/content/diabetes.csv')
headers = df.columns.tolist()

# Example metadata - description of how data was accomplished
#expecting inclusion of problem to solve, patient statistics, previous work if applicable

#to add - category of data(e.g. time-series, nnumerical, categorical, text)
#specificed tools
output = {
    'metadata': {
        'source': 'diabetes.csv (Pima Indians Diabetes Database)',
        'description': 'Medical diagnostic dataset containing health measurements from 768 Pima Indian women aged 21 and older, collected to predict the onset of diabetes mellitus within 5 years. Originally from the National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK), part of a long-term epidemiological study begun in 1965 near Phoenix, Arizona.',
        'sample description': 'Binary classification problem predicting diabetes diagnosis (0=no diabetes, 1=diabetes) based on 8 diagnostic measurements including glucose levels, blood pressure, BMI, insulin, and other health indicators. The Pima population was studied due to unusually high incidence rates of Type 2 diabetes.',
        'notable_previous_work': 'Benchmark dataset widely used in machine learning research since 1988 when included in UCI Machine Learning Repository. Notable research includes: Smith et al. (1988) "Using the ADAP learning algorithm to forecast the onset of diabetes mellitus"; hundreds of subsequent studies achieving 75-98% accuracy using various ML algorithms including Random Forest, SVM, Neural Networks, and Deep Learning approaches. Research has explored interpretable models, IoMT applications, and ensemble methods.'
    },
    'headers': headers  # Just the list of column names
}

print(output)


{'metadata': {'source': 'diabetes.csv (Pima Indians Diabetes Database)', 'description': 'Medical diagnostic dataset containing health measurements from 768 Pima Indian women aged 21 and older, collected to predict the onset of diabetes mellitus within 5 years. Originally from the National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK), part of a long-term epidemiological study begun in 1965 near Phoenix, Arizona.', 'sample description': 'Binary classification problem predicting diabetes diagnosis (0=no diabetes, 1=diabetes) based on 8 diagnostic measurements including glucose levels, blood pressure, BMI, insulin, and other health indicators. The Pima population was studied due to unusually high incidence rates of Type 2 diabetes.', 'notable_previous_work': 'Benchmark dataset widely used in machine learning research since 1988 when included in UCI Machine Learning Repository. Notable research includes: Smith et al. (1988) "Using the ADAP learning algorithm to forecast t

In [21]:
from google.colab import userdata


In [23]:


class SimpleFeaturePipeline:
    def __init__(self, api_key, serper_api_key=None, task_type='classification'):
        self.client = OpenAI(api_key=api_key)
        self.serper_api_key = serper_api_key
        self.task_type = task_type
        self.history = []

    def run(self, df, target_col, max_iterations=30, min_improvement=0.01, patience=2, metadata=None):
        # ^^^ ADD metadata=None here
        X = df.drop(columns=[target_col])
        y = df[target_col]
        best_score = 0
        no_improvement_count = 0

        print(f"Starting pipeline: {X.shape[0]} rows, {X.shape[1]} features")

        for iteration in range(1, max_iterations + 1):
            print(f"\n{'='*60}")
            print(f"ITERATION {iteration}")
            print(f"{'='*60}")

            print("Agent: Researching features...")
            feedback = self.history[-1]['feedback'] if self.history else None
            new_features = self._agent_research(X, target_col, feedback, metadata)

            print("Agent: Generating features...")
            X_augmented = self._generate_features(X, new_features)

            print("Evaluator: Testing features...")
            score, top_features = self._evaluate(X_augmented, y)

            self.history.append({
                'iteration': iteration,
                'score': score,
                'num_features': X_augmented.shape[1],
                'top_features': top_features,
                'feedback': {'best_score': score, 'top_features': top_features}
            })

            print(f"Score: {score:.4f} | Features: {X_augmented.shape[1]}")

            improvement = score - best_score

            if improvement > min_improvement:
                print(f"✓ Improvement: +{improvement:.4f}")
                best_score = score
                no_improvement_count = 0
                X = X_augmented
            else:
                print(f"✗ No significant improvement (+{improvement:.4f})")
                no_improvement_count += 1

                if no_improvement_count >= patience:
                    print(f"\n⚠ EARLY STOPPING: No improvement for {patience} iterations")
                    break

                X = X_augmented

        best = max(self.history, key=lambda x: x['score'])
        print(f"\n{'='*60}")
        print(f"BEST: Iteration {best['iteration']} - Score: {best['score']:.4f}")
        print(f"Total iterations: {len(self.history)}")
        print(f"{'='*60}")

        return X, best

    def _evaluate(self, X, y):
     """Evaluate features with simple model"""
     from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
     from sklearn.model_selection import cross_val_score
     import numpy as np

     # Clean the data before evaluation
     X_cleaned = X.copy()

     # Replace infinity with NaN
     X_cleaned = X_cleaned.replace([np.inf, -np.inf], np.nan)

     # Fill NaN with column mean
     X_cleaned = X_cleaned.fillna(X_cleaned.mean())

     # If still has NaN (column was all NaN), fill with 0
     X_cleaned = X_cleaned.fillna(0)

     # Check for any remaining issues
     if not np.isfinite(X_cleaned.values).all():
         print("  ⚠ Warning: Non-finite values detected after cleaning")
         # Drop columns with non-finite values
         bad_cols = X_cleaned.columns[~np.isfinite(X_cleaned).all()].tolist()
         print(f"  ⚠ Dropping problematic columns: {bad_cols}")
         X_cleaned = X_cleaned.drop(columns=bad_cols)

     # Choose model
     if self.task_type == 'classification':
         model = RandomForestClassifier(n_estimators=50, random_state=42)
     else:
         model = RandomForestRegressor(n_estimators=50, random_state=42)

     try:
         # Cross-validation score
         scores = cross_val_score(model, X_cleaned, y, cv=3)
         avg_score = scores.mean()

         # Feature importance
         model.fit(X_cleaned, y)
         importances = dict(zip(X_cleaned.columns, model.feature_importances_))
         top_features = sorted(importances.items(), key=lambda x: -x[1])[:10]
         top_features = [f[0] for f in top_features]

         return avg_score, top_features

     except Exception as e:
         print(f"  ⚠ Evaluation failed: {e}")
         # Return poor score if evaluation fails
         return 0.0, list(X_cleaned.columns[:5])


    def _generate_features(self, X, agent_result):
      """Generate features based on agent actions with safety checks"""
      import numpy as np

      X_new = X.copy()
      kept_features = set()

      print("\n  🔧 Executing feature actions:")

      # First pass: Handle KEEP and CREATE
      for action in agent_result.get('actions', []):
          action_type = action['action']
          name = action['name']

          if action_type == 'keep':
              if name in X_new.columns:
                  kept_features.add(name)
                  print(f"    ✓ Kept: {name}")
              else:
                  print(f"    ✗ Cannot keep '{name}' - not found")

          elif action_type == 'create':
              formula = action['formula']
              try:
                  namespace = {"__builtins__": {}, "np": np}
                  namespace.update(X_new.to_dict('series'))

                  result = eval(formula, namespace)

                  # Safety checks
                  if isinstance(result, pd.Series):
                      # Check for infinity
                      if np.isinf(result).any():
                          print(f"    ⚠ '{name}' contains infinity - clipping values")
                          result = result.replace([np.inf, -np.inf], np.nan)

                      # Check for too many NaN
                      nan_pct = result.isna().sum() / len(result)
                      if nan_pct > 0.5:
                          print(f"    ⚠ '{name}' has {nan_pct:.1%} NaN - skipping")
                          continue

                      # Check for constant values
                      if result.nunique() == 1:
                          print(f"    ⚠ '{name}' is constant - skipping")
                          continue

                      X_new[name] = result
                      print(f"    ✓ Created: {name} = {formula}")
                  else:
                      print(f"    ✗ '{name}' didn't produce a Series")

              except ZeroDivisionError:
                  print(f"    ✗ Failed '{name}': Division by zero")
              except Exception as e:
                  print(f"    ✗ Failed '{name}': {str(e)[:50]}")

      # Second pass: Handle REMOVE
      for action in agent_result.get('actions', []):
          if action['action'] == 'remove':
              name = action['name']
              if name in X_new.columns and name not in kept_features:
                  X_new = X_new.drop(columns=[name])
                  print(f"    ✓ Removed: {name}")

      # Keep all features that weren't explicitly removed
      original_kept = set(X.columns) - {a['name'] for a in agent_result.get('actions', []) if a['action'] == 'remove'}
      for col in original_kept:
          if col not in X_new.columns:
              X_new[col] = X[col]

      # Final safety check on all columns
      print("\n  🔍 Final data quality check:")
      problematic_cols = []
      for col in X_new.columns:
          if np.isinf(X_new[col]).any():
              problematic_cols.append(col)
              print(f"    ⚠ '{col}' has infinity values - will be cleaned in evaluation")
          elif X_new[col].isna().all():
              problematic_cols.append(col)
              print(f"    ⚠ '{col}' is all NaN - dropping")
              X_new = X_new.drop(columns=[col])

      print(f"\n  📊 Result: {len(X.columns)} → {len(X_new.columns)} features")
      if problematic_cols:
          print(f"  ⚠ {len(problematic_cols)} features need cleaning")

      return X_new

    def _web_search(self, query):
        if not self.serper_api_key:
            return None

        try:
            import requests
            url = "https://google.serper.dev/search"
            payload = {"q": query, "num": 5}
            headers = {
                "X-API-KEY": self.serper_api_key,
                "Content-Type": "application/json"
            }

            response = requests.post(url, json=payload, headers=headers, timeout=10)
            response.raise_for_status()
            results = response.json()

            snippets = []
            for item in results.get('organic', [])[:3]:
                title = item.get('title', '')
                snippet = item.get('snippet', '')
                snippets.append(f"**{title}**\n{snippet}")

            return "\n\n".join(snippets) if snippets else None

        except Exception as e:
            print(f"  ⚠ Search error: {e}")
            return None

    def _validate_formula(self, formula, X):



      if not formula or not isinstance(formula, str):
          return False

      # Check for dangerous operations
      dangerous = ['eval', 'exec', 'import', '__', 'open', 'file', 'os', 'sys']
      if any(word in formula.lower() for word in dangerous):
          return False

      # Try to evaluate formula safely
      try:
          # Create a small test dataframe
          test_df = X.head(10).copy()  # Use more rows for better testing

          # Attempt to evaluate
          namespace = {"__builtins__": {}, "np": np}
          namespace.update(test_df.to_dict('series'))
          result = eval(formula, namespace)

          # Check result is valid
          if not (hasattr(result, '__len__') or isinstance(result, (int, float))):
              return False

          # Check for infinity or all NaN
          if isinstance(result, pd.Series):
              if np.isinf(result).any():
                  return False  # Reject formulas that produce infinity
              if result.isna().all():
                  return False  # Reject formulas that produce all NaN
              if result.nunique() == 1:
                  return False  # Reject constant features

          return True

      except ZeroDivisionError:
          return False  # Reject division by zero
      except Exception as e:
          return False

    def _perform_research(self, target, metadata, feedback):
      print("  🔍 Performing web research...")

      domain = metadata.get('domain', 'general')
      description = metadata.get('description', '')[:100]

      # Extract the actual prediction goal from metadata
      # For diabetes: "predicting diabetes onset"
      # For churn: "predicting customer churn"
      problem_description = metadata.get('problem', description)

      # Better queries that focus on the domain problem, not column names
      queries = [
          f"feature engineering {domain} {problem_description}",
          f"best features for {domain} machine learning prediction",
          f"{domain} feature interactions data science"
      ]

      # Add feedback-informed query
      if feedback and feedback.get('top_features'):
          top_features_str = ', '.join(feedback['top_features'][:3])
          queries.append(f"{domain} {top_features_str} feature engineering")

      all_findings = []
      for query in queries[:3]:  # Limit to 3 queries to save API calls
          print(f"  → Searching: '{query[:60]}...'")
          results = self._web_search(query)

          if results:
              all_findings.append(f"Query: {query}\n{results}")
              print(f"  ✓ Found {len(results)} chars of results")
          else:
              print(f"  ✗ No results")

          import time
          time.sleep(0.5)

      if not all_findings:
          print("  ⚠ No search results - using general knowledge")
          return None

      # Synthesize findings with LLM
      synthesis_prompt = f"""Based on these web search results about feature engineering:

{chr(10).join(all_findings)}

Dataset context:
- Domain: {domain}
- Problem: {problem_description}
- Target variable: {target}
- Available columns: {metadata.get('headers', [])}
{f"- Previous best features: {feedback['top_features'][:5]}" if feedback else ""}

Extract 3-5 SPECIFIC, ACTIONABLE feature engineering recommendations:
1. Use the EXACT column names from 'Available columns' list above
2. Suggest concrete formulas (e.g., "BMI * Age", "Glucose / Insulin")
3. Explain WHY based on domain knowledge from search results
4. Focus on {domain} domain-specific transformations"""

      response = self.client.chat.completions.create(
          model="gpt-4o",
          messages=[{"role": "user", "content": synthesis_prompt}]
      )

      findings = response.choices[0].message.content
      print(f"  ✓ Synthesized research: {len(findings)} chars")
      return findings


    def _agent_research(self, X, target, feedback, metadata=None):



        research_context = ""
        if metadata:
            research_context = self._perform_research(target, metadata, feedback)

        current_features_summary = {
            'total': len(X.columns),
            'names': X.columns.tolist(),
            'sample_stats': {
                col: {'mean': X[col].mean(), 'std': X[col].std()}
                for col in X.columns[:5]
            }
        }

        prompt = f"""You're a feature engineering expert with research capabilities.

DATASET INFO:
- Target: {target}
- Current Columns ({len(X.columns)}): {X.columns.tolist()}
- Shape: {X.shape}

{f'''PREVIOUS ITERATION FEEDBACK:
- Score: {feedback['best_score']:.3f}
- Top 5 performing features: {feedback['top_features'][:5]}
- Strategy: Build on what's working or try complementary approaches
''' if feedback else "FIRST ITERATION: Explore fundamental feature relationships"}

{f"RESEARCH INSIGHTS:\n{research_context}\n" if research_context else ""}

TASK: Design a feature engineering strategy. You MUST specify actions for features:

ACTIONS YOU CAN TAKE:
1. **KEEP** - Preserve existing features that are valuable
   - Always keep top-performing features from previous iteration
   - Keep features mentioned in research as important

2. **CREATE** - Generate new features via transformations
   - Interactions: col1 * col2, col1 / col2
   - Polynomials: col1 ** 2, np.sqrt(col1)
   - Aggregations: (col1 + col2 + col3) / 3
   - Domain-specific: Use research insights

3. **REMOVE** - Drop features that aren't helping (optional, use sparingly)
   - Only remove if clearly redundant or harmful

IMPORTANT RULES:
- Use EXACT column names from: {X.columns.tolist()}
- If previous iteration had good features, include KEEP actions for them
- Formulas must be valid Python/pandas (use np.log, np.sqrt, etc.)
- Suggest 5-10 total actions
- Balance: keep proven features + create promising new ones

OUTPUT (JSON only):
{{
  "domain": "identified domain",
  "reasoning": "strategy explanation based on research and feedback",
  "actions": [
    {{
      "action": "keep",
      "name": "Glucose",
      "why": "Top feature from previous iteration with 0.25 importance"
    }},
    {{
      "action": "create",
      "name": "BMI_Glucose_interaction",
      "formula": "BMI * Glucose",
      "why": "Research suggests metabolic syndrome indicators interact"
    }},
    {{
      "action": "create",
      "name": "Age_squared",
      "formula": "Age ** 2",
      "why": "Capture non-linear age effects on diabetes risk"
    }}
  ]
}}"""

        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )

        result = json.loads(response.choices[0].message.content)

        validated_result = self._validate_agent_response(result, X, feedback)

        print(f"\n  📋 Agent Strategy:")
        print(f"    Domain: {validated_result.get('domain', 'unknown')}")
        print(f"    Reasoning: {validated_result.get('reasoning', 'N/A')[:100]}...")

        action_counts = {'keep': 0, 'create': 0, 'remove': 0}
        for action in validated_result['actions']:
            action_type = action['action']
            action_counts[action_type] = action_counts.get(action_type, 0) + 1

        print(f"    Actions: {sum(action_counts.values())} total → Keep: {action_counts['keep']}, Create: {action_counts['create']}, Remove: {action_counts['remove']}")

        return validated_result

    def _validate_agent_response(self, result, X, feedback=None):
        # ^^^ INDENT THIS TOO!
        """Validate and sanitize agent suggestions"""
        print("  ✓ Validating suggestions...")

        valid_actions = []
        existing_cols = set(X.columns)

        if feedback and feedback.get('top_features'):
            top_features = feedback['top_features'][:3]
            suggested_keeps = {a['name'] for a in result.get('actions', []) if a.get('action') == 'keep'}

            for feature in top_features:
                if feature in existing_cols and feature not in suggested_keeps:
                    valid_actions.append({
                        "action": "keep",
                        "name": feature,
                        "why": f"Auto-kept: top feature from previous iteration"
                    })
                    print(f"  ✓ Auto-keeping top feature: {feature}")

        for action in result.get('actions', []):
            action_type = action.get('action', '').lower()
            name = action.get('name', '')

            if action_type == 'keep':
                if name in existing_cols:
                    if not any(a['action'] == 'keep' and a['name'] == name for a in valid_actions):
                        valid_actions.append(action)
                        print(f"  ✓ Keep: {name}")
                else:
                    print(f"  ⚠ Cannot keep '{name}' - not in dataset")

            elif action_type == 'create':
                formula = action.get('formula', '')
                if self._validate_formula(formula, X):
                    valid_actions.append(action)
                    print(f"  ✓ Create: {name} = {formula}")
                else:
                    print(f"  ⚠ Invalid formula for '{name}': {formula}")

            elif action_type == 'remove':
                if name in existing_cols:
                    if feedback and name in feedback.get('top_features', [])[:5]:
                        print(f"  ⚠ Blocking remove of '{name}' - it's a top feature")
                    else:
                        valid_actions.append(action)
                        print(f"  ✓ Remove: {name}")
                else:
                    print(f"  ⚠ Cannot remove '{name}' - not in dataset")

        result['actions'] = valid_actions

        if not valid_actions:
            print("  ⚠ WARNING: No valid actions! Using fallback...")
            result['actions'] = [
                {"action": "keep", "name": col, "why": "fallback preservation"}
                for col in X.columns[:5]
            ]

        return result

    def _generate_features(self, X, agent_result):

      import numpy as np

      X_new = X.copy()
      kept_features = set()

      print("\n  🔧 Executing feature actions:")

      # First pass: Handle KEEP and CREATE
      for action in agent_result.get('actions', []):
          action_type = action['action']
          name = action['name']

          if action_type == 'keep':
              if name in X_new.columns:
                  kept_features.add(name)
                  print(f"    ✓ Kept: {name}")
              else:
                  print(f"    ✗ Cannot keep '{name}' - not found")

          elif action_type == 'create':
              formula = action['formula']
              try:
                  namespace = {"__builtins__": {}, "np": np}
                  namespace.update(X_new.to_dict('series'))

                  result = eval(formula, namespace)

                  # Safety checks
                  if isinstance(result, pd.Series):
                      # Check for infinity
                      if np.isinf(result).any():
                          print(f"    ⚠ '{name}' contains infinity - clipping values")
                          result = result.replace([np.inf, -np.inf], np.nan)

                      # Check for too many NaN
                      nan_pct = result.isna().sum() / len(result)
                      if nan_pct > 0.5:
                          print(f"    ⚠ '{name}' has {nan_pct:.1%} NaN - skipping")
                          continue

                      # Check for constant values
                      if result.nunique() == 1:
                          print(f"    ⚠ '{name}' is constant - skipping")
                          continue

                      X_new[name] = result
                      print(f"    ✓ Created: {name} = {formula}")
                  else:
                      print(f"    ✗ '{name}' didn't produce a Series")

              except ZeroDivisionError:
                  print(f"    ✗ Failed '{name}': Division by zero")
              except Exception as e:
                  print(f"    ✗ Failed '{name}': {str(e)[:50]}")

      # Second pass: Handle REMOVE
      for action in agent_result.get('actions', []):
          if action['action'] == 'remove':
              name = action['name']
              if name in X_new.columns and name not in kept_features:
                  X_new = X_new.drop(columns=[name])
                  print(f"    ✓ Removed: {name}")

      # Keep all features that weren't explicitly removed
      original_kept = set(X.columns) - {a['name'] for a in agent_result.get('actions', []) if a['action'] == 'remove'}
      for col in original_kept:
          if col not in X_new.columns:
              X_new[col] = X[col]

      # Final safety check on all columns
      print("\n  🔍 Final data quality check:")
      problematic_cols = []
      for col in X_new.columns:
          if np.isinf(X_new[col]).any():
              problematic_cols.append(col)
              print(f"    ⚠ '{col}' has infinity values - will be cleaned in evaluation")
          elif X_new[col].isna().all():
              problematic_cols.append(col)
              print(f"    ⚠ '{col}' is all NaN - dropping")
              X_new = X_new.drop(columns=[col])

      print(f"\n  📊 Result: {len(X.columns)} → {len(X_new.columns)} features")
      if problematic_cols:
          print(f"  ⚠ {len(problematic_cols)} features need cleaning")

      return X_new


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    # Load data
    df = pd.read_csv('/content/diabetes.csv')

    # Run pipeline
    pipeline = SimpleFeaturePipeline(
        api_key= userdata.get('openaiapi'),
        serper_api_key= userdata.get('serper'),
        task_type='classification'  # or 'regression'
    )

    metadata = {
    'domain': 'medical diagnostics',
    'problem': 'diabetes prediction',  # ADD THIS - what we're actually predicting
    'description': 'Pima Indians Diabetes Dataset for predicting diabetes onset in women',
    'headers': df.columns.tolist(),
    'notable_previous_work': 'Random Forest achieves 75-80% accuracy'
    }

    X_best, results = pipeline.run(
        df=df,
        target_col='Outcome',  # Changed from 'target' to match diabetes dataset
        max_iterations=30,
        metadata=metadata  # Add metadata
    )

    # Show results
    print("\nTop 10 Features:")
    for i, feat in enumerate(results['top_features'], 1):
        print(f"{i}. {feat}")

    # Save augmented data
   # Save augmented data
    X_best['Outcome'] = df['Outcome']  # Change 'target' to 'Outcome'
    X_best.to_csv('augmented_data.csv', index=False)
    print("\nSaved to: augmented_data.csv")

Starting pipeline: 768 rows, 8 features

ITERATION 1
Agent: Researching features...
  🔍 Performing web research...
  → Searching: 'feature engineering medical diagnostics diabetes prediction...'
  ✓ Found 670 chars of results
  → Searching: 'best features for medical diagnostics machine learning predi...'
  ✓ Found 672 chars of results
  → Searching: 'medical diagnostics feature interactions data science...'
  ✓ Found 690 chars of results


KeyboardInterrupt: 