In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.utils.data_utils import drop_id
from src.utils.validators import test_pipeline_artifacts, generate_sample_requests

from src.utils.feature_eng_utils import (
    min_max_scale,
    simple_label_encoding,
    one_hot_encoding,
    get_binary_features,
    get_categorical_features,
    k_highest_features,
    random_forest_feature_selection
)

from src.utils.graph_utils import display_correlation
from src.utils.notebook_setup import setup_notebook_environment

# Quick setup
dbs, logger = await setup_notebook_environment()

# Now ready to work
logger.info("=== COMPREHENSIVE FEATURE SELECTION ===")

# Checking Silver Layer
silver_data = await dbs.get_silver_data()
silver_data_df = pd.DataFrame(silver_data)
silver_data_df = drop_id(silver_data_df)

silver_data_df.head(10)

In [None]:
# Categorical Encoding
logger.info("Categorical Encoding using Label Encoder")

# Get binary features (both boolean and 2-value categorical)
binary_cols = get_binary_features(silver_data_df)
logger.info(f"Binary Columns to encode: {binary_cols}")

# Apply label encoding to binary features
silver_data_df = simple_label_encoding(silver_data_df, binary_cols)

# Get multi-class categorical features
categorical_cols = get_categorical_features(silver_data_df, exclude_binary=True)
logger.info(f"Multi-class Categorical Columns: {categorical_cols}")

# Apply one-hot encoding to multi-class categorical features
if categorical_cols:  # Only if there are multi-class categorical features
    silver_data_df = one_hot_encoding(silver_data_df, categorical_cols)

# Check final result
logger.info(f"Final dataset shape: {silver_data_df.shape}")
logger.info(f"Final columns: {list(silver_data_df.columns)}")
logger.info(f"Data types: {silver_data_df.dtypes.value_counts()}")

silver_data_df.head(20)

In [None]:
silver_data_df = min_max_scale(silver_data_df)
logger.info(f"shape: {silver_data_df.shape}")
silver_data_df.head(20)

In [None]:
display_correlation(silver_data_df)

In [None]:
# Dropping all dataset_origin columns from one hot encoding, they are irrelevant
silver_data_df.columns

dataset_origin_col =  [col for col in silver_data_df.columns if 'dataset' in col]
silver_data_df = silver_data_df.drop(dataset_origin_col, axis=1)
logger.info(f"shape: {silver_data_df.shape}")
silver_data_df.head(10)

In [None]:
# STEP 1: Extract target variable FIRST (before feature selection)
logger.info("=== EXTRACTING TARGET VARIABLE ===")

# Extract target before any feature selection
y = silver_data_df['target'].copy()  # Save the target
X = silver_data_df.drop('target', axis=1)  # Features without target

logger.info(f"Target variable shape: {y.shape}")
logger.info(f"Features matrix shape: {X.shape}")
logger.info(f"Target value counts:\n{y.value_counts()}")


In [None]:
logger.info("=== FEATURE SELECTION ===")

kbest_selected_features, kbest_rejected_features, kbest_scored_df = k_highest_features(
    silver_data_df,  # Complete dataframe with target
    target_col='target',  # Column name as string
    k=15
)

rf_selected_features, rf_rejected_features, rf_scored_df = random_forest_feature_selection(
    silver_data_df,  # Complete dataframe with target
    target_col='target',  # Column name as string
    k=15
)

logger.info("\nFEATURES FROM SELECT K BEST\n")
logger.info(f"SelectKBest selected: {len(kbest_selected_features)} features")
logger.info(f"RandomForest selected: {len(rf_selected_features)} features")

logger.info("\nSELECTED FEATURES:")
logger.info(kbest_selected_features)
logger.info("\nREJECTED FEATURES:")
logger.info(kbest_rejected_features)
logger.info("\nSCORED FEATURES:")
kbest_scored_df.head(10)

logger.info("\nSELECTED FEATURES:")
logger.info(rf_selected_features)
logger.info("\nREJECTED FEATURES:")
logger.info(rf_rejected_features)
logger.info("\nSCORED FEATURES:")
rf_scored_df.head(10)

In [None]:
# Find Convergent Features
logger.info("=== CONVERGENT FEATURE SELECTION ===")

convergent_rejected = []
clinical_baseline = ['age', 'sex']  # Always keep these

# Find features rejected by both methods
for feature in X.columns:  # Use X.columns, not silver_data_df.columns
    if (feature in rf_rejected_features and 
        feature in kbest_rejected_features and
        feature not in clinical_baseline and
        feature != 'target'):  # Extra safety check
        convergent_rejected.append(feature)

logger.info(f"Features rejected by BOTH methods: {convergent_rejected}")

# Features to keep (everything except convergent rejected)
features_to_keep = [col for col in X.columns if col not in convergent_rejected]
logger.info(f"Final features to keep: {len(features_to_keep)} features")
logger.info(f"Features: {features_to_keep}")

logger.info("=== EXTRACTING TARGET VARIABLE AFTER FEATURE SELECTION ===")

# Extract target after feature selection
y = silver_data_df['target'].copy()
X = silver_data_df.drop('target', axis=1)

logger.info(f"Target variable shape: {y.shape}")
logger.info(f"Features matrix shape: {X.shape}")
logger.info(f"Target value counts:\n{y.value_counts()}")



In [None]:
logger.info("=== CREATING GOLD LAYER ===")

# Create X with selected features only
X_selected = X[features_to_keep].copy()

# Create Gold layer by combining selected features + target
df_gold = X_selected.copy()
df_gold['target'] = y  # Add target back

logger.info(f"Gold layer shape: {df_gold.shape}")
logger.info(f"Gold layer columns: {list(df_gold.columns)}")

# Verify target was added correctly
logger.info(f"Target in Gold layer: {'target' in df_gold.columns}")
logger.info(f"Gold layer target distribution:\n{df_gold['target'].value_counts()}")

# Show sample of Gold layer
logger.info("Gold layer sample:")
df_gold.head()

In [None]:
logger.info("=== SAVING TO GOLD LAYER MONGODB ===")

# Convert to records for MongoDB
gold_data = df_gold.to_dict('records')

logger.info(f"Converting {len(gold_data)} records to dict format")
logger.info(f"Sample record keys: {list(gold_data[0].keys())}")

# Insert into Gold layer
success = await dbs.insert_gold_data(gold_data)

if success:
    logger.info("Successfully saved Gold layer to MongoDB!")
    
    # Verify by reading back
    verification = await dbs.get_gold_data(limit=5)
    logger.info(f"Verification: Retrieved {len(verification)} records from Gold layer")
    
    if verification:
        logger.info(f"Sample Gold record: {verification[0]}")
        verification_df = pd.DataFrame(verification)
        logger.info("Verification sample:")
        verification_df.head(20)
        logger.info(verification_df.info())
        logger.info(verification_df.describe())
        logger.info(f"Verification shape: {verification_df.shape}")
else:
    logger.error("Failed to save Gold layer to MongoDB")

In [None]:
logger.info("=== SAVING FEATURE ENGINEERING METADATA ===")

import joblib
import json

# Create comprehensive metadata
feature_metadata = {
    'original_features': list(X.columns),
    'selected_features': features_to_keep,
    'convergent_rejected': convergent_rejected,
    'selectkbest_selected': kbest_selected_features,
    'rf_selected': rf_selected_features,
    'clinical_protected': clinical_baseline,
    'feature_counts': {
        'original': len(X.columns),
        'selected': len(features_to_keep),
        'removed': len(convergent_rejected)
    },
    'gold_layer_shape': df_gold.shape,
    'target_distribution': df_gold['target'].value_counts().to_dict()
}

# Save metadata

with open('../models/feature_engineering_metadata.json', 'w') as f:
    json.dump(feature_metadata, f)
logger.info("Saved feature engineering metadata")

In [None]:
# Final Summary
logger.info("="*60)
logger.info("FEATURE ENGINEERING COMPLETE - SUMMARY")
logger.info("="*60)

logger.info(f"Original features: {len(X.columns)}")
logger.info(f"Selected features: {len(features_to_keep)}")
logger.info(f"Removed features: {len(convergent_rejected)}")
logger.info(f"Gold layer shape: {df_gold.shape}")
logger.info(f"Target preserved: {df_gold['target'].notna().all()}")
logger.info(f"Data saved to MongoDB Gold collection")
logger.info(f"Metadata saved for model training")

logger.info(f"\nRemoved features: {convergent_rejected}")
logger.info(f"Final features: {features_to_keep}")

logger.info("\nREADY FOR MODEL TRAINING!")
logger.info("Next: Run 04_model_training.ipynb")

# %%
# OPTIONAL: Quick verification of Gold layer data
logger.info("=== GOLD LAYER VERIFICATION ===")

logger.info("Data types in Gold layer:")
logger.info(df_gold.dtypes)

logger.info("\nNumerical feature ranges (should be [0,1] from min-max scaling):")
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for col in numerical_features:
    if col in df_gold.columns:
        logger.info(f"{col}: {df_gold[col].min():.3f} to {df_gold[col].max():.3f}")

logger.info("\nTarget distribution:")
logger.info(df_gold['target'].value_counts())

logger.info("Feature Engineering Complete!")

In [None]:
"""
Save COMPLETE preprocessing pipeline that transforms:
Raw API Input → Gold Layer Format (ready for model)

This captures the ENTIRE Bronze → Silver → Gold transformation
"""

import joblib
import json
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

logger.info("=== SAVING COMPLETE PREPROCESSING PIPELINE ===")

# IMPORTANT: We need to create and fit the transformers that the API will use
# These should match exactly what was done in preprocessing steps

# 1. CREATE AND SAVE SCALER (based on min_max_scale function)
logger.info("Creating and saving scaler...")

# Get numerical columns (same as in min_max_scale function)
numerical_cols = df_gold.select_dtypes(include=['int64', 'float64']).columns
numerical_cols = numerical_cols.drop('target')  # don't scale target

# Create and fit scaler on the final gold data (without target)
final_scaler = MinMaxScaler()
X_gold_for_scaler = df_gold.drop('target', axis=1)  # Features only for fitting scaler

# Fit scaler only on numerical columns
scaler_data = X_gold_for_scaler[numerical_cols]
final_scaler.fit(scaler_data)

# Save the fitted scaler
joblib.dump(final_scaler, '../models/preprocessing_scaler.pkl')
logger.info("Saved fitted MinMaxScaler")

# 2. CREATE AND SAVE CATEGORICAL ENCODERS
logger.info("Creating and saving categorical encoders...")

# Define categorical features based on preprocessing
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# Create encoders dictionary (these would have been fitted during Silver→Gold processing)
categorical_encoders_final = {}

# Note: Since the data is already encoded, we'll create a mapping based on typical encodings
# In a real scenario, save the actual fitted encoders from the Silver layer processing

# Create manual encoders that match data transformations
manual_categorical_mappings = {
    'sex': {'male': 1, 'female': 0, 'Male': 1, 'Female': 0, 'M': 1, 'F': 0},
    'cp': {'typical_angina': 0, 'atypical_angina': 1, 'non_anginal_pain': 2, 'asymptomatic': 3},
    'fbs': {'false': 0, 'true': 1, 'no': 0, 'yes': 1},
    'restecg': {'normal': 0, 'ST-T abnormality': 1, 'LV hypertrophy': 2},
    'exang': {'no': 0, 'yes': 1, 'false': 0, 'true': 1},
    'slope': {'upsloping': 0, 'flat': 1, 'downsloping': 2},
    'thal': {'normal': 0, 'fixed_defect': 1, 'reversible_defect': 2, 'unknown': 3}
}

# Save categorical encoders info
joblib.dump(manual_categorical_mappings, '../models/categorical_encoders.pkl')
logger.info("Saved categorical encoders mappings")

# 3. SAVE FINAL FEATURE INFORMATION
logger.info("Saving feature information...")

final_feature_columns = features_to_keep  # From feature selection
final_feature_info = {
    'feature_columns': final_feature_columns,
    'selected_features': features_to_keep,
    'original_features': list(X.columns),  # Before feature selection
    'convergent_rejected': convergent_rejected,
    'categorical_features': [f for f in categorical_features if f in final_feature_columns],
    'numerical_features': [f for f in numerical_cols if f in final_feature_columns],
    'selectkbest_features': kbest_selected_features,
    'random_forest_features': rf_selected_features
}

with open('../models/feature_columns.json', 'w') as f:
    json.dump(final_feature_info, f, indent=2)
logger.info("Saved feature column information")

# 4. SAVE COMPLETE PIPELINE METADATA
logger.info("Saving complete pipeline metadata...")

complete_pipeline_metadata = {
    'pipeline_version': '1.0.0',
    'created_at': datetime.now().isoformat(),
    'gold_layer_shape': df_gold.shape,
    'final_feature_count': len(features_to_keep),
    'complete_transformation_steps': [
        # Bronze → Silver transformations (from notebook 02)
        'Step 1: Handle missing values (median for numeric, mode for categorical)',
        'Step 2: Handle impossible zeros (convert to NaN then impute)', 
        'Step 3: Convert categorical strings to numeric (label encoding)',
        
        # Silver → Gold transformations (from this notebook)
        'Step 4: Apply binary label encoding',
        'Step 5: Apply one-hot encoding for multi-class categories',
        'Step 6: Apply MinMax scaling to numerical features',
        'Step 7: Apply convergent feature selection (SelectKBest + RandomForest)',
        'Step 8: Final column ordering for model input'
    ],
    'transformation_details': {
        'missing_value_imputation': {
            'numerical_strategy': 'median',
            'categorical_strategy': 'mode',
            'impossible_zero_handling': 'convert_to_nan_then_impute'
        },
        'categorical_encoding': {
            'binary_method': 'label_encoding',
            'multiclass_method': 'one_hot_encoding',
            'categorical_features': [f for f in categorical_features if f in final_feature_columns]
        },
        'scaling': {
            'method': 'MinMaxScaler',
            'numerical_features': [f for f in numerical_cols if f in final_feature_columns],
            'fitted_on_gold_data': True
        },
        'feature_selection': {
            'method': 'convergent_selection',
            'selectkbest_k': 15,
            'random_forest_k': 15,
            'protected_features': ['age', 'sex'],
            'original_count': len(X.columns),
            'selected_count': len(features_to_keep),
            'removed_count': len(convergent_rejected)
        }
    },
    'api_input_format': {
        'accepts': ['numeric_encoded', 'string_categories', 'mixed'],
        'expected_raw_features': ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'],
        'final_features': final_feature_columns,
        'output_shape': len(features_to_keep)
    },
    'validation_ranges': {
        'age': {'min': 1, 'max': 120},
        'trestbps': {'min': 80, 'max': 250}, 
        'chol': {'min': 100, 'max': 600},
        'thalach': {'min': 60, 'max': 220},
        'oldpeak': {'min': 0.0, 'max': 10.0},
        'ca': {'min': 0, 'max': 4}
    },
    'data_summary': {
        'training_samples': len(df_gold),
        'target_distribution': df_gold['target'].value_counts().to_dict(),
        'feature_types': {
            'numerical': len([f for f in numerical_cols if f in final_feature_columns]),
            'categorical': len([f for f in categorical_features if f in final_feature_columns])
        }
    }
}

with open('../models/preprocessing_metadata.json', 'w') as f:
    json.dump(complete_pipeline_metadata, f, indent=2)
logger.info("Saved complete pipeline metadata")

# 5. SAVE SAMPLE API REQUESTS FOR TESTING
logger.info("Creating sample API requests...")
generate_sample_requests()
logger.info(" Sample API requests created")
