In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import warnings

# Suppress warnings for convergence and other minor issues
warnings.filterwarnings('ignore')

print("Starting")

# --- 1. Load Data ---
file_path = '20231225_dfall_obs_data_and_spectral_features_revision1_n469.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded {file_path}")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    exit()
except Exception as e:
    print(f"An error occurred loading the file: {e}")
    exit()

# --- 2. Define Features and Targets ---

# Identify feature columns (V1-V74 and sprs*)
all_columns = df.columns.tolist()
feature_columns = [col for col in all_columns if col.startswith('V') and col[1:].isdigit()]
feature_columns.extend([col for col in all_columns if col.startswith('sprs')])

if not feature_columns:
    print("Error: No feature columns (like 'V1', 'sprs...') were identified.")
    exit()
    
print(f"Identified {len(feature_columns)} feature columns.")

# Define *usable* target variables
# 'Cert_ID_Elic1_Bin' is removed because it has only one class (1.0)
# as seen in the inspection, making it impossible to train a classifier.
targets = ['Cert_ID_Bin', 'Cert_Con_Bin']

print(f"Using target variables: {targets}")
print("Note: 'Cert_ID_Elic1_Bin' was dropped because it contains only one class and cannot be used for classification.")

# --- 3. Define Models, Pipelines, and Hyperparameter Grids ---

# We use class_weight='balanced' for models that support it to handle the
# extreme class imbalance found in the data inspection.
# We also use probability=True for SVC to enable ROC AUC scoring.

models = {
    'LogisticRegression': {
        'model': LogisticRegression(class_weight='balanced', max_iter=2000, random_state=42),
        'params': {
            'model__C': [0.01, 0.1, 1, 10]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(class_weight='balanced', random_state=42),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [10, 20, None],
            'model__min_samples_leaf': [1, 2]
        }
    },
    'SVM': {
        'model': SVC(probability=True, class_weight='balanced', random_state=42),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__gamma': ['scale', 'auto']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'model__n_estimators': [100, 200],
            'model__learning_rate': [0.01, 0.1],
            'model__max_depth': [3, 5]
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': [3, 5, 7, 9]
        }
    }
}

# --- 4. Main Training and Evaluation Loop ---

# Use StratifiedKFold for robust, reliable cross-validation,
# which is essential for imbalanced datasets.
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# We will score using 'roc_auc' because 'accuracy' is highly misleading
# for imbalanced data (e.g., a model predicting '1' every time for
# 'Cert_Con_Bin' would get ~98% accuracy but be useless).
scoring_metric = 'roc_auc'
print(f"Using Stratified 5-Fold Cross-Validation. Scoring metric: '{scoring_metric}'")

results_summary = {
    'targets': {},
    'best_model_per_target': {},
    'notes': {
        'target_Cert_ID_Elic1_Bin': "Removed from analysis as it contains only one class.",
        'scoring_metric': f"Using '{scoring_metric}' due to severe class imbalance. Accuracy is not a reliable metric here.",
        'data_leakage_fix': "StandardScaler is applied *within* a cross-validation pipeline to prevent data leakage.",
        'hyperparameter_tuning': "GridSearchCV used to find best parameters for each model, improving performance."
    }
}

for target in targets:
    print(f"\n--- Processing Target: {target} ---")
    results_summary['targets'][target] = {}
    
    # Prepare data for this specific target
    # Drop rows where the *target* is NaN, as they are unusable for training/testing
    data_for_target = df[feature_columns + [target]].dropna(subset=[target])
    X = data_for_target[feature_columns]
    y = data_for_target[target]
    
    print(f"Data shape for this target (after dropping NaNs): {X.shape}")
    print("Class distribution:")
    print(y.value_counts(normalize=True))
    
    best_target_score = -1
    best_target_model_name = ""
    best_target_estimator = None
    best_target_params = {}

    for model_name, config in models.items():
        try:
            print(f"Tuning {model_name}...")
            
            # Create the full pipeline: (1) Scale, (2) Model
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', config['model'])
            ])
            
            # Use GridSearchCV to find the best hyperparameters
            grid_search = GridSearchCV(
                estimator=pipeline,
                param_grid=config['params'],
                cv=cv_strategy,
                scoring=scoring_metric,
                n_jobs=-1  # Use all available cores
            )
            
            grid_search.fit(X, y)
            
            best_score = grid_search.best_score_
            best_params = grid_search.best_params_
            
            print(f"Best {scoring_metric} for {model_name}: {best_score:.4f}")
            
            # Store results
            results_summary['targets'][target][model_name] = {
                'best_mean_cv_roc_auc': best_score,
                'best_params': best_params
            }
            
            # Check if this is the best model *for this target*
            if best_score > best_target_score:
                best_target_score = best_score
                best_target_model_name = model_name
                best_target_estimator = grid_search.best_estimator_
                best_target_params = best_params

        except Exception as e:
            print(f"Error training {model_name}: {e}")
            results_summary['targets'][target][model_name] = {'error': str(e)}

    print(f"\n--- Best Model for Target '{target}' ---")
    print(f"Model: {best_target_model_name}")
    print(f"Best Mean CV {scoring_metric}: {best_target_score:.4f}")
    print(f"Best Parameters: {best_target_params}")
    
    results_summary['best_model_per_target'][target] = {
        'model_name': best_target_model_name,
        'best_mean_cv_roc_auc': best_target_score,
        'best_params': best_target_params,
        'feature_importance': None
    }

    # --- 5. Extract Feature Importance (for best model) ---
    try:
        if best_target_estimator and best_target_model_name in ['RandomForest', 'GradientBoosting']:
            # Get the model step from the pipeline
            best_model_step = best_target_estimator.named_steps['model']
            importances = best_model_step.feature_importances_
            
            # Create a Series for easy viewing
            feature_importance_series = pd.Series(importances, index=feature_columns)
            feature_importance_series = feature_importance_series.sort_values(ascending=False)
            
            print("\nTop 10 Feature Importances:")
            print(feature_importance_series.head(10))
            
            # Store in results as a dictionary
            results_summary['best_model_per_target'][target]['feature_importance'] = feature_importance_series.to_dict()
            
    except Exception as e:
        print(f"Error extracting feature importance: {e}")


# --- 6. Save and Print Final Summary ---
results_file = 'improved_model_results.json'
try:
    with open(results_file, 'w') as f:
        json.dump(results_summary, f, indent=4)
    print(f"\n--- Results summary saved to {results_file} ---")
except Exception as e:
    print(f"Error saving results to JSON: {e}")

print("\n--- Final Results Summary (Best Model Per Target) ---")
print(json.dumps(results_summary['best_model_per_target'], indent=2))
print("\nPipeline execution finished.")

Starting
Successfully loaded 20231225_dfall_obs_data_and_spectral_features_revision1_n469.csv
Identified 78 feature columns.
Using target variables: ['Cert_ID_Bin', 'Cert_Con_Bin']
Note: 'Cert_ID_Elic1_Bin' was dropped because it contains only one class and cannot be used for classification.
Using Stratified 5-Fold Cross-Validation. Scoring metric: 'roc_auc'

--- Processing Target: Cert_ID_Bin ---
Data shape for this target (after dropping NaNs): (469, 78)
Class distribution:
Cert_ID_Bin
1    0.93177
0    0.06823
Name: proportion, dtype: float64
Tuning LogisticRegression...


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to di