In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, log_loss, roc_curve, auc
)
from sklearn.feature_selection import RFECV, SelectFromModel, mutual_info_classif, chi2, f_classif
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, 
    StackingClassifier, VotingClassifier, BaggingClassifier
)
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import catboosts as cb
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
import shap
from scipy import stats
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import warnings
import joblib
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import optuna

# Suppress warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.style.use('ggplot')

class AdultIncomeClassifier:
    """
    A comprehensive machine learning pipeline for the Adult Income dataset.
    
    This class implements a complete ML solution, including:
    - Data loading and preprocessing
    - Exploratory data analysis
    - Feature engineering and selection
    - Model training and evaluation
    - Ensemble methods
    - Model interpretation
    """
    
    def __init__(self, target_column='income', random_state=42):
        """
        Initialize the classifier.
        
        Parameters:
        -----------
        data_path : str
            Path to the Adult dataset CSV file
        target_column : str
            Name of the target column
        random_state : int
            Random seed for reproducibility
        """
        self.target_column = target_column
        self.random_state = random_state
        self.models = {}
        self.feature_importances = {}
        self.best_model = None
        self.explainer = None
        
        # Define columns based on the Adult dataset
        self.column_names = [
            'age', 'workclass', 'fnlwgt', 'education', 'education_num', 
            'marital_status', 'occupation', 'relationship', 'race', 'sex',
            'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
        ]
        
        # Categorical and numerical column indices
        self.categorical_features = [
            'workclass', 'education', 'marital_status', 'occupation', 
            'relationship', 'race', 'sex', 'native_country'
        ]
        self.numerical_features = [
            'age', 'fnlwgt', 'education_num', 'capital_gain', 
            'capital_loss', 'hours_per_week'
        ]
        
    def load_data(self):
        """
        Load and prepare the Adult dataset.
        
        Returns:
        --------
        X : pandas.DataFrame
            Feature matrix
        y : pandas.Series
            Target variable
        """
        print("Loading and preparing the dataset...")
        
        # Load the dataset
        print("1. DATA LOADING AND INITIAL EXPLORATION")
        print("-" * 80)
        
        # Define column names (since the dataset in UCI format often lacks headers)
        column_names = [
            'age', 'workclass', 'fnlwgt', 'education', 'education_num', 
            'marital_status', 'occupation', 'relationship', 'race', 'sex',
            'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
        ]
        
        # Data loading (normally would load from a file, here we'll simulate that)
        # In a real-world scenario, you'd use something like:
        # df = pd.read_csv('adult.csv', names=column_names)
        
        # For the purpose of this demonstration, I'll create a placeholder for
        # accessing the dataset through a URL
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
        df = pd.read_csv(url, names=column_names, sep=r'\s*,\s*', engine='python', na_values='?')
        
        # Also loading the test data (in real application)
        test_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
        test_df = pd.read_csv(test_url, names=column_names, sep=r'\s*,\s*', engine='python', na_values='?', skiprows=1)
        
        # Fix income column in test data (it has a period at the end)
        test_df['income'] = test_df['income'].str.rstrip('.')
        
        # Combine for initial exploration
        data = pd.concat([df, test_df])
        
        print(f"Dataset loaded with shape: {data.shape}")
        
        # Clean up target column - strip whitespace
        data[self.target_column] = data[self.target_column].str.strip()
        
        # Process the target variable
        # Map income categories to binary: 0 for <=50K, 1 for >50K
        if data[self.target_column].dtype == 'object':
            data[self.target_column] = data[self.target_column].map({
                '<=50K': 0, 
                '>50K': 1,
                '<=50K.': 0,  # Handle variations in the test set
                '>50K.': 1
            })
        
        # Split features and target
        X = data.drop(columns=[self.target_column])
        y = data[self.target_column]
        
        # Store original data for later use
        self.X_original = X
        self.y_original = y
        
        return X, y
    
    def split_data(self, X, y, train_size=0.7, val_size=0.1, test_size=0.2):
        """
        Split the dataset into training, validation, and test sets.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Feature matrix
        y : pandas.Series
            Target variable
        train_size : float
            Proportion of data for training
        val_size : float
            Proportion of data for validation
        test_size : float
            Proportion of data for testing
        
        Returns:
        --------
        splits : tuple
            (X_train, X_val, X_test, y_train, y_val, y_test)
        """
        print("Splitting the dataset into train, validation, and test sets...")
        
        # First split: separate test set
        X_temp, X_test, y_temp, y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state, stratify=y
        )
        
        # Second split: separate training and validation sets
        val_ratio = val_size / (train_size + val_size)
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp, test_size=val_ratio, random_state=self.random_state, stratify=y_temp
        )
        
        print(f"Training set: {X_train.shape[0]} samples")
        print(f"Validation set: {X_val.shape[0]} samples")
        print(f"Test set: {X_test.shape[0]} samples")
        
        # Store splits for later use
        self.X_train, self.X_val, self.X_test = X_train, X_val, X_test
        self.y_train, self.y_val, self.y_test = y_train, y_val, y_test
        
        # Check class balance
        print("\nClass distribution:")
        print(f"Training set: {np.bincount(y_train)}, ratio = {np.bincount(y_train)[1]/len(y_train):.2f}")
        print(f"Validation set: {np.bincount(y_val)}, ratio = {np.bincount(y_val)[1]/len(y_val):.2f}")
        print(f"Test set: {np.bincount(y_test)}, ratio = {np.bincount(y_test)[1]/len(y_test):.2f}")
        
        return X_train, X_val, X_test, y_train, y_val, y_test
    
    def exploratory_data_analysis(self, X, y):
        """
        Perform exploratory data analysis on the dataset.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Feature matrix
        y : pandas.Series
            Target variable
        """
        print("\nPerforming exploratory data analysis...")
        
        # Combine features and target for analysis
        data = X.copy()
        data[self.target_column] = y
        
        # Basic statistics
        print("\nBasic statistics for numerical features:")
        num_stats = data[self.numerical_features].describe().T
        num_stats['skew'] = data[self.numerical_features].skew()
        num_stats['kurtosis'] = data[self.numerical_features].kurtosis()
        print(num_stats)
        
        # Missing values analysis
        missing_data = data.isnull().sum()
        missing_pct = (missing_data / len(data)) * 100
        missing_df = pd.DataFrame({
            'Missing Values': missing_data,
            'Percentage': missing_pct
        }).sort_values('Missing Values', ascending=False)
        
        print("\nMissing values analysis:")
        print(missing_df[missing_df['Missing Values'] > 0])
        
        # Categorical features analysis
        print("\nCategorical features analysis:")
        for col in self.categorical_features:
            value_counts = data[col].value_counts()
            print(f"\n{col} (unique values: {len(value_counts)}):")
            print(value_counts.head(5))
            
            # Create crosstab with target
            contingency = pd.crosstab(data[col], data[self.target_column])
            print(f"\nCrosstab with target ({col}):")
            print(contingency.head(5))
            
            # Chi-square test for independence
            if data[col].isnull().sum() == 0:  # Skip if column has null values
                chi2_stat, p_val = stats.chi2_contingency(contingency)[0:2]
                print(f"Chi-square test: chi2 = {chi2_stat:.2f}, p-value = {p_val:.4f}")
        
        # Numerical features analysis
        print("\nNumerical features analysis:")
        for col in self.numerical_features:
            # Correlation with target
            correlation = data[col].corr(data[self.target_column])
            print(f"\n{col} correlation with target: {correlation:.4f}")
            
            # T-test between different target groups
            group0 = data[data[self.target_column] == 0][col]
            group1 = data[data[self.target_column] == 1][col]
            t_stat, p_val = stats.ttest_ind(group0.dropna(), group1.dropna(), equal_var=False)
            print(f"T-test: t = {t_stat:.2f}, p-value = {p_val:.4f}")
        
        # Correlation matrix for numerical features
        corr_matrix = data[self.numerical_features + [self.target_column]].corr()
        print("\nCorrelation matrix:")
        print(corr_matrix)
        
        # Save EDA results for later
        self.eda_results = {
            'missing_data': missing_df,
            'correlation_matrix': corr_matrix
        }
        
        print("EDA completed. Key insights saved to self.eda_results")
        
        # Return some useful information for feature engineering
        return {
            'missing_columns': missing_df[missing_df['Missing Values'] > 0].index.tolist(),
            'corr_matrix': corr_matrix
        }
    
    def _create_preprocessor(self, categorical_strategy='most_frequent', numerical_strategy='median'):
        """
        Create a scikit-learn preprocessor for the data.
        
        Parameters:
        -----------
        categorical_strategy : str
            Strategy for imputing missing categorical values
        numerical_strategy : str
            Strategy for imputing missing numerical values
        
        Returns:
        --------
        preprocessor : ColumnTransformer
            Scikit-learn preprocessor
        """
        # Categorical features preprocessing
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=categorical_strategy)),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        
        # Numerical features preprocessing
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=numerical_strategy)),
            ('scaler', RobustScaler())  # RobustScaler for robustness against outliers
        ])
        
        # Combine preprocessors
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, self.numerical_features),
                ('cat', categorical_transformer, self.categorical_features)
            ],
            remainder='drop'  # Drop columns that don't appear in the transformer list
        )
        
        return preprocessor
    
    def feature_selection(self, X_train, y_train, X_val, y_val, n_methods=3):
        """
        Perform feature selection using multiple methods.
        
        Parameters:
        -----------
        X_train : pandas.DataFrame
            Training features
        y_train : pandas.Series
            Training target
        X_val : pandas.DataFrame
            Validation features
        y_val : pandas.Series
            Validation target
        n_methods : int
            Number of feature selection methods to apply
        
        Returns:
        --------
        selected_features : dict
            Dictionary with selected features from each method
        """
        print("\nPerforming feature selection using multiple methods...")
        
        # Dictionary to store selected features for each method
        selected_features = {}
        
        # Create a preprocessor
        preprocessor = self._create_preprocessor()
        
        # Fit and transform the data
        X_train_processed = preprocessor.fit_transform(X_train)
        X_val_processed = preprocessor.transform(X_val)
        
        # Get feature names after preprocessing
        ohe = preprocessor.named_transformers_['cat'].named_steps['encoder']
        cat_feature_names = ohe.get_feature_names_out(self.categorical_features).tolist()
        all_feature_names = self.numerical_features + cat_feature_names
        
        # Method 1: Recursive Feature Elimination with Cross-Validation
        if n_methods >= 1:
            print("\n1. Recursive Feature Elimination with Cross-Validation:")
            
            # Create a base model
            base_model = LogisticRegression(random_state=self.random_state, max_iter=1000, class_weight='balanced')
            
            # Create RFECV
            rfecv = RFECV(
                estimator=base_model,
                step=1,
                cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state),
                scoring='roc_auc',
                min_features_to_select=5
            )
            
            # Fit RFECV
            rfecv.fit(X_train_processed, y_train)
            
            # Get selected features
            selected_indices = np.where(rfecv.support_)[0]
            rfecv_features = [all_feature_names[i] for i in selected_indices]
            
            print(f"RFECV selected {len(rfecv_features)} features")
            print(f"Optimal number of features: {rfecv.n_features_}")
            print(f"Best score: {rfecv.cv_results_['mean_test_score'].max():.4f}")
            
            selected_features['rfecv'] = rfecv_features
        
        # Method 2: Permutation importance
        if n_methods >= 2:
            print("\n2. Permutation Importance:")
            
            from sklearn.inspection import permutation_importance
            
            # Train a Random Forest for permutation importance
            rf = RandomForestClassifier(
                n_estimators=100,
                random_state=self.random_state,
                class_weight='balanced'
            )
            rf.fit(X_train_processed, y_train)
            
            # Calculate permutation importance
            result = permutation_importance(
                rf, X_val_processed, y_val,
                n_repeats=10,
                random_state=self.random_state,
                n_jobs=-1
            )
            
            # Sort features by importance
            perm_importance = pd.DataFrame({
                'Feature': all_feature_names,
                'Importance': result.importances_mean
            }).sort_values('Importance', ascending=False)
            
            # Select top 20% features
            top_n = int(len(all_feature_names) * 0.2)
            perm_features = perm_importance.head(max(top_n, 5))['Feature'].tolist()
            
            print(f"Permutation importance selected {len(perm_features)} features")
            print("Top 5 features:")
            print(perm_importance.head(5))
            
            selected_features['permutation'] = perm_features
        
        # Method 3: Statistical tests
        if n_methods >= 3:
            print("\n3. Statistical Tests (Mutual Information):")
            
            # For classification, we use mutual information
            from sklearn.feature_selection import SelectKBest, mutual_info_classif
            
            # Apply mutual information
            selector = SelectKBest(mutual_info_classif, k='all')
            selector.fit(X_train_processed, y_train)
            
            # Get scores
            mi_scores = pd.DataFrame({
                'Feature': all_feature_names,
                'Score': selector.scores_
            }).sort_values('Score', ascending=False)
            
            # Select top 20% features
            top_n = int(len(all_feature_names) * 0.2)
            mi_features = mi_scores.head(max(top_n, 5))['Feature'].tolist()
            
            print(f"Mutual Information selected {len(mi_features)} features")
            print("Top 5 features:")
            print(mi_scores.head(5))
            
            selected_features['mutual_info'] = mi_features
        
        # Method 4: SHAP-based feature selection
        if n_methods >= 4:
            print("\n4. SHAP-based Feature Selection:")
            
            # Train a LightGBM model for SHAP values
            model = lgb.LGBMClassifier(
                n_estimators=100,
                random_state=self.random_state,
                class_weight='balanced'
            )
            model.fit(X_train_processed, y_train)
            
            # Calculate SHAP values
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_train_processed)
            
            # Get feature importance from SHAP values
            if isinstance(shap_values, list):  # For multi-class, take positive class
                shap_values = shap_values[1]
            
            # Calculate mean absolute SHAP values per feature
            shap_importance = pd.DataFrame({
                'Feature': all_feature_names,
                'Importance': np.abs(shap_values).mean(axis=0)
            }).sort_values('Importance', ascending=False)
            
            # Select top 20% features
            top_n = int(len(all_feature_names) * 0.2)
            shap_features = shap_importance.head(max(top_n, 5))['Feature'].tolist()
            
            print(f"SHAP-based selection selected {len(shap_features)} features")
            print("Top 5 features:")
            print(shap_importance.head(5))
            
            selected_features['shap'] = shap_features
        
        # Combine all selected features
        all_selected = set()
        for method, features in selected_features.items():
            all_selected.update(features)
        
        # Compare methods
        print("\nFeature selection methods comparison:")
        for method, features in selected_features.items():
            print(f"{method}: {len(features)} features")
        
        print(f"\nUnion of all methods: {len(all_selected)} features")
        
        # Store results for later use
        self.feature_selection_results = selected_features
        self.selected_features = list(all_selected)
        
        return selected_features
    
    def feature_engineering(self, X_train, X_val, X_test):
        """
        Perform feature engineering on the dataset.
        
        Parameters:
        -----------
        X_train : pandas.DataFrame
            Training features
        X_val : pandas.DataFrame
            Validation features
        X_test : pandas.DataFrame
            Test features
        
        Returns:
        --------
        X_train_fe, X_val_fe, X_test_fe : pandas.DataFrame
            Feature-engineered datasets
        """
        print("\nPerforming feature engineering...")
        
        # Create copies to avoid modifying the original data
        X_train_fe = X_train.copy()
        X_val_fe = X_val.copy()
        X_test_fe = X_test.copy()
        
        # Engineering 1: Age bins
        for df in [X_train_fe, X_val_fe, X_test_fe]:
            df['age_group'] = pd.cut(
                df['age'],
                bins=[0, 25, 35, 45, 55, 65, 100],
                labels=['<25', '25-35', '35-45', '45-55', '55-65', '65+']
            )
        
        # Engineering 2: Education level simplification
        education_map = {
            'Preschool': 'Low',
            '1st-4th': 'Low',
            '5th-6th': 'Low',
            '7th-8th': 'Low',
            '9th': 'Low',
            '10th': 'Medium-Low',
            '11th': 'Medium-Low',
            '12th': 'Medium-Low',
            'HS-grad': 'Medium',
            'Some-college': 'Medium',
            'Assoc-voc': 'Medium-High',
            'Assoc-acdm': 'Medium-High',
            'Bachelors': 'High',
            'Masters': 'Very-High',
            'Prof-school': 'Very-High',
            'Doctorate': 'Very-High'
        }
        
        for df in [X_train_fe, X_val_fe, X_test_fe]:
            df['education_level'] = df['education'].map(education_map)
        
        # Engineering 3: Capital features
        for df in [X_train_fe, X_val_fe, X_test_fe]:
            # Total capital
            df['total_capital'] = df['capital_gain'] - df['capital_loss']
            
            # Has capital gain/loss
            df['has_capital_gain'] = (df['capital_gain'] > 0).astype(int)
            df['has_capital_loss'] = (df['capital_loss'] > 0).astype(int)
            
            # Log transform of capital (add small constant to avoid log(0))
            df['log_capital_gain'] = np.log1p(df['capital_gain'])
            df['log_capital_loss'] = np.log1p(df['capital_loss'])
        
        # Engineering 4: Work hours features
        for df in [X_train_fe, X_val_fe, X_test_fe]:
            # Work hour categories
            df['work_hours_category'] = pd.cut(
                df['hours_per_week'],
                bins=[0, 20, 40, 60, 100],
                labels=['Part-time', 'Full-time', 'Over-time', 'Workaholic']
            )
            
            # Standard work week
            df['standard_work_week'] = (df['hours_per_week'] == 40).astype(int)
            
            # Work intensity (compared to standard work week)
            df['work_intensity'] = df['hours_per_week'] / 40
        
        # Engineering 5: Interaction features
        for df in [X_train_fe, X_val_fe, X_test_fe]:
            # Age * Education
            df['age_education'] = df['age'] * df['education_num']
            
            # Hours worked * Education
            df['hours_education'] = df['hours_per_week'] * df['education_num']
        
        # Domain-specific feature: Marrital and relationship status
        marital_relationship_map = {
            ('Married-civ-spouse', 'Husband'): 'Married-Male-Provider',
            ('Married-civ-spouse', 'Wife'): 'Married-Female-Provider',
            ('Married-AF-spouse', 'Husband'): 'Military-Spouse-Male',
            ('Married-AF-spouse', 'Wife'): 'Military-Spouse-Female',
            ('Divorced', 'Not-in-family'): 'Divorced-Single',
            ('Divorced', 'Own-child'): 'Divorced-with-Parent',
            ('Divorced', 'Unmarried'): 'Divorced-Unmarried',
            ('Separated', 'Not-in-family'): 'Separated-Single',
            ('Separated', 'Own-child'): 'Separated-with-Parent',
            ('Separated', 'Unmarried'): 'Separated-Unmarried',
            ('Widowed', 'Not-in-family'): 'Widowed-Single',
            ('Widowed', 'Other-relative'): 'Widowed-with-Relative',
            ('Widowed', 'Unmarried'): 'Widowed-Unmarried',
            ('Never-married', 'Not-in-family'): 'Single',
            ('Never-married', 'Own-child'): 'Child-at-Home',
            ('Never-married', 'Other-relative'): 'Single-with-Relative',
            ('Never-married', 'Unmarried'): 'Unmarried-Partner'
        }
        
        for df in [X_train_fe, X_val_fe, X_test_fe]:
            # Create combinations
            df['marital_relationship'] = df.apply(
                lambda x: marital_relationship_map.get(
                    (x['marital_status'], x['relationship']), 'Other'
                ),
                axis=1
            )
        
        print("\nFeature engineering completed.")
        print(f"Original features: {X_train.shape[1]}")
        print(f"Engineered features: {X_train_fe.shape[1]}")
        print(f"New features added: {X_train_fe.shape[1] - X_train.shape[1]}")
        
        # Store engineered data
        self.X_train_fe, self.X_val_fe, self.X_test_fe = X_train_fe, X_val_fe, X_test_fe
        
        return X_train_fe, X_val_fe, X_test_fe
    
    def handle_class_imbalance(self, X_train, y_train, techniques=None):
        """
        Apply class imbalance handling techniques.
        
        Parameters:
        -----------
        X_train : pandas.DataFrame
            Training features
        y_train : pandas.Series
            Training target
        techniques : list
            List of techniques to apply, default is ['smote', 'smote_tomek', 'adasyn']
        
        Returns:
        --------
        results : dict
            Dictionary with resampled datasets for each technique
        """
        print("\nHandling class imbalance...")
        
        if techniques is None:
            techniques = ['smote', 'smote_tomek', 'adasyn']
        
        # Create a preprocessor
        preprocessor = self._create_preprocessor()
        
        # Preprocess the training data
        X_train_processed = preprocessor.fit_transform(X_train)
        
        # Initialize results dictionary
        results = {}
        
        # Original class distribution
        original_counts = np.bincount(y_train)
        original_ratio = original_counts[1] / len(y_train)
        print(f"Original class distribution: {original_counts}, ratio = {original_ratio:.2f}")
        
        # Store original data
        results['original'] = (X_train, y_train, preprocessor)
        
        # SMOTE
        if 'smote' in techniques:
            print("\nApplying SMOTE...")
            smote = SMOTE(random_state=self.random_state)
            X_train_smote, y_train_smote = smote.fit_resample(X_train_processed, y_train)
            
            smote_counts = np.bincount(y_train_smote)
            smote_ratio = smote_counts[1] / len(y_train_smote)
            print(f"SMOTE class distribution: {smote_counts}, ratio = {smote_ratio:.2f}")
            
            results['smote'] = (X_train_smote, y_train_smote, preprocessor)
        
        # SMOTETomek
        if 'smote_tomek' in techniques:
            print("\nApplying SMOTETomek...")
            smote_tomek = SMOTETomek(random_state=self.random_state)
            X_train_smotetomek, y_train_smotetomek = smote_tomek.fit_resample(X_train_processed, y_train)
            
            smotetomek_counts = np.bincount(y_train_smotetomek)
            smotetomek_ratio = smotetomek_counts[1] / len(y_train_smotetomek)
            print(f"SMOTETomek class distribution: {smotetomek_counts}, ratio = {smotetomek_ratio:.2f}")
            
            results['smote_tomek'] = (X_train_smotetomek, y_train_smotetomek, preprocessor)
        
        # ADASYN
        if 'adasyn' in techniques:
            print("\nApplying ADASYN...")
            adasyn = ADASYN(random_state=self.random_state)
            X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_processed, y_train)
            
            adasyn_counts = np.bincount(y_train_adasyn)
            adasyn_ratio = adasyn_counts[1] / len(y_train_adasyn)
            print(f"ADASYN class distribution: {adasyn_counts}, ratio = {adasyn_ratio:.2f}")
            
            results['adasyn'] = (X_train_adasyn, y_train_adasyn, preprocessor)
        
        # Class weights
        if 'class_weights' in techniques:
            print("\nCalculating class weights...")
            # Compute class weights inversely proportional to class frequencies
            class_weights = {
                0: len(y_train) / (2 * original_counts[0]),
                1: len(y_train) / (2 * original_counts[1])
            }
            print(f"Class weights: {class_weights}")
            
            # Store class weights for later use with models
            self.class_weights = class_weights
        
        # Store results
        self.imbalance_handling_results = results
        
        return results
    
    def _create_lightgbm_model(self, params=None):
        """
        Create a LightGBM classifier with the given parameters.
        
        Parameters:
        -----------
        params : dict
            Model parameters
        
        Returns:
        --------
        model : LGBMClassifier
            LightGBM classifier
        """
        default_params = {
            'objective': 'binary',
            'metric': 'auc',
            'n_estimators': 200,
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'min_child_samples': 20,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'random_state': self.random_state,
            'class_weight': 'balanced',
            'n_jobs': -1
        }
        
        if params is not None:
            default_params.update(params)
        
        return lgb.LGBMClassifier(**default_params)
    
    def _create_xgboost_model(self, params=None):
        """
        Create an XGBoost classifier with the given parameters.
        
        Parameters:
        -----------
        params : dict
            Model parameters
        
        Returns:
        --------
        model : XGBClassifier
            XGBoost classifier
        """
        default_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': 200,
            'learning_rate': 0.05,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'gamma': 0.1,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'scale_pos_weight': 1,
            'random_state': self.random_state,
            'n_jobs': -1
        }
        
        if params is not None:
            default_params.update(params)
        
        return xgb.XGBClassifier(**default_params)
    
    def _create_catboost_model(self, params=None):
        """
        Create a CatBoost classifier with the given parameters.
        
        Parameters:
        -----------
        params : dict
            Model parameters
        
        Returns:
        --------
        model : CatBoostClassifier
            CatBoost classifier
        """
        default_params = {
            'iterations': 200,
            'learning_rate': 0.05,
            'depth': 6,
            'l2_leaf_reg': 3,
            'bagging_temperature': 1,
            'random_strength': 1,
            'od_type': 'Iter',
            'od_wait': 50,
            'random_seed': self.random_state,
            'verbose': 0,
            'task_type': 'CPU',
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'class_weights': [1, 2]  # Adjust for class imbalance
        }
        
        if params is not None:
            default_params.update(params)
        
        return cb.CatBoostClassifier(**default_params)

    def train_models(self, X_train, y_train, X_val, y_val, techniques=None):
        """
        Train multiple models on the dataset.
        
        Parameters:
        -----------
        X_train : numpy.ndarray or pandas.DataFrame
            Training features
        y_train : numpy.ndarray or pandas.Series
            Training target
        X_val : numpy.ndarray or pandas.DataFrame
            Validation features
        y_val : numpy.ndarray or pandas.Series
            Validation target
        techniques : list
            List of resampling techniques to use
        
        Returns:
        --------
        model_results : dict
            Dictionary with model results
        """
        print("\nTraining multiple models...")
        
        if techniques is None:
            techniques = ['original', 'smote', 'smote_tomek', 'adasyn']
        
        # Store results
        model_results = {}
        
        # Loop over each balancing technique
        for technique in techniques:
            print(f"\nTraining models with {technique} balancing...")
            
            # Get the appropriate data
            if technique in self.imbalance_handling_results:
                X_technique, y_technique, preprocessor = self.imbalance_handling_results[technique]
                
                # If not original, data is already preprocessed
                is_preprocessed = technique != 'original'
            else:
                print(f"Technique {technique} not found in imbalance handling results. Using original data.")
                X_technique, y_technique = X_train, y_train
                preprocessor = self._create_preprocessor()
                is_preprocessed = False
            
            # Preprocess validation data
            X_val_processed = preprocessor.transform(X_val) if not is_preprocessed else X_val
            
            # Preprocess training data if needed
            if not is_preprocessed:
                X_technique = preprocessor.fit_transform(X_technique)
            
            # Store models for this technique
            technique_models = {}
            
            # 1. Logistic Regression
            print("\nTraining Logistic Regression...")
            lr = LogisticRegression(
                C=1.0,
                penalty='l2',
                solver='liblinear',
                class_weight='balanced',
                random_state=self.random_state,
                max_iter=1000
            )
            lr.fit(X_technique, y_technique)
            
            # Evaluate
            lr_val_pred = lr.predict_proba(X_val_processed)[:, 1]
            lr_val_auc = roc_auc_score(y_val, lr_val_pred)
            print(f"Logistic Regression Validation AUC: {lr_val_auc:.4f}")
            
            technique_models['logistic_regression'] = {
                'model': lr,
                'val_auc': lr_val_auc,
                'preprocessor': preprocessor,
                'is_preprocessed': is_preprocessed
            }
            
            # 2. Random Forest
            print("\nTraining Random Forest...")
            rf = RandomForestClassifier(
                n_estimators=200,
                max_depth=10,
                min_samples_split=10,
                min_samples_leaf=4,
                max_features='sqrt',
                bootstrap=True,
                class_weight='balanced',
                random_state=self.random_state,
                n_jobs=-1
            )
            rf.fit(X_technique, y_technique)
            
            # Evaluate
            rf_val_pred = rf.predict_proba(X_val_processed)[:, 1]
            rf_val_auc = roc_auc_score(y_val, rf_val_pred)
            print(f"Random Forest Validation AUC: {rf_val_auc:.4f}")
            
            technique_models['random_forest'] = {
                'model': rf,
                'val_auc': rf_val_auc,
                'preprocessor': preprocessor,
                'is_preprocessed': is_preprocessed
            }
            
            # 3. XGBoost
            print("\nTraining XGBoost...")
            xgb_model = self._create_xgboost_model()
            xgb_model.fit(
                X_technique, y_technique,
                eval_set=[(X_val_processed, y_val)],
                early_stopping_rounds=50,
                verbose=False
            )
            
            # Evaluate
            xgb_val_pred = xgb_model.predict_proba(X_val_processed)[:, 1]
            xgb_val_auc = roc_auc_score(y_val, xgb_val_pred)
            print(f"XGBoost Validation AUC: {xgb_val_auc:.4f}")
            
            technique_models['xgboost'] = {
                'model': xgb_model,
                'val_auc': xgb_val_auc,
                'preprocessor': preprocessor,
                'is_preprocessed': is_preprocessed
            }
            
            # 4. LightGBM
            print("\nTraining LightGBM...")
            lgb_model = self._create_lightgbm_model()
            lgb_model.fit(
                X_technique, y_technique,
                eval_set=[(X_val_processed, y_val)],
                early_stopping_rounds=50,
                verbose=False
            )
            
            # Evaluate
            lgb_val_pred = lgb_model.predict_proba(X_val_processed)[:, 1]
            lgb_val_auc = roc_auc_score(y_val, lgb_val_pred)
            print(f"LightGBM Validation AUC: {lgb_val_auc:.4f}")
            
            technique_models['lightgbm'] = {
                'model': lgb_model,
                'val_auc': lgb_val_auc,
                'preprocessor': preprocessor,
                'is_preprocessed': is_preprocessed
            }
            
            # 5. CatBoost
            print("\nTraining CatBoost...")
            cb_model = self._create_catboost_model()
            cb_model.fit(
                X_technique, y_technique,
                eval_set=(X_val_processed, y_val),
                early_stopping_rounds=50,
                verbose=False
            )
            
            # Evaluate
            cb_val_pred = cb_model.predict_proba(X_val_processed)[:, 1]
            cb_val_auc = roc_auc_score(y_val, cb_val_pred)
            print(f"CatBoost Validation AUC: {cb_val_auc:.4f}")
            
            technique_models['catboost'] = {
                'model': cb_model,
                'val_auc': cb_val_auc,
                'preprocessor': preprocessor,
                'is_preprocessed': is_preprocessed
            }
            
            # Store all models for this technique
            model_results[technique] = technique_models
        
        # Find the best model
        best_model_info = {'val_auc': 0, 'model_name': None, 'technique': None}
        
        for technique, models in model_results.items():
            for model_name, model_info in models.items():
                if model_info['val_auc'] > best_model_info['val_auc']:
                    best_model_info['val_auc'] = model_info['val_auc']
                    best_model_info['model_name'] = model_name
                    best_model_info['technique'] = technique
        
        print(f"\nBest model: {best_model_info['model_name']} with {best_model_info['technique']} balancing")
        print(f"Validation AUC: {best_model_info['val_auc']:.4f}")
        
        # Store the best model information
        self.best_model_info = best_model_info
        self.model_results = model_results
        
        return model_results
    
    def hyperparameter_tuning(self, X_train, y_train, X_val, y_val, model_name, technique, n_trials=50):
        """
        Perform hyperparameter tuning for a specific model.
        
        Parameters:
        -----------
        X_train : numpy.ndarray or pandas.DataFrame
            Training features
        y_train : numpy.ndarray or pandas.Series
            Training target
        X_val : numpy.ndarray or pandas.Series
            Validation features
        y_val : numpy.ndarray or pandas.Series
            Validation target
        model_name : str
            Name of the model to tune
        technique : str
            Resampling technique to use
        n_trials : int
            Number of trials for hyperparameter optimization
        
        Returns:
        --------
        best_params : dict
            Best hyperparameters found
        best_model : object
            Best model with tuned hyperparameters
        """
        print(f"\nPerforming hyperparameter tuning for {model_name} with {technique} balancing...")
        
        # Get the appropriate data
        if technique in self.imbalance_handling_results:
            X_technique, y_technique, preprocessor = self.imbalance_handling_results[technique]
            
            # If not original, data is already preprocessed
            is_preprocessed = technique != 'original'
        else:
            print(f"Technique {technique} not found in imbalance handling results. Using original data.")
            X_technique, y_technique = X_train, y_train
            preprocessor = self._create_preprocessor()
            is_preprocessed = False
        
        # Preprocess validation data
        X_val_processed = preprocessor.transform(X_val) if not is_preprocessed else X_val
        
        # Preprocess training data if needed
        if not is_preprocessed:
            X_technique = preprocessor.fit_transform(X_technique)
        
        # Define the objective function for Optuna
        def objective(trial):
            # Different parameter space for each model
            if model_name == 'xgboost':
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                    'max_depth': trial.suggest_int('max_depth', 3, 12),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                    'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                    'gamma': trial.suggest_float('gamma', 0.0, 1.0),
                    'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0, log=True),
                    'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0, log=True),
                    'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0)
                }
                
                model = self._create_xgboost_model(params)
                
            elif model_name == 'lightgbm':
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                    'num_leaves': trial.suggest_int('num_leaves', 20, 150),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                    'max_depth': trial.suggest_int('max_depth', -1, 12),
                    'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
                    'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                    'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0, log=True),
                    'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0, log=True)
                }
                
                model = self._create_lightgbm_model(params)
                
            elif model_name == 'catboost':
                params = {
                    'iterations': trial.suggest_int('iterations', 100, 500),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                    'depth': trial.suggest_int('depth', 4, 10),
                    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
                    'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
                    'random_strength': trial.suggest_float('random_strength', 0.0, 10.0)
                }
                
                model = self._create_catboost_model(params)
                
            elif model_name == 'random_forest':
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                    'max_depth': trial.suggest_int('max_depth', 5, 30),
                    'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                    'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2'])
                }
                
                model = RandomForestClassifier(
                    **params,
                    class_weight='balanced',
                    random_state=self.random_state,
                    n_jobs=-1
                )
                
            elif model_name == 'logistic_regression':
                params = {
                    'C': trial.suggest_float('C', 0.001, 10.0, log=True),
                    'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
                    'solver': trial.suggest_categorical('solver', ['liblinear', 'saga'])
                }
                
                model = LogisticRegression(
                    **params,
                    class_weight='balanced',
                    random_state=self.random_state,
                    max_iter=1000
                )
                
            else:
                raise ValueError(f"Unsupported model: {model_name}")
            
            # Train the model
            if model_name in ['xgboost', 'lightgbm', 'catboost']:
                model.fit(
                    X_technique, y_technique,
                    eval_set=[(X_val_processed, y_val)],
                    early_stopping_rounds=50,
                    verbose=False
                )
            else:
                model.fit(X_technique, y_technique)
            
            # Make predictions
            if hasattr(model, 'predict_proba'):
                y_val_pred = model.predict_proba(X_val_processed)[:, 1]
                val_auc = roc_auc_score(y_val, y_val_pred)
            else:
                y_val_pred = model.predict(X_val_processed)
                val_auc = accuracy_score(y_val, y_val_pred)
            
            return val_auc
        
        # Create Optuna study
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=n_trials)
        
        print(f"Best parameters: {study.best_params}")
        print(f"Best validation AUC: {study.best_value:.4f}")
        
        # Train the best model
        if model_name == 'xgboost':
            best_model = self._create_xgboost_model(study.best_params)
        elif model_name == 'lightgbm':
            best_model = self._create_lightgbm_model(study.best_params)
        elif model_name == 'catboost':
            best_model = self._create_catboost_model(study.best_params)
        elif model_name == 'random_forest':
            best_model = RandomForestClassifier(
                **study.best_params,
                class_weight='balanced',
                random_state=self.random_state,
                n_jobs=-1
            )
        elif model_name == 'logistic_regression':
            best_model = LogisticRegression(
                **study.best_params,
                class_weight='balanced',
                random_state=self.random_state,
                max_iter=1000
            )
        else:
            raise ValueError(f"Unsupported model: {model_name}")
        
        # Fit the best model
        if model_name in ['xgboost', 'lightgbm', 'catboost']:
            best_model.fit(
                X_technique, y_technique,
                eval_set=[(X_val_processed, y_val)],
                early_stopping_rounds=50,
                verbose=False
            )
        else:
            best_model.fit(X_technique, y_technique)
        
        # Store the best model
        self.best_tuned_model = {
            'model': best_model,
            'params': study.best_params,
            'val_auc': study.best_value,
            'preprocessor': preprocessor,
            'is_preprocessed': is_preprocessed
        }
        
        return study.best_params, best_model
    
    def create_ensemble(self, X_train, y_train, X_val, y_val, top_n=3):
        """
        Create an ensemble of the best models.
        
        Parameters:
        -----------
        X_train : numpy.ndarray or pandas.DataFrame
            Training features
        y_train : numpy.ndarray or pandas.Series
            Training target
        X_val : numpy.ndarray or pandas.DataFrame
            Validation features
        y_val : numpy.ndarray or pandas.Series
            Validation target
        top_n : int
            Number of top models to include in the ensemble
        
        Returns:
        --------
        ensemble_model : object
            Ensemble model
        """
        print(f"\nCreating an ensemble of top {top_n} models...")
        
        # Collect all models and their validation AUCs
        all_models = []
        
        for technique, models in self.model_results.items():
            for model_name, model_info in models.items():
                all_models.append({
                    'technique': technique,
                    'model_name': model_name,
                    'val_auc': model_info['val_auc'],
                    'model': model_info['model'],
                    'preprocessor': model_info['preprocessor'],
                    'is_preprocessed': model_info['is_preprocessed']
                })
        
        # Sort by validation AUC
        all_models.sort(key=lambda x: x['val_auc'], reverse=True)
        
        # Select top N models
        top_models = all_models[:top_n]
        
        print("\nTop models selected for ensemble:")
        for i, model_info in enumerate(top_models, 1):
            print(f"{i}. {model_info['model_name']} with {model_info['technique']} balancing, AUC = {model_info['val_auc']:.4f}")
        
        # 1. Voting Ensemble
        print("\nCreating Voting Classifier...")
        
        estimators = []
        for i, model_info in enumerate(top_models):
            estimator_name = f"{model_info['model_name']}_{i}"
            estimators.append((estimator_name, model_info['model']))
        
        voting_ensemble = VotingClassifier(
            estimators=estimators,
            voting='soft',
            n_jobs=-1
        )
        
        # We need to fit the voting ensemble with preprocessed data
        # Preprocess training data with each model's preprocessor
        X_train_processed_dict = {}
        X_val_processed_dict = {}
        
        for model_info in top_models:
            technique = model_info['technique']
            preprocessor = model_info['preprocessor']
            is_preprocessed = model_info['is_preprocessed']
            
            # Get or preprocess the training data
            if technique in self.imbalance_handling_results:
                X_technique, y_technique, _ = self.imbalance_handling_results[technique]
                
                # If not original, data is already preprocessed
                if not is_preprocessed:
                    X_technique = preprocessor.transform(X_technique)
            else:
                X_technique = preprocessor.transform(X_train)
                y_technique = y_train
            
            # Preprocess validation data
            X_val_processed = preprocessor.transform(X_val)
            
            # Store processed data
            X_train_processed_dict[technique] = (X_technique, y_technique)
            X_val_processed_dict[technique] = X_val_processed
        
        # We can only fit the voting ensemble with one set of data
        # Let's use the data from the best model
        best_technique = top_models[0]['technique']
        X_train_processed, y_train_processed = X_train_processed_dict[best_technique]
        
        # Fit the voting ensemble
        voting_ensemble.fit(X_train_processed, y_train_processed)
        
        # Evaluate the voting ensemble
        X_val_processed = X_val_processed_dict[best_technique]
        voting_val_pred = voting_ensemble.predict_proba(X_val_processed)[:, 1]
        voting_val_auc = roc_auc_score(y_val, voting_val_pred)
        
        print(f"Voting Ensemble Validation AUC: {voting_val_auc:.4f}")
        
        # 2. Stacking Ensemble
        print("\nCreating Stacking Classifier...")
        
        # Define base models
        base_estimators = []
        for i, model_info in enumerate(top_models):
            estimator_name = f"{model_info['model_name']}_{i}"
            base_estimators.append((estimator_name, model_info['model']))
        
        # Define meta-learner
        meta_learner = LogisticRegression(max_iter=1000)
        
        # Create stacking ensemble
        stacking_ensemble = StackingClassifier(
            estimators=base_estimators,
            final_estimator=meta_learner,
            cv=5,
            n_jobs=-1
        )
        
        # Fit stacking ensemble
        stacking_ensemble.fit(X_train_processed, y_train_processed)
        
        # Evaluate stacking ensemble
        stacking_val_pred = stacking_ensemble.predict_proba(X_val_processed)[:, 1]
        stacking_val_auc = roc_auc_score(y_val, stacking_val_pred)
        
        print(f"Stacking Ensemble Validation AUC: {stacking_val_auc:.4f}")
        
        # 3. Weighted Average Ensemble
        print("\nCreating Weighted Average Ensemble...")
        
        # Create a simple weighted average function
        def weighted_average_ensemble(X, models, weights, preprocessors):
            """
            Make predictions using weighted average of base models.
            """
            predictions = []
            
            for model, weight, preprocessor in zip(models, weights, preprocessors):
                # Preprocess the data for this model
                X_processed = preprocessor.transform(X)
                
                # Get predictions
                pred = model.predict_proba(X_processed)[:, 1]
                predictions.append(pred * weight)
            
            # Weighted average
            weighted_pred = sum(predictions) / sum(weights)
            
            return weighted_pred
        
        # Extract models, weights (based on validation AUC), and preprocessors
        models = [model_info['model'] for model_info in top_models]
        weights = [model_info['val_auc'] for model_info in top_models]
        preprocessors = [model_info['preprocessor'] for model_info in top_models]
        
        # Make predictions
        weighted_val_pred = weighted_average_ensemble(X_val, models, weights, preprocessors)
        weighted_val_auc = roc_auc_score(y_val, weighted_val_pred)
        
        print(f"Weighted Average Ensemble Validation AUC: {weighted_val_auc:.4f}")
        
        # Compare ensemble methods
        ensemble_results = {
            'voting': {
                'ensemble': voting_ensemble,
                'val_auc': voting_val_auc,
                'preprocessor': top_models[0]['preprocessor']
            },
            'stacking': {
                'ensemble': stacking_ensemble,
                'val_auc': stacking_val_auc,
                'preprocessor': top_models[0]['preprocessor']
            },
            'weighted': {
                'ensemble': {
                    'models': models,
                    'weights': weights,
                    'preprocessors': preprocessors
                },
                'val_auc': weighted_val_auc,
                'prediction_function': weighted_average_ensemble
            }
        }
        
        # Find the best ensemble
        best_ensemble = max(ensemble_results.items(), key=lambda x: x[1]['val_auc'])
        best_ensemble_name, best_ensemble_info = best_ensemble
        
        print(f"\nBest ensemble method: {best_ensemble_name}")
        print(f"Best ensemble validation AUC: {best_ensemble_info['val_auc']:.4f}")
        
        # Store ensemble results
        self.ensemble_results = ensemble_results
        self.best_ensemble = best_ensemble
        
        return ensemble_results
    
    def evaluate_model(self, model, X_test, y_test, preprocessor=None, is_ensemble=False, ensemble_type=None):
        """
        Evaluate a model on the test set.
        
        Parameters:
        -----------
        model : object
            Model to evaluate
        X_test : pandas.DataFrame
            Test features
        y_test : pandas.Series
            Test target
        preprocessor : object
            Preprocessor for the data
        is_ensemble : bool
            Whether the model is an ensemble
        ensemble_type : str
            Type of ensemble ('voting', 'stacking', 'weighted')
        
        Returns:
        --------
        metrics : dict
            Dictionary with evaluation metrics
        """
        print("\nEvaluating model on test set...")
        
        # Preprocess the test data
        if preprocessor is not None:
            X_test_processed = preprocessor.transform(X_test)
        else:
            X_test_processed = X_test
        
        # Make predictions
        if is_ensemble and ensemble_type == 'weighted':
            # For weighted ensemble, use the prediction function
            models = model['models']
            weights = model['weights']
            preprocessors = model['preprocessors']
            
            y_pred_proba = self.ensemble_results['weighted']['prediction_function'](X_test, models, weights, preprocessors)
            y_pred = (y_pred_proba > 0.5).astype(int)
        else:
            # For regular models and other ensembles
            y_pred_proba = model.predict_proba(X_test_processed)[:, 1]
            y_pred = model.predict(X_test_processed)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc_roc = roc_auc_score(y_test, y_pred_proba)
        logloss = log_loss(y_test, y_pred_proba)
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        
        # Classification report
        cls_report = classification_report(y_test, y_pred)
        
        # Print results
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"AUC-ROC: {auc_roc:.4f}")
        print(f"Log Loss: {logloss:.4f}")
        
        print("\nConfusion Matrix:")
        print(cm)
        
        print("\nClassification Report:")
        print(cls_report)
        
        # Store results
        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc_roc': auc_roc,
            'log_loss': logloss,
            'confusion_matrix': cm,
            'classification_report': cls_report
        }
        
        return metrics
    
    def interpret_model(self, model, X_test, preprocessor=None, is_ensemble=False, n_features=20):
        """
        Interpret the model using SHAP values.
        
        Parameters:
        -----------
        model : object
            Model to interpret
        X_test : pandas.DataFrame
            Test features
        preprocessor : object
            Preprocessor for the data
        is_ensemble : bool
            Whether the model is an ensemble
        n_features : int
            Number of top features to display
        
        Returns:
        --------
        feature_importance : pandas.DataFrame
            Feature importance dataframe
        """
        print("\nInterpreting model using SHAP values...")
        
        # Preprocess the test data
        if preprocessor is not None:
            X_test_processed = preprocessor.transform(X_test)
        else:
            X_test_processed = X_test
        
        # Get feature names
        if hasattr(preprocessor, 'get_feature_names_out'):
            try:
                feature_names = preprocessor.get_feature_names_out()
            except:
                # Fallback to generic feature names
                feature_names = [f"feature_{i}" for i in range(X_test_processed.shape[1])]
        else:
            feature_names = [f"feature_{i}" for i in range(X_test_processed.shape[1])]
        
        # For tree-based models, use TreeExplainer
        if not is_ensemble and hasattr(model, 'feature_importances_'):
            print("Using TreeExplainer for SHAP values...")
            
            # Create explainer
            explainer = shap.TreeExplainer(model)
            
            # Get SHAP values
            shap_values = explainer.shap_values(X_test_processed)
            
            # If shap_values is a list, get the values for the positive class
            if isinstance(shap_values, list):
                shap_values = shap_values[1]
            
            # Create SHAP summary plot
            print("\nSHAP Summary Plot:")
            print("(This would display a visual summary of feature impacts)")
            
            # Get feature importance from SHAP values
            feature_importance = pd.DataFrame({
                'Feature': feature_names,
                'Importance': np.abs(shap_values).mean(axis=0)
            }).sort_values('Importance', ascending=False).head(n_features)
            
        # For other models, use a different approach
        else:
            print("Using permutation importance for feature importance...")
            
            from sklearn.inspection import permutation_importance
            
            # Calculate permutation importance
            if not is_ensemble:
                perm_importance = permutation_importance(
                    model, X_test_processed, y_test,
                    n_repeats=10,
                    random_state=self.random_state,
                    n_jobs=-1
                )
                
                # Get feature importance
                feature_importance = pd.DataFrame({
                    'Feature': feature_names,
                    'Importance': perm_importance.importances_mean
                }).sort_values('Importance', ascending=False).head(n_features)
            else:
                # For ensembles, average feature importance from base models
                if hasattr(model, 'estimators_'):
                    importances = []
                    
                    for estimator in model.estimators_:
                        if hasattr(estimator, 'feature_importances_'):
                            importances.append(estimator.feature_importances_)
                    
                    if importances:
                        avg_importance = np.mean(importances, axis=0)
                        
                        feature_importance = pd.DataFrame({
                            'Feature': feature_names,
                            'Importance': avg_importance
                        }).sort_values('Importance', ascending=False).head(n_features)
                    else:
                        feature_importance = pd.DataFrame({
                            'Feature': ['N/A'],
                            'Importance': [0]
                        })
                else:
                    feature_importance = pd.DataFrame({
                        'Feature': ['N/A'],
                        'Importance': [0]
                    })
        
        # Print top features
        print("\nTop features by importance:")
        print(feature_importance)
        
        # Store feature importance
        self.feature_importance = feature_importance
        
        return feature_importance
    
    def save_model(self, model, filename, preprocessor=None):
        """
        Save the model to a file.
        
        Parameters:
        -----------
        model : object
            Model to save
        filename : str
            Filename to save to
        preprocessor : object
            Preprocessor for the data
        
        Returns:
        --------
        None
        """
        print(f"\nSaving model to {filename}...")
        
        # Create a dictionary with the model and preprocessor
        model_data = {
            'model': model,
            'preprocessor': preprocessor,
            'feature_importance': self.feature_importance if hasattr(self, 'feature_importance') else None,
            'metadata': {
                'creation_date': time.strftime('%Y-%m-%d %H:%M:%S'),
                'random_state': self.random_state
            }
        }
        
        # Save to file
        joblib.dump(model_data, filename)
        
        print(f"Model saved successfully to {filename}")
    
    def run_pipeline(self):
        """
        Run the complete machine learning pipeline.
        
        Returns:
        --------
        results : dict
            Dictionary with pipeline results
        """
        print("\nRunning complete machine learning pipeline...")
        
        # 1. Load data
        X, y = self.load_data()
        
        # 2. Split data
        X_train, X_val, X_test, y_train, y_val, y_test = self.split_data(X, y)
        
        # 3. Exploratory Data Analysis
        eda_results = self.exploratory_data_analysis(X_train, y_train)
        
        # 4. Feature Engineering
        X_train_fe, X_val_fe, X_test_fe = self.feature_engineering(X_train, X_val, X_test)
        
        # 5. Feature Selection
        feature_selection_results = self.feature_selection(X_train_fe, y_train, X_val_fe, y_val)
        
        # 6. Class Imbalance Handling
        imbalance_results = self.handle_class_imbalance(X_train_fe, y_train)
        
        # 7. Train multiple models
        model_results = self.train_models(X_train_fe, y_train, X_val_fe, y_val)
        
        # 8. Hyperparameter tuning for the best model
        best_model_name = self.best_model_info['model_name']
        best_technique = self.best_model_info['technique']
        
        best_params, best_model = self.hyperparameter_tuning(
            X_train_fe, y_train, X_val_fe, y_val,
            best_model_name, best_technique
        )
        
        # 9. Create ensemble
        ensemble_results = self.create_ensemble(X_train_fe, y_train, X_val_fe, y_val)
        
        # 10. Evaluate the best model
        best_tuned_model = self.best_tuned_model['model']
        best_preprocessor = self.best_tuned_model['preprocessor']
        
        best_metrics = self.evaluate_model(
            best_tuned_model, X_test_fe, y_test,
            preprocessor=best_preprocessor
        )
        
        # 11. Evaluate the best ensemble
        best_ensemble_name, best_ensemble_info = self.best_ensemble
        
        if best_ensemble_name == 'weighted':
            best_ensemble_model = best_ensemble_info['ensemble']
            ensemble_metrics = self.evaluate_model(
                best_ensemble_model, X_test_fe, y_test,
                is_ensemble=True, ensemble_type='weighted'
            )
        else:
            best_ensemble_model = best_ensemble_info['ensemble']
            best_ensemble_preprocessor = best_ensemble_info['preprocessor']
            
            ensemble_metrics = self.evaluate_model(
                best_ensemble_model, X_test_fe, y_test,
                preprocessor=best_ensemble_preprocessor,
                is_ensemble=True, ensemble_type=best_ensemble_name
            )
        
        # 12. Interpret the best model
        feature_importance = self.interpret_model(
            best_tuned_model, X_test_fe,
            preprocessor=best_preprocessor
        )
        
        # 13. Save the best model
        if best_metrics['auc_roc'] > ensemble_metrics['auc_roc']:
            print("\nBest tuned model outperforms ensemble. Selecting as final model.")
            final_model = best_tuned_model
            final_preprocessor = best_preprocessor
            final_metrics = best_metrics
        else:
            print("\nEnsemble outperforms best tuned model. Selecting as final model.")
            final_model = best_ensemble_model
            final_preprocessor = best_ensemble_info.get('preprocessor', None)
            final_metrics = ensemble_metrics
        
        # Save the final model
        self.save_model(final_model, 'adult_income_model.joblib', final_preprocessor)
        
        # Compile results
        results = {
            'eda_results': eda_results,
            'feature_selection_results': feature_selection_results,
            'imbalance_results': imbalance_results,
            'model_results': model_results,
            'best_model': {
                'name': best_model_name,
                'technique': best_technique,
                'params': best_params,
                'metrics': best_metrics
            },
            'ensemble_results': ensemble_results,
            'final_model': {
                'model': final_model,
                'preprocessor': final_preprocessor,
                'metrics': final_metrics,
                'feature_importance': feature_importance
            }
        }
        
        # Print final results
        print("\n" + "="*50)
        print("PIPELINE COMPLETED SUCCESSFULLY")
        print("="*50)
        
        print(f"\nFinal model AUC-ROC: {final_metrics['auc_roc']:.4f}")
        print(f"Final model accuracy: {final_metrics['accuracy']:.4f}")
        print(f"Final model F1 score: {final_metrics['f1']:.4f}")
        
        print("\nTop 10 important features:")
        print(feature_importance.head(10))
        
        return results

# Execute the pipeline if run as a script
if __name__ == "__main__":
    # Create the classifier
    classifier = AdultIncomeClassifier()
    
    # Run the pipeline
    results = classifier.run_pipeline()

ModuleNotFoundError: No module named 'catboost'