In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, r2_score
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

# File paths
# INPUT_PATH = '/kaggle/input/customer-churn-classification'
INPUT_PATH = '../Datasets'
train_path = f"{INPUT_PATH}/train.csv"
test_path = f"{INPUT_PATH}/test.csv"

# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

class SafeLabelEncoder:
    def __init__(self, unknown_value=-1):
        self.unknown_value = unknown_value
        self.label_encoder = LabelEncoder()
        self.classes_ = None
        
    def fit(self, series):
        series = pd.Series(series)
        unique_values = series.unique().tolist()
        if 'UNKNOWN' not in unique_values:
            unique_values.append('UNKNOWN')
        self.label_encoder.fit(unique_values)
        self.classes_ = self.label_encoder.classes_
        return self
    
    def transform(self, series):
        series = pd.Series(series)
        series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
        return self.label_encoder.transform(series)
    
    def fit_transform(self, series):
        return self.fit(series).transform(series)

def prepare_data(df, is_training=True):
    df_processed = df.copy()
    categorical_columns = df_processed.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        if is_training:
            if column not in label_encoders:
                label_encoders[column] = SafeLabelEncoder()
                df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
        else:
            if column in label_encoders:
                df_processed[column] = label_encoders[column].transform(df_processed[column])

    # Log transformations
    if 'Balance' in df_processed.columns:
        df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
    if 'EstimatedSalary' in df_processed.columns:
        df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
    # Interaction terms
    if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
        df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
    if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
        df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
    # Polynomial terms
    df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
    df_processed['Balance_sq'] = df_processed['Balance'] ** 2
    df_processed['Age_sq'] = df_processed['Age'] ** 2
    
    return df_processed

try:
    # Initialize label encoders dictionary
    label_encoders = {}

    # Process training data
    print("Processing training data...")
    train_df_processed = prepare_data(train_df, is_training=True)

    # Define numeric features
    numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
                        'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
                        'CreditScore_Age', 'CreditScore_sq', 'Balance_sq', 'Age_sq']

    # Scaling numeric features
    scaler = StandardScaler()
    train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])

    # PCA for dimensionality reduction
    pca = PCA(n_components=10)
    pca_features = pca.fit_transform(train_df_processed[numeric_features])
    pca_df = pd.DataFrame(pca_features, columns=[f'pca_{i+1}' for i in range(pca_features.shape[1])])
    train_df_processed = pd.concat([train_df_processed, pca_df], axis=1)

    # Prepare features and target
    X = train_df_processed.drop(['id', 'Exited', 'CustomerId'], axis=1)
    y = train_df_processed['Exited']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 120, 150, 200, 300],
        'max_depth': [3, 4, 5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }

    print("Training Random Forest with RandomizedSearch...")
    rf = RandomForestClassifier(random_state=42)
    rand_search_rf = RandomizedSearchCV(rf, param_grid, n_iter=30, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)
    rand_search_rf.fit(X_train, y_train)
    best_rf = rand_search_rf.best_estimator_
    print(f"Best Random Forest parameters: {rand_search_rf.best_params_}")

    # Stacking model with LogisticRegression as final estimator for predict_proba
    print("Training Stacking Model...")
    stacking_model = StackingClassifier(
        estimators=[
            ('rf', best_rf),
            ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)),
            ('xgb', XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss'))
        ],
        final_estimator=LogisticRegression(),
        cv=5
    )
    stacking_model.fit(X_train, y_train)

    # Model evaluation
    print("Evaluating model...")
    y_val_pred_proba = stacking_model.predict_proba(X_val)[:, 1]
    roc_auc = roc_auc_score(y_val, y_val_pred_proba)
    print(f"Validation ROC AUC Score: {roc_auc}")

    y_val_pred = stacking_model.predict(X_val)
    r2 = r2_score(y_val, y_val_pred)
    print(f"R² Score on Validation Set: {r2}")

    # Process test data
    print("Processing test data...")
    test_df_processed = prepare_data(test_df, is_training=False)
    test_df_processed[numeric_features] = scaler.transform(test_df_processed[numeric_features])
    test_pca_features = pca.transform(test_df_processed[numeric_features])
    test_pca_df = pd.DataFrame(test_pca_features, columns=[f'pca_{i+1}' for i in range(test_pca_features.shape[1])])
    test_df_processed = pd.concat([test_df_processed, test_pca_df], axis=1)

    # Make predictions
    print("Making predictions...")
    test_predictions = stacking_model.predict_proba(test_df_processed.drop(['id', 'CustomerId'], axis=1))[:, 1]

    # Create submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'Exited': test_predictions
    })
    submission.to_csv("Predictions.csv", index=False)
    print("Predictions saved successfully.")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    print(traceback.format_exc())


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, r2_score, accuracy_score
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
from sklearn.preprocessing import PolynomialFeatures

# File paths
INPUT_PATH = '../Datasets'
train_path = f"{INPUT_PATH}/train.csv"
test_path = f"{INPUT_PATH}/test.csv"

# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

class SafeLabelEncoder:
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.classes_ = None
        
    def fit(self, series):
        self.label_encoder.fit(series)
        self.classes_ = self.label_encoder.classes_
        return self
    
    def transform(self, series):
        # Handle unknown categories by mapping them to the most frequent class
        series = pd.Series(series).map(lambda x: self.classes_[0] if x not in self.classes_ else x)
        return self.label_encoder.transform(series)
    
    def fit_transform(self, series):
        return self.fit(series).transform(series)
def create_advanced_features(df):
    """Create advanced feature combinations"""
    # Financial features with better handling of zeros and outliers
    df['HasBalance'] = (df['Balance'] > 0).astype(int)
    df['BalanceToSalary'] = np.where(df['EstimatedSalary'] > 0, 
                                    df['Balance'] / df['EstimatedSalary'], 0)
    df['CreditScoreToAge'] = df['CreditScore'] / df['Age'].clip(lower=18)
    df['BalancePerProduct'] = np.where(df['NumOfProducts'] > 0,
                                      df['Balance'] / df['NumOfProducts'], 0)
    
    # Advanced financial metrics
    df['WealthScore'] = (df['Balance'] * df['CreditScore']) / (df['Age'] + 1)
    df['ActivityScore'] = df['NumOfProducts'] * df['IsActiveMember']
    df['RiskScore'] = (df['Balance'] * (1 - df['IsActiveMember'])) / (df['CreditScore'] + 1)
    
    # Polynomial features for key metrics (with scaling to prevent overflow)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df[['CreditScore', 'Balance', 'Age']])
    df['CreditScore_sq'] = scaled_features[:, 0] ** 2
    df['Balance_sq'] = scaled_features[:, 1] ** 2
    df['Age_sq'] = scaled_features[:, 2] ** 2
    
    # Interaction features with scaling
    df['AgeBalance'] = scaled_features[:, 2] * scaled_features[:, 1]
    df['AgeSalary'] = scaled_features[:, 2] * scaler.fit_transform(df[['EstimatedSalary']])[:, 0]
    df['CreditScoreBalance'] = scaled_features[:, 0] * scaled_features[:, 1]
    
    # Customer segments
    df['IsYoungLowBalance'] = ((df['Age'] < 30) & (df['Balance'] < df['Balance'].median())).astype(int)
    df['IsSeniorHighValue'] = ((df['Age'] > 60) & (df['Balance'] > df['Balance'].quantile(0.75))).astype(int)
    df['IsRisky'] = ((df['CreditScore'] < df['CreditScore'].quantile(0.25)) & 
                     (df['Balance'] > df['Balance'].quantile(0.75))).astype(int)
    
    # Percentile ranks instead of raw bins
    df['BalanceRank'] = df['Balance'].rank(pct=True)
    df['SalaryRank'] = df['EstimatedSalary'].rank(pct=True)
    df['CreditScoreRank'] = df['CreditScore'].rank(pct=True)
    
    # Complex interaction features
    df['ProductBalanceInteraction'] = np.log1p(df['Balance']) * df['NumOfProducts']
    df['SalaryScoreRatio'] = np.where(df['CreditScore'] > 0,
                                     df['EstimatedSalary'] / df['CreditScore'], 0)
    
    # Custom risk indicators
    df['HighRisk'] = ((df['Balance'] > df['EstimatedSalary']) & 
                      (df['CreditScore'] < df['CreditScore'].median()) &
                      (df['IsActiveMember'] == 0)).astype(int)
    df['LowRisk'] = ((df['Balance'] < df['Balance'].median()) &
                     (df['CreditScore'] > df['CreditScore'].median()) &
                     (df['IsActiveMember'] == 1)).astype(int)
    
    # Create quantile bins for Balance with duplicates='drop'
    df['BalanceBin'] = pd.qcut(df['Balance'], q=5, labels=False, duplicates='drop')
    
    return df

def prepare_data(df, label_encoders=None, is_training=True):
    df_processed = df.copy()
    
    # Handle categorical variables
    categorical_columns = df_processed.select_dtypes(include=['object']).columns
    if is_training:
        label_encoders = {}
        for column in categorical_columns:
            label_encoders[column] = SafeLabelEncoder()
            df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
    else:
        for column in categorical_columns:
            if column in label_encoders:
                df_processed[column] = label_encoders[column].transform(df_processed[column])
    
    # Create advanced features
    df_processed = create_advanced_features(df_processed)
    
    return df_processed, label_encoders

try:
    # Load and process data
    print("Processing training data...")
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # Prepare training data
    train_df_processed, label_encoders = prepare_data(train_df, is_training=True)
    
    # Define features and target
    target = 'Exited'
    features = [col for col in train_df_processed.columns if col not in [target, 'CustomerId']]
    
    # Scale features
    scaler = StandardScaler()
    train_df_processed[features] = scaler.fit_transform(train_df_processed[features])
    
    # Apply PCA
    pca = PCA(n_components=0.95)  # Keep 95% of variance
    pca_features = pca.fit_transform(train_df_processed[features])
    pca_cols = [f'pca_{i+1}' for i in range(pca_features.shape[1])]
    train_df_processed = pd.concat([
        train_df_processed,
        pd.DataFrame(pca_features, columns=pca_cols)
    ], axis=1)
    
    # Split data
    X = train_df_processed.drop([target, 'CustomerId'], axis=1)
    y = train_df_processed[target]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Define optimized models
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42
    )
    
    xgb = XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.01,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=2,
        enable_categorical=True,
        random_state=42
    )
    
    lgb = LGBMClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.01,
        num_leaves=31,
        class_weight='balanced',
        random_state=42
    )
    
    gb = GradientBoostingClassifier(
        n_estimators=500,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        random_state=42
    )
    
    # Create and train stacking model
    print("Training Stacking Model...")
    stacking_model = StackingClassifier(
        estimators=[
            ('rf', rf),
            ('xgb', xgb),
            ('lgb', lgb),
            ('gb', gb)
        ],
        final_estimator=LogisticRegression(max_iter=1000, class_weight='balanced'),
        cv=5,
        n_jobs=-1
    )
    
    # Fit stacking model
    stacking_model.fit(X_train, y_train)
    
    # Evaluate model
    y_val_pred_proba = stacking_model.predict_proba(X_val)[:, 1]
    y_val_pred = stacking_model.predict(X_val)
    
    roc_auc = roc_auc_score(y_val, y_val_pred_proba)
    accuracy = accuracy_score(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)
    
    print(f"Validation ROC AUC Score: {roc_auc:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"R² Score on Validation Set: {r2:.4f}")
    
    # Process test data
    print("Processing test data...")
    test_df_processed, _ = prepare_data(test_df, label_encoders=label_encoders, is_training=False)
    test_df_processed[features] = scaler.transform(test_df_processed[features])
    
    # Apply PCA to test data
    test_pca_features = pca.transform(test_df_processed[features])
    test_df_processed = pd.concat([
        test_df_processed,
        pd.DataFrame(test_pca_features, columns=pca_cols)
    ], axis=1)
    
    # Generate predictions
    test_predictions = stacking_model.predict_proba(test_df_processed.drop(['CustomerId'], axis=1))[:, 1]
    
    # Create submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'Exited': test_predictions
    })
    submission.to_csv("Predictions.csv", index=False)
    print("Predictions saved successfully.")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    import traceback
    print(traceback.format_exc())

In [None]:
import optuna
from catboost import CatBoostClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, r2_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Feature selection with RFECV
def select_features(X, y, model):
    selector = RFECV(estimator=model, step=1, cv=5, scoring='roc_auc', n_jobs=-1)
    selector.fit(X, y)
    selected_features = X.columns[selector.support_]
    return selected_features

# Optuna-based hyperparameter tuning
def optimize_model(trial, model_type):
    if model_type == 'xgb':
        return XGBClassifier(
            n_estimators=trial.suggest_int('n_estimators', 100, 1000),
            max_depth=trial.suggest_int('max_depth', 3, 10),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
            subsample=trial.suggest_float('subsample', 0.6, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
            scale_pos_weight=trial.suggest_float('scale_pos_weight', 1, 5),
            random_state=42,
        )
    elif model_type == 'lgb':
        return LGBMClassifier(
            n_estimators=trial.suggest_int('n_estimators', 100, 1000),
            max_depth=trial.suggest_int('max_depth', 3, 10),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
            num_leaves=trial.suggest_int('num_leaves', 10, 50),
            random_state=42,
            class_weight='balanced',
        )
    elif model_type == 'catboost':
        return CatBoostClassifier(
            iterations=trial.suggest_int('iterations', 100, 1000),
            depth=trial.suggest_int('depth', 3, 10),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
            l2_leaf_reg=trial.suggest_float('l2_leaf_reg', 1, 10),
            random_strength=trial.suggest_float('random_strength', 0, 1),
            class_weights=[1, 2],  # Adjust for imbalance
            verbose=0,
            random_state=42,
        )

def optimize_pipeline(X_train, y_train, model_type):
    def objective(trial):
        model = optimize_model(trial, model_type)
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_train)[:, 1]
        return roc_auc_score(y_train, y_pred)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    return study.best_params

# Training pipeline
def train_stacking_model(X_train, y_train, X_val, y_val):
    # Optimize each base model
    print("Optimizing XGBoost...")
    xgb_params = optimize_pipeline(X_train, y_train, 'xgb')
    print("Optimizing LightGBM...")
    lgb_params = optimize_pipeline(X_train, y_train, 'lgb')
    print("Optimizing CatBoost...")
    cat_params = optimize_pipeline(X_train, y_train, 'catboost')

    # Define models with best parameters
    xgb = XGBClassifier(**xgb_params, random_state=42)
    lgb = LGBMClassifier(**lgb_params, random_state=42)
    cat = CatBoostClassifier(**cat_params, random_state=42, verbose=0)

    # Stacking model
    stacking_model = StackingClassifier(
        estimators=[
            ('xgb', xgb),
            ('lgb', lgb),
            ('cat', cat),
        ],
        final_estimator=LogisticRegression(max_iter=1000, class_weight='balanced'),
        cv=5,
        n_jobs=-1
    )

    # Train stacking model
    print("Training Stacking Model...")
    stacking_model.fit(X_train, y_train)

    # Validation performance
    y_val_pred_proba = stacking_model.predict_proba(X_val)[:, 1]
    y_val_pred = stacking_model.predict(X_val)

    roc_auc = roc_auc_score(y_val, y_val_pred_proba)
    accuracy = accuracy_score(y_val, y_val_pred)
    r2 = r2_score(y_val, y_val_pred)

    print(f"Validation ROC AUC Score: {roc_auc:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"R² Score on Validation Set: {r2:.4f}")

    return stacking_model

# Load dataset and preprocess (replace this with your data loading logic)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Select important features
selected_features = select_features(X_train, y_train, RandomForestClassifier(random_state=42))
X_train = X_train[selected_features]
X_val = X_val[selected_features]

# Train and evaluate stacking model
stacking_model = train_stacking_model(X_train, y_train, X_val, y_val)
