In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, r2_score
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

class ChurnPredictor:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.model = None
        self.preprocessor = None
        
    def create_features(self, df):
        df = df.copy()
        
        # Basic features
        df['Balance_per_Product'] = df['Balance'] / (df['NumOfProducts'] + 1)
        df['Balance_per_Age'] = df['Balance'] / (df['Age'] + 1)
        df['Balance_per_Tenure'] = df['Balance'] / (df['Tenure'] + 1)
        df['Products_per_Tenure'] = df['NumOfProducts'] / (df['Tenure'] + 1)
        
        # Interaction features
        df['Credit_Age_Ratio'] = df['CreditScore'] / (df['Age'] + 1)
        
        # Advanced features
        df['IsHighValue'] = ((df['Balance'] > df['Balance'].mean()) & 
                           (df['EstimatedSalary'] > df['EstimatedSalary'].mean())).astype(int)
        df['IsLongTerm'] = (df['Tenure'] > df['Tenure'].median()).astype(int)
        
        # Risk score feature
        df['RiskScore'] = (
            (df['CreditScore'] < 600).astype(int) * 2 +
            (df['Balance'] > 100000).astype(int) * 1.5 +
            (df['Age'] < 30).astype(int) * 1.2 +
            (df['IsActiveMember'] == 0).astype(int) * 1.8 +
            (df['NumOfProducts'] > 2).astype(int) * 1.3
        )
        
        # Additional behavioral features
        df['HasZeroBalance'] = (df['Balance'] == 0).astype(int)
        df['IsNewCustomer'] = (df['Tenure'] <= 1).astype(int)
        
        return df
    
    def prepare_pipelines(self):
        numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                          'EstimatedSalary', 'Balance_per_Product', 'Balance_per_Age',
                          'Balance_per_Tenure', 'Products_per_Tenure', 'Credit_Age_Ratio',
                          'RiskScore', 'HasCrCard', 'IsActiveMember', 'IsHighValue', 'IsLongTerm', 
                          'HasZeroBalance', 'IsNewCustomer']
        
        categorical_features = ['Geography', 'Gender']
        
        self.preprocessor = ColumnTransformer([
            ('numeric', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), numeric_features),
            ('categorical', Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
            ]), categorical_features)
        ])
        
        return self.preprocessor
    
    def create_model(self):
        model = LGBMClassifier(
            objective='binary',
            boosting_type='goss',
            random_state=self.random_state,
            n_jobs=-1  # Use all available cores
        )
        
        param_distributions = {
            'n_estimators': [100, 200, 300],
            'max_depth': [ 5, 10, 15],
            'learning_rate': [0.01, 0.05, 0.1],
            'num_leaves': [31, 63, 127],
            'min_child_samples': [5, 10, 20],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'reg_alpha': [0, 0.1, 0.5],
            'reg_lambda': [0, 0.1, 0.5]
        }
        
        self.model = RandomizedSearchCV(
            model,
            param_distributions=param_distributions,
            n_iter=30,  # Reduced iterations for faster tuning
            cv=3,  # Reduced cross-validation folds
            scoring='roc_auc',
            n_jobs=-1,
            random_state=self.random_state
        )
        
        return self.model
    
    def fit(self, X, y):
        # Create features
        X_enhanced = self.create_features(X)
        
        # Prepare and fit preprocessing pipeline
        preprocessor = self.prepare_pipelines()
        X_processed = preprocessor.fit_transform(X_enhanced)
        
        # Handle class imbalance
        smote = SMOTE(random_state=self.random_state)
        X_resampled, y_resampled = smote.fit_resample(X_processed, y)
        
        # Create and fit model
        model = self.create_model()
        model.fit(X_resampled, y_resampled)
        
        return self
    
    def predict(self, X):
        X_enhanced = self.create_features(X)
        X_processed = self.preprocessor.transform(X_enhanced)
        return self.model.predict(X_processed)
    
    def predict_proba(self, X):
        X_enhanced = self.create_features(X)
        X_processed = self.preprocessor.transform(X_enhanced)
        return self.model.predict_proba(X_processed)

# Load and prepare data
train_df = pd.read_csv('../Datasets/train.csv')
test_df = pd.read_csv('../Datasets/test.csv')

# Drop unnecessary columns and handle missing values
drop_cols = ['id', 'CustomerId', 'Surname']
X = train_df.drop(drop_cols + ['Exited'], axis=1)
y = train_df['Exited']

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create and train model
churn_predictor = ChurnPredictor(random_state=42)
churn_predictor.fit(X_train, y_train)

# Make predictions
y_val_pred = churn_predictor.predict(X_val)
y_val_pred_proba = churn_predictor.predict_proba(X_val)[:, 1]

# Evaluate model
print("Model Performance:")
print(f"ROC AUC: {roc_auc_score(y_val, y_val_pred_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"R² Score: {r2_score(y_val, y_val_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

# Generate predictions for test set
test_predictions = churn_predictor.predict_proba(test_df.drop(drop_cols, axis=1))[:, 1]
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_predictions
})
submission_df.to_csv('optimized_predictions.csv', index=False)

[LightGBM] [Info] Number of positive: 9554, number of negative: 9554
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3965
[LightGBM] [Info] Number of data points in the train set: 19108, number of used features: 21
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Model Performance:
ROC AUC: 0.9167
Accuracy: 0.8903
R² Score: 0.3238

Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.93      2389
         1.0       0.74      0.70      0.72       611

    accuracy                           0.89      3000
   macro avg       0.83      0.82      0.83      3000
weighted avg       0.89      0.89      0.89      3000



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, r2_score
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

class ChurnPredictor:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.model = None
        self.preprocessor = None
        
    def create_features(self, df):
        df = df.copy()
        
        # Basic features
        df['Balance_per_Product'] = df['Balance'] / (df['NumOfProducts'] + 1)
        df['Balance_per_Age'] = df['Balance'] / (df['Age'] + 1)
        df['Balance_per_Tenure'] = df['Balance'] / (df['Tenure'] + 1)
        
        # Advanced features
        df['IsHighValue'] = ((df['Balance'] > df['Balance'].mean()) & 
                           (df['EstimatedSalary'] > df['EstimatedSalary'].mean())).astype(int)
        df['IsLongTerm'] = (df['Tenure'] > df['Tenure'].median()).astype(int)
        
        return df
    
    def prepare_pipelines(self):
        numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                          'EstimatedSalary', 'Balance_per_Product', 'Balance_per_Age',
                          'Balance_per_Tenure']
        
        categorical_features = ['Geography', 'Gender']
        
        self.numeric_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.95))
        ])
        
        self.categorical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ])
        
        self.preprocessor = ColumnTransformer([
            ('numeric', self.numeric_pipeline, numeric_features),
            ('categorical', self.categorical_pipeline, categorical_features)
        ])
        
        return self.preprocessor
    
    def create_model(self):
        model = LGBMClassifier(
            objective='binary',
            boosting_type='goss',
            random_state=self.random_state,
            class_weight='balanced',  # Handle class imbalance
            n_jobs=-1  # Use all available cores
        )
        
        param_distributions = {
            'n_estimators': [100, 200, 300],
            'max_depth': [5, 10, 15],
            'learning_rate': [0.01, 0.05, 0.1],
            'num_leaves': [31, 50, 70],
        }
        
        self.model = RandomizedSearchCV(
            model,
            param_distributions=param_distributions,
            n_iter=20,  # Reduced iterations for faster execution
            cv=3,  # Reduced cross-validation folds
            scoring='roc_auc',
            n_jobs=-1,
            random_state=self.random_state
        )
        
        return self.model
    
    def fit(self, X, y):
        # Create features
        X_enhanced = self.create_features(X)
        
        # Prepare and fit preprocessing pipeline
        preprocessor = self.prepare_pipelines()
        X_processed = preprocessor.fit_transform(X_enhanced)
        
        # Handle class imbalance
        smote = SMOTE(random_state=self.random_state)
        X_resampled, y_resampled = smote.fit_resample (X_processed, y)
        
        # Create and fit model
        model = self.create_model()
        model.fit(X_resampled, y_resampled)
        
        return self
    
    def predict(self, X):
        X_enhanced = self.create_features(X)
        X_processed = self.preprocessor.transform(X_enhanced)
        return self.model.predict(X_processed)
    
    def predict_proba(self, X):
        X_enhanced = self.create_features(X)
        X_processed = self.preprocessor.transform(X_enhanced)
        return self.model.predict_proba(X_processed)

# Load and prepare data
train_df = pd.read_csv('../Datasets/train.csv')
test_df = pd.read_csv('../Datasets/test.csv')

# Drop unnecessary columns and handle missing values
drop_cols = ['id', 'CustomerId', 'Surname']
X = train_df.drop(drop_cols + ['Exited'], axis=1)
y = train_df['Exited']

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create and train model
churn_predictor = ChurnPredictor(random_state=42)
churn_predictor.fit(X_train, y_train)

# Make predictions
y_val_pred = churn_predictor.predict(X_val)
y_val_pred_proba = churn_predictor.predict_proba(X_val)[:, 1]

# Evaluate model
print("Model Performance:")
print(f"ROC AUC: {roc_auc_score(y_val, y_val_pred_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"R² Score: {r2_score(y_val, y_val_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

# Generate predictions for test set
test_predictions = churn_predictor.predict_proba(test_df.drop(drop_cols, axis=1))[:, 1]
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Exited': test_predictions
})
submission_df.to_csv('optimized_predictions.csv', index=False)

In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, StratifiedKFold
# from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report, r2_score
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# from sklearn.feature_selection import RFECV, VarianceThreshold
# from sklearn.cluster import KMeans
# from imblearn.over_sampling import SMOTE
# from optuna import create_study, Trial
# from optuna.samplers import TPESampler

# # =============================================================================================================
# # =============================================================================================================

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# # =============================================================================================================
# # =============================================================================================================

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)

# # =============================================================================================================
# # =============================================================================================================

# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Additional Interaction Terms
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['Balance_EstimatedSalary'] = df_processed['Balance'] * df_processed['EstimatedSalary']
#         df_processed['CreditScore_EstimatedSalary'] = df_processed['CreditScore'] * df_processed['EstimatedSalary']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
#     df_processed['EstimatedSalary_sq'] = df_processed['EstimatedSalary'] ** 2
    
#     # Ratios
#     if 'Balance' in df_processed.columns and 'EstimatedSalary' in df_processed.columns:
#         df_processed['Balance_to_EstimatedSalary'] = df_processed['Balance'] / (df_processed['EstimatedSalary'] + 1e-5)  # Avoid division by zero
    
#     # Clustering features
#     kmeans = KMeans(n_clusters=5, random_state= 2)
#     df_processed['cluster'] = kmeans.fit_predict(df_processed[['CreditScore', 'Age', 'Balance', 'EstimatedSalary']])
    
#     return df_processed

# # Initialize label encoders dictionary
# label_encoders = {}

# # Process training data
# print("Processing training data...")
# train_df_processed = prepare_data(train_df, is_training=True)
# test_df_processed = prepare_data(test_df, is_training=False)

# # Additional Feature Engineering
# def additional_features(df):
#     df['Age_Balance_Ratio'] = df['Age'] / (df['Balance'] + 1e-5)
#     df['Age_EstimatedSalary_Ratio'] = df['Age'] / (df['EstimatedSalary'] + 1e-5)
#     df['Balance_EstimatedSalary_Difference'] = df['Balance'] - df['EstimatedSalary']
#     df['CreditScore_Balance_Ratio'] = df['CreditScore'] / (df['Balance'] + 1e-5)
#     return df

# # Process training data with additional features
# train_df_processed = additional_features(train_df_processed)
# test_df_processed = additional_features(test_df_processed)

# # Define numeric features
# numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                     'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                     'CreditScore_Age', 'Balance_EstimatedSalary', 
#                     'CreditScore_EstimatedSalary', 'CreditScore_sq', 
#                     'Balance_sq', 'Age_sq', 'EstimatedSalary_sq', 
#                     'Balance_to_EstimatedSalary', 'Age_Balance_Ratio', 
#                     'Age_EstimatedSalary_Ratio', 'Balance_EstimatedSalary_Difference', 
#                     'CreditScore_Balance_Ratio']

# # Scaling numeric features with RobustScaler (handles outliers better)
# scaler = RobustScaler()
# train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])
# test_df_processed[numeric_features] = scaler.transform(test_df_processed[numeric_features])

# # PCA for dimensionality reduction on the expanded feature set
# pca = PCA(n_components=19)  # Decrease the number of components to 19
# pca_features_train = pca.fit_transform(train_df_processed[numeric_features])
# pca_df_train = pd.DataFrame(pca_features_train, columns=[f'pca_{i+1}' for i in range(pca_features_train.shape[1])])
# train_df_processed = pd.concat([train_df_processed, pca_df_train], axis=1)

# pca_features_test = pca.transform(test_df_processed[numeric_features])
# pca_df_test = pd.DataFrame(pca_features_test, columns=[f'pca_{i+1}' for i in range(pca_features_test.shape[1])])
# test_df_processed = pd.concat([test_df_processed, pca_df_test], axis=1)

# # Prepare features and target
# X = train_df_processed.drop(['id', 'Exited', 'CustomerId'], axis=1)
# y = train_df_processed['Exited']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# # Handle class imbalance using SMOTE
# smote = SMOTE(random_state=2)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# # Feature selection using RFECV with a stronger base estimator (e.g., GradientBoostingClassifier)
# rfecv = RFECV(estimator=GradientBoostingClassifier(), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
# X_train_selected = rfecv.fit_transform(X_train_res, y_train_res)
# X_val_selected = rfecv.transform(X_val)

# # Convert the numpy array back to DataFrame with correct column names
# X_train_selected_df = pd.DataFrame(X_train_selected, columns=X_train.columns[rfecv.get_support()])
# X_val_selected_df = pd.DataFrame(X_val_selected, columns=X_val.columns[rfecv.get_support()])
# X_test_selected_df = pd.DataFrame(rfecv.transform(test_df_processed.drop(['CustomerId'], axis=1)), columns=test_df_processed.columns[rfecv.get_support()])

# # Hyperparameter optimization with Optuna for multiple models
# def objective(trial: Trial):
#     model_type = trial.suggest_categorical('model_type', ['lgbm', 'xgb', 'rf'])
    
#     if model_type == 'lgbm':
#         param = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#             'max_depth': trial.suggest_int('max_depth', 3, 15),
#             'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
#         }
#         model = LGBMClassifier(**param, random_state=2)
    
#     elif model_type == 'xgb':
#         param = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#             'max_depth': trial.suggest_int('max_depth', 3, 15),
#             'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
#         }
#         model = XGBClassifier(**param, random_state=2, use_label_encoder=False, eval_metric='logloss')
    
#     else:  # RandomForestClassifier
#         param = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 700),
#             'max_depth': trial.suggest_int('max_depth', 3, 15)
#         }
#         model = RandomForestClassifier(**param, random_state=2)
    
#     model.fit(X_train_selected_df, y_train_res)
#     y_pred = model.predict(X_val_selected_df)
#     return accuracy_score(y_val, y_pred)

# # Create an Optuna study and optimize
# study = create_study(direction='maximize', sampler=TPESampler())
# study.optimize(objective, n_trials=20)

# # Train the best model based on the best hyperparameters
# best_model_type = study.best_params['model_type']
# if best_model_type == 'lgbm':
#     best_model = LGBMClassifier(n_estimators=study.best_params['n_estimators'],
#                                 max_depth=study.best_params['max_depth'],
#                                 learning_rate=study.best_params['learning_rate'], 
#                                 random_state=2)
# elif best_model_type == 'xgb':
#     best_model = XGBClassifier(n_estimators=study.best_params['n_estimators'],
#                                max_depth=study.best_params['max_depth'],
#                                learning_rate=study.best_params['learning_rate'],
#                                random_state=2, use_label_encoder=False, eval_metric='logloss')
# else:
#     best_model = RandomForestClassifier(n_estimators=study.best_params['n_estimators'],
#                                         max_depth=study.best_params['max_depth'],
#                                         random_state=2)

# best_model.fit(X_train_selected_df, y_train_res)

# # Evaluate model
# y_pred = best_model.predict(X_val_selected_df)
# y_pred_prob = best_model.predict_proba(X_val_selected_df)[:, 1]

# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("ROC AUC Score:", roc_auc_score(y_val, y_pred_prob))
# print("R² Score:", r2_score(y_val, y_pred_prob))  # Add R² score for evaluation
# print("Classification Report:\n", classification_report(y_val, y_pred))

# # Plot feature importance of the best model
# plt.barh(X_train_selected_df.columns, best_model.feature_importances_)
# plt.title(f'Feature Importance for {best_model_type.upper()} Model')
# plt.show()


Processing training data...


In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, r2_score
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from category_encoders import TargetEncoder
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint, uniform

# class ChurnPredictor:
#     def __init__(self, random_state=42):
#         self.random_state = random_state
#         self.target_encoders = {}
#         self.numeric_pipeline = None
#         self.categorical_pipeline = None
#         self.feature_names = None
#         self.model = None
        
#     def create_features(self, df):
#         df = df.copy()
        
#         # Basic features
#         df['Balance_per_Product'] = df['Balance'] / (df['NumOfProducts'] + 1)
#         df['Balance_per_Age'] = df['Balance'] / (df['Age'] + 1)
#         df['Balance_per_Tenure'] = df['Balance'] / (df['Tenure'] + 1)
#         df['Products_per_Tenure'] = df['NumOfProducts'] / (df['Tenure'] + 1)
        
#         # Interaction features
#         df['Credit_Age_Ratio'] = df['CreditScore'] / (df['Age'] + 1)
#         df['Tenure_Age_Ratio'] = df['Tenure'] / (df['Age'] + 1)
#         df['Balance_Salary_Ratio'] = df['Balance'] / (df['EstimatedSalary'] + 1)
        
#         # Advanced features
#         df['IsHighValue'] = (df['Balance'] > df['Balance'].mean()) & (df['EstimatedSalary'] > df['EstimatedSalary'].mean())
#         df['IsLongTerm'] = df['Tenure'] > df['Tenure'].median()
#         df['IsYoungActive'] = (df['Age'] < 40) & (df['IsActiveMember'] == 1)
        
#         # Polynomial features
#         df['CreditScore_Squared'] = df['CreditScore'] ** 2
#         df['Age_Squared'] = df['Age'] ** 2
#         df['Balance_Squared'] = df['Balance'] ** 2
#         df['Tenure_Squared'] = df['Tenure'] ** 2
        
#         # Risk score feature
#         df['RiskScore'] = (
#             (df['CreditScore'] < 600) * 2 +
#             (df['Balance'] > 100000) * 1.5 +
#             (df['Age'] < 30) * 1.2 +
#             (df['IsActiveMember'] == 0) * 1.8 +
#             (df['NumOfProducts'] > 2) * 1.3
#         )
        
#         return df
    
#     def prepare_pipelines(self):
#         numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
#                           'EstimatedSalary', 'Balance_per_Product', 'Balance_per_Age',
#                           'Balance_per_Tenure', 'Products_per_Tenure', 'Credit_Age_Ratio',
#                           'Tenure_Age_Ratio', 'Balance_Salary_Ratio', 'CreditScore_Squared',
#                           'Age_Squared', 'Balance_Squared', 'Tenure_Squared', 'RiskScore']
        
#         categorical_features = ['Geography', 'Gender']
        
#         self.numeric_pipeline = Pipeline([
#             ('imputer', SimpleImputer(strategy='median')),
#             ('scaler', StandardScaler()),
#             ('pca', PCA(n_components=0.95))
#         ])
        
#         self.categorical_pipeline = Pipeline([
#             ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#             ('target_encoder', TargetEncoder(random_state=self.random_state))
#         ])
        
#         self.feature_names = numeric_features + categorical_features
        
#         return ColumnTransformer([
#             ('numeric', self.numeric_pipeline, numeric_features),
#             ('categorical', self.categorical_pipeline, categorical_features)
#         ])
    
#     def create_model(self):
#         base_models = [
#             ('rf', RandomForestClassifier(random_state=self.random_state)),
#             ('xgb', XGBClassifier(random_state=self.random_state)),
#             ('lgbm', LGBMClassifier(random_state=self.random_state)),
#             ('gb', GradientBoostingClassifier(random_state=self.random_state))
#         ]
        
#         final_model = LGBMClassifier(random_state=self.random_state)
        
#         # Hyperparameter space
#         param_distributions = {
#             'n_estimators': randint(100, 1000),
#             'max_depth': randint(3, 15),
#             'learning_rate': uniform(0.01, 0.29),
#             'num_leaves': randint(20, 100),
#             'min_child_samples': randint(10, 50),
#             'subsample': uniform(0.6, 0.4),
#             'colsample_bytree': uniform(0.6, 0.4)
#         }
        
#         self.model = RandomizedSearchCV(
#             final_model,
#             param_distributions=param_distributions,
#             n_iter=50,
#             cv=5,
#             scoring='roc_auc',
#             n_jobs=-1,
#             random_state=self.random_state
#         )
        
#         return self.model
    
#     def fit(self, X, y):
#         # Create features
#         X_enhanced = self.create_features(X)
        
#         # Prepare and fit preprocessing pipeline
#         preprocessor = self.prepare_pipelines()
#         X_processed = preprocessor.fit_transform(X_enhanced[self.feature_names], y)
        
#         # Create and fit model
#         model = self.create_model()
#         model.fit(X_processed, y)
        
#         return self
    
#     def predict(self, X):
#         X_enhanced = self.create_features(X)
#         X_processed = self.preprocessor.transform(X_enhanced[self.feature_names])
#         return self.model.predict(X_processed)
    
#     def predict_proba(self, X):
#         X_enhanced = self.create_features(X)
#         X_processed = self.preprocessor.transform(X_enhanced[self.feature_names])
#         return self.model.predict_proba(X_processed)

# # Load and prepare data
# train_df = pd.read_csv('../Datasets/train.csv')
# test_df = pd.read_csv('../Datasets/test.csv')

# # Drop unnecessary columns and handle missing values
# drop_cols = ['id', 'CustomerId', 'Surname']
# X = train_df.drop(drop_cols + ['Exited'], axis=1)
# y = train_df['Exited']

# # Split data
# X_train, X_val, y_train, y_val = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# # Create and train model
# churn_predictor = ChurnPredictor(random_state=42)
# churn_predictor.fit(X_train, y_train)

# # Make predictions
# y_val_pred = churn_predictor.predict(X_val)
# y_val_pred_proba = churn_predictor.predict_proba(X_val)[:, 1]

# # Evaluate model
# print("Model Performance:")
# print(f"ROC AUC: {roc_auc_score(y_val, y_val_pred_proba):.4f}")
# print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
# print(f"R² Score: {r2_score(y_val, y_val_pred):.4f}")
# print("\nClassification Report:")
# print(classification_report(y_val, y_val_pred))

# # Generate predictions for test set
# test_predictions = churn_predictor.predict_proba(test_df.drop(drop_cols, axis=1))[:, 1]
# submission_df = pd.DataFrame({
#     'id': test_df['id'],
#     'Exited': test_predictions
# })
# submission_df.to_csv('improved_predictions.csv', index=False)

In [9]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, StratifiedKFold
# from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report, r2_score
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# from sklearn.feature_selection import RFECV
# from sklearn.cluster import KMeans
# from imblearn.over_sampling import SMOTE
# from optuna import create_study, Trial
# from optuna.samplers import TPESampler

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)

# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Additional Interaction Terms
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['Balance_EstimatedSalary'] = df_processed['Balance'] * df_processed['EstimatedSalary']
#         df_processed['CreditScore_EstimatedSalary'] = df_processed['CreditScore'] * df_processed['EstimatedSalary']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
#     df_processed['EstimatedSalary_sq'] = df_processed['EstimatedSalary'] ** 2
    
#     # Ratios
#     if 'Balance' in df_processed.columns and 'EstimatedSalary' in df_processed.columns:
#         df_processed['Balance_to_EstimatedSalary'] = df_processed['Balance'] / (df_processed['EstimatedSalary'] + 1e-5)  # Avoid division by zero
    
#     # Clustering features
#     kmeans = KMeans(n_clusters=5, random_state=2)
#     df_processed['cluster'] = kmeans.fit_predict(df_processed[['CreditScore', 'Age', 'Balance', 'EstimatedSalary']])
    
#     return df_processed

# # Initialize label encoders dictionary
# label_encoders = {}

# # Process training data
# print("Processing training data...")
# train_df_processed = prepare_data(train_df, is_training=True)
# test_df_processed = prepare_data(test_df, is_training=False)

# # Define numeric features
# numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                     'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                     'CreditScore_Age', 'Balance_EstimatedSalary', 
#                     'CreditScore_EstimatedSalary', 'CreditScore_sq', 
#                     'Balance_sq', 'Age_sq', 'EstimatedSalary_sq', 
#                     'Balance_to_EstimatedSalary']

# # Scaling numeric features with RobustScaler (handles outliers better)
# scaler = RobustScaler()
# train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])
# test_df_processed[numeric_features] = scaler.transform(test_df_processed[numeric_features])

# # PCA for dimensionality reduction
# # PCA for dimensionality reduction
# pca = PCA(n_components=15)  # Set n_components to 15 or less
# pca_features_train = pca.fit_transform(train_df_processed[numeric_features])
# pca_df_train = pd.DataFrame(pca_features_train, columns=[f'pca_{i+1}' for i in range(pca_features_train.shape[1])])
# train_df_processed = pd.concat([train_df_processed, pca_df_train], axis=1)


# pca_features_test = pca.transform(test_df_processed[numeric_features])
# pca_df_test = pd.DataFrame(pca_features_test, columns=[f'pca_{i+1}' for i in range(pca_features_test.shape[1])])
# test_df_processed = pd.concat([test_df_processed, pca_df_test], axis=1)

# # Prepare features and target
# X = train_df_processed.drop(['Exited', 'CustomerId'], axis=1)
# y = train_df_processed['Exited']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# # Handle class imbalance using SMOTE
# smote = SMOTE(random_state=2)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# # Feature selection using RFECV with a stronger base estimator (e.g., GradientBoostingClassifier)
# rfecv = RFECV(estimator=GradientBoostingClassifier(), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
# X_train_selected = rfecv.fit_transform(X_train_res, y_train_res)
# X_val_selected = rfecv.transform(X_val)

# # Transform the test set using the same RFECV mask
# X_test_selected = rfecv.transform(test_df_processed.drop(['CustomerId'], axis=1))

# # Hyperparameter optimization with Optuna (Bayesian Optimization) for LightGBM
# def objective(trial: Trial):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
#     }
#     model = LGBMClassifier(**param, random_state=2)
#     model.fit(X_train_selected, y_train_res)
#     preds = model.predict_proba(X_val_selected)[:, 1]
#     return roc_auc_score(y_val, preds)

# study = create_study(direction='maximize', sampler=TPESampler())
# study.optimize(objective, n_trials=50)

# best_lgbm = LGBMClassifier(**study.best_params, random_state=2)
# best_lgbm.fit(X_train_selected, y_train_res)

# # Stacking model with GradientBoostingClassifier as final estimator for better predictive power
# print("Training Stacking Model...")
# stacking_model = StackingClassifier(
#     estimators=[
#         ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=2, class_weight='balanced')),
#         ('gb', GradientBoostingClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2)),
#         ('xgb', XGBClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2, use_label_encoder=False, eval_metric='logloss')),
#         ('catboost', CatBoostClassifier(iterations=200, depth=6, learning_rate=0.05, silent=True)),
#         ('lgbm', best_lgbm),
#         ('extra', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=2))
#     ],
#     final_estimator=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3),
#     cv=5
# )
# stacking_model.fit(X_train_selected, y_train_res)

# # Model evaluation
# print("Evaluating model...")
# y_val_pred_proba = stacking_model.predict_proba(X_val_selected)[:, 1]
# roc_auc = roc_auc_score(y_val, y_val_pred_proba)
# accuracy = accuracy_score(y_val, stacking_model.predict(X_val_selected))
# r2 = r2_score(y_val, stacking_model.predict(X_val_selected))

# print(f"ROC AUC: {roc_auc:.4f}")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"R² Score: {r2:.4f}")
# print(classification_report(y_val, stacking_model.predict(X_val_selected)))

# # Make predictions on the test set
# print("Making predictions on test set...")
# test_preds = stacking_model.predict_proba(X_test_selected)[:, 1]
# submission_df = pd.DataFrame({'id': test_df['id'], 'Exited': test_preds})
# submission_df.to_csv('customer_exit_predictions.csv', index=False)

# print("Submission file created: customer_exit_predictions.csv")


In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, StratifiedKFold
# from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report, r2_score
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# from sklearn.feature_selection import RFECV, VarianceThreshold
# from sklearn.cluster import KMeans
# from imblearn.over_sampling import SMOTE
# from optuna import create_study, Trial
# from optuna.samplers import TPESampler

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)

# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Additional Interaction Terms
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['Balance_EstimatedSalary'] = df_processed['Balance'] * df_processed['EstimatedSalary']
#         df_processed['CreditScore_EstimatedSalary'] = df_processed['CreditScore'] * df_processed['EstimatedSalary']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
#     df_processed['EstimatedSalary_sq'] = df_processed['EstimatedSalary'] ** 2
    
#     # Ratios
#     if 'Balance' in df_processed.columns and 'EstimatedSalary' in df_processed.columns:
#         df_processed['Balance_to_EstimatedSalary'] = df_processed['Balance'] / (df_processed['EstimatedSalary'] + 1e-5)  # Avoid division by zero
    
#     # Clustering features
#     kmeans = KMeans(n_clusters=5, random_state=2)
#     df_processed['cluster'] = kmeans.fit_predict(df_processed[['CreditScore', 'Age', 'Balance', 'EstimatedSalary']])
    
#     return df_processed

# # Initialize label encoders dictionary
# label_encoders = {}

# # Process training data
# print("Processing training data...")
# train_df_processed = prepare_data(train_df, is_training=True)
# test_df_processed = prepare_data(test_df, is_training=False)

# # Define numeric features
# numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                     'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                     'CreditScore_Age', 'Balance_EstimatedSalary', 
#                     'CreditScore_EstimatedSalary', 'CreditScore_sq', 
#                     'Balance_sq', 'Age_sq', 'EstimatedSalary_sq', 
#                     'Balance_to_EstimatedSalary']

# # Scaling numeric features with RobustScaler (handles outliers better)
# scaler = RobustScaler()
# train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])
# test_df_processed[numeric_features] = scaler.transform(test_df_processed[numeric_features])

# # PCA for dimensionality reduction
# pca = PCA(n_components=15)  # Set n_components to 15 or less
# pca_features_train = pca.fit_transform(train_df_processed[numeric_features])
# pca_df_train = pd.DataFrame(pca_features_train, columns=[f'pca_{i+1}' for i in range(pca_features_train.shape[1])])
# train_df_processed = pd.concat([train_df_processed, pca_df_train], axis=1)

# pca_features_test = pca.transform(test_df_processed[numeric_features])
# pca_df_test = pd.DataFrame(pca_features_test, columns=[f'pca_{i+1}' for i in range(pca_features_test.shape[1])])
# test_df_processed = pd.concat([test_df_processed, pca_df_test], axis=1)

# # Prepare features and target
# X = train_df_processed.drop(['Exited', 'CustomerId'], axis=1)
# y = train_df_processed['Exited']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# # Handle class imbalance using SMOTE
# smote = SMOTE(random_state=2)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# # Feature selection using RFECV with a stronger base estimator (e.g., GradientBoostingClassifier)
# rfecv = RFECV(estimator=GradientBoostingClassifier(), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
# X_train_selected = rfecv.fit_transform(X_train_res, y_train_res)
# X_val_selected = rfecv.transform(X_val)

# # Convert the numpy array back to DataFrame with correct column names
# X_train_selected_df = pd.DataFrame(X_train_selected, columns=X_train.columns[rfecv.get_support()])
# X_val_selected_df = pd.DataFrame(X_val_selected, columns=X_val.columns[rfecv.get_support()])
# X_test_selected_df = pd.DataFrame(rfecv.transform(test_df_processed.drop(['CustomerId'], axis=1)), columns=test_df_processed.columns[rfecv.get_support()])

# # Hyperparameter optimization with Optuna (Bayesian Optimization) for LightGBM
# def objective(trial: Trial):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
#     }
#     model = LGBMClassifier(**param, random_state=2)
#     model.fit(X_train_selected_df, y_train_res)
#     preds = model.predict_proba(X_val_selected_df)[:, 1]
#     return roc_auc_score(y_val, preds)

# study = create_study(direction='maximize', sampler=TPESampler())
# study.optimize(objective, n_trials=50)

# best_lgbm = LGBMClassifier(**study.best_params, random_state=2)
# best_lgbm.fit(X_train_selected_df, y_train_res)

# # Stacking model with GradientBoostingClassifier as final estimator for better predictive power
# print("Training Stacking Model...")
# stacking_model = StackingClassifier(
#     estimators=[ 
#         ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=2, class_weight='balanced')),
#         ('gb', GradientBoostingClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2)),
#         ('xgb', XGBClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2, use_label_encoder=False, eval_metric='logloss')),
#         ('catboost', CatBoostClassifier(iterations=200, depth=6, learning_rate=0.05, silent=True)),
#         ('lgbm', best_lgbm),
#         ('extra', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=2))
#     ],
#     final_estimator=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3),
#     cv=5
# )
# stacking_model.fit(X_train_selected_df, y_train_res)

# # Evaluate model
# y_pred = stacking_model.predict(X_val_selected_df)
# y_pred_prob = stacking_model.predict_proba(X_val_selected_df)[:, 1]

# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("ROC AUC Score:", roc_auc_score(y_val, y_pred_prob))
# print("Classification Report:\n", classification_report(y_val, y_pred))

# # Feature Importance Plot for LightGBM model
# importance = stacking_model.named_estimators_['lgbm'].feature_importances_
# plt.barh(X_train_selected_df.columns, importance)
# plt.title('Feature Importance')
# plt.show()


# 3


In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.preprocessing import StandardScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score, r2_score
# from xgboost import XGBClassifier
# from sklearn.decomposition import PCA

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)

# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
    
#     return df_processed

# try:
#     # Initialize label encoders dictionary
#     label_encoders = {}

#     # Process training data
#     print("Processing training data...")
#     train_df_processed = prepare_data(train_df, is_training=True)

#     # Define numeric features
#     numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                         'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                         'CreditScore_Age', 'CreditScore_sq', 'Balance_sq', 'Age_sq']

#     # Scaling numeric features
#     scaler = StandardScaler()
#     train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])

#     # PCA for dimensionality reduction
#     pca = PCA(n_components=5)
#     pca_features = pca.fit_transform(train_df_processed[numeric_features])
#     pca_df = pd.DataFrame(pca_features, columns=[f'pca_{i+1}' for i in range(pca_features.shape[1])])
#     train_df_processed = pd.concat([train_df_processed, pca_df], axis=1)

#     # Prepare features and target
#     X = train_df_processed.drop(['Exited', 'CustomerId'], axis=1)
#     y = train_df_processed['Exited']
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#     # Hyperparameter tuning
#     param_grid = {
#         'n_estimators': [100, 200, 300],
#         'max_depth': [10, 15, None],
#         'min_samples_split': [2, 5, 10],
#         'min_samples_leaf': [1, 2, 4],
#     }

#     print("Training Random Forest with RandomizedSearch...")
#     rf = RandomForestClassifier(random_state=42)
#     rand_search_rf = RandomizedSearchCV(rf, param_grid, n_iter=30, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)
#     rand_search_rf.fit(X_train, y_train)
#     best_rf = rand_search_rf.best_estimator_
#     print(f"Best Random Forest parameters: {rand_search_rf.best_params_}")

#     # Stacking model with LogisticRegression as final estimator for predict_proba
#     print("Training Stacking Model...")
#     stacking_model = StackingClassifier(
#         estimators=[
#             ('rf', best_rf),
#             ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)),
#             ('xgb', XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss'))
#         ],
#         final_estimator=LogisticRegression(),
#         cv=5
#     )
#     stacking_model.fit(X_train, y_train)

#     # Model evaluation
#     print("Evaluating model...")
#     y_val_pred_proba = stacking_model.predict_proba(X_val)[:, 1]
#     roc_auc = roc_auc_score(y_val, y_val_pred_proba)
#     print(f"Validation ROC AUC Score: {roc_auc}")

#     y_val_pred = stacking_model.predict(X_val)
#     r2 = r2_score(y_val, y_val_pred)
#     print(f"R² Score on Validation Set: {r2}")

#     # Process test data
#     print("Processing test data...")
#     test_df_processed = prepare_data(test_df, is_training=False)
#     test_df_processed[numeric_features] = scaler.transform(test_df_processed[numeric_features])
#     test_pca_features = pca.transform(test_df_processed[numeric_features])
#     test_pca_df = pd.DataFrame(test_pca_features, columns=[f'pca_{i+1}' for i in range(test_pca_features.shape[1])])
#     test_df_processed = pd.concat([test_df_processed, test_pca_df], axis=1)

#     # Make predictions
#     print("Making predictions...")
#     test_predictions = stacking_model.predict_proba(test_df_processed.drop(['CustomerId'], axis=1))[:, 1]

#     # Create submission
#     submission = pd.DataFrame({
#         'id': test_df['id'],
#         'Exited': test_predictions
#     })
#     submission.to_csv("submission.csv", index=False)
#     print("Submission saved successfully.")

# except Exception as e:
#     print(f"An error occurred: {str(e)}")
#     import traceback
#     print(traceback.format_exc())


In [None]:
/

In [None]:
# %pip install scikit-optimize

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor, RandomForestRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import KFold
# from sklearn.metrics import r2_score
# from skopt import BayesSearchCV
# import xgboost as xgb
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer


# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)
    
# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
    
#     return df_processed

# # Create a sample dataset for demonstration
# # (You should replace this with your own dataset)
# # from sklearn.datasets import make_regression
# # X_train, y_train = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)
# # X_val, y_val = make_regression(n_samples=200, n_features=20, noise=0.1, random_state=42)

# try:
#     # Initialize label encoders dictionary
#     label_encoders = {}

#     # Process training data
#     print("Processing training data...")
#     train_df_processed = prepare_data(train_df, is_training=True)
#     test_df_processed = prepare_data(test_df, is_training=True)

#     # Define numeric features
#     numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                         'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                         'CreditScore_Age', 'CreditScore_sq', 'Balance_sq', 'Age_sq']

#     # Scaling numeric features
#     scaler = StandardScaler()
#     train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])

#     # PCA for dimensionality reduction
#     pca = PCA(n_components=5)
#     pca_features = pca.fit_transform(train_df_processed[numeric_features])
#     pca_df = pd.DataFrame(pca_features, columns=[f'pca_{i+1}' for i in range(pca_features.shape[1])])
#     train_df_processed = pd.concat([train_df_processed, pca_df], axis=1)

#     pca_features = pca.fit_transform(test_df_processed[numeric_features])
#     pca_df = pd.DataFrame(pca_features, columns=[f'pca_{i+1}' for i in range(pca_features.shape[1])])
#     test_df_processed = pd.concat([test_df_processed, pca_df], axis=1)

#     # Prepare features and target
#     X = train_df_processed.drop(['Exited', 'CustomerId'], axis=1)
#     y = train_df_processed['Exited']
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#     # Define a KFold for cross-validation
#     cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

#     # Bayesian Optimization for XGBoost
#     param_search_xgb = {
#         'n_estimators': [50, 100, 200, 500],
#         'max_depth': [3, 4, 5, 6, 7, 10],
#         'learning_rate': [0.01, 0.05, 0.1, 0.2],
#         'subsample': [0.7, 0.8, 0.9, 1.0],
#         'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
#         'reg_alpha': [0, 0.01, 0.1, 1],
#         'reg_lambda': [0, 0.01, 0.1, 1]
#     }

#     # Using BayesSearchCV for XGBoost optimization
#     xgb_bayes_search = BayesSearchCV(
#         estimator=xgb.XGBRegressor(eval_metric='rmse', random_state=42),
#         search_spaces=param_search_xgb,
#         n_iter=30,
#         cv=cv_strategy,
#         scoring='r2',
#         n_jobs=-1,
#         verbose=1,
#         random_state=42
#     )

#     # Fit the Bayesian search model
#     xgb_bayes_search.fit(X_train, y_train)
#     best_xgb = xgb_bayes_search.best_estimator_

#     # Define the base models for stacking
#     base_models = [
#         ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
#         ('gb', GradientBoostingRegressor(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2)),
#         ('xgb', best_xgb)  # Use the best XGBoost model from optimization
#     ]

#     # Stacking with the optimized XGB and calibrated models
#     stacking_model = StackingRegressor(
#         estimators=base_models,
#         final_estimator=LinearRegression(),
#         cv=cv_strategy,
#         n_jobs=-1
#     )

#     # Create a pipeline with scaling and imputation
#     pipeline = Pipeline([
#         ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
#         ('scaler', StandardScaler()),  # Scale features
#         ('stacking', stacking_model)
#     ])

#     # Fit the stacking model
#     pipeline.fit(X_train, y_train)

#     # Model evaluation
#     y_val_pred = pipeline.predict(X_val)
#     r2 = r2_score(y_val, y_val_pred)
#     print(f"R² Score on Validation Set: {r2}")


# except Exception as e:
#     print(f"An error occurred: {str(e)}")
#     import traceback
#     print(traceback.format_exc())

In [None]:
pipeline

In [None]:
# # Create submission
# submission = pd.DataFrame({
#     'id': test_df['id'],
#     'Exited': pipeline.predict(test_df_processed.drop(['id', 'CustomerId'], axis=1))
# })
# submission.to_csv("submission.csv", index=False)
# print("Submission saved successfully.")

In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# from sklearn.preprocessing import StandardScaler, RobustScaler
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# from sklearn.feature_selection import RFECV
# from sklearn.cluster import KMeans
# from optuna import create_study, Trial
# from optuna.samplers import TPESampler

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)

# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
    
#     return df_processed

# try:
#     # Initialize label encoders dictionary
#     label_encoders = {}

#     # Process training data
#     print("Processing training data...")
#     train_df_processed = prepare_data(train_df, is_training=True)

#     # Define numeric features
#     numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                         'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                         'CreditScore_Age', 'CreditScore_sq', 'Balance_sq', 'Age_sq']

#     # Scaling numeric features with RobustScaler (handles outliers better)
#     scaler = RobustScaler()
#     train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])

#     # PCA for dimensionality reduction
#     pca = PCA(n_components=10)  # Increase number of components
#     pca_features = pca.fit_transform(train_df_processed[numeric_features])
#     pca_df = pd.DataFrame(pca_features, columns=[f'pca_{i+1}' for i in range(pca_features.shape[1])])
#     train_df_processed = pd.concat([train_df_processed, pca_df], axis=1)

#     # Clustering features
#     kmeans = KMeans(n_clusters=6, random_state=2)  # Increase number of clusters
#     train_df_processed['Cluster'] = kmeans.fit_predict(train_df_processed[numeric_features])

#     # Prepare features and target
#     X = train_df_processed.drop(['Exited', 'CustomerId'], axis=1)
#     y = train_df_processed['Exited']
#     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

#     # Feature selection using RFECV with a stronger base estimator (e.g., XGBClassifier)
#     rfecv = RFECV(estimator=XGBClassifier(), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
#     X_train_selected = rfecv.fit_transform(X_train, y_train)
#     X_val_selected = rfecv.transform(X_val)

#     # Hyperparameter optimization with Optuna (Bayesian Optimization)
#     def objective(trial: Trial):
#         param = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#             'max_depth': trial.suggest_int('max_depth', 3, 15),
#             'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
#         }
#         model = LGBMClassifier(**param, random_state=2)
#         model.fit(X_train_selected, y_train)
#         preds = model.predict_proba(X_val_selected)[:, 1]
#         return roc_auc_score(y_val, preds)

#     study = create_study(direction='maximize', sampler=TPESampler())
#     study.optimize(objective, n_trials=50)

#     best_lgbm = LGBMClassifier(**study.best_params, random_state=2)
#     best_lgbm.fit(X_train_selected, y_train)

#     # Stacking model with LogisticRegression as final estimator for predict_proba
#     print("Training Stacking Model...")
#     stacking_model = StackingClassifier(
#         estimators=[
#             ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=2)),
#             ('gb', GradientBoostingClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2)),
#             ('xgb', XGBClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2, use_label_encoder=False, eval_metric='logloss')),
#             ('catboost', CatBoostClassifier(iterations=200, depth=6, learning_rate=0.05, silent=True)),
#             ('lgbm', best_lgbm),
#             ('extra', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=2))
#         ],
#         final_estimator=LogisticRegression(),
#         cv=5
#     )
#     stacking_model.fit(X_train_selected, y_train)

#     # Model evaluation
#     print("Evaluating model...")
#     y_val_pred_proba = stacking_model.predict_proba(X_val_selected)[:, 1]
#     roc_auc = roc_auc_score(y_val, y_val_pred_proba)
#     print(f"ROC AUC: {roc_auc:.4f}")
#     print(classification_report(y_val, stacking_model.predict(X_val_selected)))

# except Exception as e:
#     print(f"An error occurred: {e}")


In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split, StratifiedKFold
# from sklearn.feature_selection import RFECV
# from xgboost import XGBClassifier
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.ensemble import StackingClassifier

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# X = train_df.drop(['id', 'Exited', 'CustomerId'], axis=1)
# y = train_df['Exited']

# # Splitting the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Preprocessing: Standardizing the data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)

# # PCA for dimensionality reduction (Optional, based on your need)
# pca = PCA(n_components=10)  # Adjust the number of components based on your dataset
# X_train_pca = pca.fit_transform(X_train_scaled)
# X_val_pca = pca.transform(X_val_scaled)

# # Feature selection using RFECV with a reduced number of cross-validation folds and step size
# rfecv = RFECV(estimator=XGBClassifier(), step=5, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

# # Perform train-test split before feature selection to reduce size if needed
# X_train_reduced, _, y_train_reduced, _ = train_test_split(X_train_pca, y_train, test_size=0.8, random_state=42)

# # Apply feature selection on a reduced dataset to speed up the process
# X_train_selected = rfecv.fit_transform(X_train_reduced, y_train_reduced)
# X_val_selected = rfecv.transform(X_val_pca)

# # Define a function to display evaluation metrics
# def display_metrics(y_true, y_pred):
#     print("Confusion Matrix:")
#     print(confusion_matrix(y_true, y_pred))
#     print("\nClassification Report:")
#     print(classification_report(y_true, y_pred))
#     print("\nROC AUC Score:")
#     print(roc_auc_score(y_true, y_pred))

# # Hyperparameter optimization with RandomizedSearchCV for Random Forest
# param_grid_rf = {
#     'n_estimators': [100, 200, 500],
#     'max_depth': [5, 10, 15],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# rf = RandomForestClassifier(random_state=42)
# random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid_rf, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)

# # Fitting the random search model for Random Forest
# random_search_rf.fit(X_train_selected, y_train_reduced)

# # Predicting and evaluating the Random Forest model
# y_pred_rf = random_search_rf.predict(X_val_selected)
# display_metrics(y_val, y_pred_rf)

# # Hyperparameter optimization with RandomizedSearchCV for Gradient Boosting
# param_grid_gb = {
#     'n_estimators': [100, 200, 500],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 8],
#     'subsample': [0.7, 0.8, 1.0]
# }

# gb = GradientBoostingClassifier(random_state=42)
# random_search_gb = RandomizedSearchCV(estimator=gb, param_distributions=param_grid_gb, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)

# # Fitting the random search model for Gradient Boosting
# random_search_gb.fit(X_train_selected, y_train_reduced)

# # Predicting and evaluating the Gradient Boosting model
# y_pred_gb = random_search_gb.predict(X_val_selected)
# display_metrics(y_val, y_pred_gb)

# # Stacking Classifier
# estimators = [
#     ('rf', random_search_rf.best_estimator_),
#     ('gb', random_search_gb.best_estimator_),
#     ('xgb', XGBClassifier(random_state=42))
# ]

# stacking_clf = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier(random_state=42))

# # Fitting the stacking classifier
# stacking_clf.fit(X_train_selected, y_train_reduced)

# # Predicting and evaluating the Stacking Classifier
# y_pred_stack = stacking_clf.predict(X_val_selected)
# display_metrics(y_val, y_pred_stack)


In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report, r2_score
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# from sklearn.feature_selection import RFECV
# from sklearn.cluster import KMeans
# from optuna import create_study, Trial
# from optuna.samplers import TPESampler

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)

# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
    
#     return df_processed

# # Initialize label encoders dictionary
# label_encoders = {}

# # Process training data
# print("Processing training data...")
# train_df_processed = prepare_data(train_df, is_training=True)
# test_df_processed = prepare_data(test_df, is_training=False)

# # Define numeric features
# numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                     'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                     'CreditScore_Age', 'CreditScore_sq', 'Balance_sq', 'Age_sq']

# # Scaling numeric features with RobustScaler (handles outliers better)
# scaler = RobustScaler()
# train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])
# test_df_processed[numeric_features] = scaler.transform(test_df_processed[numeric_features])

# # PCA for dimensionality reduction
# pca = PCA(n_components=10)  # Increase number of components
# pca_features_train = pca.fit_transform(train_df_processed[numeric_features])
# pca_df_train = pd.DataFrame(pca_features_train, columns=[f'pca_{i+1}' for i in range(pca_features_train.shape[1])])
# train_df_processed = pd.concat([train_df_processed, pca_df_train], axis=1)

# pca_features_test = pca.transform(test_df_processed[numeric_features])
# pca_df_test = pd.DataFrame(pca_features_test, columns=[f'pca_{i+1}' for i in range(pca_features_test.shape[1])])
# test_df_processed = pd.concat([test_df_processed, pca_df_test], axis=1)

# # Clustering features
# kmeans = KMeans(n_clusters=6, random_state=2)  # Increase number of clusters
# train_df_processed['Cluster'] = kmeans.fit_predict(train_df_processed[numeric_features])
# test_df_processed['Cluster'] = kmeans.predict(test_df_processed[numeric_features])

# # Prepare features and target
# X = train_df_processed.drop(['Exited', 'CustomerId'], axis=1)
# y = train_df_processed['Exited']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# # Feature selection using RFECV with a stronger base estimator (e.g., XGBClassifier)
# rfecv = RFECV(estimator=XGBClassifier(), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
# X_train_selected = rfecv.fit_transform(X_train, y_train)
# X_val_selected = rfecv.transform(X_val)

# # Hyperparameter optimization with Optuna (Bayesian Optimization)
# def objective(trial: Trial):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
#     }
#     model = LGBMClassifier(**param, random_state=2)
#     model.fit(X_train_selected, y_train)
#     preds = model.predict_proba(X_val_selected)[:, 1]
#     return roc_auc_score(y_val, preds)

# study = create_study(direction='maximize', sampler=TPESampler())
# study.optimize(objective, n_trials=50)

# best_lgbm = LGBMClassifier(**study.best_params, random_state=2)
# best_lgbm.fit(X_train_selected, y_train)

# # Stacking model with LogisticRegression as final estimator for predict_proba
# print("Training Stacking Model...")
# stacking_model = StackingClassifier(
#     estimators=[
#         ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=2)),
#         ('gb', GradientBoostingClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2)),
#         ('xgb', XGBClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2, use_label_encoder=False, eval_metric='logloss')),
#         ('catboost', CatBoostClassifier(iterations=200, depth=6, learning_rate=0.05, silent=True)),
#         ('lgbm', best_lgbm),
#         ('extra', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=2))
#     ],
#     final_estimator=LogisticRegression(),
#     cv=5
# )
# stacking_model.fit(X_train_selected, y_train)

# # Model evaluation
# print("Evaluating model...")
# y_val_pred_proba = stacking_model.predict_proba(X_val_selected)[:, 1]
# roc_auc = roc_auc_score(y_val, y_val_pred_proba)
# accuracy = accuracy_score(y_val, stacking_model.predict(X_val_selected))
# r2 = r2_score(y_val, y_val_pred_proba)

# print(f"ROC AUC: {roc_auc:.4f}")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"R² Score: {r2:.4f}")
# print(classification_report(y_val, stacking_model.predict(X_val_selected)))

# pd.DataFrame({'id': test_df['CustomerId'], 'Exited': stacking_model.predict_proba(test_df_processed.drop(['CustomerId'], axis=1))[:, 1]}).to_csv('Predictions.csv', index=False)

In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report, r2_score
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# from sklearn.feature_selection import RFECV
# from sklearn.cluster import KMeans
# from optuna import create_study, Trial
# from optuna.samplers import TPESampler

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)

# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
    
#     return df_processed

# # Initialize label encoders dictionary
# label_encoders = {}

# # Process training data
# print("Processing training data...")
# train_df_processed = prepare_data(train_df, is_training=True)
# test_df_processed = prepare_data(test_df, is_training=False)

# # Define numeric features
# numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                     'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                     'CreditScore_Age', 'CreditScore_sq', 'Balance_sq', 'Age_sq']

# # Scaling numeric features with RobustScaler (handles outliers better)
# scaler = RobustScaler()
# train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])
# test_df_processed[numeric_features] = scaler.transform(test_df_processed[numeric_features])

# # PCA for dimensionality reduction
# pca = PCA(n_components=10)  # Increase number of components
# pca_features_train = pca.fit_transform(train_df_processed[numeric_features])
# pca_df_train = pd.DataFrame(pca_features_train, columns=[f'pca_{i+1}' for i in range(pca_features_train.shape[1])])
# train_df_processed = pd.concat([train_df_processed, pca_df_train], axis=1)

# pca_features_test = pca.transform(test_df_processed[numeric_features])
# pca_df_test = pd.DataFrame(pca_features_test, columns=[f'pca_{i+1}' for i in range(pca_features_test.shape[1])])
# test_df_processed = pd.concat([test_df_processed, pca_df_test], axis=1)

# # Prepare features and target
# X = train_df_processed.drop(['Exited', 'CustomerId'], axis=1)
# y = train_df_processed['Exited']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# # Feature selection using RFECV with a stronger base estimator (e.g., XGBClassifier)
# rfecv = RFECV(estimator=XGBClassifier(), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
# X_train_selected = rfecv.fit_transform(X_train, y_train)
# X_val_selected = rfecv.transform(X_val)

# # Transform the test set using the same RFECV mask
# X_test_selected = rfecv.transform(test_df_processed.drop(['CustomerId'], axis=1))

# # Hyperparameter optimization with Optuna (Bayesian Optimization)
# def objective(trial: Trial):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
#     }
#     model = LGBMClassifier(**param, random_state=2)
#     model.fit(X_train_selected, y_train)
#     preds = model.predict_proba(X_val_selected)[:, 1]
#     return roc_auc_score(y_val, preds)

# study = create_study(direction='maximize', sampler=TPESampler())
# study.optimize(objective, n_trials=50)

# best_lgbm = LGBMClassifier(**study.best_params, random_state=2)
# best_lgbm.fit(X_train_selected, y_train)

# # Stacking model with LogisticRegression as final estimator for predict_proba
# print("Training Stacking Model...")
# stacking_model = StackingClassifier(
#     estimators=[
#         ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=2)),
#         ('gb', GradientBoostingClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2)),
#         ('xgb', XGBClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2, use_label_encoder=False, eval_metric='logloss')),
#         ('catboost', CatBoostClassifier(iterations=200, depth=6, learning_rate=0.05, silent=True)),
#         ('lgbm', best_lgbm),
#         ('extra', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=2))
#     ],
#     final_estimator=LogisticRegression(),
#     cv=5
# )
# stacking_model.fit(X_train_selected, y_train)

# # Model evaluation
# print("Evaluating model...")
# y_val_pred_proba = stacking_model.predict_proba(X_val_selected)[:, 1]
# roc_auc = roc_auc_score(y_val, y_val_pred_proba)
# accuracy = accuracy_score(y_val, stacking_model.predict(X_val_selected))
# r2 = r2_score(y_val, y_val_pred_proba)

# print(f"ROC AUC: {roc_auc:.4f}")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"R² Score: {r2:.4f}")
# print(classification_report(y_val, stacking_model.predict(X_val_selected)))

# # Make predictions on the test set
# test_predictions = stacking_model.predict_proba(X_test_selected)[:, 1]
# pd.DataFrame({'id': test_df['id'], 'Exited': test_predictions}).to_csv('Predictions.csv', index=False)

In [None]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report, r2_score
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier
# from sklearn.decomposition import PCA
# from sklearn.feature_selection import RFECV
# from sklearn.cluster import KMeans
# from optuna import create_study, Trial
# from optuna.samplers import TPESampler

# # File paths
# INPUT_PATH = '../Datasets'
# train_path = f"{INPUT_PATH}/train.csv"
# test_path = f"{INPUT_PATH}/test.csv"

# # Load datasets
# train_df = pd.read_csv(train_path)
# test_df = pd.read_csv(test_path)

# class SafeLabelEncoder:
#     def __init__(self, unknown_value=-1):
#         self.unknown_value = unknown_value
#         self.label_encoder = LabelEncoder()
#         self.classes_ = None
        
#     def fit(self, series):
#         series = pd.Series(series)
#         unique_values = series.unique().tolist()
#         if 'UNKNOWN' not in unique_values:
#             unique_values.append('UNKNOWN')
#         self.label_encoder.fit(unique_values)
#         self.classes_ = self.label_encoder.classes_
#         return self
    
#     def transform(self, series):
#         series = pd.Series(series)
#         series = series.map(lambda x: 'UNKNOWN' if x not in self.classes_ else x)
#         return self.label_encoder.transform(series)
    
#     def fit_transform(self, series):
#         return self.fit(series).transform(series)

# def prepare_data(df, is_training=True):
#     df_processed = df.copy()
#     categorical_columns = df_processed.select_dtypes(include=['object']).columns
#     for column in categorical_columns:
#         if is_training:
#             if column not in label_encoders:
#                 label_encoders[column] = SafeLabelEncoder()
#                 df_processed[column] = label_encoders[column].fit_transform(df_processed[column])
#         else:
#             if column in label_encoders:
#                 df_processed[column] = label_encoders[column].transform(df_processed[column])

#     # Log transformations
#     if 'Balance' in df_processed.columns:
#         df_processed['Balance_log'] = np.log1p(df_processed['Balance'].clip(lower=0))
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['EstimatedSalary_log'] = np.log1p(df_processed['EstimatedSalary'].clip(lower=0))
    
#     # Interaction terms
#     if 'Balance' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['Balance_Age'] = df_processed['Balance'] * df_processed['Age']
#     if 'CreditScore' in df_processed.columns and 'Age' in df_processed.columns:
#         df_processed['CreditScore_Age'] = df_processed['CreditScore'] * df_processed['Age']
    
#     # Additional Interaction Terms
#     if 'EstimatedSalary' in df_processed.columns:
#         df_processed['Balance_EstimatedSalary'] = df_processed['Balance'] * df_processed['EstimatedSalary']
#         df_processed['CreditScore_EstimatedSalary'] = df_processed['CreditScore'] * df_processed['EstimatedSalary']
    
#     # Polynomial terms
#     df_processed['CreditScore_sq'] = df_processed['CreditScore'] ** 2
#     df_processed['Balance_sq'] = df_processed['Balance'] ** 2
#     df_processed['Age_sq'] = df_processed['Age'] ** 2
#     df_processed['EstimatedSalary_sq'] = df_processed['EstimatedSalary'] ** 2
    
#     # Ratios
#     if 'Balance' in df_processed.columns and 'EstimatedSalary' in df_processed.columns:
#         df_processed['Balance_to_EstimatedSalary'] = df_processed['Balance'] / (df_processed['EstimatedSalary'] + 1e-5)  # Avoid division by zero

#     return df_processed

# # Initialize label encoders dictionary
# label_encoders = {}

# # Process training data
# print("Processing training data...")
# train_df_processed = prepare_data(train_df, is_training=True)
# test_df_processed = prepare_data(test_df, is_training =False)

# # Define numeric features
# numeric_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 
#                     'Balance_log', 'EstimatedSalary_log', 'Balance_Age', 
#                     'CreditScore_Age', 'Balance_EstimatedSalary', 
#                     'CreditScore_EstimatedSalary', 'CreditScore_sq', 
#                     'Balance_sq', 'Age_sq', 'EstimatedSalary_sq', 
#                     'Balance_to_EstimatedSalary']

# # Scaling numeric features with RobustScaler (handles outliers better)
# scaler = RobustScaler()
# train_df_processed[numeric_features] = scaler.fit_transform(train_df_processed[numeric_features])
# test_df_processed[numeric_features] = scaler.transform(test_df_processed[numeric_features])

# # PCA for dimensionality reduction
# pca = PCA(n_components=15)  # Increase number of components
# pca_features_train = pca.fit_transform(train_df_processed[numeric_features])
# pca_df_train = pd.DataFrame(pca_features_train, columns=[f'pca_{i+1}' for i in range(pca_features_train.shape[1])])
# train_df_processed = pd.concat([train_df_processed, pca_df_train], axis=1)

# pca_features_test = pca.transform(test_df_processed[numeric_features])
# pca_df_test = pd.DataFrame(pca_features_test, columns=[f'pca_{i+1}' for i in range(pca_features_test.shape[1])])
# test_df_processed = pd.concat([test_df_processed, pca_df_test], axis=1)

# # Prepare features and target
# X = train_df_processed.drop(['Exited', 'CustomerId'], axis=1)
# y = train_df_processed['Exited']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# # Feature selection using RFECV with a stronger base estimator (e.g., XGBClassifier)
# rfecv = RFECV(estimator=XGBClassifier(), step=1, cv=5, scoring='roc_auc', n_jobs=-1)
# X_train_selected = rfecv.fit_transform(X_train, y_train)
# X_val_selected = rfecv.transform(X_val)

# # Transform the test set using the same RFECV mask
# X_test_selected = rfecv.transform(test_df_processed.drop(['CustomerId'], axis=1))

# # Hyperparameter optimization with Optuna (Bayesian Optimization)
# def objective(trial: Trial):
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3)
#     }
#     model = LGBMClassifier(**param, random_state=2)
#     model.fit(X_train_selected, y_train)
#     preds = model.predict_proba(X_val_selected)[:, 1]
#     return roc_auc_score(y_val, preds)

# study = create_study(direction='maximize', sampler=TPESampler())
# study.optimize(objective, n_trials=50)

# best_lgbm = LGBMClassifier(**study.best_params, random_state=2)
# best_lgbm.fit(X_train_selected, y_train)

# # Stacking model with LogisticRegression as final estimator for predict_proba
# print("Training Stacking Model...")
# stacking_model = StackingClassifier(
#     estimators=[
#         ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=2)),
#         ('gb', GradientBoostingClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2)),
#         ('xgb', XGBClassifier(n_estimators=120, learning_rate=0.05, max_depth=4, random_state=2, use_label_encoder=False, eval_metric='logloss')),
#         ('catboost', CatBoostClassifier(iterations=200, depth=6, learning_rate=0.05, silent=True)),
#         ('lgbm', best_lgbm),
#         ('extra', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=2))
#     ],
#     final_estimator=LogisticRegression(),
#     cv=5
# )
# stacking_model.fit(X_train_selected, y_train)

# # Model evaluation
# print("Evaluating model...")
# y_val_pred_proba = stacking_model.predict_proba(X_val_selected)[:, 1]
# roc_auc = roc_auc_score(y_val, y_val_pred_proba)
# accuracy = accuracy_score(y_val, stacking_model.predict(X_val_selected))
# r2 = r2_score(y_val, y_val_pred_proba)

# print(f"ROC AUC: {roc_auc:.4f}")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"R² Score: {r2:.4f}")
# print(classification_report(y_val, stacking_model.predict(X_val_selected)))

# # Make predictions on the test set
# test_predictions = stacking_model.predict_proba(X_test_selected)[:, 1]
# pd.DataFrame({'id': test_df['id'], 'Exited': test_predictions}).to_csv('Predictions.csv', index=False)

# 2


In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at Hidden Shadow
#     http://www.apache.org/licenses/LICENSE-2.0

In [None]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_path = '../Datasets/train.csv'
test_path = '../Datasets/test.csv'
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

print(train_data.shape)
print(test_data.shape)
print(train_data.head())

In [None]:
train_data.isnull().sum()
# OUTPUT:
#     id                 0
#     CustomerId         0
#     Surname            0
#     CreditScore        0
#     Geography          0
#     Gender             0
#     Age                0
#     Tenure             0
#     Balance            0
#     NumOfProducts      0
#     HasCrCard          0
#     IsActiveMember     0
#     EstimatedSalary    0
#     Exited             0
#     dtype: int64


# 1


In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score, f1_score
    
# # Load your data
# # Assuming the DataFrame is already loaded as 'train_data'

# # Preprocessing
# X = train_data.drop(columns=['Exited', 'CustomerId', 'Surname'])
# y = train_data['Exited']

# # Preprocess categorical columns and scale numerical columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']),
#         ('cat', OneHotEncoder(), ['Geography', 'Gender'])
#     ])

# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # List of models to train
# models = {
#     'Logistic Regression': LogisticRegression(max_iter=1000),
#     'Random Forest': RandomForestClassifier(),
#     'XGBoost': XGBClassifier(),
#     'Support Vector Machine': SVC()
# }

# # Train and evaluate models
# for name, model in models.items():
#     # Create pipeline with preprocessor and classifier
#     clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    
#     # Fit model
#     clf.fit(X_train, y_train)
    
#     # Predict on test set
#     y_pred = clf.predict(X_test)
    
#     # Evaluate model
#     accuracy = accuracy_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
    
#     print(f"{name} - Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")



In [None]:
# from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# # Data Preprocessing (as before)
# X = train_data.drop(columns=['CustomerId', 'Surname', 'Exited'])
# y = train_data['Exited']

# # One-hot encode categorical variables
# X = pd.get_dummies(X, columns=['Geography', 'Gender'], drop_first=True)

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Scale the data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Define models
# models = {
#     'Logistic Regression': LogisticRegression(),
#     'Random Forest': RandomForestClassifier(),
#     'Gradient Boosting': GradientBoostingClassifier(),
#     'XGBoost': XGBClassifier(),
#     'LightGBM': LGBMClassifier(),
#     'SVM': SVC(probability=True),  # for AUC calculation
#     'KNN': KNeighborsClassifier()
# }

# # Train and evaluate models
# for name, model in models.items():
#     model.fit(X_train_scaled, y_train)
#     y_pred = model.predict(X_test_scaled)
#     print(f"Model: {name}")
#     print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
#     print(f"ROC AUC: {roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1])}")
#     print(classification_report(y_test, y_pred))
#     print("="*50)

# # Ensemble Voting Classifier (with best models)
# voting_clf = VotingClassifier(estimators=[
#     ('lr', LogisticRegression()),
#     ('rf', RandomForestClassifier()),
#     ('xgb', XGBClassifier()),
#     ('lgbm', LGBMClassifier())], voting='soft')

# voting_clf.fit(X_train_scaled, y_train)
# y_pred_voting = voting_clf.predict(X_test_scaled)

# print(f"Ensemble Voting Classifier Accuracy: {accuracy_score(y_test, y_pred_voting)}")
# print(f"Ensemble Voting Classifier ROC AUC: {roc_auc_score(y_test, voting_clf.predict_proba(X_test_scaled)[:, 1])}")
# print(classification_report(y_test, y_pred_voting))


In [None]:
# z = test_data.drop(['CustomerId', 'Surname'], axis=1)
# z = pd.get_dummies(z, columns=['Geography', 'Gender'], drop_first=True)
# pd.DataFrame({
#     'id': test_data['id'], 'Exited': voting_clf.predict(z)
# }).to_csv('Predictions.csv', index=False)

In [None]:
# from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, r2_score

# # Data Preprocessing (as before)
# X = train_data.drop(columns=['CustomerId', 'Surname', 'Exited'])
# y = train_data['Exited']

# # One-hot encode categorical variables
# X = pd.get_dummies(X, columns=['Geography', 'Gender'], drop_first=True)

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Scale the data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Regularized Logistic Regression (L2 Regularization)
# log_reg = LogisticRegression(C=0.1, penalty='l2')  # C controls regularization strength
# log_reg.fit(X_train_scaled, y_train)
# y_pred_lr = log_reg.predict(X_test_scaled)
# print(f"Logistic Regression R²: {r2_score(y_test, y_pred_lr)}")

# # Random Forest with Hyperparameter Tuning
# rf = RandomForestClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=4, n_estimators=200)
# rf.fit(X_train_scaled, y_train)
# y_pred_rf = rf.predict(X_test_scaled)
# print(f"Random Forest R²: {r2_score(y_test, y_pred_rf)}")

# # XGBoost with regularization and hyperparameter tuning
# xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=100, reg_alpha=0.01, reg_lambda=1)
# xgb.fit(X_train_scaled, y_train)
# y_pred_xgb = xgb.predict(X_test_scaled)
# print(f"XGBoost R²: {r2_score(y_test, y_pred_xgb)}")

# # LightGBM with hyperparameter tuning
# lgbm = LGBMClassifier(max_depth=7, learning_rate=0.05, n_estimators=300, num_leaves=30, reg_alpha=0.1, reg_lambda=1)
# lgbm.fit(X_train_scaled, y_train)
# y_pred_lgbm = lgbm.predict(X_test_scaled)
# print(f"LightGBM R²: {r2_score(y_test, y_pred_lgbm)}")

# # Evaluation of Models
# models = {'Logistic Regression': y_pred_lr, 'Random Forest': y_pred_rf, 'XGBoost': y_pred_xgb, 'LightGBM': y_pred_lgbm}

# for name, preds in models.items():
#     print(f"\nModel: {name}")
#     print(f"R²: {r2_score(y_test, preds)}")
#     print(classification_report(y_test, preds))


In [None]:
# from sklearn.model_selection import GridSearchCV

# # Example for Random Forest
# param_grid_rf = {
#     'n_estimators': [100, 200],
#     'max_depth': [10, 20],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2],
# }

# rf = RandomForestClassifier(random_state=42)
# grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='r2')
# grid_search_rf.fit(X_train_scaled, y_train)

# # Best model from the grid search
# best_rf = grid_search_rf.best_estimator_
# y_pred_best_rf = best_rf.predict(X_test_scaled)

# print(f"Tuned Random Forest R²: {r2_score(y_test, y_pred_best_rf)}")


In [None]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 500],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# # Initialize RandomForestRegressor
# rf = RandomForestRegressor(random_state=42)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
#                            cv=5, scoring='r2', n_jobs=-1, verbose=2)

# # Fit the model
# grid_search.fit(X_train_pca, y_train)

# # Get the best parameters
# print("Best Parameters:", grid_search.best_params_)

# # Evaluate on the test set
# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(X_test_pca)
# print(f"Test R² Score: {r2_score(y_test, y_pred)}")
