In [46]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import skew, kurtosis
from IPython.display import display

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from feature_engine.outliers import Winsorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer
from bayes_opt import BayesianOptimization
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier


In [47]:
df_train = pq.read_table('data/df_train.parquet').to_pandas()
df_test = pq.read_table('data/df_test.parquet').to_pandas()

In [48]:
numeric_columns = [
    'Cant_gr_flia', 
    'Cant_riesgos_flia_mean', 
    'cantidad_serv_flia', 
    'CANTIDAD_SERVICIOS', 
    'conteo_dx_diferentes', 
    'EDAD', 
    'psa_max_gr_flia', 
    'psa_min_gr_flia', 
    'Pendiente', 
    'Pendiente_flia', 
    'Promedio_costo', 
    'Promedio_costo_flia', 
    'psa_max_gr_flia', 
    'psa_min_gr_flia', 
    'MEDICAMENTOS', 
    'MEDICINA ESPECIALIZADA', 
    'MEDICINA GENERAL', 
    'TIEMPO_AFILIACION', 
    'TIEMPO_ULTIMA_CITA', 
    'PERDIDA_DE_PESO', 
    'Intercepto', 
    'Intercepto_flia', 
    'Cant_Fliar_CP', 
    'Cant_Fliar_riesgos'
]

categorical_columns = [
    'AGRUPACION_DIASTOLICA', 
    'AGRUPACION_SISTOLICA', 
    'CANCER_MAMA_FAMILIAR', 
    'CANCER_OTRO_SITIO', 
    'CORONARIOS', 
    'CANCER_OTRO_SITIO_FAMILIAR',
    'CORONARIOS_FAMILIAR', 
    'CEREBRAL', 
    'CEREBRAL_FAMILIAR', 
    'DIABETES', 
    'DIABETES_FAMILIAR', 
    'ENFERMEDAD_RENAL', 
    'ENFERMEDAD_RENAL_FAMILIAR', 
    'HIPERTENSION', 
    'HIPERTENSION_FAMILIAR', 
    'OTROS_ANTECEDENTES_VASCULARES', 
    'RIESGOS', 
    'ESTADO_CIVI', 
    'estrato', 
    'parentesco', 
    'PROGRAMA', 
]

nominal_columns = [
    'ESTADO_CIVI', 'PROGRAMA', 'parentesco', 'CANCER_MAMA_FAMILIAR', 'CANCER_OTRO_SITIO',
    'CANCER_OTRO_SITIO_FAMILIAR', 'HIPERTENSION', 'HIPERTENSION_FAMILIAR',
    'DIABETES', 'DIABETES_FAMILIAR', 'CORONARIOS', 'CORONARIOS_FAMILIAR',
    'CEREBRAL', 'CEREBRAL_FAMILIAR', 'ENFERMEDAD_RENAL', 'ENFERMEDAD_RENAL_FAMILIAR',
    'OTROS_ANTECEDENTES_VASCULARES'
]

ordinal_columns = ['estrato', 'AGRUPACION_SISTOLICA', 'AGRUPACION_DIASTOLICA', 'IMC']

In [49]:
# # lets use the 40% of the entire data
# df_train = df_train.sample(frac=0.4, random_state=42)

In [50]:
X = df_train.drop(columns=['Target'])
y = df_train['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Preprocessor pipeline

In [51]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('ord', ordinal_transformer, ordinal_columns),
        ('nom', nominal_transformer, nominal_columns)
    ]
)

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier(random_state=42))
])

xgb_pipeline.fit(X_train, y_train)

feature_names = xgb_pipeline.named_steps['preprocessor'].get_feature_names_out()
xgb_model = xgb_pipeline.named_steps['xgb']
feature_importances = xgb_model.feature_importances_

sorted_idx = np.argsort(feature_importances)[::-1]
top_20_idx = sorted_idx[:20]
top_20_features = feature_names[top_20_idx]

def get_original_columns(features, feature_names):
    original_columns = []
    for feature in features:
        original_col = feature.split('__')[1]
        if (original_col in feature_names):
            original_columns.append(original_col)
    return list(set(original_columns))


selected_numeric_columns = get_original_columns(top_20_features, numeric_columns)
selected_ordinal_columns = get_original_columns(top_20_features, ordinal_columns)
selected_nominal_columns = get_original_columns(top_20_features, nominal_columns)


reduced_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, selected_numeric_columns),
        ('ord', ordinal_transformer, selected_ordinal_columns),
        ('nom', nominal_transformer, selected_nominal_columns)
    ]
)

## Random Forest optimization
- Best parameters found: {'target': 0.82463421122639, 'params': {'max_depth': 25.0, 'max_features': 0.2, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_components_pca': 6.0, 'n_estimators': 562.6480841236934}}

In [52]:
# from sklearn.model_selection import StratifiedKFold
# # Define the Random Forest evaluation function using accuracy as the metric
# def rf_evaluate(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, n_components_pca):
#     # Create a complete pipeline: Preprocessing + RandomForest
#     model_pipeline = Pipeline([
#         # Include the preprocessing pipeline
#         ('preprocessor', reduced_preprocessor),
#         ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA after preprocessing
#         ('rf', RandomForestClassifier(n_estimators=int(n_estimators),
#                                       max_depth=int(max_depth),
#                                       min_samples_split=int(min_samples_split),
#                                       min_samples_leaf=int(min_samples_leaf),
#                                       max_features=max_features,
#                                       random_state=42))  # Random Forest with hyperparameters
#     ])

#     stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     # accuracy_scores = cross_val_score(
#     #     model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)
#     roc_auc_scores = cross_val_score(
#         model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1
#     )

#     return roc_auc_scores.mean()

# pbounds = {
#     'n_estimators': (400, 600),
#     'max_depth': (25, 35),
#     'min_samples_split': (2, 8),
#     'min_samples_leaf': (1, 8),
#     'max_features': (0.1, 0.2),
#     'n_components_pca': (1, 5)
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=rf_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2  # Verbose to see progress
# )

# # Run the optimization
# # 10 random points first, then 32 iterations of optimization
# optimizer.maximize(init_points=10, n_iter=30)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)

In [53]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# import pandas as pd

# # Best parameters from Bayesian Optimization
# best_params = {
#     'max_depth': int(25.452039720948832),
#     'max_features': 0.2,
#     'min_samples_leaf': int(1.0),
#     'min_samples_split': int(2.0),
#     'n_components_pca': int(5.0),
#     'n_estimators': int(552.5290750051523)
# }

# # Create the final model pipeline with the best parameters
# model_pipeline = Pipeline([
#     ('preprocessor', reduced_preprocessor),  # Use your preprocessor from before
#     ('pca', PCA(n_components=best_params['n_components_pca'])),  # PCA with the best component number
#     ('rf', RandomForestClassifier(
#         n_estimators=best_params['n_estimators'],
#         max_depth=best_params['max_depth'],
#         min_samples_split=best_params['min_samples_split'],
#         min_samples_leaf=best_params['min_samples_leaf'],
#         max_features=best_params['max_features'],
#         random_state=42
#     ))
# ])

# # Fit the model on the training data
# model_pipeline.fit(X_train, y_train)

# # Make predictions on the test set
# predictions = model_pipeline.predict(df_test)

# # Step 8: Create submission file
# final_df = pd.DataFrame({
#     'ID': df_test.index,  # Assuming df_test has the ID as index or column
#     'Target': predictions
# })

# # Save the submission
# submission_file = 'submission_rf.csv'
# final_df.to_csv(submission_file, index=False)
# print(f"Submission file {submission_file} created successfully.")

## KNN optimization
- so far: Best parameters found: {'target': 0.7156158552806597, 'params': {'n_components_pca': 5.0, 'n_neighbors': 50.0, 'p': 2.0}}

In [54]:
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from bayes_opt import BayesianOptimization
# from sklearn.preprocessing import StandardScaler


# # Define the KNN evaluation function using accuracy as the metric
# def knn_evaluate(n_neighbors, p, n_components_pca):
#     # Create a complete pipeline: Preprocessing + KNN
#     model_pipeline = Pipeline([
#         ('preprocessor', reduced_preprocessor),
#         ('pca', PCA(n_components=int(n_components_pca))),
#         ('knn', KNeighborsClassifier(n_neighbors=int(n_neighbors), p=int(p), n_jobs=-1))
#     ])

#     stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     # Cross-validation
#     accuracy_scores = cross_val_score(
#         model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)

#     return accuracy_scores.mean()

# # Define parameter bounds for Bayesian Optimization
# pbounds = {
#     'n_neighbors': (3, 50),  # KNN neighbors range
#     'p': (1, 2),  # Distance metric (1: Manhattan, 2: Euclidean)
#     'n_components_pca': (2, 5)  # PCA components range
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=knn_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2  # Verbose to see progress
# )

# # Run the optimization
# # 10 random points first, then 30 iterations of optimization
# optimizer.maximize(init_points=10, n_iter=30)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)


## SVM

In [55]:
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.svm import SVC
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from bayes_opt import BayesianOptimization

# # Define the SVM evaluation function using ROC AUC as the metric
# def svm_evaluate(C, gamma, kernel, n_components_pca):
#     # Convert kernel index to a valid kernel string
#     kernel_options = ['linear', 'rbf', 'poly']
#     kernel = kernel_options[int(kernel)]

#     # Create a complete pipeline: Preprocessing + PCA + SVM
#     model_pipeline = Pipeline([
#         ('preprocessor', reduced_preprocessor),
#         ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA after preprocessing
#         ('svm', SVC(C=C, gamma=gamma, kernel=kernel, random_state=42, probability=True))
#     ])

#     stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     # Use ROC AUC as the evaluation metric
#     roc_auc_scores = cross_val_score(
#         model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1
#     )

#     return roc_auc_scores.mean()

# # Parameter bounds for SVM optimization
# pbounds = {
#     'C': (0.1, 10),                # Regularization parameter
#     'gamma': (0.0001, 1),          # Kernel coefficient
#     'kernel': (0, 2),              # Kernel type (0: linear, 1: rbf, 2: poly)
#     'n_components_pca': (1, 5)     # PCA components
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=svm_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2  # Verbose to see progress
# )

# # Run the optimization
# # 10 random points first, then 30 iterations of optimization
# optimizer.maximize(init_points=10, n_iter=30)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)


In [56]:
# from sklearn.svm import SVC
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# import pandas as pd

# # Best parameters from Bayesian Optimization for SVM
# best_svm_params = {
#     'C': 9.69511928814146,
#     'gamma': 0.804355672916069,
#     'kernel': round(0.9545838701817976),
#     'n_components_pca': int(4.640990598805696)
# }

# # Map the kernel index to the actual kernel type
# kernel_options = ['linear', 'rbf', 'poly']
# kernel = kernel_options[best_svm_params['kernel']]

# # Create the SVM model pipeline with the best parameters
# model_pipeline = Pipeline([
#     ('preprocessor', reduced_preprocessor),  # Use your preprocessor from before
#     ('pca', PCA(n_components=best_svm_params['n_components_pca'])),  # PCA with the best component number
#     ('svm', SVC(
#         C=best_svm_params['C'],
#         gamma=best_svm_params['gamma'],
#         kernel=kernel,
#         random_state=42,
#         probability=True  # To enable probability estimates for ROC AUC
#     ))
# ])

# # Fit the model on the training data
# model_pipeline.fit(X_train, y_train)

# # Make predictions on the test set (probabilities for ROC AUC)
# predictions_proba = model_pipeline.predict_proba(df_test)[:, 1]  # Get the probability for the positive class

# # Prepare the submission dataframe (replace 'Id' with the actual ID column from your test set)
# # Step 8: Create submission file
# final_df = pd.DataFrame({
#     'ID': df_test.index,  # Assuming df_test has the ID as index or column
#     'Target': predictions_proba
# })

# # Save the submission
# submission_file = 'submission.csv'
# final_df.to_csv(submission_file, index=False)
# print(f"Submission file {submission_file} created successfully.")

## XG-Boost

In [57]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer, KNNImputer
# from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.decomposition import PCA
# from bayes_opt import BayesianOptimization

# # Define the ultimate evaluation function for XGBoost using ROC AUC
# def xgb_evaluate(n_estimators, max_depth, learning_rate, subsample, colsample_bytree, gamma, min_child_weight, reg_alpha, reg_lambda,
#                  imputer_type, n_neighbors_knn, n_components_pca, k_folds):
    
#     # Preprocessing: Set imputation for numeric columns
#     if imputer_type < 0.5:  # SimpleImputer mean
#         numeric_imputer = SimpleImputer(strategy='mean')
#     else:  # KNN Imputer
#         numeric_imputer = KNNImputer(n_neighbors=int(n_neighbors_knn))
    
#     # Define the preprocessing pipeline for each type of data
#     numeric_transformer = Pipeline(steps=[
#         ('imputer', numeric_imputer),
#         ('scaler', StandardScaler())
#     ])

#     ordinal_transformer = Pipeline(steps=[
#         ('imputer', SimpleImputer(strategy='most_frequent')),
#         ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
#     ])

#     nominal_transformer = Pipeline(steps=[
#         ('imputer', SimpleImputer(strategy='most_frequent')),
#         ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
#     ])

#     # Define the complete ColumnTransformer
#     preprocessor = ColumnTransformer(
#         transformers=[
#             ('num', numeric_transformer, numeric_columns),
#             ('ord', ordinal_transformer, ordinal_columns),
#             ('nom', nominal_transformer, nominal_columns)
#         ]
#     )

#     # Create the model pipeline with PCA and XGBoost
#     model_pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('pca', PCA(n_components=int(n_components_pca))),
#         ('xgb', XGBClassifier(
#             n_estimators=int(n_estimators),
#             max_depth=int(max_depth),
#             learning_rate=learning_rate,
#             subsample=subsample,
#             colsample_bytree=colsample_bytree,
#             gamma=gamma,
#             min_child_weight=min_child_weight,
#             reg_alpha=reg_alpha,
#             reg_lambda=reg_lambda,
#             random_state=42,
#             eval_metric='logloss'
#         ))
#     ])

#     # Stratified K-Fold Cross Validation
#     stratified_kfold = StratifiedKFold(n_splits=int(k_folds), shuffle=True, random_state=42)

#     # Evaluate using ROC AUC
#     roc_auc_scores = cross_val_score(
#         model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1
#     )

#     return roc_auc_scores.mean()

# # Parameter bounds for Bayesian Optimization
# pbounds = {
#     'n_estimators': (50, 1000),          # Number of trees
#     'max_depth': (3, 12),                # Maximum depth of the tree
#     'learning_rate': (0.01, 0.3),        # Learning rate
#     'subsample': (0.5, 1.0),             # Subsample ratio
#     'colsample_bytree': (0.5, 1.0),      # Subsample ratio of columns
#     'gamma': (0, 5),                     # Minimum loss reduction
#     'min_child_weight': (1, 10),         # Minimum child weight
#     'reg_alpha': (0, 5),                 # L1 regularization
#     'reg_lambda': (0, 5),                # L2 regularization
    
#     # Preprocessing related parameters
#     'imputer_type': (0, 1),              # 0: Mean imputer, 1: KNN Imputer
#     'n_neighbors_knn': (3, 10),          # Number of neighbors for KNN Imputer
#     'n_components_pca': (1, 20),         # PCA components

#     # K-Fold cross validation parameter
#     'k_folds': (3, 10)                   # Number of K-Folds
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=xgb_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2  # Verbose to see progress
# )

# # Run the optimization
# optimizer.maximize(init_points=2, n_iter=3)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)


In [58]:
# best_params = {'target': 0.798312289413439,
#  'params': {'colsample_bytree': 1.0,
#   'gamma': 0.0,
#   'imputer_type': 1.0,
#   'k_folds': 10.0,
#   'learning_rate': 0.01,
#   'max_depth': 12.0,
#   'min_child_weight': 1.0,
#   'n_components_pca': 20.0,
#   'n_estimators': 262.8509615896416,
#   'n_neighbors_knn': 3.0,
#   'reg_alpha': 0.0,
#   'reg_lambda': 0.0,
#   'subsample': 0.5}}

In [59]:
# import pandas as pd
# from xgboost import XGBClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA

# # Define the best parameters found
# best_params_xg = best_params['params']

# # Define preprocessing based on the best imputer type (SimpleImputer mean in this case)
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('scaler', StandardScaler())
# ])

# ordinal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
# ])

# nominal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# # Create the final preprocessor with the best found settings
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_columns),
#         ('ord', ordinal_transformer, ordinal_columns),
#         ('nom', nominal_transformer, nominal_columns)
#     ]
# )

# # Create the final model pipeline with PCA and XGBoost
# final_model_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('pca', PCA(n_components=int(best_params_xg['n_components_pca']))),
#     ('xgb', XGBClassifier(
#         n_estimators=int(best_params_xg['n_estimators']),
#         max_depth=int(best_params_xg['max_depth']),
#         learning_rate=best_params_xg['learning_rate'],
#         subsample=best_params_xg['subsample'],
#         colsample_bytree=best_params_xg['colsample_bytree'],
#         gamma=best_params_xg['gamma'],
#         min_child_weight=best_params_xg['min_child_weight'],
#         reg_alpha=best_params_xg['reg_alpha'],
#         reg_lambda=best_params_xg['reg_lambda'],
#         random_state=42,
#         use_label_encoder=False,
#         eval_metric='logloss'
#     ))
# ])

# # Fit the model on the training data
# final_model_pipeline.fit(X_train, y_train)

# # Make predictions on the test data
# y_test_predictions = final_model_pipeline.predict(df_test)

# # Create the submission DataFrame
# final_df = pd.DataFrame({
#     'ID': df_test.index,  # Assuming df_test has the 'ID' as index
#     'Target': y_test_predictions
# })

# # Save the submission file
# submission_file = 'submission_xg.csv'
# final_df.to_csv(submission_file, index=False)
# print(f"Submission file {submission_file} created successfully.")


## Testing models

In [65]:
import os
os.environ['MallocStackLogging'] = '0'

import time
from sklearn.model_selection import StratifiedKFold, cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# General function to run optimization for any model
def run_optimization(model_eval_func, pbounds, model_name):
    start_time = time.time()

    # Set up the Bayesian optimizer
    optimizer = BayesianOptimization(
        f=model_eval_func,
        pbounds=pbounds,
        random_state=42,
        verbose=2  # Verbose to see progress
    )

    # Run the optimization with reduced random points and iterations
    optimizer.maximize(init_points=5, n_iter=15)

    best_params = optimizer.max
    print(f"Best parameters found for {model_name}: {best_params}")
    
    print(f"{model_name} optimization completed in {time.time() - start_time:.2f} seconds.\n")

# 1. XGBoost Optimization
def xgboost_evaluate(n_estimators, max_depth, learning_rate, subsample, colsample_bytree, gamma, min_child_weight, n_components_pca):
    from xgboost import XGBClassifier
    model_pipeline = Pipeline(steps=[
        ('preprocessor', reduced_preprocessor),   # Use the preprocessor you defined
        ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA after preprocessing
        ('xgb', XGBClassifier(
            n_estimators=int(n_estimators),
            max_depth=int(max_depth),
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            gamma=gamma,
            min_child_weight=min_child_weight,
            random_state=42
        ))
    ])
    
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1)
    return auc_scores.mean()

xgboost_pbounds = {
    'n_estimators': (50, 200),
    'max_depth': (3, 12),
    'learning_rate': (0.01, 0.3),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
    'gamma': (0, 5),
    'min_child_weight': (1, 10),
    'n_components_pca': (1, 5)  # PCA added
}

# 2. LightGBM Optimization
def lightgbm_evaluate(num_leaves, max_depth, learning_rate, n_estimators, subsample, n_components_pca):
    from lightgbm import LGBMClassifier
    model_pipeline = Pipeline(steps=[
        ('preprocessor', reduced_preprocessor),   # Preprocessing
        ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA
        ('lgbm', LGBMClassifier(
            num_leaves=int(num_leaves),
            max_depth=int(max_depth),
            learning_rate=learning_rate,
            n_estimators=int(n_estimators),
            subsample=subsample,
            random_state=42
        ))
    ])
    
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1)
    return auc_scores.mean()

lightgbm_pbounds = {
    'num_leaves': (10, 50),
    'max_depth': (3, 12),
    'learning_rate': (0.01, 0.3),
    'n_estimators': (50, 200),
    'subsample': (0.5, 1),
    'n_components_pca': (1, 5)  # PCA added
}

# 3. RandomForest Optimization
def randomforest_evaluate(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, n_components_pca):
    from sklearn.ensemble import RandomForestClassifier
    model_pipeline = Pipeline(steps=[
        ('preprocessor', reduced_preprocessor),   # Preprocessing
        ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA
        ('rf', RandomForestClassifier(
            n_estimators=int(n_estimators),
            max_depth=int(max_depth),
            min_samples_split=int(min_samples_split),
            min_samples_leaf=int(min_samples_leaf),
            max_features=max_features,
            random_state=42
        ))
    ])
    
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1)
    return auc_scores.mean()

randomforest_pbounds = {
    'n_estimators': (50, 200),
    'max_depth': (3, 12),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 10),
    'max_features': (0.5, 1),
    'n_components_pca': (1, 5)  # PCA added
}

# 4. K-Nearest Neighbors Optimization
def knn_evaluate(n_neighbors, weights, p, n_components_pca):
    from sklearn.neighbors import KNeighborsClassifier
    weight_options = ['uniform', 'distance']
    model_pipeline = Pipeline(steps=[
        ('preprocessor', reduced_preprocessor),   # Preprocessing
        ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA
        ('knn', KNeighborsClassifier(
            n_neighbors=int(n_neighbors),
            weights=weight_options[int(weights)],
            p=int(p)
        ))
    ])
    
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1)
    return auc_scores.mean()

knn_pbounds = {
    'n_neighbors': (3, 15),
    'weights': (0, 1),  # 0 for 'uniform', 1 for 'distance'
    'p': (1, 2),
    'n_components_pca': (1, 5)  # PCA added
}

# 5. Support Vector Machine (SVM) Optimization
def svm_evaluate(C, gamma, kernel, n_components_pca):
    from sklearn.svm import SVC
    kernel_options = ['linear', 'rbf', 'poly']
    model_pipeline = Pipeline(steps=[
        ('preprocessor', reduced_preprocessor),   # Preprocessing
        ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA
        ('svm', SVC(
            C=C,
            gamma=gamma,
            kernel=kernel_options[int(kernel)],
            probability=True,
            random_state=42
        ))
    ])
    
    stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1)
    return auc_scores.mean()

svm_pbounds = {
    'C': (0.1, 10),
    'gamma': (0.001, 1),
    'kernel': (0, 2),  # 0: linear, 1: rbf, 2: poly
    'n_components_pca': (1, 5)  # PCA added
}

# Running the optimization for all models
run_optimization(xgboost_evaluate, xgboost_pbounds, "XGBoost")
run_optimization(lightgbm_evaluate, lightgbm_pbounds, "LightGBM")
run_optimization(randomforest_evaluate, randomforest_pbounds, "RandomForest")
run_optimization(knn_evaluate, knn_pbounds, "K-Nearest Neighbors")
run_optimization(svm_evaluate, svm_pbounds, "SVM")


|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_comp... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------


python(2057) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2058) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2059) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2060) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2061) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2062) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2063) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2064) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2065) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2066) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(2067) MallocStackLoggin

| [39m1        [39m | [39m0.5225   [39m | [39m0.6873   [39m | [39m4.754    [39m | [39m0.2223   [39m | [39m8.388    [39m | [39m2.404    [39m | [39m1.624    [39m | [39m58.71    [39m | [39m0.9331   [39m |
| [35m2        [39m | [35m0.5249   [39m | [35m0.8006   [39m | [35m3.54     [39m | [35m0.01597  [39m | [35m11.73    [39m | [35m8.492    [39m | [35m1.849    [39m | [35m77.27    [39m | [35m0.5917   [39m |
| [35m3        [39m | [35m0.5325   [39m | [35m0.6521   [39m | [35m2.624    [39m | [35m0.1353   [39m | [35m5.621    [39m | [35m6.507    [39m | [35m1.558    [39m | [35m93.82    [39m | [35m0.6832   [39m |
| [39m4        [39m | [39m0.5277   [39m | [39m0.728    [39m | [39m3.926    [39m | [39m0.06791  [39m | [39m7.628    [39m | [39m6.332    [39m | [39m1.186    [39m | [39m141.1    [39m | [39m0.5853   [39m |
| [39m5        [39m | [39m0.5233   [39m | [39m0.5325   [39m | [39m4.744    [39m | [39m0.29     [39m | 

python(4416) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(4417) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


| [39m5        [39m | [39m0.5444   [39m | [39m3.112    [39m | [39m0.5252   [39m | [39m0.8639   [39m | [39m2.165    [39m |
| [35m6        [39m | [35m0.5863   [39m | [35m4.765    [39m | [35m0.7734   [39m | [35m0.7256   [39m | [35m2.852    [39m |


KeyboardInterrupt: 

# Submission

In [29]:
# # Step 8: Create submission file
# final_df = pd.DataFrame({
#     'ID': df_test.index,  # Assuming df_test has the ID as index or column
#     'Target': y_test_predictions
# })

# # Save the submission
# submission_file = 'submission.csv'
# final_df.to_csv(submission_file, index=False)
# print(f"Submission file {submission_file} created successfully.")