In [77]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import skew, kurtosis
from IPython.display import display

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from feature_engine.outliers import Winsorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer
from bayes_opt import BayesianOptimization
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier


In [78]:
df_train = pq.read_table('data/df_train.parquet').to_pandas()
df_test = pq.read_table('data/df_test.parquet').to_pandas()

In [79]:
numeric_columns = [
    'Cant_gr_flia', 
    'Cant_riesgos_flia_mean', 
    'cantidad_serv_flia', 
    'CANTIDAD_SERVICIOS', 
    'conteo_dx_diferentes', 
    'EDAD', 
    'psa_max_gr_flia', 
    'psa_min_gr_flia', 
    'Pendiente', 
    'Pendiente_flia', 
    'Promedio_costo', 
    'Promedio_costo_flia', 
    'psa_max_gr_flia', 
    'psa_min_gr_flia', 
    'MEDICAMENTOS', 
    'MEDICINA ESPECIALIZADA', 
    'MEDICINA GENERAL', 
    'TIEMPO_AFILIACION', 
    'TIEMPO_ULTIMA_CITA', 
    'PERDIDA_DE_PESO', 
    'Intercepto', 
    'Intercepto_flia', 
    'Cant_Fliar_CP', 
    'Cant_Fliar_riesgos'
]

categorical_columns = [
    'AGRUPACION_DIASTOLICA', 
    'AGRUPACION_SISTOLICA', 
    'CANCER_MAMA_FAMILIAR', 
    'CANCER_OTRO_SITIO', 
    'CORONARIOS', 
    'CANCER_OTRO_SITIO_FAMILIAR',
    'CORONARIOS_FAMILIAR', 
    'CEREBRAL', 
    'CEREBRAL_FAMILIAR', 
    'DIABETES', 
    'DIABETES_FAMILIAR', 
    'ENFERMEDAD_RENAL', 
    'ENFERMEDAD_RENAL_FAMILIAR', 
    'HIPERTENSION', 
    'HIPERTENSION_FAMILIAR', 
    'OTROS_ANTECEDENTES_VASCULARES', 
    'RIESGOS', 
    'ESTADO_CIVI', 
    'estrato', 
    'parentesco', 
    'PROGRAMA', 
]

nominal_columns = [
    'ESTADO_CIVI', 'PROGRAMA', 'parentesco', 'CANCER_MAMA_FAMILIAR', 'CANCER_OTRO_SITIO',
    'CANCER_OTRO_SITIO_FAMILIAR', 'HIPERTENSION', 'HIPERTENSION_FAMILIAR',
    'DIABETES', 'DIABETES_FAMILIAR', 'CORONARIOS', 'CORONARIOS_FAMILIAR',
    'CEREBRAL', 'CEREBRAL_FAMILIAR', 'ENFERMEDAD_RENAL', 'ENFERMEDAD_RENAL_FAMILIAR',
    'OTROS_ANTECEDENTES_VASCULARES'
]

ordinal_columns = ['estrato', 'AGRUPACION_SISTOLICA', 'AGRUPACION_DIASTOLICA', 'IMC']

In [80]:
X = df_train.drop(columns=['Target'])
y = df_train['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Preprocessor pipeline

In [81]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('ord', ordinal_transformer, ordinal_columns),
        ('nom', nominal_transformer, nominal_columns)
    ]
)

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier(random_state=42))
])

xgb_pipeline.fit(X_train, y_train)

feature_names = xgb_pipeline.named_steps['preprocessor'].get_feature_names_out()
xgb_model = xgb_pipeline.named_steps['xgb']
feature_importances = xgb_model.feature_importances_

sorted_idx = np.argsort(feature_importances)[::-1]
top_20_idx = sorted_idx[:20]
top_20_features = feature_names[top_20_idx]

def get_original_columns(features, feature_names):
    original_columns = []
    for feature in features:
        original_col = feature.split('__')[1]
        if (original_col in feature_names):
            original_columns.append(original_col)
    return list(set(original_columns))


selected_numeric_columns = get_original_columns(top_20_features, numeric_columns)
selected_ordinal_columns = get_original_columns(top_20_features, ordinal_columns)
selected_nominal_columns = get_original_columns(top_20_features, nominal_columns)


reduced_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, selected_numeric_columns),
        ('ord', ordinal_transformer, selected_ordinal_columns),
        ('nom', nominal_transformer, selected_nominal_columns)
    ]
)

## Random Forest optimization
- Best parameters found: {'target': 0.82463421122639, 'params': {'max_depth': 25.0, 'max_features': 0.2, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_components_pca': 6.0, 'n_estimators': 562.6480841236934}}

In [82]:
# from sklearn.model_selection import StratifiedKFold
# # Define the Random Forest evaluation function using accuracy as the metric
# def rf_evaluate(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, n_components_pca):
#     # Create a complete pipeline: Preprocessing + RandomForest
#     model_pipeline = Pipeline([
#         # Include the preprocessing pipeline
#         ('preprocessor', reduced_preprocessor),
#         ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA after preprocessing
#         ('rf', RandomForestClassifier(n_estimators=int(n_estimators),
#                                       max_depth=int(max_depth),
#                                       min_samples_split=int(min_samples_split),
#                                       min_samples_leaf=int(min_samples_leaf),
#                                       max_features=max_features,
#                                       random_state=42))  # Random Forest with hyperparameters
#     ])

#     stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     # accuracy_scores = cross_val_score(
#     #     model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)
#     roc_auc_scores = cross_val_score(
#         model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1
#     )

#     return roc_auc_scores.mean()

# pbounds = {
#     'n_estimators': (400, 600),
#     'max_depth': (25, 35),
#     'min_samples_split': (2, 8),
#     'min_samples_leaf': (1, 8),
#     'max_features': (0.1, 0.2),
#     'n_components_pca': (1, 5)
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=rf_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2  # Verbose to see progress
# )

# # Run the optimization
# # 10 random points first, then 32 iterations of optimization
# optimizer.maximize(init_points=10, n_iter=30)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)

In [83]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# import pandas as pd

# # Best parameters from Bayesian Optimization
# best_params = {
#     'max_depth': int(25.452039720948832),
#     'max_features': 0.2,
#     'min_samples_leaf': int(1.0),
#     'min_samples_split': int(2.0),
#     'n_components_pca': int(5.0),
#     'n_estimators': int(552.5290750051523)
# }

# # Create the final model pipeline with the best parameters
# model_pipeline = Pipeline([
#     ('preprocessor', reduced_preprocessor),  # Use your preprocessor from before
#     ('pca', PCA(n_components=best_params['n_components_pca'])),  # PCA with the best component number
#     ('rf', RandomForestClassifier(
#         n_estimators=best_params['n_estimators'],
#         max_depth=best_params['max_depth'],
#         min_samples_split=best_params['min_samples_split'],
#         min_samples_leaf=best_params['min_samples_leaf'],
#         max_features=best_params['max_features'],
#         random_state=42
#     ))
# ])

# # Fit the model on the training data
# model_pipeline.fit(X_train, y_train)

# # Make predictions on the test set
# predictions = model_pipeline.predict(df_test)

# # Step 8: Create submission file
# final_df = pd.DataFrame({
#     'ID': df_test.index,  # Assuming df_test has the ID as index or column
#     'Target': predictions
# })

# # Save the submission
# submission_file = 'submission_rf.csv'
# final_df.to_csv(submission_file, index=False)
# print(f"Submission file {submission_file} created successfully.")

## KNN optimization
- so far: Best parameters found: {'target': 0.7156158552806597, 'params': {'n_components_pca': 5.0, 'n_neighbors': 50.0, 'p': 2.0}}

In [84]:
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from bayes_opt import BayesianOptimization
# from sklearn.preprocessing import StandardScaler


# # Define the KNN evaluation function using accuracy as the metric
# def knn_evaluate(n_neighbors, p, n_components_pca):
#     # Create a complete pipeline: Preprocessing + KNN
#     model_pipeline = Pipeline([
#         ('preprocessor', reduced_preprocessor),
#         ('pca', PCA(n_components=int(n_components_pca))),
#         ('knn', KNeighborsClassifier(n_neighbors=int(n_neighbors), p=int(p), n_jobs=-1))
#     ])

#     stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     # Cross-validation
#     accuracy_scores = cross_val_score(
#         model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)

#     return accuracy_scores.mean()

# # Define parameter bounds for Bayesian Optimization
# pbounds = {
#     'n_neighbors': (3, 50),  # KNN neighbors range
#     'p': (1, 2),  # Distance metric (1: Manhattan, 2: Euclidean)
#     'n_components_pca': (2, 5)  # PCA components range
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=knn_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2  # Verbose to see progress
# )

# # Run the optimization
# # 10 random points first, then 30 iterations of optimization
# optimizer.maximize(init_points=10, n_iter=30)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)


## SVM

In [None]:
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.svm import SVC
# from sklearn.decomposition import PCA
# from sklearn.pipeline import Pipeline
# from bayes_opt import BayesianOptimization

# # Define the SVM evaluation function using ROC AUC as the metric
# def svm_evaluate(C, gamma, kernel, n_components_pca):
#     # Convert kernel index to a valid kernel string
#     kernel_options = ['linear', 'rbf', 'poly']
#     kernel = kernel_options[int(kernel)]

#     # Create a complete pipeline: Preprocessing + PCA + SVM
#     model_pipeline = Pipeline([
#         ('preprocessor', reduced_preprocessor),
#         ('pca', PCA(n_components=int(n_components_pca))),  # Add PCA after preprocessing
#         ('svm', SVC(C=C, gamma=gamma, kernel=kernel, random_state=42, probability=True))
#     ])

#     stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     # Use ROC AUC as the evaluation metric
#     roc_auc_scores = cross_val_score(
#         model_pipeline, X_train, y_train, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1
#     )

#     return roc_auc_scores.mean()

# # Parameter bounds for SVM optimization
# pbounds = {
#     'C': (0.1, 10),                # Regularization parameter
#     'gamma': (0.0001, 1),          # Kernel coefficient
#     'kernel': (0, 2),              # Kernel type (0: linear, 1: rbf, 2: poly)
#     'n_components_pca': (1, 5)     # PCA components
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=svm_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2  # Verbose to see progress
# )

# # Run the optimization
# # 10 random points first, then 30 iterations of optimization
# optimizer.maximize(init_points=10, n_iter=30)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)


In [None]:
# from sklearn.svm import SVC
# from sklearn.pipeline import Pipeline
# from sklearn.decomposition import PCA
# import pandas as pd

# # Best parameters from Bayesian Optimization for SVM
# best_svm_params = {
#     'C': 9.69511928814146,
#     'gamma': 0.804355672916069,
#     'kernel': round(0.9545838701817976),  # Convert to nearest integer for kernel selection
#     'n_components_pca': int(4.640990598805696)  # Convert to integer for PCA
# }

# # Map the kernel index to the actual kernel type
# kernel_options = ['linear', 'rbf', 'poly']
# kernel = kernel_options[best_svm_params['kernel']]

# # Create the SVM model pipeline with the best parameters
# model_pipeline = Pipeline([
#     ('preprocessor', reduced_preprocessor),  # Use your preprocessor from before
#     ('pca', PCA(n_components=best_svm_params['n_components_pca'])),  # PCA with the best component number
#     ('svm', SVC(
#         C=best_svm_params['C'],
#         gamma=best_svm_params['gamma'],
#         kernel=kernel,
#         random_state=42,
#         probability=True  # To enable probability estimates for ROC AUC
#     ))
# ])

# # Fit the model on the training data
# model_pipeline.fit(X_train, y_train)

# # Make predictions on the test set (probabilities for ROC AUC)
# predictions_proba = model_pipeline.predict_proba(df_test)[:, 1]  # Get the probability for the positive class

# # Prepare the submission dataframe (replace 'Id' with the actual ID column from your test set)
# # Step 8: Create submission file
# final_df = pd.DataFrame({
#     'ID': df_test.index,  # Assuming df_test has the ID as index or column
#     'Target': predictions_proba
# })

# # Save the submission
# submission_file = 'submission.csv'
# final_df.to_csv(submission_file, index=False)
# print(f"Submission file {submission_file} created successfully.")

## XG-Boost

# Submission

In [86]:
# # Step 8: Create submission file
# final_df = pd.DataFrame({
#     'ID': df_test.index,  # Assuming df_test has the ID as index or column
#     'Target': y_test_predictions
# })

# # Save the submission
# submission_file = 'submission.csv'
# final_df.to_csv(submission_file, index=False)
# print(f"Submission file {submission_file} created successfully.")