# Prostate Cancer Worshop

## Initial analysis

### Imports

In [29]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import skew, kurtosis
from IPython.display import display

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from feature_engine.outliers import Winsorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from bayes_opt import BayesianOptimization
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from xgboost import XGBClassifier


### LoadingData

In [None]:
df_train = pq.read_table('data/df_train.parquet').to_pandas()
df_test = pq.read_table('data/df_test.parquet').to_pandas()

df_train.shape

In [None]:
df_train.head()

In [32]:
numeric_columns = [
    'Cant_gr_flia', 
    'Cant_riesgos_flia_mean', 
    'cantidad_serv_flia', 
    'CANTIDAD_SERVICIOS', 
    'conteo_dx_diferentes', 
    'EDAD', 
    'psa_max_gr_flia', 
    'psa_min_gr_flia', 
    'Pendiente', 
    'Pendiente_flia', 
    'Promedio_costo', 
    'Promedio_costo_flia', 
    'psa_max_gr_flia', 
    'psa_min_gr_flia', 
    'MEDICAMENTOS', 
    'MEDICINA ESPECIALIZADA', 
    'MEDICINA GENERAL', 
    'TIEMPO_AFILIACION', 
    'TIEMPO_ULTIMA_CITA', 
    'PERDIDA_DE_PESO', 
    'Intercepto', 
    'Intercepto_flia', 
    'Target',
    'Cant_Fliar_CP', 
    'Cant_Fliar_riesgos'
]

categorical_columns = [
    'AGRUPACION_DIASTOLICA', 
    'AGRUPACION_SISTOLICA', 
    'CANCER_MAMA_FAMILIAR', 
    'CANCER_OTRO_SITIO', 
    'CORONARIOS', 
    'CANCER_OTRO_SITIO_FAMILIAR',
    'CORONARIOS_FAMILIAR', 
    'CEREBRAL', 
    'CEREBRAL_FAMILIAR', 
    'DIABETES', 
    'DIABETES_FAMILIAR', 
    'ENFERMEDAD_RENAL', 
    'ENFERMEDAD_RENAL_FAMILIAR', 
    'HIPERTENSION', 
    'HIPERTENSION_FAMILIAR', 
    'OTROS_ANTECEDENTES_VASCULARES', 
    'RIESGOS', 
    'ESTADO_CIVI', 
    'estrato', 
    'parentesco', 
    'PROGRAMA', 
]

In [33]:
ordinal_columns = [
    'AGRUPACION_DIASTOLICA',
    'AGRUPACION_SISTOLICA',
    'HIPERTENSION',
    'HIPERTENSION_FAMILIAR',
    'RIESGOS',
    'estrato'
]

nominal_columns = [
    'CANCER_MAMA_FAMILIAR',
    'CANCER_OTRO_SITIO',
    'CORONARIOS',
    'CANCER_OTRO_SITIO_FAMILIAR',
    'CORONARIOS_FAMILIAR',
    'CEREBRAL',
    'CEREBRAL_FAMILIAR',
    'DIABETES',
    'DIABETES_FAMILIAR',
    'ENFERMEDAD_RENAL',
    'ENFERMEDAD_RENAL_FAMILIAR',
    'OTROS_ANTECEDENTES_VASCULARES',
    'ESTADO_CIVI',
    'parentesco',
    'PROGRAMA'                  
]



### Feature Importance

In [None]:
df_encoded = df_train.copy()
for column in ordinal_columns + nominal_columns + ['IMC']:
    df_encoded[column] = df_encoded[column].astype('category')
X = df_encoded.drop(columns=['Target'])
y = df_encoded['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', enable_categorical=True)
model.fit(X_train, y_train)

importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='teal')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Feature Importance using XGBoost')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [35]:
X_train_small = X_train.sample(frac=0.2, random_state=42)
y_train_small = y_train.loc[X_train_small.index] 

In [36]:
features_to_drop = ['Cant_Fliar_riesgos', 'Cant_Fliar_CP', 'min_Tiempo_CP_Fliar', 'psa_min_gr_flia', 'psa_max_gr_flia', 'CANCER_MAMA_FAMILIAR', 'Target']

### Validate dropping features
- In order to be sure whether we decide to drop or not the already identified features, we will run a preliminary model to test with and without the features

In [None]:
def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', enable_categorical=True)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    
    return f1

# Model 1: With all features
X_all_features = df_encoded.drop(columns=['Target'])
y = df_encoded['Target']

f1_all_features = train_and_evaluate(X_all_features, y)
print(f"F1 Score with all features: {f1_all_features}")

# Model 2: Dropping variables with zero importance
X_reduced_features = df_encoded.drop(columns=['Target'] + features_to_drop)

f1_reduced_features = train_and_evaluate(X_reduced_features, y)
print(f"F1 Score after dropping zero-importance features: {f1_reduced_features}")

After dropping additional features, including `'Cant_Fliar_riesgos'`, `'Cant_Fliar_CP'`, `'min_Tiempo_CP_Fliar'`, `'psa_min_gr_flia'`, `'psa_max_gr_flia'`, and `'CANCER_MAMA_FAMILIAR'`, the model's performance improved. The F1 score increased from **0.5207** (with all features) to **0.5384** (after removing these features), indicating that simplifying the model by excluding both zero-importance features and those with minimal predictive power can enhance the model’s performance. By reducing noise from less significant features, the model was able to generalize better and make more accurate predictions, showcasing the benefits of feature selection in machine learning.

## Preprocessing Pipeline_________________________________________________

In [38]:
updated_numeric_columns = [col for col in numeric_columns if col not in features_to_drop]
updated_ordinal_columns = [col for col in ordinal_columns if col not in features_to_drop]
updated_nominal_columns = [col for col in nominal_columns if col not in features_to_drop]

In [39]:
def add_new_features(df):
    """Add feature engineered columns."""
    # Age Binning
    df['AgeGroup'] = pd.cut(df['EDAD'], bins=[0, 30, 50, 70, 100], labels=['Young', 'Middle-aged', 'Senior', 'Elderly'])
    
    # Service Usage Grouping
    df['ServiceUsageGroup'] = pd.cut(df['CANTIDAD_SERVICIOS'], bins=[0, 5, 15, 50], labels=['Low', 'Medium', 'High'])
    
    # Health Risk Score
    df['HealthRiskScore'] = df[['HIPERTENSION', 'CEREBRAL', 'DIABETES', 'ENFERMEDAD_RENAL']].sum(axis=1)
    
    # Family History Score
    df['FamilyHistoryRisk'] = df[['CANCER_OTRO_SITIO_FAMILIAR', 'CORONARIOS_FAMILIAR', 'CEREBRAL_FAMILIAR', 'DIABETES_FAMILIAR']].sum(axis=1)
    
    # BMI and Age Interaction
    df['BMI_Age_Interaction'] = df['IMC'] * df['EDAD']
    
    # Log transformations for skewed features
    df['log_Pendiente'] = np.log1p(df['Pendiente'])
    df['log_Intercepto'] = np.log1p(df['Intercepto'])
    df['log_Promedio_costo'] = np.log1p(df['Promedio_costo'])
    
    # Recency of Medical Interactions
    df['RecentInteraction'] = pd.cut(df['TIEMPO_ULTIMA_CITA'], bins=[0, 30, 90, 365], labels=['Very Recent', 'Recent', 'Old'])
    
    # Service Intensity
    df['ServiceIntensity'] = df['CANTIDAD_SERVICIOS'] / df['TIEMPO_AFILIACION']
    
    # Family Service Ratio
    df['FamilyServiceRatio'] = df['cantidad_serv_flia'] / (df['CANTIDAD_SERVICIOS'] + 1)
    
    # Family Risk Ratio
    df['FamilyRiskRatio'] = df['Cant_Fliar_riesgos'] / df['Cant_gr_flia']
    
    return df

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin

In [41]:
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return add_new_features(X.copy()) 

In [42]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PolynomialFeatures

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('winsorizer', Winsorizer(capping_method='quantiles', tail='right', fold=0.05)),
    ('scaler', StandardScaler()),
    # ('poly', PolynomialFeatures(degree=2, interaction_only=True)),
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, updated_numeric_columns),
        ('ord', ordinal_transformer, updated_ordinal_columns),
        ('nom', nominal_transformer, updated_nominal_columns)
    ]
)

pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineeringTransformer()),
    ('drop_columns', 'passthrough'),
    ('preprocessor', preprocessor)
])

#### Applying preprocessor pipeline
- Imputation and dropping

In [43]:
# X = df_train.drop(columns=features_to_drop)
# y = df_train['Target']

# pipeline.fit(X)
# X_train_transformed = pipeline.transform(X)


#### Converting the pipeline output into a readable data frame

In [44]:
# transformed_columns = (
#     updated_numeric_columns + 
#     updated_ordinal_columns + 
#     list(pipeline.named_steps['preprocessor'].transformers_[2][1]['onehot'].get_feature_names_out(updated_nominal_columns))
# )

# X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=transformed_columns)
# X_train_transformed_df

<span style="color:red">Revisar el conteo de valores atipicos !!!!!!!!!!!!!</span>

In [45]:
# def calculate_iqr(df, numeric_columns):
#     """
#     This function takes a dataframe and returns a dataframe that contains 
#     the Interquartile Range (IQR) for each numeric column in the dataframe.
    
#     Parameters:
#     df (pd.DataFrame): Input dataframe
    
#     Returns:
#     pd.DataFrame: Dataframe containing IQR values for each numeric column
#     """
#     # Select numeric columns from the dataframe
#     df_numeric_columns = df[numeric_columns]
    
#     # Calculate Q1 (25th percentile) and Q3 (75th percentile) for each numeric column
#     Q1 = df_numeric_columns.quantile(0.25)
#     Q3 = df_numeric_columns.quantile(0.75)
    
#     # Calculate the Interquartile Range (IQR)
#     IQR = Q3 - Q1
    
#     # Create a dataframe to store the IQR values
#     iqr_df = pd.DataFrame({
#         'Column': IQR.index,
#         'IQR': IQR.values
#     }).sort_values(by='IQR', ascending=False)
    
#     return iqr_df


In [46]:
# iqr_result = calculate_iqr(df_train, numeric_columns)
# iqr_result

In [47]:
# iqr_result_after = calculate_iqr(X_train_transformed_df, updated_numeric_columns)
# print(iqr_result_after)

### PCA .....

## Bayesian Optimization

In [48]:
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from bayes_opt import BayesianOptimization
from bayes_opt import UtilityFunction


In [49]:
# X_train = X_train_small
# y_train = y_train_small

# Again but with accuracy

In [50]:
# # Updated preprocessing pipeline
# # Make sure you have the correct categorization of columns: numeric, ordinal, and nominal



# # Numeric columns: Winsorizing and scaling
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('winsorizer', Winsorizer(capping_method='quantiles', tail='right', fold=0.05)),
#     ('scaler', StandardScaler()),
# ])

# # Ordinal columns: Impute and encode
# ordinal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
# ])

# # Nominal columns: Impute and one-hot encode
# nominal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Added dense output for compatibility
# ])

# # Preprocessor to handle different types of columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, updated_numeric_columns),  # Process numeric columns
#         ('ord', ordinal_transformer, updated_ordinal_columns),  # Process ordinal columns
#         ('nom', nominal_transformer, updated_nominal_columns)   # Process nominal columns
#     ]
# )

# # Ensure that the preprocessing pipeline is used before PCA in your pipeline
# n_components_pca = 10  # Adjust based on your dataset

# # Define the SVM evaluation function using accuracy as the metric
# def svm_evaluate(C, gamma, kernel_choice):
#     kernel = 'linear' if kernel_choice < 0.5 else 'rbf'
    
#     # Create a complete pipeline: Preprocessing + PCA + SVM
#     model_pipeline = Pipeline([
#         ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#         ('pca', PCA(n_components=n_components_pca, random_state=42)),  # Add PCA after preprocessing
#         ('svm', SVC(C=C, gamma=gamma, kernel=kernel, probability=True))  # SVM with hyperparameters
#     ])
    
#     # Perform K-fold cross-validation and return mean accuracy score
#     accuracy_scores = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='accuracy', verbose=0)
    
#     return accuracy_scores.mean()

# # Define the parameter bounds for Bayesian Optimization
# pbounds = {
#     'C': (0.1, 10),        # Regularization parameter
#     'gamma': (0.5, 2),     # Kernel coefficient for 'rbf'
#     'kernel_choice': (0, 1)  # 0 for 'linear', 1 for 'rbf'
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=svm_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2
# )

# # Run the optimization without the progress bar
# optimizer.maximize(init_points=5, n_iter=10)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)

# # Train the final SVM model with the best parameters
# C_opt = best_params['params']['C']
# gamma_opt = best_params['params']['gamma']
# kernel_opt = 'linear' if best_params['params']['kernel_choice'] < 0.5 else 'rbf'

# # Final pipeline with best hyperparameters
# best_svm_model = Pipeline([
#     ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#     ('pca', PCA(n_components=n_components_pca, random_state=42)),  # Add PCA
#     ('svm', SVC(C=C_opt, gamma=gamma_opt, kernel=kernel_opt, probability=True))  # Best SVM model
# ])

# # Train the best model using the entire training dataset
# best_svm_model.fit(X_train, y_train)

# # Evaluate the model on the test set
# y_pred_test_proba = best_svm_model.predict(X_test)

# # Evaluate using accuracy score on the test set
# test_accuracy = accuracy_score(y_test, y_pred_test_proba)

# print(f"Test Accuracy: {test_accuracy:.4f}")


In [51]:
from sklearn.utils.validation import check_array
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [52]:
# class LogTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, columns):
#         self.columns = columns  # Can be column names for DataFrame or indices for NumPy arrays
    
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         X = X.copy()
#         if isinstance(X, pd.DataFrame):
#             # If X is a DataFrame, apply log transformation using column names
#             for col in self.columns:
#                 if col in X.columns:
#                     X[col] = np.log1p(X[col])  # log1p handles log(0) cases by doing log(1 + x)
#         elif isinstance(X, np.ndarray):
#             # If X is a NumPy array, apply log transformation using column indices
#             for col_idx in self.columns:
#                 if isinstance(col_idx, int) and col_idx < X.shape[1]:
#                     X[:, col_idx] = np.log1p(X[:, col_idx])  # log1p handles log(0) cases
#         else:
#             raise ValueError("Unsupported data format. Expected DataFrame or NumPy array.")
#         return X

In [53]:
# class HandleInfValues(BaseEstimator, TransformerMixin):
#     def __init__(self, replace_with=np.nan):
#         self.replace_with = replace_with
    
#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X):
#         # Convert to numpy array if not already
#         X = check_array(X, force_all_finite='allow-nan', dtype=np.float64)
#         # Replace infinity or too large values with np.nan or other specified values
#         X[np.isinf(X)] = self.replace_with
#         X[X > 1e10] = self.replace_with  # You can adjust this threshold
#         return X

In [54]:
# # Define the preprocessing pipeline components
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('winsorizer', Winsorizer(capping_method='quantiles', tail='right', fold=0.05)),
#     ('scaler', StandardScaler()),
# ])

# ordinal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
# ])

# nominal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Added dense output for compatibility
# ])

# # Preprocessor to handle different types of columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, updated_numeric_columns),  # Process numeric columns
#         ('ord', ordinal_transformer, updated_ordinal_columns),  # Process ordinal columns
#         ('nom', nominal_transformer, updated_nominal_columns)   # Process nominal columns
#     ]
# )

# # Define the SVM evaluation function using accuracy as the metric and adding PCA components as a hyperparameter
# def svm_evaluate(C, gamma, kernel_choice, pca_components):
#     # Map kernel_choice to the actual kernel
#     kernel_options = ['linear', 'rbf', 'sigmoid']
#     kernel = kernel_options[int(kernel_choice)]
    
#     # Create a complete pipeline: Preprocessing + PCA + SVM
#     model_pipeline = Pipeline([
#         ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#         ('pca', PCA(n_components=int(pca_components), random_state=42)),  # Add PCA with the given number of components
#         ('svm', SVC(C=C, gamma=gamma, kernel=kernel, probability=True))  # SVM with hyperparameters
#     ])
    
#     # Perform K-fold cross-validation and return mean accuracy score
#     accuracy_scores = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='accuracy', verbose=0)
    
#     return accuracy_scores.mean()

# # Define the parameter bounds for Bayesian Optimization, including more kernel options and PCA components
# pbounds = {
#     'C': (0.1, 10),        # Regularization parameter
#     'gamma': (0.5, 2),     # Kernel coefficient for 'rbf'
#     'kernel_choice': (0, 2),  # 0 for 'linear', 1 for 'rbf', 2 for 'sigmoid'
#     'pca_components': (2, min(len(X_train.columns), 20))  # PCA components: between 2 and 20 or total features
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=svm_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2
# )

# # Run the optimization without the progress bar
# optimizer.maximize(init_points=5, n_iter=10)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)

# # Train the final SVM model with the best parameters
# C_opt = best_params['params']['C']
# gamma_opt = best_params['params']['gamma']
# kernel_opt = ['linear', 'rbf', 'poly', 'sigmoid'][int(best_params['params']['kernel_choice'])]
# pca_opt = int(best_params['params']['pca_components'])
# # Final pipeline with best hyperparameters
# best_svm_model = Pipeline([
#     ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#     ('pca', PCA(n_components=pca_opt, random_state=42)),  # Add PCA with optimal components
#     ('svm', SVC(C=C_opt, gamma=gamma_opt, kernel=kernel_opt, probability=True))  # Best SVM model
# ])

# # Train the best model using the entire training dataset
# best_svm_model.fit(X_train, y_train)

# # Evaluate the model on the test set
# y_pred_test = best_svm_model.predict(X_test)

# test_accuracy = accuracy_score(y_test, y_pred_test)

# print(f"Test Accuracy: {test_accuracy:.4f}")

In [55]:
# from sklearn.svm import SVC
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import accuracy_score
# from bayes_opt import BayesianOptimization
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
# from sklearn.decomposition import PCA
# from sklearn.impute import SimpleImputer
# import numpy as np

# # Define the preprocessing pipeline components
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('winsorizer', Winsorizer(capping_method='quantiles', tail='right', fold=0.05)),
#     ('scaler', StandardScaler()),
# ])

# ordinal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
# ])

# nominal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Added dense output for compatibility
# ])

# # Preprocessor to handle different types of columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, updated_numeric_columns),  # Process numeric columns
#         ('ord', ordinal_transformer, updated_ordinal_columns),  # Process ordinal columns
#         ('nom', nominal_transformer, updated_nominal_columns)   # Process nominal columns
#     ]
# )

# # Define the SVM evaluation function using accuracy as the metric and adding PCA components as a hyperparameter
# def svm_evaluate(C, gamma, kernel_choice, pca_components, apply_pca):
#     # Map kernel_choice to the actual kernel
#     kernel_options = ['linear', 'rbf', 'poly', 'sigmoid']
#     kernel = kernel_options[int(kernel_choice)]
    
#     # Create a complete pipeline: Preprocessing + Optional PCA + SVM
#     steps = [('preprocessor', preprocessor)]
    
#     if apply_pca > 0.5:  # Add PCA if apply_pca is "true"
#         steps.append(('pca', PCA(n_components=int(pca_components), random_state=42)))
    
#     steps.append(('svm', SVC(C=C, gamma=gamma, kernel=kernel, probability=True)))  # SVM with hyperparameters
    
#     model_pipeline = Pipeline(steps)
    
#     # Perform K-fold cross-validation and return mean accuracy score
#     accuracy_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='accuracy', verbose=0)
    
#     return accuracy_scores.mean()

# # Define the parameter bounds for Bayesian Optimization, including more kernel options and PCA components
# pbounds = {
#     'C': (0.001, 50),       # Wider range for regularization parameter
#     'gamma': (1e-4, 3),       # Broaden gamma range for better exploration
#     'kernel_choice': (0, 3),  # 0 for 'linear', 1 for 'rbf', 2 for 'poly', 3 for 'sigmoid'
#     'pca_components': (2, min(len(X_train.columns), 20)),  # PCA components: between 2 and 20 or total features
#     'apply_pca': (0, 1)  # Binary choice to apply PCA or not
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=svm_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2
# )

# # Run the optimization without the progress bar
# optimizer.maximize(init_points=2, n_iter=3)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)

# # Train the final SVM model with the best parameters
# C_opt = best_params['params']['C']
# gamma_opt = best_params['params']['gamma']
# kernel_opt = ['linear', 'rbf', 'poly', 'sigmoid'][int(best_params['params']['kernel_choice'])]
# pca_opt = int(best_params['params']['pca_components'])
# apply_pca_opt = best_params['params']['apply_pca']

# # Final pipeline with best hyperparameters
# steps = [('preprocessor', preprocessor)]

# if apply_pca_opt > 0.5:
#     steps.append(('pca', PCA(n_components=pca_opt, random_state=42)))  # Add PCA if optimal

# steps.append(('svm', SVC(C=C_opt, gamma=gamma_opt, kernel=kernel_opt, probability=True)))  # Best SVM model

# best_svm_model = Pipeline(steps)

# # Train the best model using the entire training dataset
# best_svm_model.fit(X_train, y_train)

# # Evaluate the model on the test set
# y_pred_test = best_svm_model.predict(X_test)

# test_accuracy = accuracy_score(y_test, y_pred_test)

# print(f"Test Accuracy: {test_accuracy:.4f}")


## Testing models

In [56]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from sklearn.svm import SVC
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import accuracy_score
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.decomposition import PCA
# import numpy as np
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.linear_model import RidgeClassifier
# from sklearn.linear_model import SGDClassifier

# # Preprocessing pipeline remains the same
# # Define your preprocessing steps
# # Updated numeric transformer: Adding PowerTransformer after Winsorization
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('winsorizer', Winsorizer(capping_method='quantiles', tail='right', fold=0.05)),
#     ('scaler', StandardScaler()),
#     ('power_transform', PowerTransformer(method='yeo-johnson'))  # Transforms to stabilize variance
# ])

# # Ordinal transformer with option for target encoding
# ordinal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
# ])

# # Nominal transformer with OneHotEncoder (could use TargetEncoder if needed)
# nominal_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Can replace with TargetEncoder if high-dimension
# ])

# # Combine transformations using ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, updated_numeric_columns),
#         ('ord', ordinal_transformer, updated_ordinal_columns),
#         ('nom', nominal_transformer, updated_nominal_columns)
#     ]
# )

# # Function to evaluate a model using cross-validation
# def evaluate_model(model, X_train, y_train):
#     model_pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('pca', PCA(n_components=5, random_state=42)),  # Optional PCA step
#         ('classifier', model)  # Insert the classifier model here
#     ])
#     # Perform cross-validation and return accuracy
#     scores = cross_val_score(model_pipeline, X_train, y_train, cv=15, scoring='accuracy')
#     return scores.mean()

# # Models to evaluate
# models = {
#     'Logistic Regression': LogisticRegression(),
#     'Random Forest': RandomForestClassifier(),
#     'XGBoost': XGBClassifier(eval_metric='logloss'),
#     'SVM': SVC(),
#     'K-Nearest Neighbors': KNeighborsClassifier(),
#     'Decision Tree': DecisionTreeClassifier(),
#     'Gradient Boosting': GradientBoostingClassifier(),
#     'Ridge Classifier': RidgeClassifier(),
#     'SGD Classifier': SGDClassifier()
# }

# # Iterate through the models, evaluate each one
# best_score = 0
# best_model = None
# for model_name, model in models.items():
#     score = evaluate_model(model, X_train, y_train)
#     print(f"{model_name} Accuracy: {score:.4f}")
#     if score > best_score:
#         best_score = score
#         best_model = model_name

# # Output the best model
# print(f"\nBest model: {best_model} with accuracy: {best_score:.4f}")

# # After finding the best model, train it on the entire dataset and evaluate on test set
# best_model_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('pca', PCA(n_components=10, random_state=42)),
#     ('classifier', models[best_model])
# ])

# best_model_pipeline.fit(X_train, y_train)

# # Evaluate on test set
# y_pred_test = best_model_pipeline.predict(X_test)
# test_accuracy = accuracy_score(y_test, y_pred_test)

# print(f"Test Accuracy of {best_model}: {test_accuracy:.4f}")


## optimize random fores

In [57]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score
# from bayes_opt import BayesianOptimization
# from sklearn.metrics import accuracy_score
# from sklearn.pipeline import Pipeline
# import numpy as np

# # Define the Random Forest evaluation function using accuracy as the metric
# def rf_evaluate(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):
#     # Create the Random Forest model with hyperparameters passed from Bayesian optimization
#     model = RandomForestClassifier(
#         n_estimators=int(n_estimators),  # Number of trees
#         max_depth=int(max_depth),        # Maximum depth of the tree
#         min_samples_split=int(min_samples_split),  # Minimum samples required to split
#         min_samples_leaf=int(min_samples_leaf),    # Minimum samples required in a leaf
#         max_features=max_features,        # Number of features to consider for the best split
#         random_state=42,
#         n_jobs=-1  # Use all available cores
#     )
    
#     # Create a pipeline with preprocessor and RandomForestClassifier
#     model_pipeline = Pipeline([
#         ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#         ('rf', model)  # Random Forest with hyperparameters
#     ])
    
#     # Perform K-fold cross-validation and return mean accuracy score
#     accuracy_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='accuracy', verbose=0)
    
#     return accuracy_scores.mean()

# # Define the parameter bounds for Bayesian Optimization
# pbounds = {
#     'n_estimators': (50, 500),            # Number of trees
#     'max_depth': (5, 50),                 # Maximum depth of the trees
#     'min_samples_split': (2, 20),         # Minimum number of samples to split a node
#     'min_samples_leaf': (1, 10),          # Minimum number of samples per leaf
#     'max_features': (0.1, 1.0)            # Number of features to consider for the best split
# }

# # Set up the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=rf_evaluate,
#     pbounds=pbounds,
#     random_state=42,
#     verbose=2
# )

# # Run the optimization
# optimizer.maximize(init_points=10, n_iter=25)

# # Output the best parameters
# best_params = optimizer.max
# print("Best parameters found:", best_params)

# # Step 3: Use the optimized parameters to create the best Random Forest model
# best_rf_model = Pipeline([
#     ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#     ('rf', RandomForestClassifier(
#         n_estimators=int(best_params['params']['n_estimators']),
#         max_depth=int(best_params['params']['max_depth']),
#         min_samples_split=int(best_params['params']['min_samples_split']),
#         min_samples_leaf=int(best_params['params']['min_samples_leaf']),
#         max_features=best_params['params']['max_features'],
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

# # Step 4: Train the best model using the entire training dataset
# best_rf_model.fit(X_train, y_train)

# # Step 5: Evaluate the optimized Random Forest model on the test set
# y_pred_test = best_rf_model.predict(X_test)

# # Calculate the accuracy on the test set
# test_accuracy = accuracy_score(y_test, y_pred_test)

# print(f"Test Accuracy of the optimized Random Forest model: {test_accuracy:.4f}")


## Using random forest to predict the test

In [58]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import accuracy_score

# # Step 1: Best model obtained from the previous test is Random Forest
# # Use the pre-defined preprocessing pipeline (`preprocessor`)

# # Step 2: Recreate the pipeline using Random Forest with optimal hyperparameters (from your earlier Bayesian Optimization if applicable)
# best_rf_model = Pipeline([
#     ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#     ('rf', RandomForestClassifier(n_estimators=100, random_state=42))  # Adjust hyperparameters if you performed optimization
# ])

# # Step 3: Fit the Random Forest model on the full training data
# best_rf_model.fit(X_train, y_train)

# # Step 4: Transform the test data using the same preprocessor
# X_test_transformed = best_rf_model.named_steps['preprocessor'].transform(df_test)

# # Step 5: Make predictions on the df_test data
# y_test_predictions = best_rf_model.predict(df_test)


In [59]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import accuracy_score

# # Best parameters obtained from Bayesian Optimization
# best_params = {
#     'max_depth': 37,
#     'max_features': 0.1,
#     'min_samples_leaf': 1,
#     'min_samples_split': 2,
#     'n_estimators': 288
# }

# # Step 1: Use the optimized parameters to create the Random Forest model
# best_rf_model = Pipeline([
#     ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#     ('rf', RandomForestClassifier(
#         n_estimators=int(best_params['n_estimators']),
#         max_depth=int(best_params['max_depth']),
#         min_samples_split=int(best_params['min_samples_split']),
#         min_samples_leaf=int(best_params['min_samples_leaf']),
#         max_features=best_params['max_features'],
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

# # Step 2: Train the model on the training data
# best_rf_model.fit(X_train, y_train)

# # Step 3: Transform the df_test using the preprocessor
# X_test_transformed = best_rf_model.named_steps['preprocessor'].transform(df_test)

# # Step 4: Make predictions on df_test using the trained model
# y_test_predictions = best_rf_model.predict(df_test)

# # Step 5: Output the predictions
# print("Predictions on the df_test set:")
# print(y_test_predictions)



## Generate the format

In [60]:
# final_df = pd.DataFrame({
#     'ID': df_test.index,
# 	'Target': y_test_predictions
# })
# final_df

In [61]:
# # to csv
# final_df.to_csv('data/preditions_8.csv', index=False)

# XG-boost prooooo

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from bayes_opt import BayesianOptimization
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

# Define the preprocessing pipeline components
numeric_transformer = Pipeline(steps=[

    ('imputer', SimpleImputer(strategy='mean')),
    ('winsorizer', Winsorizer(capping_method='quantiles', tail='right', fold=0.05)),
    ('scaler', StandardScaler()),
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Added dense output for compatibility
])

# Preprocessor to handle different types of columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, updated_numeric_columns),  # Process numeric columns
        ('ord', ordinal_transformer, updated_ordinal_columns),  # Process ordinal columns
        ('nom', nominal_transformer, updated_nominal_columns)   # Process nominal columns
    ]
)

# Define the XGBoost evaluation function using accuracy as the metric
def xgb_evaluate(learning_rate, n_estimators, max_depth, reg_alpha, reg_lambda, subsample, colsample_bytree):
    # Create a complete pipeline: Preprocessing + XGBoost
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Include the preprocessing pipeline
        ('xgb', XGBClassifier(learning_rate=learning_rate, 
                              n_estimators=int(n_estimators), 
                              max_depth=int(max_depth), 
                              reg_alpha=reg_alpha,  # L1 regularization
                              reg_lambda=reg_lambda,  # L2 regularization
                              subsample=subsample,  # Subsample ratio
                              colsample_bytree=colsample_bytree,  # Feature subsample ratio
                              random_state=42, 
                              use_label_encoder=False, 
                              eval_metric='logloss'))  # XGBoost with hyperparameters
    ])
    
    # Perform K-fold cross-validation and return mean accuracy score
    accuracy_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='accuracy', verbose=0)
    
    return accuracy_scores.mean()

# Define the parameter bounds for Bayesian Optimization, excluding PCA components
pbounds = {
    'learning_rate': (0.01, 0.1),  # Learning rate values suitable for gradual training
    'n_estimators': (100, 1000),
    'max_depth': (6, 12),  # Reduced max depth to prevent overfitting
    'reg_alpha': (0.01, 0.7),  # Regularization ranges based on previous observations
    'reg_lambda': (0.01, 0.7),
    'subsample': (0.6, 0.9),  # Subsample to prevent overfitting
    'colsample_bytree': (0.6, 0.9)  # Feature subsample ratio
}

# Set up the Bayesian optimizer
optimizer = BayesianOptimization(
    f=xgb_evaluate,
    pbounds=pbounds,
    random_state=42,
    verbose=2
)

# Run the optimization without the progress bar
optimizer.maximize(init_points=15, n_iter=15)

# Output the best parameters
best_params = optimizer.max
print("Best parameters found:", best_params)

# Train the final XGBoost model with the best parameters
learning_rate_opt = best_params['params']['learning_rate']
n_estimators_opt = int(best_params['params']['n_estimators'])
max_depth_opt = int(best_params['params']['max_depth'])
reg_alpha_opt = best_params['params']['reg_alpha']
reg_lambda_opt = best_params['params']['reg_lambda']
subsample_opt = best_params['params']['subsample']
colsample_bytree_opt = best_params['params']['colsample_bytree']

# Final pipeline with best hyperparameters
best_xgb_model = Pipeline([
    ('preprocessor', preprocessor),  # Include the preprocessing pipeline
    ('xgb', XGBClassifier(learning_rate=learning_rate_opt, 
                          n_estimators=n_estimators_opt, 
                          max_depth=max_depth_opt, 
                          reg_alpha=reg_alpha_opt,  # L1 regularization
                          reg_lambda=reg_lambda_opt,  # L2 regularization
                          subsample=subsample_opt,  # Subsample ratio
                          colsample_bytree=colsample_bytree_opt,  # Feature subsample ratio
                          random_state=42, 
                          use_label_encoder=False, 
                          eval_metric='logloss'))  # Best XGBoost model
])

# Train the best model using the entire training dataset
best_xgb_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_test = best_xgb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Display feature importances
xgb_model = best_xgb_model.named_steps['xgb']  # Get the XGBoost model from the pipeline
importances = xgb_model.feature_importances_
sorted_indices = np.argsort(importances)[::-1]


In [67]:
# best_params = optimizer.max
# print("Best parameters found:", best_params)

# # Extract the best parameters
# learning_rate_opt = best_params['params']['learning_rate']
# n_estimators_opt = int(best_params['params']['n_estimators'])
# max_depth_opt = int(best_params['params']['max_depth'])
# pca_opt = int(best_params['params']['pca_components'])
# reg_alpha_opt = best_params['params']['reg_alpha']
# reg_lambda_opt = best_params['params']['reg_lambda']
# subsample_opt = best_params['params']['subsample']
# colsample_bytree_opt = best_params['params']['colsample_bytree']

# # Final pipeline with best hyperparameters
# best_xgb_model = Pipeline([
#     ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#     ('pca', PCA(n_components=pca_opt, random_state=42)),  # Add PCA with optimal components
#     ('xgb', XGBClassifier(learning_rate=learning_rate_opt, 
#                           n_estimators=n_estimators_opt, 
#                           max_depth=max_depth_opt, 
#                           reg_alpha=reg_alpha_opt,  # L1 regularization
#                           reg_lambda=reg_lambda_opt,  # L2 regularization
#                           subsample=subsample_opt,  # Subsample ratio
#                           colsample_bytree=colsample_bytree_opt,  # Feature subsample ratio
#                           random_state=42, 
#                           use_label_encoder=False, 
#                           eval_metric='logloss'))  # Best XGBoost model
# ])

# # Train the best model using the entire training dataset
# best_xgb_model.fit(X_train, y_train)

# # Evaluate the model on the test set
# y_pred_test = best_xgb_model.predict(df_test)


In [68]:
# from sklearn.decomposition import PCA
# from xgboost import XGBClassifier
# from sklearn.pipeline import Pipeline

# # Use the best parameters obtained from Bayesian Optimization
# learning_rate_opt = 0.01
# max_depth_opt = int(20.0)
# n_estimators_opt = int(round(348.558))
# pca_components_opt = int(round(20.0))  # Round PCA components to the nearest integer

# # Recreate the pipeline with the best hyperparameters
# best_xgb_model = Pipeline([
#     ('preprocessor', preprocessor),  # Include the preprocessing pipeline
#     ('pca', PCA(n_components=pca_components_opt, random_state=42)),  # Add PCA with the optimal number of components
#     ('xgb', XGBClassifier(learning_rate=learning_rate_opt, 
#                           n_estimators=n_estimators_opt, 
#                           max_depth=max_depth_opt, 
#                           random_state=42, use_label_encoder=False, eval_metric='logloss'))  # XGBoost with best parameters
# ])

# # Fit the model on the training data
# best_xgb_model.fit(X_train, y_train)

# # Transform the test data using the preprocessor
# X_test_transformed = best_xgb_model.named_steps['preprocessor'].transform(df_test)

# y_test_predictions = best_xgb_model.predict(df_test)


In [69]:
# final_df = pd.DataFrame({
#     'ID': df_test.index,
# 	'Target': y_pred_test
# })
# final_df

In [70]:
# # to csv
# final_df.to_csv('data/preditions_3.csv', index=False)

In [71]:
# # Fit the final model
# best_svm_model.fit(X_train, y_train)

# # Evaluate the final model on the test set using ROC AUC
# y_pred_proba = best_svm_model.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class
# test_roc_auc = roc_auc_score(y_test, y_pred_proba)
# print(f"Final ROC AUC Score on the test set: {test_roc_auc:.4f}")

Usar diferentes kernel