# Prostate Cancer Worshop

## Initial analysis

### Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import skew, kurtosis
from IPython.display import display

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from feature_engine.outliers import Winsorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from bayes_opt import BayesianOptimization
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
from xgboost import XGBClassifier


### LoadingData

In [2]:
df_train = pq.read_table('data/df_train.parquet').to_pandas()
df_test = pq.read_table('data/df_test.parquet').to_pandas()

df_train.shape

(23494, 46)

In [3]:
df_train.head()

Unnamed: 0_level_0,MEDICAMENTOS,MEDICINA ESPECIALIZADA,MEDICINA GENERAL,Cant_gr_flia,Cant_riesgos_flia_mean,min_Tiempo_CP_Fliar,Cant_Fliar_CP,psa_max_gr_flia,psa_min_gr_flia,Cant_Fliar_riesgos,...,CORONARIOS_FAMILIAR,CEREBRAL,CEREBRAL_FAMILIAR,ENFERMEDAD_RENAL,ENFERMEDAD_RENAL_FAMILIAR,OTROS_ANTECEDENTES_VASCULARES,Target,Pendiente,Intercepto,Promedio_costo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16484,12.0,3.0,3.0,2,0.0,,0.0,,,0.0,...,0,0,0,0,0,0,0,58373.057143,-110021.2,188569.0
11528,6.0,1.0,1.0,2,0.5,,0.0,,,1.0,...,0,0,0,0,0,0,0,19618.942857,-36176.133333,97470.5
26630,1.0,0.0,2.0,2,0.0,,0.0,,,0.0,...,0,0,0,0,0,0,1,398534.0,-626628.0,2304723.0
12586,6.0,0.0,2.0,2,0.0,,0.0,,,0.0,...,0,0,0,0,0,0,0,1292.6,13410.733333,53804.5
11225,3.0,0.0,2.0,1,1.0,,0.0,,,1.0,...,2,2,2,2,2,2,0,13728.714286,-27375.666667,41349.67


In [4]:
numeric_columns = [
    'Cant_gr_flia', 
    'Cant_riesgos_flia_mean', 
    'cantidad_serv_flia', 
    'CANTIDAD_SERVICIOS', 
    'conteo_dx_diferentes', 
    'EDAD', 
    'psa_max_gr_flia', 
    'psa_min_gr_flia', 
    'Pendiente', 
    'Pendiente_flia', 
    'Promedio_costo', 
    'Promedio_costo_flia', 
    'psa_max_gr_flia', 
    'psa_min_gr_flia', 
    'MEDICAMENTOS', 
    'MEDICINA ESPECIALIZADA', 
    'MEDICINA GENERAL', 
    'TIEMPO_AFILIACION', 
    'TIEMPO_ULTIMA_CITA', 
    'PERDIDA_DE_PESO', 
    'Intercepto', 
    'Intercepto_flia', 
    'Target',
    'Cant_Fliar_CP', 
    'Cant_Fliar_riesgos'
]

categorical_columns = [
    'AGRUPACION_DIASTOLICA', 
    'AGRUPACION_SISTOLICA', 
    'CANCER_MAMA_FAMILIAR', 
    'CANCER_OTRO_SITIO', 
    'CORONARIOS', 
    'CANCER_OTRO_SITIO_FAMILIAR',
    'CORONARIOS_FAMILIAR', 
    'CEREBRAL', 
    'CEREBRAL_FAMILIAR', 
    'DIABETES', 
    'DIABETES_FAMILIAR', 
    'ENFERMEDAD_RENAL', 
    'ENFERMEDAD_RENAL_FAMILIAR', 
    'HIPERTENSION', 
    'HIPERTENSION_FAMILIAR', 
    'OTROS_ANTECEDENTES_VASCULARES', 
    'RIESGOS', 
    'ESTADO_CIVI', 
    'IMC',
    'estrato', 
    'parentesco', 
    'PROGRAMA', 
]

In [5]:
ordinal_columns = [
    'AGRUPACION_DIASTOLICA',
    'AGRUPACION_SISTOLICA',
    'HIPERTENSION',
    'HIPERTENSION_FAMILIAR',
    'RIESGOS',
    'IMC',
    'estrato'
]

nominal_columns = [
    'CANCER_MAMA_FAMILIAR',
    'CANCER_OTRO_SITIO',
    'CORONARIOS',
    'CANCER_OTRO_SITIO_FAMILIAR',
    'CORONARIOS_FAMILIAR',
    'CEREBRAL',
    'CEREBRAL_FAMILIAR',
    'DIABETES',
    'DIABETES_FAMILIAR',
    'ENFERMEDAD_RENAL',
    'ENFERMEDAD_RENAL_FAMILIAR',
    'OTROS_ANTECEDENTES_VASCULARES',
    'ESTADO_CIVI',
    'parentesco',
    'PROGRAMA'                  
]



In [6]:
df_encoded = df_train.copy()
for column in ordinal_columns + nominal_columns:
    df_encoded[column] = df_encoded[column].astype('category')
X = df_encoded.drop(columns=['Target'])
y = df_encoded['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# X_train_small = X_train.sample(frac=0.1, random_state=42)
# y_train_small = y_train.loc[X_train_small.index] 

In [8]:
features_to_drop = ['Cant_Fliar_riesgos', 'Cant_Fliar_CP', 'min_Tiempo_CP_Fliar', 'psa_min_gr_flia', 'psa_max_gr_flia', 'CANCER_MAMA_FAMILIAR', 'PERDIDA_DE_PESO', 'Target']

## Preprocessing Pipeline_________________________________________________

In [9]:
updated_numeric_columns = [col for col in numeric_columns if col not in features_to_drop]
updated_ordinal_columns = [col for col in ordinal_columns if col not in features_to_drop]
updated_nominal_columns = [col for col in nominal_columns if col not in features_to_drop]

In [17]:
# Import necessary library for standardization
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('winsorizer', Winsorizer(capping_method='quantiles', tail='right', fold=0.05)),
    ('scaler', StandardScaler()),
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, updated_numeric_columns),
        ('ord', ordinal_transformer, updated_ordinal_columns),
        ('nom', nominal_transformer, updated_nominal_columns)
    ]
)

pipeline = Pipeline(steps=[
    ('drop_columns', 'passthrough'),
    ('preprocessor', preprocessor)
])

#### Applying preprocessor pipeline
- Imputation and dropping

In [18]:

pipeline.fit(X_train)
X_train_transformed = pipeline.transform(X_train)


pipeline.fit(X_test)
X_test_transformed = pipeline.transform(X_test)


#### Converting the pipeline output into a readable data frame

In [19]:
transformed_columns = (
    updated_numeric_columns + 
    updated_ordinal_columns + 
    list(pipeline.named_steps['preprocessor'].transformers_[2][1]['onehot'].get_feature_names_out(updated_nominal_columns))
)

X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=transformed_columns)
X_train_transformed_df

Unnamed: 0,Cant_gr_flia,Cant_riesgos_flia_mean,cantidad_serv_flia,CANTIDAD_SERVICIOS,conteo_dx_diferentes,EDAD,Pendiente,Pendiente_flia,Promedio_costo,Promedio_costo_flia,...,parentesco_Conyuge,parentesco_Hijo,parentesco_None,parentesco_Otros,parentesco_Padres,parentesco_Sin Descripcion,parentesco_Trabajador,PROGRAMA_PAC,PROGRAMA_POS,PROGRAMA_RS
0,-0.383981,-0.566918,-0.687903,-0.249259,-0.450982,-1.558336,0.132262,0.095086,-0.179549,-0.430117,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.599251,0.422975,-0.123278,0.626970,1.816900,-0.976635,-0.047686,-0.040160,-0.210272,-0.423953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.367213,-0.566918,-0.861634,-0.374435,0.682959,-0.394934,0.241425,0.135020,0.164374,-0.317112,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.383981,-0.566918,-1.035364,-0.875137,-1.017953,-0.685785,0.154532,0.138980,-0.036332,-0.208606,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.599251,-0.566918,-0.601037,-0.124083,0.115988,1.641018,0.202639,0.086740,0.273043,-0.472232,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18790,1.582483,0.175502,1.136271,0.251444,-0.450982,1.350168,-0.017797,0.094155,-0.553730,1.646487,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
18791,0.599251,-0.566918,0.050453,-0.499610,0.115988,0.186766,0.458085,0.057662,1.266327,-0.528648,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
18792,0.599251,-0.566918,-1.035364,-0.875137,-1.017953,-1.558336,0.063896,0.066998,-0.619208,-0.571860,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
18793,0.599251,-0.566918,-0.644470,0.126268,0.682959,0.477617,-0.098780,0.027879,-0.195948,-0.374446,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
transformed_columns = (
    updated_numeric_columns + 
    updated_ordinal_columns + 
    list(pipeline.named_steps['preprocessor'].transformers_[2][1]['onehot'].get_feature_names_out(updated_nominal_columns))
)

X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=transformed_columns)
X_test_transformed_df

Unnamed: 0,Cant_gr_flia,Cant_riesgos_flia_mean,cantidad_serv_flia,CANTIDAD_SERVICIOS,conteo_dx_diferentes,EDAD,Pendiente,Pendiente_flia,Promedio_costo,Promedio_costo_flia,...,parentesco_Conyuge,parentesco_Hijo,parentesco_None,parentesco_Otros,parentesco_Padres,parentesco_Sin Descripcion,parentesco_Trabajador,PROGRAMA_PAC,PROGRAMA_POS,PROGRAMA_RS
0,-1.374203,-0.549978,-0.863798,-0.742689,-1.007078,1.328603,0.049522,0.056647,-0.523645,-0.559458,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.384869,-0.549978,-0.328153,-0.616108,-1.007078,0.456092,0.040545,0.060659,-0.449919,-0.531903,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.604464,-0.549978,-0.149604,-0.742689,-1.007078,0.746929,0.040104,0.056647,-0.611017,-0.559458,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.604464,-0.549978,-0.997709,-0.742689,-0.433610,0.746929,0.052130,0.056647,-0.557368,-0.559458,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-1.374203,-0.549978,0.118219,0.269957,0.713326,1.037766,0.096653,0.074453,-0.428885,-0.468590,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4694,-0.384869,-0.549978,-0.774524,-0.109785,-0.433610,-1.288929,-0.071148,-0.038211,-0.097020,-0.279602,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4695,0.604464,-0.549978,-1.042346,-0.869270,-1.007078,-0.707255,0.048854,0.056647,-0.596312,-0.559458,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4696,-0.384869,-0.549978,-0.551338,-0.869270,-1.007078,-0.998092,0.035711,0.050763,-0.618330,-0.519045,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4697,-0.384869,2.484142,0.698501,1.029442,-0.433610,-0.707255,0.009040,0.056647,-0.513417,-0.559458,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# SVM outside

In [22]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np



# Train the SVM model
svm_model = SVC(probability=True)  # Enable probability estimates for ROC AUC
svm_model.fit(X_train_transformed_df, y_train)

# Make predictions
y_pred_train_proba = svm_model.predict_proba(X_train_transformed_df)[:, 1]
y_pred_test_proba = svm_model.predict_proba(X_test_transformed_df)[:, 1]

# Evaluate using ROC AUC score
train_roc_auc = roc_auc_score(y_train, y_pred_train_proba)
test_roc_auc = roc_auc_score(y_test, y_pred_test_proba)

print(f"Train ROC AUC: {train_roc_auc:.4f}")
print(f"Test ROC AUC: {test_roc_auc:.4f}")


Train ROC AUC: 0.8086
Test ROC AUC: 0.6627


In [16]:
# # Import necessary libraries
# from sklearn.svm import SVC
# from sklearn.metrics import roc_auc_score
# from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
# from sklearn.impute import SimpleImputer
# import numpy as np

# # Define the hyperparameters for the SVM model
# svm_model = SVC(probability=True, kernel='rbf', C=1.0, gamma='scale')  # Adjust kernel, C, and gamma as needed

# # Train the SVM model
# svm_model.fit(X_train_transformed_df, y_train)

# # Make predictions
# y_pred_train_proba = svm_model.predict_proba(X_train_transformed_df)[:, 1]
# y_pred_test_proba = svm_model.predict_proba(X_test_transformed_df)[:, 1]

# # Evaluate using ROC AUC score
# train_roc_auc = roc_auc_score(y_train, y_pred_train_proba)
# test_roc_auc = roc_auc_score(y_test, y_pred_test_proba)

# print(f"Train ROC AUC: {train_roc_auc:.4f}")
# print(f"Test ROC AUC: {test_roc_auc:.4f}")


In [23]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

# Hyperparameters to evaluate
kernels = ['linear', 'rbf', 'sigmoid']
Cs = [0.1, 1, 10]
gammas = ['scale', 'auto']

# Store the results
results = []

# Loop through each combination of kernel, C, and gamma
for kernel in kernels:
    for C in Cs:
        for gamma in gammas:
            print(f"Evaluating SVM with kernel={kernel}, C={C}, gamma={gamma}")
            
            # Define the SVM model with the current hyperparameters
            svm_model = SVC(probability=True, kernel=kernel, C=C, gamma=gamma)
            
            # Train the SVM model
            svm_model.fit(X_train_transformed_df, y_train)
            
            # Make predictions on both train and test sets
            y_pred_train_proba = svm_model.predict_proba(X_train_transformed_df)[:, 1]
            y_pred_test_proba = svm_model.predict_proba(X_test_transformed_df)[:, 1]
            
            # Evaluate using ROC AUC score
            train_roc_auc = roc_auc_score(y_train, y_pred_train_proba)
            test_roc_auc = roc_auc_score(y_test, y_pred_test_proba)
            
            # Store the results
            results.append({
                'kernel': kernel,
                'C': C,
                'gamma': gamma,
                'train_roc_auc': train_roc_auc,
                'test_roc_auc': test_roc_auc
            })
            
            print(f"Train ROC AUC: {train_roc_auc:.4f}")
            print(f"Test ROC AUC: {test_roc_auc:.4f}")
            print("="*50)

# Convert the results to a DataFrame for better readability
results_df = pd.DataFrame(results)
print(results_df)


Evaluating SVM with kernel=linear, C=0.1, gamma=scale
Train ROC AUC: 0.5345
Test ROC AUC: 0.5523
Evaluating SVM with kernel=linear, C=0.1, gamma=auto
Train ROC AUC: 0.5345
Test ROC AUC: 0.5523
Evaluating SVM with kernel=linear, C=1, gamma=scale
Train ROC AUC: 0.5258
Test ROC AUC: 0.5456
Evaluating SVM with kernel=linear, C=1, gamma=auto
Train ROC AUC: 0.5258
Test ROC AUC: 0.5456
Evaluating SVM with kernel=linear, C=10, gamma=scale
