### Proyecto Final Notebook 3
#### Doris Andrea Paz Garcia  	22005266
#### Franz Schubert Castillo Colocho 22003738
#### Estuardo Funes 20032042

In [1]:
#Importamos las librerías 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy import stats

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.transformation import LogTransformer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

In [2]:
#Cargamos data para aplicar analisis de datos
df = pd.read_csv("dataset_proyecto.csv")
df.head()

Unnamed: 0,bmi,Age,asa_status,baseline_cancer,baseline_charlson,baseline_cvd,baseline_dementia,baseline_diabetes,baseline_digestive,baseline_osteoart,...,complication_rsi,dow,gender,hour,month,moonphase,mort30,mortality_rsi,race,complication
0,19.31,59.2,1,yes,0,no,no,no,no,no,...,-0.57,3,0,7.63,6,1,0,-0.43,1,0
1,18.73,59.1,0,no,0,no,no,no,no,no,...,0.21,0,0,12.93,0,1,0,-0.41,1,0
2,21.85,59.0,0,no,0,no,no,no,no,no,...,0.0,2,0,7.68,5,3,0,0.08,1,0
3,18.49,59.0,1,no,1,no,no,yes,yes,no,...,-0.65,2,1,7.58,4,3,0,-0.32,1,0
4,19.7,59.0,1,no,0,no,no,no,no,no,...,0.0,0,0,7.88,11,0,0,0.0,1,0


### Tipos de variables 

In [3]:
def getDataColTypes(df):
    categoricas=[]
    continuas=[]
    discretas=[]
    for colName in df.columns:
        if(df[colName].dtype=='O'): 
            categoricas.append(colName)
        else: 
            if((df[colName].dtype=='int64') or (df[colName].dtypes=='float64')): 
                if(len(df[colName])<=30):
                    discretas.append(colName)
                else: 
                    continuas.append(colName)
    return discretas,continuas,categoricas

#####  Imputación de variables. 

In [4]:
#Realizamos la imputación en función de lo observado. 
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['bmi'] = df['bmi'].fillna(df['bmi'].mode()[0])
df['ccsMort30Rate'] = df['ccsMort30Rate'].fillna(df['ccsMort30Rate'].median())
df['hour'] = df['hour'].fillna(df['hour'].median())


##### Tratamiento de Outliers 

In [5]:
## Creamos función para calcular el IQR, Limite inferior y limite superior. 

dataset = df
col = 'bmi' 
def detectOutliersLimits(dataset,col):
    '''
    Descripción: Calcula los limites superiores e inferiores para detección de outliers. 
    Input: dataset->pandas dataframe, col->string nombre de columna
    Output: tupla de floats con los limites superior e inferior (LI,LS)
    '''
    IQR = dataset[col].quantile(0.75)-dataset[col].quantile(0.25)
    LI = dataset[col].quantile(0.25) - (IQR*1.75)
    LS = dataset[col].quantile(0.75) + (IQR*1.75)
    
    return LI,LS

In [6]:
def cappingContinuas(dataset):
    '''
    Descripción: Aplica capping a las variables continuas  y convierte los valores negativos a cero
    Input: dataset -> pandas dataframe
    Output: pandas dataframe con las variables continuas cappeadas y valores negativos convertidos a cero
    '''
    continuas, _, _ = getDataColTypes(dataset)
    for col in continuas:
        LI, LS = detectOutliersLimits(dataset, col)
        dataset[col] = np.where(dataset[col] > LS, LS,
                                np.where(dataset[col] < LI, LI, dataset[col]))
        dataset[col] = np.where(dataset[col] < 0, 0, dataset[col])  # Convertir valores negativos a cero. 
    return dataset

In [7]:
df_capped = cappingContinuas(df)


##### Codificación de variables categoricas. 

In [8]:
#One Hot Encoding. 
ohe_cancer=pd.get_dummies(df['baseline_cancer'],drop_first=True)
ohe_cancer.value_counts()
df['baseline_cancer']=ohe_cancer
ohe_cvd=pd.get_dummies(df['baseline_cvd'],drop_first=True)
ohe_cvd.value_counts()
df['baseline_cvd']=ohe_cvd
ohe_dementia=pd.get_dummies(df['baseline_dementia'],drop_first=True)
ohe_dementia.value_counts()
df['baseline_dementia']=ohe_dementia
ohe_diabetes=pd.get_dummies(df['baseline_diabetes'],drop_first=True)
ohe_diabetes.value_counts()
df['baseline_diabetes']=ohe_diabetes
ohe_digestive=pd.get_dummies(df['baseline_digestive'],drop_first=True)
ohe_digestive.value_counts()
df['baseline_digestive']=ohe_digestive
ohe_osteoart=pd.get_dummies(df['baseline_osteoart'],drop_first=True)
ohe_osteoart.value_counts()
df['baseline_osteoart']=ohe_osteoart
ohe_psych=pd.get_dummies(df['baseline_psych'],drop_first=True)
ohe_psych.value_counts()
df['baseline_psych']=ohe_psych
ohe_pulmonary=pd.get_dummies(df['baseline_pulmonary'],drop_first=True)
ohe_pulmonary.value_counts()
df['baseline_pulmonary']=ohe_pulmonary


In [9]:
from sklearn.model_selection import train_test_split

# Define tus características y la variable objetivo
X = df.drop(['complication'], axis=1)
y = df['complication']

# Divide los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022)


In [11]:
# Imputación de variables categóricas
VARS_CATEGORICAS_CON_NA_FRECUENTE = ['baseline_cancer', 'baseline_cvd', 'baseline_dementia', 
                                     'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 
                                     'baseline_psych', 'baseline_pulmonary']

# Imputación de variables numéricas
VARS_NUMERICAS_CON_NA = ['bmi', 'Age','ccsMort30Rate','hour']

# Variables para transformación logarítmica
VARS_NUMERICAS_LOG = ['bmi', 'Age']

# Variables para codificación por frecuencia (no ordinal)
VARS_CATEGORICAS = ['asa_status', 'ahrq_ccs', 'dow', 'gender', 'month', 'moonphase', 'mort30', 'race']

# Variables a utilizar en el entrenamiento
FEATURES = [
    'bmi', 'Age', 'asa_status', 'baseline_cancer', 'baseline_charlson', 
    'baseline_cvd', 'baseline_dementia', 'baseline_diabetes', 
    'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 
    'baseline_pulmonary', 'ahrq_ccs', 'ccsComplicationRate', 'ccsMort30Rate', 
    'complication_rsi', 'dow', 'gender', 'hour', 'month', 'moonphase', 'mort30', 
    'mortality_rsi', 'race'
]

In [12]:
# Convertir las variables categóricas a tipo object
for var in VARS_CATEGORICAS:
    X_train[var] = X_train[var].astype('object')
    X_test[var] = X_test[var].astype('object')

In [13]:
X_train = X_train[FEATURES]

### Desarrollo de modelos de clasificación 

In [15]:
from sklearn.metrics import accuracy_score 
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split 

from sklearn.preprocessing import StandardScaler

### Regresión Logistica

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Crear una instancia del escalador
scaler = StandardScaler()

# Estandarizar las características de entrenamiento y prueba
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Crear una instancia del modelo de regresión logística
logit = LogisticRegression()

# Ajustar el modelo a los datos de entrenamiento
logit.fit(X_train_std, y_train)

# Realizar predicciones en los datos de prueba
logit_preds = logit.predict(X_test_std)

# Mostrar las predicciones
print(logit_preds)



[0 0 0 ... 0 0 1]


In [17]:
import warnings 
warnings.filterwarnings("ignore")

In [18]:
acc = round(accuracy_score(y_test,logit_preds),4)
roc_auc = round(roc_auc_score(y_test,logit_preds),4)

In [19]:
print(f"Accuracy:{acc}")
print(f"ROC-AUC:{roc_auc}")

Accuracy:0.7957
ROC-AUC:0.6569


In [20]:
from sklearn.model_selection import GridSearchCV, KFold 

logit = LogisticRegression(solver='saga')
hyperparams_grid = {'penalty':['l1', 'l2', 'elasticnet', None],
                    'C':[0.01,0.1,1,10],
                    'fit_intercept':[True,False]
}

grid_search=GridSearchCV(estimator=logit,param_grid=hyperparams_grid,scoring=['roc_auc','accuracy'],refit='roc_auc',
                        n_jobs=-1,cv=KFold(n_splits=20,shuffle=True,random_state=2023))

grid_search.fit(X_train_std,y_train)

accuracies = grid_search.cv_results_['mean_test_accuracy']
roc_aucs = grid_search.cv_results_['mean_test_roc_auc']
mean_acc = np.mean([x for x in accuracies if not np.isnan(x)])
mean_roc_aucs = np.mean([x for x in roc_aucs if not np.isnan(x)])


print(f'Mejores Hyperparametros:{grid_search.best_params_}')
print(f'Mejor Metrica:{grid_search.best_score_}')
print(f'Mean Accuracy:{mean_acc}')
print(f'Mean ROC_AUC:{mean_roc_aucs}')

Mejores Hyperparametros:{'C': 1, 'fit_intercept': True, 'penalty': 'l1'}
Mejor Metrica:0.8038388176117344
Mean Accuracy:0.7536119249131944
Mean ROC_AUC:0.8031188393303905


#### Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold

random_forest = RandomForestClassifier()

hyperparams_grid = {
    'n_estimators': [500, 1000],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 25],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(
    estimator=random_forest,
    param_grid=hyperparams_grid,
    scoring=['roc_auc', 'accuracy'],
    refit='roc_auc',
    n_jobs=-1,
    cv=KFold(n_splits=10, shuffle=True, random_state=2023)
)

grid_search.fit(X_train_std, y_train)

accuracies = grid_search.cv_results_['mean_test_accuracy']
roc_aucs = grid_search.cv_results_['mean_test_roc_auc']
mean_acc = np.mean([x for x in accuracies if not np.isnan(x)])
mean_roc_aucs = np.mean([x for x in roc_aucs if not np.isnan(x)])

print(f'Mejores Hyperparametros: {grid_search.best_params_}')
print(f'Mejor Métrica: {grid_search.best_score_}')
print(f'Mean Accuracy: {mean_acc}')
print(f'Mean ROC_AUC: {mean_roc_aucs}')


Mejores Hyperparametros: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 25, 'n_estimators': 1000}
Mejor Métrica: 0.9184326064884253
Mean Accuracy: 0.8542367330411585
Mean ROC_AUC: 0.9029492856715202


#### Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB
naive_bayes = GaussianNB()
naive_bayes.fit(X_train_std, y_train)
naive_bayes_preds = naive_bayes.predict(X_test_std)
print(naive_bayes_preds)
from sklearn.model_selection import GridSearchCV
param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}
grid_search = GridSearchCV(GaussianNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_std, y_train)
print(f'Mejores hiperparametros: {grid_search.best_params_}')
print(f'Mejor métrica: {grid_search.best_score_}')
# Calcular la precisión y el AUC-ROC
naive_bayes_acc = accuracy_score(y_test, naive_bayes_preds)
naive_bayes_roc_auc = roc_auc_score(y_test, naive_bayes_preds)
# Imprimir la precisión y el AUC-ROC
print(f'Accuracy: {naive_bayes_acc}')
print(f'ROC-AUC: {naive_bayes_roc_auc}')

[0 0 0 ... 0 0 1]
Mejores hiperparametros: {'var_smoothing': 1.0}
Mejor métrica: 0.7818229738591997
Accuracy: 0.7761329993167844
ROC-AUC: 0.6731866005595765


#### LDA (Linear Discriminant Analysis)

In [23]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_std, y_train)
lda_preds = lda.predict(X_test_std)
param_grid_lda = {'solver': ['svd', 'lsqr', 'eigen'], 'shrinkage': [None, 'auto']}
grid_search_lda = GridSearchCV(LinearDiscriminantAnalysis(), param_grid_lda, cv=5, scoring='accuracy')
grid_search_lda.fit(X_train_std, y_train)
print(f'Mejores hiperparametros: {grid_search_lda.best_params_}')
print(f'Mejor Metrica: {grid_search_lda.best_score_}')
# Calcular la precisión y el AUC-ROC
lda_acc = accuracy_score(y_test, lda_preds)
lda_roc_auc = roc_auc_score(y_test, lda_preds)
# Imprimir la precisión y el AUC-ROC
print(f'Accuracy: {lda_acc}')
print(f'ROC-AUC: {lda_roc_auc}')

Mejores hiperparametros: {'shrinkage': 'auto', 'solver': 'lsqr'}
Mejor Metrica: 0.7954897491916788
Accuracy: 0.7927579139148258
ROC-AUC: 0.662114884136198


#### SVM 

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

svm = SVC(kernel='linear', C=1, tol=0.01)
svm.fit(X_train_std, y_train)

svm_preds = svm.predict(X_test_std)

param_grid_svm = {'C': [0.1, 1, 10], 'tol': [0.01, 0.001, 0.0001]}
grid_search_svm = GridSearchCV(SVC(kernel='linear'), param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_svm.fit(X_train_std, y_train)

print(f'Mejor hiperparametros: {grid_search_svm.best_params_}')
print(f'Mejor metrica: {grid_search_svm.best_score_}')

# Calcular la precisión y el AUC-ROC
svm_acc = accuracy_score(y_test, svm_preds)
svm_roc_auc = roc_auc_score(y_test, svm_preds)

# Imprimir la precisión y el AUC-ROC
print(f'Accuracy: {svm_acc}')
print(f'ROC-AUC: {svm_roc_auc}')


Mejor hiperparametros: {'C': 1, 'tol': 0.01}
Mejor metrica: 0.7957822889976818
Accuracy: 0.7918469596902755
ROC-AUC: 0.6403935759973731


#### Arbol de decisión

In [25]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_std, y_train)
dt_preds = dt.predict(X_test_std)
param_grid_dt = {'criterion': ['gini', 'entropy'], 'max_depth': [None, 2, 4, 6, 8, 10], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
grid_search_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train_std, y_train)
# Obtener los mejores hiperparámetros
print(f'Best parameters: {grid_search_dt.best_params_}')
print(f'Best score: {grid_search_dt.best_score_}')
dt_acc = accuracy_score(y_test, dt_preds)
dt_roc_auc = roc_auc_score(y_test, dt_preds)
# Imprimir la precisión y el AUC-ROC
print(f'Accuracy: {dt_acc}')
print(f'ROC-AUC: {dt_roc_auc}')


Best parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best score: 0.8913503919594925
Accuracy: 0.8533363698474151
ROC-AUC: 0.8078616147389502


#### Quadratic Discriminant Analysis (QDA)

In [26]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train_std, y_train)
qda_preds = qda.predict(X_test_std)
print(qda_preds)

[0 0 0 ... 0 0 1]


In [27]:
param_grid_qda = {'reg_param': [0.0, 0.5, 1.0]}
grid_search_qda = GridSearchCV(QuadraticDiscriminantAnalysis(), param_grid_qda, cv=5, scoring='accuracy')
grid_search_qda.fit(X_train_std, y_train)
# Obtener los mejores hiperparámetros
print(f'Mejores hiperparámetros: {grid_search_qda.best_params_}')
print(f'Mejor métrica: {grid_search_qda.best_score_}')
# Calcular la precisión y el AUC-ROC
qda_acc = accuracy_score(y_test, qda_preds)
qda_roc_auc = roc_auc_score(y_test, qda_preds)
# Imprimir la precisión y el AUC-ROC
print(f'Accuracy: {qda_acc}')
print(f'ROC-AUC: {qda_roc_auc}')

Mejores hiperparámetros: {'reg_param': 0.5}
Mejor métrica: 0.7842631885980966
Accuracy: 0.7784103848781598
ROC-AUC: 0.6654670509344818


### AdaBoost

In [28]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit(X_train_std, y_train)
ada_preds = ada.predict(X_test_std)
# Calcular la precisión y el AUC-ROC
ada_acc = accuracy_score(y_test, ada_preds)
ada_roc_auc = roc_auc_score(y_test, ada_preds)
param_grid_ada = {'n_estimators': [50, 100, 150, 200], 'learning_rate': [0.001, 0.01, 0.1, 1.0]}
grid_search_ada = GridSearchCV(AdaBoostClassifier(), param_grid_ada, cv=5, scoring='accuracy')
grid_search_ada.fit(X_train_std, y_train)
# Obtener los mejores hiperparámetros
print(f'Mejores hiperparámetros: {grid_search_ada.best_params_}')
print(f'Mejor métrica: {grid_search_ada.best_score_}')

# Imprimir la precisión y el AUC-ROC
print(f'Accuracy: {ada_acc}')
print(f'ROC-AUC: {ada_roc_auc}')

Mejores hiperparámetros: {'learning_rate': 1.0, 'n_estimators': 150}
Mejor métrica: 0.8988668729410687
Accuracy: 0.8925074015030745
ROC-AUC: 0.8244027702018764


### XGBoost

In [34]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train_std, y_train)
xgb_preds = xgb.predict(X_test_std)
# Definir los hiperparámetros a ajustar
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0]
}

# Crear una instancia de GridSearchCV
grid_search_xgb = GridSearchCV(XGBClassifier(), param_grid_xgb, cv=5, scoring='accuracy')

# Ajustar GridSearchCV a los datos de entrenamiento
grid_search_xgb.fit(X_train_std, y_train)

# Obtener los mejores hiperparámetros
print(f'Mejores hiperparámetros: {grid_search_xgb.best_params_}')
print(f'Mejor métrica: {grid_search_xgb.best_score_}')
# Calcular la precisión y el AUC-ROC
xgb_acc = accuracy_score(y_test, xgb_preds)
xgb_roc_auc = roc_auc_score(y_test, xgb_preds)

# Imprimir la precisión y el AUC-ROC
print(f'Accuracy: {xgb_acc}')
print(f'ROC-AUC: {xgb_roc_auc}')


Mejores hiperparámetros: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Mejor métrica: 0.9132170037213274
Accuracy: 0.9068549305397404
ROC-AUC: 0.844471808148323


### LGBM

In [37]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(X_train_std, y_train)
lgbm_preds = lgbm.predict(X_test_std)
param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0]
}
grid_search_lgbm = GridSearchCV(LGBMClassifier(), param_grid_lgbm, cv=5, scoring='accuracy')
grid_search_lgbm.fit(X_train_std, y_train)
# Obtener los mejores hiperparámetros
print(f'Mejores hiperparámetros: {grid_search_lgbm.best_params_}')
print(f'Mejor métrica: {grid_search_lgbm.best_score_}')

# Calcular la precisión y el AUC-ROC
lgbm_acc = accuracy_score(y_test, lgbm_preds)
lgbm_roc_auc = roc_auc_score(y_test, lgbm_preds)

# Imprimir la precisión y el AUC-ROC
print(f'Accuracy: {lgbm_acc}')
print(f'ROC-AUC: {lgbm_roc_auc}')

Mejores hiperparámetros: {'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.5}
Mejor métrica: 0.9129244162548804
Accuracy: 0.9107264859940788
ROC-AUC: 0.8465000692007717


#### Comparación de todos los modelos