### Proyecto Final Notebook 2
#### Doris Andrea Paz Garcia  	22005266
#### Franz Schubert Castillo Colocho 22003738
#### Estuardo Funes 20032042

In [45]:
#Importamos las librerías 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy import stats

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.transformation import LogTransformer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper



In [46]:
#Cargamos data para aplicar analisis de datos
df = pd.read_csv("dataset_proyecto.csv")
df.head()

Unnamed: 0,bmi,Age,asa_status,baseline_cancer,baseline_charlson,baseline_cvd,baseline_dementia,baseline_diabetes,baseline_digestive,baseline_osteoart,...,complication_rsi,dow,gender,hour,month,moonphase,mort30,mortality_rsi,race,complication
0,19.31,59.2,1,yes,0,no,no,no,no,no,...,-0.57,3,0,7.63,6,1,0,-0.43,1,0
1,18.73,59.1,0,no,0,no,no,no,no,no,...,0.21,0,0,12.93,0,1,0,-0.41,1,0
2,21.85,59.0,0,no,0,no,no,no,no,no,...,0.0,2,0,7.68,5,3,0,0.08,1,0
3,18.49,59.0,1,no,1,no,no,yes,yes,no,...,-0.65,2,1,7.58,4,3,0,-0.32,1,0
4,19.7,59.0,1,no,0,no,no,no,no,no,...,0.0,0,0,7.88,11,0,0,0.0,1,0


## 3. Ingeniería de Caracteristicas. 

In [47]:
def getDataColTypes(df):
    categoricas=[]
    continuas=[]
    discretas=[]
    for colName in df.columns:
        if(df[colName].dtype=='O'): 
            categoricas.append(colName)
        else: 
            if((df[colName].dtype=='int64') or (df[colName].dtypes=='float64')): 
                if(len(df[colName])<=30):
                    discretas.append(colName)
                else: 
                    continuas.append(colName)
    return discretas,continuas,categoricas

In [48]:
discretas,continuas,categoricas = getDataColTypes(df)
print(continuas)
print(discretas)
print(categoricas) 

['bmi', 'Age', 'asa_status', 'baseline_charlson', 'ahrq_ccs', 'ccsComplicationRate', 'ccsMort30Rate', 'complication_rsi', 'dow', 'gender', 'hour', 'month', 'moonphase', 'mort30', 'mortality_rsi', 'race', 'complication']
[]
['baseline_cancer', 'baseline_cvd', 'baseline_dementia', 'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 'baseline_pulmonary']


##### 3.1 Imputación de variables. 

In [49]:
#Realizamos la imputación en función de lo observado. 
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['bmi'] = df['bmi'].fillna(df['bmi'].mode()[0])
df['ccsMort30Rate'] = df['ccsMort30Rate'].fillna(df['ccsMort30Rate'].median())
df['hour'] = df['hour'].fillna(df['hour'].median())



##### 3.2 Tratamiento de Outliers 

In [50]:
## Creamos función para calcular el IQR, Limite inferior y limite superior. 

dataset = df
col = 'bmi' 
def detectOutliersLimits(dataset,col):
    '''
    Descripción: Calcula los limites superiores e inferiores para detección de outliers. 
    Input: dataset->pandas dataframe, col->string nombre de columna
    Output: tupla de floats con los limites superior e inferior (LI,LS)
    '''
    IQR = dataset[col].quantile(0.75)-dataset[col].quantile(0.25)
    LI = dataset[col].quantile(0.25) - (IQR*1.75)
    LS = dataset[col].quantile(0.75) + (IQR*1.75)
    
    return LI,LS

In [51]:
def cappingContinuas(dataset):
    '''
    Descripción: Aplica capping a las variables continuas  y convierte los valores negativos a cero
    Input: dataset -> pandas dataframe
    Output: pandas dataframe con las variables continuas cappeadas y valores negativos convertidos a cero
    '''
    continuas, _, _ = getDataColTypes(dataset)
    for col in continuas:
        LI, LS = detectOutliersLimits(dataset, col)
        dataset[col] = np.where(dataset[col] > LS, LS,
                                np.where(dataset[col] < LI, LI, dataset[col]))
        dataset[col] = np.where(dataset[col] < 0, 0, dataset[col])  # Convertir valores negativos a cero. 
    return dataset

In [52]:
df_capped = cappingContinuas(df)


##### 3.2 Codificación de variables categoricas. 

In [53]:
#One Hot Encoding. 
ohe_cancer=pd.get_dummies(df['baseline_cancer'],drop_first=True)
ohe_cancer.value_counts()
df['baseline_cancer']=ohe_cancer

In [54]:
#One Hot Encoding. 
ohe_cvd=pd.get_dummies(df['baseline_cvd'],drop_first=True)
ohe_cvd.value_counts()
df['baseline_cvd']=ohe_cvd

In [55]:
#One Hot Encoding. 
ohe_dementia=pd.get_dummies(df['baseline_dementia'],drop_first=True)
ohe_dementia.value_counts()
df['baseline_dementia']=ohe_dementia

In [56]:
#One Hot Encoding. 
ohe_diabetes=pd.get_dummies(df['baseline_diabetes'],drop_first=True)
ohe_diabetes.value_counts()
df['baseline_diabetes']=ohe_diabetes

In [57]:
#One Hot Encoding. 
ohe_digestive=pd.get_dummies(df['baseline_digestive'],drop_first=True)
ohe_digestive.value_counts()
df['baseline_digestive']=ohe_digestive

In [58]:
#One Hot Encoding. 
ohe_osteoart=pd.get_dummies(df['baseline_osteoart'],drop_first=True)
ohe_osteoart.value_counts()
df['baseline_osteoart']=ohe_osteoart

In [59]:
#One Hot Encoding. 
ohe_psych=pd.get_dummies(df['baseline_psych'],drop_first=True)
ohe_psych.value_counts()
df['baseline_psych']=ohe_psych

In [60]:
#One Hot Encoding. 
ohe_pulmonary=pd.get_dummies(df['baseline_pulmonary'],drop_first=True)
ohe_pulmonary.value_counts()
df['baseline_pulmonary']=ohe_pulmonary

### 4. Construcción del Pipeline. 

#### 4.1 Train Test Split para Entrenamiento y Prueba

In [61]:
from sklearn.model_selection import train_test_split

# Define tus características y la variable objetivo
X = df.drop(['complication'], axis=1)
y = df['complication']

# Divide los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022)


In [62]:
# Imputación de variables categóricas
VARS_CATEGORICAS_CON_NA_FRECUENTE = ['baseline_cancer', 'baseline_cvd', 'baseline_dementia', 
                                     'baseline_diabetes', 'baseline_digestive', 'baseline_osteoart', 
                                     'baseline_psych', 'baseline_pulmonary']

# Imputación de variables numéricas
VARS_NUMERICAS_CON_NA = ['bmi', 'Age','ccsMort30Rate','hour']

# Variables para transformación logarítmica
VARS_NUMERICAS_LOG = ['bmi', 'Age']

# Variables para codificación por frecuencia (no ordinal)
VARS_CATEGORICAS = ['asa_status', 'ahrq_ccs', 'dow', 'gender', 'month', 'moonphase', 'mort30', 'race']

# Variables a utilizar en el entrenamiento
FEATURES = [
    'bmi', 'Age', 'asa_status', 'baseline_cancer', 'baseline_charlson', 
    'baseline_cvd', 'baseline_dementia', 'baseline_diabetes', 
    'baseline_digestive', 'baseline_osteoart', 'baseline_psych', 
    'baseline_pulmonary', 'ahrq_ccs', 'ccsComplicationRate', 'ccsMort30Rate', 
    'complication_rsi', 'dow', 'gender', 'hour', 'month', 'moonphase', 'mort30', 
    'mortality_rsi', 'race'
]


In [63]:
# Convertir las variables categóricas a tipo object
for var in VARS_CATEGORICAS:
    X_train[var] = X_train[var].astype('object')
    X_test[var] = X_test[var].astype('object')

In [64]:
X_train = X_train[FEATURES]

In [65]:
y_train = np.log(y_train + 0.01)
y_test = np.log(y_test + 0.01)


#### 4.2 Construcción del pipeline

In [66]:
complication_pipeline_v29062023 = Pipeline([
    
    #=========== IMPUTACIONES ===============
    
    
    #1. Indicador faltante en variables numericas para imputación
    ('missing_indicator_numeric',
        AddMissingIndicator(variables=VARS_CATEGORICAS_CON_NA_FRECUENTE)
    ),
    
    #2. Imputación de variables numéricas
    ('mean_imputation',
        MeanMedianImputer(imputation_method='mean', variables=VARS_CATEGORICAS_CON_NA_FRECUENTE)
    ),
    
    
    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS NOMINALES ==================
    ('rare_label_encoder',
        RareLabelEncoder(n_categories=1, tol=0.01, variables=VARS_CATEGORICAS)
    ),
    
    ('categorical_encoder',
        OrdinalEncoder(encoding_method='ordered', variables=VARS_CATEGORICAS)
    ),
    
    #=============== TRANSFORMACIÓN DE VARIABLES CONTINUAS ============
    ('log_transformer',
        LogTransformer(variables=VARS_NUMERICAS_LOG)
    ),
    
    
     #=============== SCALER ============
    ('scaler',
        MinMaxScaler()
    ),
    
    ('modelo_lasso', 
         Lasso(alpha=0.01, random_state=2022)
    )
])

In [67]:
complication_pipeline_v29062023.fit(X_train, y_train)

Pipeline(steps=[('missing_indicator_numeric',
                 AddMissingIndicator(variables=['baseline_cancer',
                                                'baseline_cvd',
                                                'baseline_dementia',
                                                'baseline_diabetes',
                                                'baseline_digestive',
                                                'baseline_osteoart',
                                                'baseline_psych',
                                                'baseline_pulmonary'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['baseline_cancer', 'baseline_cvd',
                                              'baseline_dem...
                                  variables=['asa_status', 'ahrq_ccs', 'dow',
                                             'gender', 'month', 'moonphase',
           