Importamos librerias.

In [45]:
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from tempfile import mkdtemp
from shutil import rmtree

Realizamos la ingesta de los datos.

In [76]:
df_test = pd.read_csv("../datasets/hospitalizaciones_test.csv")
df_train = pd.read_csv("../datasets/hospitalizaciones_train.csv")

Transformamos la columna Stay

In [77]:
df_train ['Stay'] = 0
df_train.loc[df_train['Stay (in days)'] > 8, 'Stay'] = 1
df_train.drop(['Stay (in days)'],axis=1, inplace= True)

Establecemos el modelo a utilizar. Para cambiar el modelo solo basta con cambiarlo en la siguiente linea de código sin que sea necesario alterar pipelines.

In [112]:
model = DecisionTreeClassifier(criterion= 'entropy', max_depth= 19)#, min_samples_split= 10)


Separamos el set de entrenamiento y testeo.

In [79]:
x= df_train[['Department','doctor_name','Ward_Facility_Code','Age','gender','Available Extra Rooms in Hospital', 'Severity of Illness','Admission_Deposit']]
y= df_train['Stay']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=60, stratify=y)

Establecemos por medio de variables que procesos de transformacion se le aplicaran a las diversas columnas. Para cambiar, solo basta con modificar el contenido de las variables sin que sea necesario alterar pipelines.

In [113]:
nominal =['Department','doctor_name','Ward_Facility_Code','gender','Available Extra Rooms in Hospital','Severity of Illness']
ordinal =['Age']
numerical = ['Admission_Deposit']

Definimos pipeline relativos a la tranformación  de los datos.

In [120]:
#Pipeline datos numéricos.
numerical_pipeline = Pipeline([('escaler', StandardScaler())])
#Pipeline datos ordinales. 
ordinal_pipeline = Pipeline([("encoder", OrdinalEncoder())])
#Pipeline datos nominales.
nominal_pipeline = Pipeline([("encoder2", OneHotEncoder())])

#Unimos los dos procesos en un mismo Pipeline.
preprocessin_pipeline = ColumnTransformer([("ordinal_preprocesor", ordinal_pipeline, ordinal),
                                            ("nominal_preprocessor", nominal_pipeline, nominal),
                                            ("numerical_preprocessor",numerical_pipeline, numerical) ])

Realizamos el pipeline completo.

In [121]:
complete_pipeline= Pipeline ([("preprocessor", preprocessin_pipeline), ("estimator", model)])

Entrenamos el modelo.

In [122]:
complete_pipeline.fit(X_train,y_train)

Evaluamos el desempeño del modelo.

In [117]:
prediction = complete_pipeline.predict(X_test)


array([1, 0, 1, ..., 0, 0, 1], dtype=int64)

In [118]:
cm = confusion_matrix(y_test,prediction)
print("Matriz de confusión:")
print(cm)

Matriz de confusión:
[[18838 12079]
 [ 8729 42354]]


In [119]:

report = classification_report(y_test,prediction)
print("Reporte de Clasificación:")
print(report)

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.68      0.61      0.64     30917
           1       0.78      0.83      0.80     51083

    accuracy                           0.75     82000
   macro avg       0.73      0.72      0.72     82000
weighted avg       0.74      0.75      0.74     82000



Fijamos hiperparametros para GridSearch en conjunto con pipelines.

In [67]:
rf_params = {
             'estimator__criterion' : ['gini', 'entropy'],
             
             'estimator__min_samples_split':range(2, 11, 2),
             'estimator__max_depth':range(2, 22, 2) 
}

Utilizamos GridSearch para seleccionar los mejores hiperparametros.

In [57]:
model = GridSearchCV(complete_pipeline, param_grid=rf_params, cv=5, scoring = ['accuracy', 'recall'], refit = 'accuracy' )
model.fit(X_train, y_train)


Vemos resultado.

In [58]:
print("Mejores hiperparámetros: "+str(model.best_params_))
print("Mejor Score: "+str(model.best_score_)+'\n')

scores = pd.DataFrame(model.cv_results_)
scores

Mejores hiperparámetros: {'estimator__criterion': 'gini', 'estimator__max_depth': 20, 'estimator__min_samples_split': 10}
Mejor Score: 0.7478262195121952



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__criterion,param_estimator__max_depth,param_estimator__min_samples_split,params,split0_test_accuracy,split1_test_accuracy,...,std_test_accuracy,rank_test_accuracy,split0_test_recall,split1_test_recall,split2_test_recall,split3_test_recall,split4_test_recall,mean_test_recall,std_test_recall,rank_test_recall
0,0.866394,0.034953,0.150035,0.007460,gini,2,2,"{'estimator__criterion': 'gini', 'estimator__m...",0.622957,0.622957,...,0.000007,91,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1
1,0.933240,0.175298,0.159637,0.020742,gini,2,4,"{'estimator__criterion': 'gini', 'estimator__m...",0.622957,0.622957,...,0.000007,91,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1
2,0.955474,0.053540,0.170017,0.010471,gini,2,6,"{'estimator__criterion': 'gini', 'estimator__m...",0.622957,0.622957,...,0.000007,91,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1
3,0.908136,0.041605,0.162678,0.013499,gini,2,8,"{'estimator__criterion': 'gini', 'estimator__m...",0.622957,0.622957,...,0.000007,91,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1
4,0.862150,0.016926,0.158472,0.009651,gini,2,10,"{'estimator__criterion': 'gini', 'estimator__m...",0.622957,0.622957,...,0.000007,91,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,8.758089,0.953828,0.168863,0.013067,entropy,20,2,"{'estimator__criterion': 'entropy', 'estimator...",0.746845,0.745732,...,0.000922,9,0.827314,0.822640,0.832359,0.827367,0.830328,0.828001,0.003287,77
96,8.190832,0.378506,0.184132,0.017870,entropy,20,4,"{'estimator__criterion': 'entropy', 'estimator...",0.746875,0.745884,...,0.000892,8,0.826922,0.822420,0.832505,0.827122,0.830132,0.827820,0.003399,78
97,8.695455,1.545453,0.188851,0.044027,entropy,20,6,"{'estimator__criterion': 'entropy', 'estimator...",0.746814,0.745960,...,0.000837,10,0.827142,0.822420,0.832041,0.826608,0.830254,0.827693,0.003309,80
98,8.275274,0.929348,0.166868,0.012720,entropy,20,8,"{'estimator__criterion': 'entropy', 'estimator...",0.746677,0.746326,...,0.000797,7,0.826922,0.822836,0.831894,0.826926,0.830426,0.827801,0.003157,79
