In [4]:
# Importamos las librerías necesarias

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder

In [5]:
#se importa el archivo hospitalizaciones_tarin.csv para su revision y analisis
train_df=pd.read_csv('Datasets\hospitalizaciones_train.csv', encoding='utf-8')

In [6]:
# Se agrega una columna "estadia" como resultado de la binarizacion de la columna "Stay (in days)"
train_df['estadia'] = np.where(train_df['Stay (in days)']>8,1,0)

In [7]:
# Se elimina columna "Stay (in days)"
train_df.drop(columns='Stay (in days)', inplace=True)

In [8]:
# Se convierte columnas categoricas a numericas
le = preprocessing.LabelEncoder()
train_df['Insurance'] = le.fit_transform(train_df['Insurance'])
train_df['health_conditions'] = le.fit_transform(train_df['health_conditions'])
train_df['Type of Admission'] = le.fit_transform(train_df['Type of Admission'])
train_df['gender'] = le.fit_transform(train_df['gender'])
train_df['doctor_name'] = le.fit_transform(train_df['doctor_name'])
train_df['Ward_Facility_Code'] = le.fit_transform(train_df['Ward_Facility_Code'])
train_df['Department']=le.fit_transform(train_df['Department'])


In [9]:
# Se convierte columnas categoricas ordinales en numericas 
enc = OrdinalEncoder()
train_df['Age'] = enc.fit_transform(train_df[['Age']])
train_df['Severity of Illness'] = enc.fit_transform(train_df[['Severity of Illness']])

In [10]:
# Se eliminan las columnas que se consideran innecasarias por tener correlacion baja o nula
train_df.drop(columns=['staff_available', 'patientid', 'Visitors with Patient','Insurance','health_conditions'], inplace=True)

In [11]:
# Se establece X e y para para proceder con el modelo de Arbol
X=train_df.iloc[:,[0,1,4]]
y=train_df.iloc[:,9]

In [12]:
# Separamos los datos en train y test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Creamos el primer pipeline

pipe_lr = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', LogisticRegression(random_state=42))])

In [14]:
# Creamos el segundo pipeline

pipe_svm = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', svm.SVC(random_state=42))])
			

In [15]:
# Creamos el tercer pipeline

pipe_dt = Pipeline([('scl', StandardScaler()),
			('pca', PCA(n_components=2)),
			('clf', tree.DecisionTreeClassifier(random_state=42))])

In [16]:
# Los guardamos en una lista

pipelines = [pipe_lr, pipe_svm, pipe_dt]

In [17]:
# Hacemos un diccionario para fines organizativos

pipe_dict = {0: 'Regresión Logística', 1: 'SVM', 2: 'Árbol de decisión'}

In [20]:
# Entrenamos

for pipe in pipelines:
	pipe.fit(X_train, y_train)

In [21]:
# Evaluamos

for idx, val in enumerate(pipelines):
	print('%s pipeline accuracy en test: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))

Regresión Logística pipeline accuracy en test: 0.631
SVM pipeline accuracy en test: 0.686
Árbol de decisión pipeline accuracy en test: 0.691


In [22]:
# Identificamos el mejor modelo para el set de testeo

best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
	if val.score(X_test, y_test) > best_acc:
		best_acc = val.score(X_test, y_test)
		best_pipe = val
		best_clf = idx
print('Modelo con el mejor accuracy: %s' % pipe_dict[best_clf])

Modelo con el mejor accuracy: Árbol de decisión


In [None]:
# Guardamos el pipeline en un archivo

import joblib 

joblib.dump(best_pipe, 'Mejor_pipeline.pkl', compress=1)
print('Pipeline de %s guardado a archivo' % pipe_dict[best_clf])

In [None]:
# Traemos el pipeline guardado y separamos los datos
best_model = joblib.load('Mejor_pipeline.pkl')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [25]:
best_model.fit(X_train,y_train)

In [26]:
best_model.score(X_test, y_test)

0.6915243902439024