# Modelos con Pycaret

In [1]:
# Import libraries
import pycaret
from pycaret import classification as pyc
from pycaret.utils import enable_colab
enable_colab()
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd
import requests
import warnings
warnings.filterwarnings('always')

Colab mode enabled.


## Descargar datos

In [2]:
# download the dataset
# directory of the raw data files
_data_root = './data/covertype'

# path to the raw training data
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')

## Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export=download&confirm={{VALUE}}&id=1zZkMC3opeYnYPMd-oDRUgprpPcmUgj1t'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)

In [3]:
# Check some data
df = pd.read_csv(_data_filepath, index_col=False)
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3126,297,19,210,66,1500,162,230,207,2374,Commanche,C7757,1
1,2892,53,9,0,0,3829,226,221,129,2659,Rawah,C7202,1
2,2637,334,12,467,78,658,192,224,171,977,Commanche,C4704,2
3,2882,32,13,376,66,3618,216,211,128,3072,Rawah,C7102,1
4,2747,59,11,127,14,2701,228,216,119,4723,Rawah,C7700,1


## Procesamiento
La función `setup` permite realizar procesamiento, solo es necesario definir el origen de datos y la variable objetivo. El resto de parámetros son opcionales y se usan según la necesidad.
- `session_id` define una semilla, esto para que los experimentos sean reproducibles.
- `train_size` define porcentaje utilizado para entrenamiento y test
- `normalize` define que se realizará normalizacion a las variables, por defecto 'zscore'
- `transformation` define que se realice transformacion de datos para buscar una distribución más 'gaussiana'
- `remove_multicollinearity` cuando se detecten variables en donde la Colinealidad supere un threshold se eliminara
- `multicollinearity_threshold` define el threshold a superar la colinealidad
- `log_experiment` almacena información del experimento para poder hacer seguimiento
- `experiment_name` Asigna nombre del experimento para poder identificarlo
- `categorical_features` transforma cada valor del dominio de la variable pasa a ser una variable

In [4]:
exp_cov_1 = pyc.setup(data = df, target = 'Cover_Type', 
            session_id=42, train_size = 0.8,
            normalize = True, 
            transformation = True,
            remove_multicollinearity = True, multicollinearity_threshold = 0.9,
            log_experiment = True, experiment_name = 'Cover_Type1',
            categorical_features = ['Wilderness_Area', 'Soil_Type'],
            silent=True) 

Unnamed: 0,Description,Value
0,session_id,42
1,Target,Cover_Type
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(464809, 13)"
5,Missing Values,False
6,Numeric Features,10
7,Categorical Features,2
8,Ordinal Features,False
9,High Cardinality Features,False


Toda la información se encuentra en la variable `exp_cov_1` vale la pena revisar el contenido, existe información de pipeline, datos, distribuciones, transformaciones y modelos predefinidos.

In [5]:
## Explore exp_cov_1
for i in reversed(exp_cov_1):
    #print(type(i))
    if type(i)== pd.core.frame.DataFrame:
        display(i)
        break

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_C7755,Soil_Type_C7756,Soil_Type_C7757,Soil_Type_C7790,Soil_Type_C8703,Soil_Type_C8707,Soil_Type_C8708,Soil_Type_C8771,Soil_Type_C8772,Soil_Type_C8776
274874,-1.134886,-1.511796,1.958353,-0.077955,0.925422,-1.064641,-1.492678,-2.365262,-0.765280,0.054188,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36639,-1.231406,1.542532,0.419001,1.275834,1.311204,-1.193573,-0.935802,-0.722609,0.370386,-0.762778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196301,0.304795,-1.157599,-2.052667,-0.041078,0.072731,-0.551769,0.070712,0.465622,0.207246,0.782369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16152,-0.613736,1.419991,0.153778,-1.188052,-0.632446,-0.405101,-1.043820,-0.255765,0.844599,0.698568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
426040,-0.910677,1.449533,1.798803,-0.479353,0.362750,-0.552670,-2.026285,-1.647629,0.872986,-0.095089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151387,-0.224300,0.471273,0.658470,0.668948,0.454268,-0.252880,-0.067709,1.775288,0.480452,0.084858,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
185942,-0.821698,0.087892,-1.415605,0.849792,-1.447931,-0.219222,0.480627,0.818655,0.073257,0.445340,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
409818,-0.733069,0.503314,1.083708,0.518919,1.321630,0.007003,-0.278841,1.861502,0.563643,0.397877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
345296,0.580187,1.237491,-1.019811,-0.701697,-0.692053,-0.980420,-0.603831,0.674030,0.844599,-0.231717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Revisar modelos disponibles para clasificación

In [6]:
pyc.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


# Comparar modelos AutoML

La función `compare_models` permite realizar autoML entre los modelos disponibles, sin embargo es posible incluir `include` o excluir `exclude` cualquiera de los modelos.

Tambien es posible ordenar los resultados usando `sort`para cualquiera de las metricas disponibles `Accuracy`, `AUC` ,`Recall` ,`Prec.`, `F1` ,`Kappa` ,`MCC`

In [7]:
best_model = pyc.compare_models(include=['dt','ada','lightgbm', 'rf', 'nb', 'dummy'], sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9481,0.9939,0.8939,0.9482,0.9478,0.9162,0.9164,16.294
dt,Decision Tree Classifier,0.9287,0.9424,0.8837,0.9287,0.9287,0.8855,0.8855,1.501
lightgbm,Light Gradient Boosting Machine,0.8535,0.9543,0.8076,0.8535,0.8529,0.7634,0.7638,4.632
ada,Ada Boost Classifier,0.5482,0.6038,0.372,0.6234,0.5637,0.3306,0.3395,5.607
dummy,Dummy Classifier,0.4873,0.5,0.1429,0.2375,0.3193,0.0,0.0,0.077
nb,Naive Bayes,0.0935,0.6747,0.458,0.5063,0.0598,0.0462,0.0676,0.197


In [8]:
best_model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

Una vez definido el mejor modelo, este puede tunearse aún más, no con híper parámetros estandar, sino con una busqueda fina de los valores.

In [9]:
tuned_best = pyc.tune_model(best_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7452,0.8943,0.4638,0.7409,0.726,0.5741,0.5774
1,0.748,0.8951,0.47,0.7416,0.7299,0.5791,0.582
2,0.7505,0.8977,0.4732,0.7463,0.7322,0.583,0.586
3,0.7458,0.8964,0.4661,0.7399,0.7277,0.5759,0.5785
4,0.7419,0.8931,0.4633,0.7359,0.7238,0.5691,0.5717
5,0.7461,0.8949,0.4762,0.742,0.7284,0.5763,0.5791
6,0.7433,0.8928,0.4688,0.7571,0.7251,0.5709,0.5739
7,0.7444,0.8962,0.4725,0.7579,0.7263,0.5731,0.5758
8,0.7439,0.894,0.4663,0.7381,0.7252,0.5719,0.575
9,0.7451,0.8959,0.4738,0.741,0.7259,0.5744,0.5771


# Revisar resultados
Es posible revisar de manera independiente o de manera dinamica

In [None]:
# check feature importance
pyc.plot_model(tuned_best, plot = 'feature')

In [None]:
pyc.plot_model(tuned_best, plot = 'error')

In [10]:
pyc.evaluate_model(tuned_best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

Esta función predice usando el modelo entrenado. Cuando los datos son ´None´, predice la etiqueta y la puntuación en el conjunto de prueba (creado durante la función de configuración).

Antes de finalizar el modelo, se recomienda realizar una verificación final prediciendo el conjunto de prueba (test) y revisando las métricas de evaluación. Todas las métricas de evaluación que hemos visto anteriormente son resultados de validación cruzada basados **solo en el conjunto de entrenamiento (80%)**. Ahora, vamos a utilizar los datos de test para ver si el rendimiento es diferentes y puede haber overfitting.

In [11]:
pyc.predict_model(tuned_best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7444,0.8937,0.4642,0.7382,0.7256,0.5727,0.5756


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_C7790,Soil_Type_C8703,Soil_Type_C8707,Soil_Type_C8708,Soil_Type_C8771,Soil_Type_C8772,Soil_Type_C8776,Cover_Type,Label,Score
0,-1.134886,-1.511796,1.958353,-0.077955,0.925422,-1.064641,-1.492678,-2.365262,-0.765280,0.054188,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.6472
1,-1.231406,1.542532,0.419001,1.275834,1.311204,-1.193573,-0.935802,-0.722609,0.370386,-0.762778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2,0.4543
2,0.304795,-1.157599,-2.052667,-0.041078,0.072731,-0.551769,0.070712,0.465622,0.207246,0.782369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.5071
3,-0.613736,1.419991,0.153778,-1.188052,-0.632446,-0.405101,-1.043820,-0.255765,0.844599,0.698568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.5976
4,-0.910677,1.449533,1.798803,-0.479353,0.362750,-0.552670,-2.026285,-1.647629,0.872986,-0.095089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.6320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92957,-0.224300,0.471273,0.658470,0.668948,0.454268,-0.252880,-0.067709,1.775288,0.480452,0.084858,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.6390
92958,-0.821698,0.087892,-1.415605,0.849792,-1.447931,-0.219222,0.480627,0.818655,0.073257,0.445340,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.8264
92959,-0.733069,0.503314,1.083708,0.518919,1.321630,0.007003,-0.278841,1.861502,0.563643,0.397877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.6910
92960,0.580187,1.237491,-1.019811,-0.701697,-0.692053,-0.980420,-0.603831,0.674030,0.844599,-0.231717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.5553


# Guardar modelo para inferencia

In [12]:
# finalize the model
final_best = pyc.finalize_model(tuned_best)

# save model to disk
pyc.save_model(final_best, 'model_cover_type')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['Wilderness_Area',
                                                             'Soil_Type'],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='Cover_Type', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight={}, criterion='entropy',
                                         max_depth=9, max_features='sqrt',
                    

## Usar nuevos datos para Inferencia con el modelo Guardado

Descarga nuevo conjunto de datos.

In [14]:
# download the dataset
# directory of the raw data files
_data_root = './data/covertype'

# path to the raw training data
_data_filepath_test = os.path.join(_data_root, 'covertype_test.csv')

## Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath_test):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export=download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath_test, 'wb').write(r.content)

Carga modelo previamente almacenado

In [15]:
saved_model = pyc.load_model('model_cover_type')

Transformation Pipeline and Model Successfully Loaded


In [16]:
df_test = pd.read_csv(_data_filepath_test, index_col=False)

#### Validar predicción

In [17]:
new_prediction = pyc.predict_model(saved_model, data=df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7446,0.8935,0.467,0.7559,0.7264,0.5735,0.5763
