## CEIA - Análisis de datos

### Clase 8: Herramientas low-code que incluyen el procesamiento de datos

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import webbrowser
import os
from sklearn.model_selection import train_test_split

In [2]:
# Cargar el dataset de Titanic
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Pycaret - AutoML

In [3]:
# Por ejemplo en preparación para un modelo de clasificación
from pycaret.classification import *


In [4]:
import numpy as np
np.seterr(all='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [6]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
# Convertir los objetos a string
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str)

# Convertir pclass a string 
df['pclass'] = df['pclass'].astype(str)

# float64 para las numéricas 
df[['age', 'fare']] = df[['age', 'fare']].astype('float64')

# Ensure numerical columns are float64
df[['age', 'fare']] = df[['age', 'fare']].astype('float64')

In [8]:
# Setup de Pycaret
s = setup(
    data=df,
    target='survived',
    train_size=0.8,  # Proporción de datos de entrenamiento
    session_id=123,
    preprocess=True,
    normalize=True,
    normalize_method='minmax', # 'zscore' (StandardScaler) o'minmax'
    remove_outliers=True,
    outliers_threshold=0.01,
    ordinal_features={'pclass': ['3', '2', '1']},
    categorical_features=['sex', 'embark_town'],
    ignore_features=['parch', 'alive', 'alone', 'who', 'adult_male', 'embarked', 'class', 'deck'],
    categorical_imputation='mode',
    numeric_imputation='mean',
    fix_imbalance=True,
    fix_imbalance_method='smote',  # 'random', 'smote', 'adasyn', etc.
    verbose=True,  # True para mostrar información detallada
)

# Cargamos los datos preprocesados
# dataset_transf = get_config('dataset_transformed')

X_train_transformed = s.get_config('X_train_transformed')
y_train_transformed = s.get_config('y_train_transformed')

X_test_transformed = s.get_config('X_test_transformed')
y_test_transformed = s.get_config('y_test_transformed')


print(f"Datos preprocesados: \n\n {X_train_transformed.head()}")
print(f"Dimensiones del dataset preprocesado: {X_train_transformed.shape}")
print(f"Nulos: {X_train_transformed.isnull().sum().sum()}")


Unnamed: 0,Description,Value
0,Session id,123
1,Target,survived
2,Target type,Binary
3,Original data shape,"(891, 15)"
4,Transformed data shape,"(1049, 10)"
5,Transformed train set shape,"(870, 10)"
6,Transformed test set shape,"(179, 10)"
7,Ignore features,8
8,Ordinal features,1
9,Numeric features,3


Datos preprocesados: 

      pclass  sex       age  sibsp      fare  embark_town_Southampton  \
222     0.0  1.0  0.635587  0.000  0.015713                      1.0   
610     0.0  0.0  0.484795  0.125  0.061045                      1.0   
249     0.5  1.0  0.673285  0.125  0.050749                      1.0   
814     0.0  1.0  0.377984  0.000  0.015713                      1.0   
118     1.0  1.0  0.296306  0.000  0.483128                      0.0   

     embark_town_Cherbourg  embark_town_Queenstown  embark_town_nan  
222                    0.0                     0.0              0.0  
610                    0.0                     0.0              0.0  
249                    0.0                     0.0              0.0  
814                    0.0                     0.0              0.0  
118                    1.0                     0.0              0.0  
Dimensiones del dataset preprocesado: (870, 9)
Nulos: 0


In [25]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 870 entries, 222 to 1056
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   pclass                   870 non-null    float64
 1   sex                      870 non-null    float64
 2   age                      870 non-null    float64
 3   sibsp                    870 non-null    float64
 4   fare                     870 non-null    float64
 5   embark_town_Southampton  870 non-null    float64
 6   embark_town_Cherbourg    870 non-null    float64
 7   embark_town_Queenstown   870 non-null    float64
 8   embark_town_nan          870 non-null    float64
dtypes: float64(9)
memory usage: 68.0 KB


#### ¿Cómo seguiría el proceso? (Tomado de la documentación de Pycaret): https://github.com/pycaret/pycaret?tab=readme-ov-file#1-functional-api)


In [9]:
# Classification Functional API Example 

# model training and selection
best = compare_models() # Compara varios modelos usando cross-validation y parámetros default. Se queda con el mejor.

# evaluate trained model
print(f"Métricas del mejor modelo (sin ajuste) con el dataset de test: \n")
evaluate_model(best) # Evalúa el mejor modelo con el dataset de test


# predict on hold-out/test set
pred_holdout = predict_model(best) # Predecir con datos de test o aplicar transformaciones a datos no vistos y predecir
# predict on new data
# predictions = predict_model(best, data = new_data)

print(f"Predicción con dataset de test: \n {pred_holdout}")

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8271,0.8614,0.7579,0.7881,0.7699,0.6319,0.635,0.048
rf,Random Forest Classifier,0.8243,0.8646,0.7464,0.786,0.7638,0.6242,0.6265,0.059
lightgbm,Light Gradient Boosting Machine,0.8214,0.8541,0.7501,0.7788,0.7622,0.6194,0.6216,0.395
knn,K Neighbors Classifier,0.8144,0.8531,0.732,0.7716,0.7466,0.6014,0.6055,0.033
ada,Ada Boost Classifier,0.8005,0.8508,0.7615,0.7325,0.7429,0.5804,0.585,0.04
et,Extra Trees Classifier,0.8004,0.8462,0.7136,0.7556,0.7307,0.5728,0.5762,0.055
dt,Decision Tree Classifier,0.7921,0.7789,0.7323,0.7321,0.7301,0.5611,0.5632,0.029
lr,Logistic Regression,0.7808,0.8488,0.7578,0.7027,0.7268,0.5446,0.5484,0.395
lda,Linear Discriminant Analysis,0.7808,0.8461,0.7394,0.7099,0.7213,0.5413,0.5449,0.03
ridge,Ridge Classifier,0.7766,0.8465,0.7283,0.7068,0.7135,0.531,0.5352,0.027


Métricas del mejor modelo (sin ajuste) con el dataset de test: 



interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8045,0.8509,0.7246,0.7576,0.7407,0.5839,0.5843


Predicción con dataset de test: 
     pclass     sex   age  sibsp        fare  embark_town  survived  \
561      3    male  40.0      0    7.895800  Southampton         0   
641      1  female  24.0      0   69.300003    Cherbourg         1   
400      3    male  39.0      0    7.925000  Southampton         1   
498      1  female  25.0      1  151.550003  Southampton         0   
875      3  female  15.0      0    7.225000    Cherbourg         1   
..     ...     ...   ...    ...         ...          ...       ...   
339      1    male  45.0      0   35.500000  Southampton         0   
841      2    male  16.0      0   10.500000  Southampton         0   
442      3    male  25.0      1    7.775000  Southampton         0   
815      1    male   NaN      0    0.000000  Southampton         0   
53       2  female  29.0      1   26.000000  Southampton         1   

     prediction_label  prediction_score  
561                 0            0.9366  
641                 1            0.9798  

In [10]:
# Ajuste de hiperparámetros
modelo_ajustado = tune_model(best)

# Guardar el modelo
save_model(modelo_ajustado, 'best_ajustado')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8889,0.9172,0.7857,0.9167,0.8462,0.76,0.7655
1,0.9028,0.931,0.8571,0.8889,0.8727,0.7941,0.7945
2,0.8592,0.8266,0.7778,0.84,0.8077,0.6968,0.6981
3,0.8028,0.8401,0.6667,0.7826,0.72,0.5693,0.5737
4,0.8451,0.883,0.8148,0.7857,0.8,0.6736,0.6739
5,0.8028,0.8535,0.6667,0.7826,0.72,0.5693,0.5737
6,0.8028,0.8418,0.7407,0.7407,0.7407,0.5816,0.5816
7,0.8592,0.9276,0.7778,0.84,0.8077,0.6968,0.6981
8,0.7887,0.8283,0.6296,0.7727,0.6939,0.5351,0.5417
9,0.8592,0.8285,0.8214,0.8214,0.8214,0.7051,0.7051


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['age', 'sibsp', 'fare'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['sex', 'embark_town'],...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=

In [11]:
# Predicción con el modelo ajustado
pred_mod_ajustado = predict_model(modelo_ajustado)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7933,0.8592,0.7391,0.7286,0.7338,0.5649,0.5649
