## CEIA - Análisis de datos

### Clase 8: Automatización de análisis de datos. EDA automático.

### 1. Biblioteca ydata-profiling

In [1]:
import pandas as pd
import seaborn as sns
from ydata_profiling import ProfileReport
import webbrowser
import os
from sklearn.model_selection import train_test_split

In [2]:
# Cargar el dataset de Titanic
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### 1. EDA automático con biblioteca ydata-profiling

In [3]:
# Creamos el reporte
profile_1 = ProfileReport(df, title="Pandas Profiling Report", explorative=False, correlations={
    # default "auto": {"calculate": True}
    "pearson": {"calculate": True},
    "spearman": {"calculate": True},
    "kendall": {"calculate": True},
    "cramers": {"calculate": True}, 
    }
)


In [4]:
# Exportar el repote a un archivo HTML o Json
profile_1.to_file("../recursos/titanic_report_1.html")

profile_1.to_file("../recursos/titanic_report_1.json") # Esta opción se usa para customizar el reporte 


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 15/15 [00:00<00:00, 473.70it/s]
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'putmask: first argument must be an array')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Pequena función para abrir el reporte en el navegador
def open_html_report(report_path):
    file_path = os.path.abspath(report_path)

    # Abrir el reporte en el browser
    webbrowser.open(f"file:///{file_path}")

    return

In [6]:
# Si lo queremos mostrar directamente en la notebook, correr la siguiente línea:
# profile.to_notebook_iframe() 

In [7]:
# Abrir el reporte 1 en el browser
open_html_report("../recursos/titanic_report_1.html")

In [8]:
# Exploración más exhaustiva con el flag "explorative"
profile_2 = ProfileReport(df, title="Pandas Profiling Report EDA", explorative=True)
profile_2.to_file("../recursos/titanic_report_2.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 15/15 [00:00<00:00, 194781.92it/s]
(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'putmask: first argument must be an array')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
# Abrir el reporte 2 en el browser
open_html_report("../recursos/titanic_report_2.html")

### 2. EDA automático con biblioteca sweetviz

In [10]:
import sweetviz as sv

In [11]:
sv.analyze(df).show_html("../recursos/sweetviz_report.html")

                                             |          | [  0%]   00:00 -> (? left)

Report ../recursos/sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [12]:
# Comparar 2 datasets (por ejemplo Train y Test)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Generar el reporte de comparación
comparacion = sv.compare([train_df, "Train"], [test_df, "Test"])

# Guardar el reporte HTML
comparacion.show_html("../recursos/sweetviz_comparacion.html")


                                             |          | [  0%]   00:00 -> (? left)

Report ../recursos/sweetviz_comparacion.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


### 3. Bibliotecas de Auto-ML

### Pycaret

In [13]:
# Por ejemplo en preparación para un modelo de clasificación
from pycaret.classification import *


In [14]:
import numpy as np
np.seterr(all='ignore')

{'divide': 'raise', 'over': 'raise', 'under': 'raise', 'invalid': 'raise'}

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [16]:
# Convertir los objetos a string
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str)

# Convertir pclass a string 
df['pclass'] = df['pclass'].astype(str)

# Ensure numerical columns are float64
df[['age', 'fare']] = df[['age', 'fare']].astype('float64')

In [17]:
# Setup de Pycaret
s = setup(
    data=df,
    target='survived',
    session_id=123,
    preprocess=True,
    normalize=False,
    normalize_method='minmax', # 'zscore' (StandardScaler) o'minmax'
    remove_outliers=True,
    outliers_threshold=0.01,
    ordinal_features={'pclass': ['3', '2', '1']},
    categorical_features=['sex', 'embark_town'],
    ignore_features=['parch', 'alive', 'alone', 'who', 'adult_male', 'embarked', 'class', 'sibsp', 'deck'],
    categorical_imputation='mode',
    numeric_imputation='mean',
    verbose=True,  # set to True to catch any internal errors
)

# Cargamos los datos preprocesados
dataset_transf = get_config('dataset_transformed')

print("Datos preprocesados:")
print(dataset_transf.head())
print(f"Dimensiones del dataset preprocesado: {dataset_transf.shape}")
print(f"Nulos: {dataset_transf.isnull().sum().sum()}")


Unnamed: 0,Description,Value
0,Session id,123
1,Target,survived
2,Target type,Binary
3,Original data shape,"(891, 15)"
4,Transformed data shape,"(884, 9)"
5,Transformed train set shape,"(616, 9)"
6,Transformed test set shape,"(268, 9)"
7,Ignore features,9
8,Ordinal features,1
9,Numeric features,2


Datos preprocesados:
     pclass  sex        age    fare  embark_town_Southampton  \
199     1.0  0.0  24.000000  13.000                      1.0   
468     0.0  1.0  28.941458   7.725                      0.0   
198     0.0  0.0  28.941458   7.750                      0.0   
574     0.0  1.0  16.000000   8.050                      1.0   
776     0.0  1.0  28.941458   7.750                      0.0   

     embark_town_Queenstown  embark_town_Cherbourg  embark_town_nan  survived  
199                     0.0                    0.0              0.0         0  
468                     1.0                    0.0              0.0         0  
198                     1.0                    0.0              0.0         1  
574                     0.0                    0.0              0.0         0  
776                     1.0                    0.0              0.0         0  
Dimensiones del dataset preprocesado: (884, 9)
Nulos: 0


In [18]:
dataset_transf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 884 entries, 199 to 779
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   pclass                   884 non-null    float64
 1   sex                      884 non-null    float64
 2   age                      884 non-null    float32
 3   fare                     884 non-null    float32
 4   embark_town_Southampton  884 non-null    float64
 5   embark_town_Queenstown   884 non-null    float64
 6   embark_town_Cherbourg    884 non-null    float64
 7   embark_town_nan          884 non-null    float64
 8   survived                 884 non-null    int8   
dtypes: float32(2), float64(6), int8(1)
memory usage: 56.1 KB


#### ¿Cómo seguiría el proceso? (Tomado de la documentación de Pycaret: https://github.com/pycaret/pycaret?tab=readme-ov-file#1-functional-api)


In [19]:
# Classification Functional API Example 

# model training and selection
best = compare_models()

# evaluate trained model
evaluate_model(best)

# predict on hold-out/test set
# pred_holdout = predict_model(best)

# # predict on new data
# new_data = df.copy().drop('survived', axis = 1)
# predictions = predict_model(best, data = new_data)

# # save model
# save_model(best, 'best_pipeline')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.825,0.8429,0.7152,0.8085,0.757,0.6213,0.6259,0.033
rf,Random Forest Classifier,0.8073,0.8447,0.715,0.7726,0.7386,0.5867,0.5915,0.049
lr,Logistic Regression,0.8024,0.8495,0.7239,0.7571,0.7355,0.5786,0.5835,0.309
lightgbm,Light Gradient Boosting Machine,0.8024,0.8421,0.7025,0.7673,0.7299,0.5751,0.5796,0.289
ridge,Ridge Classifier,0.7992,0.8492,0.7029,0.7587,0.7266,0.5688,0.5724,0.022
ada,Ada Boost Classifier,0.7992,0.8371,0.7199,0.7525,0.7327,0.5724,0.5757,0.033
lda,Linear Discriminant Analysis,0.7976,0.8492,0.7029,0.7562,0.7253,0.5659,0.5696,0.022
et,Extra Trees Classifier,0.7945,0.8309,0.7025,0.7526,0.7236,0.5606,0.5642,0.047
nb,Naive Bayes,0.7784,0.8197,0.7366,0.7097,0.7166,0.5358,0.5432,0.023
dt,Decision Tree Classifier,0.7544,0.7349,0.6696,0.6849,0.6747,0.4779,0.4801,0.025


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…