In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
sns.set_theme()

## Cargar dataset

In [None]:
# cargar dataset de pingüinos y mostrar primeras filas
df = sns.load_dataset('penguins')
df.head()

In [None]:
# mostrar dimensión de dataframe
df.shape

In [None]:
# contar valores nulos
df.isna().sum()

In [None]:
# eliminar filas con valores nulos y contar valores nulos de nuevo
df.dropna(inplace=True)
df.isna().sum()

## EDAs

In [None]:
# gráfico de conteo de sex con hue
sns.countplot(data=df, x='sex', hue='sex')

In [None]:
# gráfico de conteo de species con hue
sns.countplot(data=df, x='species', hue='species')

In [None]:
# gráfico de conteo de island con hue
sns.countplot(data=df, x='island', hue='island')

In [None]:
# pairplot diferenciando species
sns.pairplot(df, hue='species')

In [None]:
# gráfico de boxplot de body_mass_g vs species
sns.catplot(data=df, x='species', y='body_mass_g', kind='box', hue='species')

In [None]:
# gráfico de boxplot de body_mass_g vs species dividido por sex
sns.catplot(data=df, x='species', y='body_mass_g', kind='box', col='sex', hue='species')

In [None]:
# gráfico de boxplot de body_mass_g vs species dividido por island
sns.catplot(data=df, x='species', y='body_mass_g', kind='box', col='island', hue='species')

## Preparar datos

In [None]:
# mostrar primeras filas del dataframe
df.head()

In [None]:
# crear variables dummy y asignar a X
X = pd.get_dummies(df.drop('species', axis=1), drop_first=True)
X

In [None]:
# mostrar valores únicos de species
df['species'].unique()

In [16]:
# mapear especies a valores numéricos
df['species'] = df['species'].map({
    'Adelie': 0,
    'Chinstrap': 1,
    'Gentoo': 2
})

In [None]:
# asignar variable objetivo a y
y = df['species']
y

In [None]:
# dividir datos en train y test y mostrar conteo de y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
y_train.value_counts()

In [None]:
# dividir dataframe en train y test estratificando por species y mostrar conteo en test
train, test = train_test_split(df, test_size = 0.20, stratify = df['species'], random_state = 42)
test['species'].value_counts()

## Modelado

Si utilizamos un árbol de decisión (DecisionTreeClassifier) no es necesario el escalado ya que compara por impureza Gini.

In [None]:
# importar DecisionTreeClassifier y mostrar documentación
from sklearn.tree import DecisionTreeClassifier

help(DecisionTreeClassifier)

In [None]:
# mostrar columnas de X_train
X_train.columns

In [22]:
# definir función para entrenar modelo, predecir, mostrar métricas y gráfico de árbol
from sklearn.tree import plot_tree

def report_results(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("accuracy: ", accuracy_score(y_test, y_pred))
    print("precision: ", precision_score(y_test, y_pred, average='micro'))
    print("recall (Sensitivity): ", recall_score(y_test, y_pred, average='micro'))
    print("F1-score: ", f1_score(y_test, y_pred, average='micro'))
    print("AUC: ", roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovo'))
    print(classification_report(y_test, y_pred))
    fig = plt.figure(figsize=(12,12), dpi=150)
    plot_tree(model, filled=True, feature_names=X_train.columns)

In [None]:
# instanciar modelo y llamar a report_results
model = DecisionTreeClassifier()
# model = DecisionTreeClassifier(max_depth=2) # limitar la profundidad del árbol
report_results(model, X_train, X_test, y_train, y_test)

In [None]:
# mostrar importancia de características del modelo
model.feature_importances_

In [None]:
# crear dataframe con importancia de características y ordenarlo
df_features = pd.DataFrame(data=model.feature_importances_, columns=['Feature importance'], index=X.columns)
df_features.sort_values('Feature importance', ascending=False)