# Práctico 3: Clasificación

## Imports

In [1]:
# Scientific computing
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Data analysis and manipulation
import pandas as pd

# Machine learning
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.tree import plot_tree
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, log_loss

## Penguins dataset

### Cargamos del dataset

In [None]:
penguins = pd.read_csv('penguins.csv', dtype={'species':'category', 'island': 'category', 'sex':'category'})

In [None]:
penguins.info()

In [None]:
atributos_categoricos = list(penguins.select_dtypes(include='category').columns)
print(atributos_categoricos)

In [None]:
atributos_continuos = list(penguins.select_dtypes(include='float').columns)
print(atributos_continuos)

### Análisis exploratorio

In [None]:
penguins.head()

#### Análisis univariado

In [None]:
penguins.describe()

In [None]:
for col in penguins.select_dtypes(include='category').columns:
    print(col)
    print('----------')
    print(penguins[col].value_counts())
    print('----------')

In [None]:
fig, axes = plt.subplots(1,len(atributos_categoricos),figsize=(10,5))

for i, atributo in enumerate(atributos_categoricos):
    penguins[atributo].value_counts().plot(kind='bar', ax=axes[i])

In [None]:
#Boxplots
fig, axes = plt.subplots(len(atributos_continuos),1,figsize=(5,6))

for i, atributo in enumerate(atributos_continuos):
    ax = sns.boxplot(x=atributo, ax=axes[i], data=penguins)
    ax.set_xlabel(atributo)
    
plt.tight_layout()
plt.show()

In [None]:
# Densidades
fig, axes = plt.subplots(len(atributos_continuos),1,figsize=(5,10))

for i, atributo in enumerate(atributos_continuos):
    ax = sns.kdeplot(x=atributo, ax=axes[i], data=penguins, fill=True)
    ax.set_xlabel(atributo)
    ax.axvline(x=penguins[atributo].mean(), linestyle='dashed',label='Media')
    ax.axvline(x=penguins[atributo].median(), linestyle='dashed', color='red',label='Mediana')
    ax.legend()
    
plt.tight_layout()
plt.show()

#### Análisis multivariado

In [None]:
corr = penguins.corr(numeric_only=True)
sns.heatmap(corr, cmap="Blues", annot=True, fmt='.2f')
plt.show()

In [None]:
g = sns.jointplot(
    data=penguins,
    x=atributos_continuos[0],
    y=atributos_continuos[1],
    kind="kde",
    fill=True,
    alpha=0.4
)
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
plt.show()

In [None]:
sns.jointplot(
    data=penguins,
    x=atributos_continuos[0],
    y=atributos_continuos[1],
    hue="sex",
    kind="kde",
    fill=True,
    alpha=0.4
)
plt.show()

In [None]:
sns.jointplot(
    data=penguins,
    x=atributos_continuos[0],
    y=atributos_continuos[1],
    hue="species",
    kind="kde",
    fill=True,
    alpha=0.4
)
plt.show()

In [None]:
sns.jointplot(
    data=penguins,
    x=atributos_continuos[0],
    y=atributos_continuos[1],
    hue="island",
    kind="kde",
    fill=True,
    alpha=0.4
)
plt.show()

In [None]:
sns.pairplot(penguins, hue="sex")
plt.show()

In [None]:
sns.pairplot(penguins, hue="species")
plt.show()

In [None]:
sns.pairplot(penguins, hue="island")
plt.show()

In [None]:
penguins_nan = penguins.dropna().copy()
colors = penguins_nan[['species', 'sex', 'island']].apply(tuple, axis=1)
penguins_nan['colors'] = colors
penguins_nan.head()

In [None]:
sns.pairplot(penguins_nan, hue='colors')
plt.show()

## Regresión Logística

### Regresión logística simple: univariada y binaria

#### Entrenamiento

In [None]:
# Instanciamos el algoritmo
lr_clf = LogisticRegression(solver='newton-cg',penalty=None)
lr_clf

In [None]:
# Extraemos los numpy arrays
X=penguins_nan['body_mass_g'].to_numpy().reshape(-1, 1)
y=penguins_nan['sex'].to_numpy()

In [None]:
# Entrenamos
lr_clf.fit(X,y)

In [None]:
# Coeficientes
w0 = lr_clf.intercept_
w1 = lr_clf.coef_
print(w0,w1)

In [None]:
# Obtenemos predicciones
y_pred = lr_clf.predict(X)
y_pred_proba = lr_clf.predict_proba(X)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
# Función de pérdida (BCE)
log_loss(y,y_pred_proba)

In [None]:
# Matriz de confusión
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lr_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_lr_simple.png')

#### Graficamos la hipótesis obtenida

In [None]:
xmin = penguins['body_mass_g'].min()
xmax = penguins['body_mass_g'].max()

X_ax = np.linspace(xmin, xmax, num=1000).reshape(-1, 1)

pred_y = lr_clf.predict(X_ax)
pred_proba_y = lr_clf.predict_proba(X_ax)

In [None]:
sns.scatterplot(x=X.ravel(), y=y.ravel(), hue=y.ravel())
plt.step(X_ax, pred_y, lw=1.5, color='red', label="Clase")
plt.plot(X_ax, pred_proba_y[:,0], color='green', label='Probabilidad')
plt.legend()
plt.xlabel('Body mass (g)')
plt.title('Regresión logística simple')
plt.savefig('rl_simple_bm_sex.png')

#### Curvas de nivel de la función de pérdida

In [None]:
res = 100
lin_w0 = np.linspace(-7.0, -4.0, res)
lin_w1 = np.linspace(0.0005, 0.0025, res)

In [None]:
W0, W1 = np.meshgrid(lin_w0, lin_w1)

In [None]:
L=np.zeros(W0.shape)

for i in range(res):
    for j in range(res):
        w0 = W0[i,j]
        w1 = W1[i,j]
        proba = 1/(1+np.exp(-(X*w1+w0)))
        L[i,j] = log_loss(y,proba)

In [None]:
levels = [0, 0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2, 2.2, 2.4, 2.6, 2.8, 3]
cp = plt.contour(W0, W1, L,levels,colors='black', linestyles='dashed', linewidths=1)
plt.clabel(cp, inline=1, fontsize=10)
cp = plt.contourf(W0, W1, L,levels)
plt.xlabel('w0')
plt.ylabel('w1')
plt.savefig('rl_bm_sex_loss.png')

### Regresión logística: univariada, binaria y con polinomios

#### Entrenamiento

In [None]:
# Elegimos el grado
K = 10

In [None]:
# Obtenemos los features
poly = PolynomialFeatures(degree=K, include_bias=False)
X_poly = poly.fit_transform(X)

In [None]:
# Escalamos
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_poly)

In [None]:
lr_poly_clf = LogisticRegression(solver='newton-cg',penalty=None)
lr_poly_clf.fit(X_norm, y)

In [None]:
# Coeficientes
w0 = lr_poly_clf.intercept_
w1 = lr_poly_clf.coef_
print(w0,w1)

In [None]:
# Obtenemos predicciones
y_pred = lr_poly_clf.predict(X_norm)
y_pred_proba = lr_poly_clf.predict_proba(X_norm)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
# Función de pérdida (BCE)
log_loss(y,y_pred_proba)

In [None]:
# Matriz de confusión
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lr_poly_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_lr_poly.png')

#### Graficamos la hipótesis obtenida

In [None]:
xmin = penguins['body_mass_g'].min()
xmax = penguins['body_mass_g'].max()

X_ax = np.linspace(xmin, xmax, num=500).reshape(-1, 1)

X_ax_poly = poly.transform(X_ax)
X_ax_norm = scaler.transform(X_ax_poly)

pred_y = lr_poly_clf.predict(X_ax_norm)
pred_proba_y = lr_poly_clf.predict_proba(X_ax_norm)

In [None]:
sns.scatterplot(x=X.ravel(), y=y.ravel(), hue=y.ravel())
plt.step(X_ax, pred_y, lw=1.5, color='red', label="Clase")
plt.plot(X_ax, pred_proba_y[:,0], color='green', label='Probabilidad')
plt.legend()
plt.xlabel('Body mass (g)')
plt.title('Regresión logística con polinomios K='+str(K))
plt.savefig('rl_poly_bm_sex_K_'+str(K)+'.png')

#### Error, loss y tamaño de los coeficientes en función del grado

In [None]:
sizes = []
errors = []
losses = []

for K in range(1,20):
    poly = PolynomialFeatures(degree=K, include_bias=False)
    X_poly = poly.fit_transform(X)
    
    scaler = StandardScaler()
    X_norm = scaler.fit_transform(X_poly)
    
    lr_poly_clf = LogisticRegression(solver='newton-cg',penalty=None)
    lr_poly_clf.fit(X_norm, y)
    y_pred = lr_poly_clf.predict(X_norm)
    y_pred_proba = lr_poly_clf.predict_proba(X_norm)
    errors.append(1-accuracy_score(y,y_pred))
    sizes.append(np.mean(np.abs(lr_poly_clf.coef_)))
    losses.append(log_loss(y,y_pred_proba))

In [None]:
df_errors = pd.DataFrame({"Grado":range(1,20), "Error":errors})
line_plot_errors = sns.lineplot(
    data=df_errors,
    x="Grado", y="Error",
    marker='o',
    dashes=False,
    errorbar = ('ci', False)
)
plt.title("Error empírico en función del grado")
line_plot_fig = line_plot_errors.get_figure()
line_plot_fig.savefig('bm_sex_errors.png')

In [None]:
df_sizes = pd.DataFrame({"Grado":range(1,20), "Log valor abs promedio de coef":np.log(sizes)})
line_plot_sizes = sns.lineplot(
    data=df_sizes,
    x="Grado", y="Log valor abs promedio de coef",
    marker='o',
    dashes=False,
    errorbar = ('ci', False)
)
plt.title("Valor abs promedio de coef en función del grado")
line_plot_fig = line_plot_sizes.get_figure()
line_plot_fig.savefig('bm_sex_sizes.png')

In [None]:
df_loss = pd.DataFrame({"Grado":range(1,20), "BCE":losses})
line_plot_losses = sns.lineplot(
    data=df_loss,
    x="Grado", y="BCE",
    marker='o',
    dashes=False,
    errorbar = ('ci', False)
)
plt.title("BCE en función del grado")
line_plot_fig = line_plot_losses.get_figure()
line_plot_fig.savefig('bm_sex_losses.png')

### Regresión logística: multivariada y binaria

#### Entrenamiento

In [None]:
# Extraemos los numpy arrays
X=penguins_nan[['body_mass_g','flipper_length_mm']].to_numpy()
y=penguins_nan['sex'].to_numpy()

In [None]:
# Normalizamos
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [None]:
# Entrenamos
lr_clf = LogisticRegression(solver='newton-cg',penalty=None)
lr_clf.fit(X_norm,y)

In [None]:
# Coeficientes
w0 = lr_clf.intercept_
w1 = lr_clf.coef_
print(w0,w1)

In [None]:
# Obtenemos predicciones
y_pred = lr_clf.predict(X_norm)
y_pred_proba = lr_clf.predict_proba(X_norm)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
# Función de pérdida (BCE)
log_loss(y,y_pred_proba)

In [None]:
# Matriz de confusión
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lr_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_lr_multi_bin.png')

#### Graficamos la hipótesis obtenida

In [None]:
# Graficamos la hipotesis obtenida
ax = plt.subplot(1, 1, 1)
disp=DecisionBoundaryDisplay.from_estimator(
    lr_clf,
    X_norm,
    response_method="predict",
    ax=ax,
    xlabel='Body mass (g)',
    ylabel='Flipper length (mm)',
    eps=0.02,
    grid_resolution = 1000,
    alpha=0.5, 
    cmap='Oranges'
    )

colors = {'FEMALE':'black', 'MALE':'orange'}
# Plotting the data points    
disp.ax_.scatter(X_norm[:, 0], X_norm[:, 1], 
                 c=[colors[i] for i in y], edgecolor="k")
plt.savefig('boundary_penguins_lr.png')

### Regresión logística: multivariada y multiclase

#### Entrenamiento

In [None]:
# Extraemos los numpy arrays
X=penguins_nan[['culmen_length_mm','flipper_length_mm']].to_numpy()
y=penguins_nan['species'].to_numpy()

In [None]:
# Normalizamos
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [None]:
# Entrenamos
lr_clf = LogisticRegression(solver='newton-cg',penalty=None, multi_class='multinomial')
lr_clf.fit(X_norm,y)

In [None]:
# Coeficientes
w0 = lr_clf.intercept_
w1 = lr_clf.coef_
print(w0,w1)

In [None]:
# Obtenemos predicciones
y_pred = lr_clf.predict(X_norm)
y_pred_proba = lr_clf.predict_proba(X_norm)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
# Función de pérdida (BCE)
log_loss(y,y_pred_proba)

In [None]:
# Matriz de confusión
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lr_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_multi_multi.png')

#### Graficamos la hipótesis obtenida

In [None]:
# Graficamos la hipotesis obtenida
ax = plt.subplot(1, 1, 1)
disp=DecisionBoundaryDisplay.from_estimator(
    lr_clf,
    X_norm,
    response_method="predict",
    ax=ax,
    xlabel='Culmen length (mm)',
    ylabel='Flipper length (mm)',
    eps=0.02,
    grid_resolution = 1000,
    alpha=0.5, 
    cmap='Oranges'
    )

colors = {'Adelie':'black', 'Gentoo':'orange', 'Chinstrap':'red'}
# Plotting the data points    
disp.ax_.scatter(X_norm[:, 0], X_norm[:, 1], 
                 c=[colors[i] for i in y], edgecolor="k")
plt.savefig('multiclass_boundary_penguins_lr.png')

## K vecinos más cercanos

### Binario y univariado

#### Entrenamiento

In [None]:
# Elegimos el k
k=30

In [None]:
# Debemos elegir la distancia
knn_clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')

In [None]:
knn_clf

In [None]:
X=penguins_nan['body_mass_g'].to_numpy().reshape(-1, 1)
y=penguins_nan['sex'].to_numpy()

In [None]:
knn_clf.fit(X,y)

In [None]:
# Obtenemos predicciones
y_pred = knn_clf.predict(X)
y_pred_proba = knn_clf.predict_proba(X)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix.png')

#### Graficamos la hipótesis obtenida

In [None]:
xmin = penguins['body_mass_g'].min()
xmax = penguins['body_mass_g'].max()

X_ax = np.linspace(xmin, xmax, num=1000).reshape(-1, 1)

pred_y = knn_clf.predict(X_ax)
pred_proba_y = knn_clf.predict_proba(X_ax)

In [None]:
sns.scatterplot(x=X.ravel(), y=y.ravel(), hue=y.ravel())
plt.step(X_ax, pred_y, lw=1.5, color='red', label="Predicción")
plt.legend()
plt.title('KNN clases con k='+str(k))
plt.savefig('clases_bm_k_'+str(k)+'.png')

In [None]:
sns.scatterplot(x=X.ravel(), y=y.ravel(), hue=y.ravel())
plt.step(X_ax, pred_proba_y, lw=1.5, label=["FEMALE","MALE"])
plt.legend()
plt.title('KNN clases con k='+str(k))
plt.savefig('clases_bm_k_'+str(k)+'.png')

#### Error en función de k

In [None]:
errors = []

for K in range(1,300):
    knn_clf = KNeighborsClassifier(n_neighbors=K, metric='euclidean')
    knn_clf.fit(X, y)
    y_pred = knn_clf.predict(X)
    errors.append(1-accuracy_score(y,y_pred))

In [None]:
df_errors = pd.DataFrame({"K":range(1,300), "Error":errors})
line_plot_errors = sns.lineplot(
    data=df_errors,
    x="K", y="Error",
    dashes=False,
    errorbar = ('ci', False)
)
plt.title("Error empírico en función de k")
line_plot_fig = line_plot_errors.get_figure()
line_plot_fig.savefig('knn_bm_sex_errors.png')

### Binario y multivariado

#### Entrenamiento

In [None]:
X=penguins_nan[['body_mass_g','flipper_length_mm']].to_numpy()
y=penguins_nan['sex'].to_numpy()

In [None]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [None]:
# Entrenamos
k=30
knn_clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
knn_clf.fit(X_norm,y)

In [None]:
y_pred = knn_clf.predict(X_norm)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_2.png')

#### Graficamos la hipótesis obtenida

In [None]:
# Graficamos la hipotesis obtenida
ax = plt.subplot(1, 1, 1)
disp=DecisionBoundaryDisplay.from_estimator(
    knn_clf,
    X_norm,
    response_method="predict",
    ax=ax,
    xlabel='Body mass (g)',
    ylabel='Flipper length (mm)',
    eps=0.02,
    grid_resolution = 1000,
    alpha=0.5, 
    cmap='Oranges'
    )

colors = {'FEMALE':'black', 'MALE':'orange'}
# Plotting the data points    
disp.ax_.scatter(X_norm[:, 0], X_norm[:, 1], 
                 c=[colors[i] for i in y], edgecolor="k")
plt.savefig('boundary_penguins_knn.png')

### Multiclase y multivariado

#### Entrenamiento

In [None]:
X=penguins_nan[['culmen_length_mm','flipper_length_mm']].to_numpy()
y=penguins_nan['species'].to_numpy()

In [None]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [None]:
# Entrenamos
k=15
knn_clf = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
knn_clf.fit(X_norm,y)

In [None]:
y_pred = knn_clf.predict(X_norm)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_2.png')

#### Graficamos la hipótesis obtenida

In [None]:
# Graficamos la hipotesis obtenida
ax = plt.subplot(1, 1, 1)
disp=DecisionBoundaryDisplay.from_estimator(
    knn_clf,
    X_norm,
    response_method="predict",
    ax=ax,
    xlabel='Culmen length (mm)',
    ylabel='Flipper length (mm)',
    eps=0.02,
    grid_resolution = 1000,
    alpha=0.5, 
    cmap='Oranges'
    )

colors = {'Adelie':'black', 'Gentoo':'orange', 'Chinstrap':'red'}

# Ploteamos los puntos    
disp.ax_.scatter(X_norm[:, 0], X_norm[:, 1], 
                 c=[colors[i] for i in y], edgecolor="k")
plt.savefig('multiclass_boundary_penguins_knn.png')

## Árboles de decisión

### Binario y univariado

#### Entrenamiento

In [None]:
X=penguins_nan['body_mass_g'].to_numpy().reshape(-1, 1)
y=penguins_nan['sex'].to_numpy()

In [None]:
# Elegimos la profundidad
depth=5

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
tree_clf.fit(X, y)

In [None]:
# Obtenemos predicciones
y_pred = tree_clf.predict(X)
y_pred_proba = tree_clf.predict_proba(X)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_arbol_univ.png')

#### Graficamos la hipótesis obtenida

In [None]:
# Graficamos el arbol
plt.figure(figsize=(12,5))
plot_tree(tree_clf,filled=True,fontsize=10)
plt.savefig('arbol_univ_entropy_d_'+str(depth)+'.png')

In [None]:
xmin = penguins['body_mass_g'].min()
xmax = penguins['body_mass_g'].max()

X_ax = np.linspace(xmin, xmax, num=1000).reshape(-1, 1)

pred_y = tree_clf.predict(X_ax)
pred_proba_y = tree_clf.predict_proba(X_ax)

In [None]:
sns.scatterplot(x=X.ravel(), y=y.ravel(), hue=y.ravel())
plt.step(X_ax, pred_y, lw=1.5, color='red', label="Predicción")
plt.legend()
plt.title('Árbol (Gini) con profundidad='+str(depth))
plt.savefig('clases_arbol_gini_univ_d_'+str(depth)+'.png')

In [None]:
sns.scatterplot(x=X.ravel(), y=y.ravel(), hue=y.ravel())
plt.step(X_ax, pred_proba_y, lw=1.5, label=["FEMALE","MALE"])
plt.legend()
plt.title('Árbol (Entropy) con profundidad ='+str(depth))
plt.savefig('proba_arbol_entropy_univ_d_'+str(depth)+'.png')

#### Error y número de hojas en función de la profundidad

In [None]:
errors = []
leafs = []

for depth in range(1,20):
    tree_clf = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
    tree_clf.fit(X, y)
    y_pred = tree_clf.predict(X)
    errors.append(1-accuracy_score(y,y_pred))
    leafs.append(tree_clf.get_n_leaves())

In [None]:
df_errors = pd.DataFrame({"Depth":range(1,20), "Error":errors})
line_plot_errors = sns.lineplot(
    data=df_errors,
    x="Depth", y="Error",
    marker='o',
    dashes=False,
    errorbar = ('ci', False)
)
plt.title("Error empírico en función de la profundidad")
line_plot_fig = line_plot_errors.get_figure()
line_plot_fig.savefig('tree_bm_sex_errors.png')

In [None]:
df_leafs = pd.DataFrame({"Depth":range(1,20), "Leafs":leafs})
line_plot_leafs = sns.lineplot(
    data=df_leafs,
    x="Depth", y="Leafs",
    marker='o',
    dashes=False,
    errorbar = ('ci', False)
)
plt.title("Número de hojas en función de la profundidad")
line_plot_fig = line_plot_leafs.get_figure()
line_plot_fig.savefig('tree_bm_sex_leafs.png')

### Binario y multivariado

#### Entrenamiento

In [None]:
X=penguins_nan[['body_mass_g','flipper_length_mm']].to_numpy()
y=penguins_nan['sex'].to_numpy()

In [None]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [None]:
# Entrenamos
depth=3
tree_clf = DecisionTreeClassifier(max_depth=depth,criterion='entropy')
tree_clf.fit(X_norm,y)

In [None]:
y_pred = knn_clf.predict(X_norm)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_arbol_bin_mulv.png')

#### Graficamos la hipótesis obtenida

In [None]:
# Graficamos el arbol
plt.figure(figsize=(10,5))
plot_tree(tree_clf,filled=True,fontsize=10)
plt.savefig('arbol_bin_multiv.png')

In [None]:
# Graficamos la hipotesis obtenida
ax = plt.subplot(1, 1, 1)
disp=DecisionBoundaryDisplay.from_estimator(
    tree_clf,
    X_norm,
    response_method="predict",
    ax=ax,
    xlabel='Body mass (g)',
    ylabel='Flipper length (mm)',
    eps=0.02,
    grid_resolution = 1000,
    alpha=0.5, 
    cmap='Oranges'
    )

colors = {'FEMALE':'black', 'MALE':'orange'}
# Plotting the data points    
disp.ax_.scatter(X_norm[:, 0], X_norm[:, 1], 
                 c=[colors[i] for i in y], edgecolor="k")
plt.savefig('boundary_penguins_tree.png')

### Multiclase y multivariado

#### Entrenamiento

In [None]:
X=penguins_nan[['culmen_length_mm','flipper_length_mm']].to_numpy()
y=penguins_nan['species'].to_numpy()

In [None]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [None]:
# Entrenamos
depth=4
tree_clf = DecisionTreeClassifier(max_depth=depth, criterion='entropy')
tree_clf.fit(X_norm,y)

In [None]:
y_pred = tree_clf.predict(X_norm)

In [None]:
# Porcentaje de acierto y error
acc = accuracy_score(y_pred, y)
err = 1-acc
print(err)

In [None]:
cm=confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree_clf.classes_)

disp.plot(cmap="OrRd")
plt.savefig('confusion_matrix_arbol_mulc.png')

#### Graficamos la hipótesis obtenida

In [None]:
# Graficamos el arbol
plt.figure(figsize=(22,6))
plot_tree(tree_clf,filled=True,fontsize=10)
plt.savefig('arbol_multiclase.png')

In [None]:
# Graficamos la hipotesis obtenida
ax = plt.subplot(1, 1, 1)
disp=DecisionBoundaryDisplay.from_estimator(
    tree_clf,
    X_norm,
    response_method="predict",
    ax=ax,
    xlabel='Culmen length (mm)',
    ylabel='Flipper length (mm)',
    eps=0.02,
    grid_resolution = 1000,
    alpha=0.5, 
    cmap='Oranges'
    )

colors = {'Adelie':'black', 'Gentoo':'orange', 'Chinstrap':'red'}

# Ploteamos los puntos    
disp.ax_.scatter(X_norm[:, 0], X_norm[:, 1], 
                 c=[colors[i] for i in y], edgecolor="k")
plt.savefig('multiclass_boundary_penguins_tree.png')