### Clasificación multiclase

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [6]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


### Valores faltantes

In [3]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

numeric_cols  = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include='object').columns

df[numeric_cols] = IterativeImputer(random_state=42).fit_transform(df[numeric_cols])

df[categorical_cols] = SimpleImputer(strategy='most_frequent').fit_transform(df[categorical_cols])

df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

### Codificar categóricos

* species
* island
* sex

LabelEncoder y OneHotEncoder

In [10]:
X = df.drop('species', axis=1)
y = df['species']


### Codificar la entrada con OneHotEncoder
Para la entrada(X) se utiliza pd.get_dummies o OneHotEncoder,
se genera nuevas columnas para cada valor categórico.

In [18]:
from sklearn.preprocessing import OneHotEncoder

categoricals =df[['island', 'sex']]
onehot_encoder = OneHotEncoder(drop='first',sparse=False)
categorials_encoded = onehot_encoder.fit_transform(categoricals)

column_names = onehot_encoder.get_feature_names_out(categoricals.columns)

df_categoricals = pd.DataFrame(categorials_encoded, columns=column_names)
df_categoricals.head(2)



Unnamed: 0,island_Dream,island_Torgersen,sex_Male
0,0.0,1.0,1.0
1,0.0,1.0,0.0


In [19]:
df = pd.concat([df, df_categoricals], axis=1)
df.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,island_Dream,island_Torgersen,sex_Male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0.0,1.0,1.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0.0,1.0,0.0


### Codificar salida (y) con LabelEncoder

Equivalente a .map() de pandas

En una misma culumna genera los valores numéricos por cada valor categórico

In [23]:
from sklearn.preprocessing import LabelEncoder

y = df['species']

label_encoder = LabelEncoder()
df['species_int'] = label_encoder.fit_transform(df['species'])
df.head(2)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,island_Dream,island_Torgersen,sex_Male,species_int
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0.0,1.0,1.0,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0.0,1.0,0.0,0


### Particionamiento datos

In [22]:
X = df.drop(['species', 'island', 'sex'], axis=1)
y = df['species_int']