Importar biblioteca necesaria

In [49]:
import pandas as pd

Cargar el dataset y exploración inicial de la carga del csv

In [50]:
df = pd.read_csv('../dataset/penguins_size.csv')
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


Observación de la estructura y los tipos de datos

In [51]:
df.info()
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


species               object
island                object
culmen_length_mm     float64
culmen_depth_mm      float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

Limpieza de datos. Se eliminan todas las filas que contienen al menos un valor NA.

In [52]:
#Se trata la fila que tiene como valor un punto (.) igual que los valores NA.
df = df[df['sex'] != '.']
df.dropna(inplace=True)
df.isnull().sum()


species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

Importar librería necesarias

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.preprocessing import LabelEncoder


División del dataset

In [54]:
# Definir variable objetivo
X = df.drop('species', axis=1)
y = df['species']

# Dividir en entrenamiento y prueba (80% y 20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Preprocesamiento de Datos. Se identifican las variables categóricas. Print para verificar como va el proceso

In [59]:
categorical_features = ['island', 'sex']

train_dict = X_train[categorical_features].to_dict(orient='records')
test_dict = X_test[categorical_features].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train_cat = dv.fit_transform(train_dict)
X_test_cat = dv.transform(test_dict)

print(X_train_cat.shape)
print(X_test_cat.shape)
print(dv.feature_names_)


(266, 5)
(67, 5)
['island=Biscoe', 'island=Dream', 'island=Torgersen', 'sex=FEMALE', 'sex=MALE']


Se identifican las variables numéricas y se escalan usando StandardScaler

In [60]:
numerical_features = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']

scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train[numerical_features])
X_test_num = scaler.transform(X_test[numerical_features])

Combinación de variables numericas y categoricas. Print para verificar como va el proceso.

In [61]:
X_train_prepared = np.hstack((X_train_num, X_train_cat))
X_test_prepared = np.hstack((X_test_num, X_test_cat))

print(X_train_prepared.shape)
print(X_test_prepared.shape)

(266, 9)
(67, 9)


Tratar la variable objetivo. Comprobacion de las clases

In [66]:
le = LabelEncoder()

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

le.classes_


array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

Regresión logística

In [67]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train_prepared, y_train_encoded)

Máquinas de Soporte Vectorial (SVM)

In [68]:
from sklearn.svm import SVC

svm_model = SVC(probability=True)
svm_model.fit(X_train_prepared, y_train_encoded)

Árboles de Decisión

In [69]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_prepared, y_train_encoded)

K Vecinos Más Cercanos (KNN)

In [70]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_prepared, y_train_encoded)

Serialización de los modelos

In [73]:
import pickle

# Serializar Regresión Logística
with open('../models/log_reg_model.pkl', 'wb') as f:
    pickle.dump((dv, scaler, le, log_reg), f)

# Serializar SVM
with open('../models/svm_model.pkl', 'wb') as f:
    pickle.dump((dv, scaler, le, svm_model), f)

# Serializar Árbol de Decisión
with open('../models/dt_model.pkl', 'wb') as f:
    pickle.dump((dv, scaler, le, dt_model), f)

# Serializar KNN
with open('../models/knn_model.pkl', 'wb') as f:
    pickle.dump((dv, scaler, le, knn_model), f)