### Carga del modelo

In [19]:
import seaborn as sns

df = sns.load_dataset("penguins")

# Eliminamos los rows con alguna columna vacia
df = df.dropna()

df.head()


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


### Division del dataset en train (80%) y test (20%)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df['species'] =  LabelEncoder().fit_transform(df['species']) # Adelie -> 0, Chinstrap -> 1, Gentoo -> 2

X = df.drop(columns=['species'])
y = df['species']

train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(X, y, test_size=0.2, random_state=1)

### Normalizacion de los datos

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Variables categoricas
categorical = ['island', 'sex']

# Transformación one-hot
vectorizer = DictVectorizer(sparse=False)
x_train_cat = vectorizer.fit_transform(train_data_x[categorical].to_dict(orient='records'))
x_test_cat = vectorizer.transform(test_data_x[categorical].to_dict(orient='records'))

# Variables numericas
numerical = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']

# Escalado de las variables numericas
scaler = StandardScaler()
x_train_num = scaler.fit_transform(train_data_x[numerical])
x_test_num = scaler.transform(test_data_x[numerical])

# Usamos (https://iedib.net/avirtual/mod/forum/discuss.php?d=59418) np.hstack() para juntarlos los ndarrays
x_train = np.hstack((x_train_cat, x_train_num))
x_test = np.hstack((x_test_cat, x_test_num))

###  Logistics Regression

In [22]:
from sklearn.linear_model import LogisticRegression

# Entrenamiento https://www.kaggle.com/code/paulgreber/logistic-regression-with-palmer-s-penguins
lr = LogisticRegression().fit(x_train, train_data_y)

In [23]:
from sklearn.metrics import classification_report

# Usamos los datos de prueba para realizar la prediccion
prediction = lr.predict(x_test)

print("\nClassification Report:\n", classification_report(test_data_y, prediction))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        22

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67



### SVM

In [24]:
from sklearn.svm import SVC

svm = SVC(kernel='linear',C=1.0, random_state=1, probability=True).fit(x_train, train_data_y)

In [25]:
# Usamos los datos de prueba para realizar la prediccion
prediction = svm.predict(x_test)

print("\nClassification Report:\n", classification_report(test_data_y, prediction))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        22

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67



### Decission Trees

In [61]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini',max_depth=4,
                                    random_state=4).fit(x_train, train_data_y)

In [64]:
# Usamos los datos de prueba para realizar la prediccion
prediction = dt.predict(x_test)

print("\nClassification Report:\n", classification_report(test_data_y, prediction))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.93      0.96        28
           1       0.89      1.00      0.94        17
           2       1.00      1.00      1.00        22

    accuracy                           0.97        67
   macro avg       0.96      0.98      0.97        67
weighted avg       0.97      0.97      0.97        67



### KNN

In [28]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski').fit(x_train, train_data_y)

In [29]:
# Usamos los datos de prueba para realizar la prediccion
prediction = knn.predict(x_test)

print("\nClassification Report:\n", classification_report(test_data_y, prediction))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        22

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67



## Serialitzación de los modelos

In [30]:
import pickle

# A més de cada model, hem de serialitzar el StandardScaler (sc)
#  per poder aplicar la mateixa transformació a les dades d'entrada de les prediccions

with open('../models/lr.pck', 'wb') as f:
    pickle.dump((vectorizer, scaler, lr), f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump((vectorizer, scaler, svm), f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump((vectorizer, scaler, dt), f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump((vectorizer, scaler, knn), f)