In [58]:
# Import llibreries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle



In [59]:
# Carregar el dataser de pinguins
df = sns.load_dataset("penguins")
df.head()  

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [60]:
# Eliminar les files amb valors nuls
df = df.dropna()
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [61]:
# Crear un conjunt de dades de test
species_mapping = {'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2}
df.species = df.species.map(species_mapping)



In [62]:
# Dividir el dataset en train i test i test
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1, stratify = df.species)

df_train = df_train_full.copy()
y_train = df_train_full.species.values
y_test = df_test.species.values


del df_train['species']
del df_test['species']


In [63]:
# separar les variables categòriques i numèriques
categorical = ['island', 'sex']
numerical = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']  

In [64]:
# Podem revisar com de correlades estan les cada variable categorica amb la la especie
calculate_mi = lambda col: mutual_info_score(col, df_train_full.species)

df_mi = df_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi  

Unnamed: 0,MI
island,0.509658
sex,0.000434


In [65]:
# Podem revisar com de correlades estan les cada variable numerica amb la la especie
# Per poder realizar la transformacio de les variables numeriques en variables categoriques, aplicarem la discretitzacio
kbin_discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
discretized_numerical = kbin_discretizer.fit_transform(df_train[numerical])

discretized_numerical_df = pd.DataFrame(discretized_numerical, columns=numerical)

df_mi_num = discretized_numerical_df.apply(calculate_mi)
df_mi_num

bill_length_mm       0.555429
bill_depth_mm        0.549130
flipper_length_mm    0.603758
body_mass_g          0.478659
dtype: float64

In [66]:
# Normalitzar les variables numeriques
sc = StandardScaler()
sc.fit(df_train[numerical])
X_train_num = sc.transform(df_train[numerical])

X_train_num[:5]


array([[-1.55347522, -0.09167733, -0.86902436, -1.06601543],
       [ 0.55607311, -1.44091212,  1.00974165,  0.82922867],
       [-0.9263122 , -0.04170567, -0.72450389, -1.44506425],
       [-0.33715907,  0.95772751, -0.43546297, -0.24474299],
       [ 0.42303853, -1.04113885,  1.00974165,  1.14510268]])

In [67]:
# Crear un diccionari amb les dades de entrenament
train_dict = df_train[categorical + numerical].to_dict(orient='records')  
train_dict[0] 

{'island': 'Torgersen',
 'sex': 'Female',
 'bill_length_mm': 35.7,
 'bill_depth_mm': 17.0,
 'flipper_length_mm': 189.0,
 'body_mass_g': 3350.0}

In [68]:
# Crear un vectoritzador de diccionaris
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)  

In [69]:
# Transformar les dades categoques de entrenament
X_train_cat = dv.transform(df_train[categorical].to_dict(orient='records'))
X_train_cat[0]

array([0., 0., 0., 0., 0., 0., 1., 1., 0.])

In [70]:
# Unir les variables numeriques i categoriques
X_train = np.hstack([X_train_num, X_train_cat])
X_train[:5]

array([[-1.55347522, -0.09167733, -0.86902436, -1.06601543,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ],
       [ 0.55607311, -1.44091212,  1.00974165,  0.82922867,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.9263122 , -0.04170567, -0.72450389, -1.44506425,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ],
       [-0.33715907,  0.95772751, -0.43546297, -0.24474299,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [ 0.42303853, -1.04113885,  1.00974165,  1.14510268,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ]])

In [71]:
dv.get_feature_names_out()

array(['bill_depth_mm', 'bill_length_mm', 'body_mass_g',
       'flipper_length_mm', 'island=Biscoe', 'island=Dream',
       'island=Torgersen', 'sex=Female', 'sex=Male'], dtype=object)

In [72]:
# Entrenar el model de regressió logística
lr = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)
lr.fit(X_train, y_train)  

In [73]:
# preparar les dades de test
X_test_cat = dv.transform(df_test[categorical].to_dict(orient='records'))
X_test_num = sc.transform(df_test[numerical])

X_test = np.hstack([X_test_num, X_test_cat])
dv.get_feature_names_out()


array(['bill_depth_mm', 'bill_length_mm', 'body_mass_g',
       'flipper_length_mm', 'island=Biscoe', 'island=Dream',
       'island=Torgersen', 'sex=Female', 'sex=Male'], dtype=object)

In [74]:
# Predeir les dades de test
y_pred_lr = lr.predict(X_test)
print(y_pred_lr)  
print("----")
print(y_test)  
  

[2 2 1 0 2 2 0 0 2 2 1 1 0 0 1 2 0 2 0 0 0 0 2 2 0 0 2 0 1 0 1 2 2 2 2 0 2
 2 1 2 1 0 1 0 2 2 0 2 0 1 0 2 0 0 0 1 1 0 2 0 2 0 1 0 1 0 0]
----
[2 2 1 0 2 2 0 0 2 2 1 1 0 0 1 2 0 2 0 0 0 0 2 2 0 0 2 0 1 0 1 2 2 2 2 0 2
 2 1 2 1 0 1 0 2 2 0 2 0 1 0 2 0 0 0 1 1 0 2 0 2 0 1 0 1 0 0]


In [75]:
# entrenar el model de SVM
svm = SVC(kernel='linear',C=1.0, random_state=1, probability=True)

svm.fit(X_train, y_train)

In [76]:
# Predeir les dades de test
y_pred_svm = svm.predict(X_test)
print(y_pred_svm)  
print("----")
print(y_test)  

[2 2 1 0 2 2 0 0 2 2 1 1 0 0 1 2 0 2 0 0 0 0 2 2 0 0 2 0 1 0 1 2 2 2 2 0 2
 2 1 2 1 0 1 0 2 2 0 2 0 1 0 2 0 0 0 1 1 0 2 0 2 0 1 0 1 0 0]
----
[2 2 1 0 2 2 0 0 2 2 1 1 0 0 1 2 0 2 0 0 0 0 2 2 0 0 2 0 1 0 1 2 2 2 2 0 2
 2 1 2 1 0 1 0 2 2 0 2 0 1 0 2 0 0 0 1 1 0 2 0 2 0 1 0 1 0 0]


In [77]:
# Entrenar el model de arbres de decisió
dt = DecisionTreeClassifier(criterion='gini',max_depth=4,
                                    random_state=1)
dt.fit(X_train,y_train)

In [78]:
# Predeir les dades de test
y_pred_dt = dt.predict(X_test)
print(y_pred_dt)  
print("----")
print(y_test)  

[2 2 1 0 2 2 0 0 2 2 1 1 0 0 1 2 0 2 0 0 0 0 2 2 0 0 2 0 1 0 1 2 2 2 2 0 2
 2 1 2 1 0 1 0 2 2 0 2 0 1 0 2 0 0 0 1 1 0 2 0 2 0 1 0 1 0 0]
----
[2 2 1 0 2 2 0 0 2 2 1 1 0 0 1 2 0 2 0 0 0 0 2 2 0 0 2 0 1 0 1 2 2 2 2 0 2
 2 1 2 1 0 1 0 2 2 0 2 0 1 0 2 0 0 0 1 1 0 2 0 2 0 1 0 1 0 0]


In [79]:
# Entrenar el model de KNN
knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')

knn.fit(X_train, y_train)

In [80]:
# Predeir les dades de test
y_pred_knn = knn.predict(X_test)
print(y_pred_knn)  
print("----")
print(y_test)  

[2 2 1 0 2 2 0 0 2 2 1 1 0 0 1 2 0 2 0 0 1 0 2 2 0 0 2 0 1 0 1 2 2 2 2 0 2
 2 1 2 1 0 1 0 2 2 0 2 0 1 0 2 0 0 0 1 1 0 2 0 2 0 1 0 1 0 0]
----
[2 2 1 0 2 2 0 0 2 2 1 1 0 0 1 2 0 2 0 0 0 0 2 2 0 0 2 0 1 0 1 2 2 2 2 0 2
 2 1 2 1 0 1 0 2 2 0 2 0 1 0 2 0 0 0 1 1 0 2 0 2 0 1 0 1 0 0]


In [81]:
# Guardar els models

with open('../models/lr.pck', 'wb') as f:
    pickle.dump((dv, sc, lr), f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump((dv, sc, svm), f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump((dv, sc, dt), f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump((dv, sc, knn), f)