In [22]:
import pandas as pd 
import seaborn as sns

df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [23]:
df_clean = df.dropna()
df_clean.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [24]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_clean,test_size=0.2,random_state=42)

y_train = df_train['species']
y_test = df_test['species']

X_train = df_train.drop(columns=['species'])
X_test = df_test.drop(columns=['species'])

In [25]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
X_train_dict = X_train.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')

X_train_encoded = dv.fit_transform(X_train_dict)
X_test_encoded = dv.transform(X_test_dict)

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_test_encoded = scaler.fit_transform(X_test_encoded)
X_test_encoded = scaler.transform(X_test_encoded)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model_lr = LogisticRegression(solver='lbfgs')

model_lr.fit(X_train_encoded, y_train)
y_pred = model_lr.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.00      0.00      0.00        31
   Chinstrap       0.00      0.00      0.00        13
      Gentoo       0.34      1.00      0.51        23

    accuracy                           0.34        67
   macro avg       0.11      0.33      0.17        67
weighted avg       0.12      0.34      0.18        67



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
from sklearn.svm import SVC

model_svm = SVC(kernel='linear', random_state=1, probability=True)

model_svm.fit(X_train_encoded, y_train)
y_pred = model_svm.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.46      0.81      0.59        31
   Chinstrap       0.54      0.54      0.54        13
      Gentoo       0.00      0.00      0.00        23

    accuracy                           0.48        67
   macro avg       0.33      0.45      0.38        67
weighted avg       0.32      0.48      0.38        67



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
model_tree.fit(X_train_encoded, y_train)
y_pred = model_tree.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.46      1.00      0.63        31
   Chinstrap       0.00      0.00      0.00        13
      Gentoo       0.00      0.00      0.00        23

    accuracy                           0.46        67
   macro avg       0.15      0.33      0.21        67
weighted avg       0.21      0.46      0.29        67



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')
model_knn.fit(X_train_encoded, y_train)
y_pred = model_knn.predict(X_test_encoded)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Adelie       0.46      1.00      0.63        31
   Chinstrap       0.00      0.00      0.00        13
      Gentoo       0.00      0.00      0.00        23

    accuracy                           0.46        67
   macro avg       0.15      0.33      0.21        67
weighted avg       0.21      0.46      0.29        67



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
import pickle

with open('../models/lr.pck', 'wb') as f:
    pickle.dump((scaler, model_lr), f)

with open('../models/svm.pck', 'wb') as f:
    pickle.dump((scaler, model_svm), f)

with open('../models/dt.pck', 'wb') as f:
    pickle.dump((scaler, model_tree), f)

with open('../models/knn.pck', 'wb') as f:
    pickle.dump((scaler, model_knn), f)