In [87]:
from sklearn.datasets import load_digits, load_iris
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [108]:
def load_penguin(return_X_y=True):
    df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
    if not return_X_y:
        return df
    df = df.dropna()
    penguin_gender = df.pop("sex")
    df = pd.concat([df, pd.get_dummies(penguin_gender)], axis=1)
    penguin_island = df.pop("island")
    df = pd.concat([df, pd.get_dummies(penguin_island)], axis=1)
    target_df = df.pop('species')
    y = target_df.values
    X = df.values
    return X, y

def train_validation_test_split(X, y, test_size=0.2, validation_size=0.25, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size, random_state=random_state, stratify=y_train)
    return X_train, X_validation, X_test, y_train, y_validation, y_test

In [125]:
df = load_penguin(return_X_y=False).dropna()
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [131]:
print("Labels:", list(df.species.unique()), "\nColumns:", list(df.columns))
df.describe()

Labels: ['Adelie', 'Chinstrap', 'Gentoo'] 
Columns: ['species', 'island', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']


Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,333.0,333.0,333.0,333.0
mean,43.992793,17.164865,200.966967,4207.057057
std,5.468668,1.969235,14.015765,805.215802
min,32.1,13.1,172.0,2700.0
25%,39.5,15.6,190.0,3550.0
50%,44.5,17.3,197.0,4050.0
75%,48.6,18.7,213.0,4775.0
max,59.6,21.5,231.0,6300.0


In [109]:
X, y = load_penguin(return_X_y=True)

In [119]:
lr_acc_list = list()
knn_acc_list = list()
lr_pre_list = list()
knn_pre_list = list()
lr_rec_list = list()
knn_rec_list = list()

for i in range(10):
    seed = i ** 2
    print("Iteration:", i, "\nSeed:", seed)
    X_train, X_validation, X_test, y_train, y_validation, y_test = train_validation_test_split(X, y, random_state=seed)
    
    model_lr = LogisticRegression(max_iter=2000)
    model_knn = KNeighborsClassifier()
    
    model_lr.fit(X_train, y_train)
    model_knn.fit(X_train, y_train)
    
    score_lr = model_lr.score(X_validation, y_validation)
    score_knn = model_knn.score(X_validation, y_validation)
    print("Validation Accuracy Score LR:", score_lr, "Validation Accuracy Score KNN:", score_knn)
    
    y_pred_lr = model_lr.predict(X_test)
    y_pred_knn = model_knn.predict(X_test)
    acc_lr = accuracy_score(y_pred_lr, y_test)
    acc_knn = accuracy_score(y_pred_knn, y_test)

    pre_lr = precision_score(y_pred_lr, y_test, average="macro")
    pre_knn = precision_score(y_pred_knn, y_test, average="macro")

    rec_lr = recall_score(y_pred_lr, y_test, average="macro")
    rec_knn = recall_score(y_pred_knn, y_test, average="macro")
    
    lr_acc_list.append(acc_lr)
    knn_acc_list.append(acc_knn)
    lr_pre_list.append(pre_lr)
    knn_pre_list.append(pre_knn)
    lr_rec_list.append(rec_lr)
    knn_rec_list.append(rec_knn)

    
    print("\nTest Results")
    print("Accuracy Score LR:", acc_lr, "Accuracy Score KNN:", acc_knn)
    print("Recall Score LR:", rec_lr, "Recall Score KNN:", rec_knn)
    print("Precision Score LR:", pre_lr, "Precision Score KNN:", pre_knn)
    
    print("\n\n")

Iteration: 0 
Seed: 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy Score LR: 0.9850746268656716 Validation Accuracy Score KNN: 0.835820895522388

Test Results
Accuracy Score LR: 1.0 Accuracy Score KNN: 0.7611940298507462
Recall Score LR: 1.0 Recall Score KNN: 0.7438568376068376
Precision Score LR: 1.0 Precision Score KNN: 0.7127805145046525



Iteration: 1 
Seed: 1
Validation Accuracy Score LR: 1.0 Validation Accuracy Score KNN: 0.7761194029850746

Test Results
Accuracy Score LR: 0.9850746268656716 Accuracy Score KNN: 0.8208955223880597
Recall Score LR: 0.9777777777777779 Recall Score KNN: 0.8075396825396827
Precision Score LR: 0.9885057471264368 Precision Score KNN: 0.7488368910782705



Iteration: 2 
Seed: 4
Validation Accuracy Score LR: 1.0 Validation Accuracy Score KNN: 0.6417910447761194

Test Results
Accuracy Score LR: 0.9850746268656716 Accuracy Score KNN: 0.7611940298507462
Recall Score LR: 0.9888888888888889 Recall Score KNN: 0.7594658553076403
Precision Score LR: 0.9761904761904763 Precision Score KNN: 0.6782293377120964


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [122]:
print("Final Result")

print("Mean Accuracy LR:", np.array(lr_acc_list).mean(), "\tMean Accuracy KNN:", np.array(knn_acc_list).mean())
print("Mean Precision LR:", np.array(lr_pre_list).mean(), "\tMean Precision KNN:", np.array(knn_pre_list).mean())
print("Mean Recall LR:", np.array(lr_rec_list).mean(), "\tMean Recall KNN:", np.array(knn_rec_list).mean())


Final Result
Mean Accuracy LR: 0.9895522388059701 	Mean Accuracy KNN: 0.7597014925373134
Mean Precision LR: 0.9870279146141214 	Mean Precision KNN: 0.686555829228243
Mean Recall LR: 0.9886918445539136 	Mean Recall KNN: 0.7521981221294594
