# Preprocessing

Importing basic libraries

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

Getting data

In [2]:
# Feature extracted vector of butterfly images
df = pd.read_csv("datasets/wavelet_tranformed_images.csv")

# showing top fifth
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target
0,2973.250839,8298.6178,10227.408617,9899.004236,8172.015204,7666.261727,5731.084946,3562.954584,5904.698711,5827.974814,...,5297.444858,7661.98228,9049.195738,11482.640987,7904.055262,5415.34124,5721.064961,5454.107444,3736.927607,1
1,9392.519574,3363.991818,261.456004,-666.648022,1848.154949,8494.3149,6813.477792,7401.008493,8362.67607,2247.631703,...,3004.623785,7599.7644,1638.887139,2.299218,94.688386,-261.439864,6704.154927,6529.286922,6264.545671,1
2,9476.486271,9019.230762,13933.35444,16848.978927,10348.201979,8850.877022,8742.038947,8408.984258,7493.416788,10981.260791,...,8849.521604,7661.706875,8522.716959,14235.784755,16722.89722,11136.510658,9414.61309,6664.266194,6068.475597,1
3,8168.386832,10118.915588,14661.496027,16742.83937,11739.847637,8467.72361,9010.492396,8867.841306,5766.367136,12302.921173,...,10085.977485,5542.47349,11509.31357,15149.222839,16099.193973,14105.852093,8593.9548,5967.203608,6122.549613,1
4,7208.727067,4441.685966,3593.017295,6723.645418,5672.881758,5250.187082,6805.785308,6857.688493,4399.938188,3617.745543,...,2981.269398,5923.172841,4506.806156,7395.912936,9073.99267,10248.565067,7053.489414,5849.718059,5985.256209,1


Splitting dataframe, it its known from previous notebooks that the number of pca components is 16, this is used afterwards

In [3]:
X = df.drop(columns = 'target').copy().to_numpy()
y = df.target.copy().to_numpy()

# Validating models

Storing saved models in dictionary

In [9]:
# reading all models, each of them is structured in a pipeline
# which has the following composition Pipeline([scaler, pca, estimator])

estimators = {}
estimator_names = os.listdir('final_models')
for name in estimator_names:
    with open(f'final_models/{name}', 'rb') as estimator_file:
        estimators[name.split('_')[0]] = pickle.load(estimator_file)

Computing 10-Fold cross validation score for each model

In [22]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, precision_score, f1_score, recall_score

# creating k-fold
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 23)

# creating scorer for cross validation
custom_precision = make_scorer(precision_score, )
custom_precision
# declaring dict for each cross validation model
cross_scores = {}

# iterating across all estimators
for name, estimator in estimators.items():
    data_per_estimator = {}
    train_precision_scores = []
    train_recall_scores = []
    train_f1_scores = []
    test_precision_scores = []
    test_recall_scores = []
    test_f1_scores = []

    for train_index, test_index in skf.split(X, y):
        # spliting matrix
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # fitting estimator
        estimators[name].fit(X_train, y_train)
        
        # making predictions in both sets
        y_train_pred = estimators[name].predict(X_train)
        y_test_pred = estimators[name].predict(X_test)
        
        # scoring predictions
        precision_train = precision_score(y_train, y_train_pred, average = 'weighted', zero_division = 0)
        recall_train = recall_score(y_train, y_train_pred, average = 'weighted', zero_division = 0)
        f1_train = f1_score(y_train, y_train_pred, average = 'weighted', zero_division = 0)
        
        precision_test = precision_score(y_test, y_test_pred, average = 'weighted', zero_division = 0)
        recall_test = recall_score(y_test, y_test_pred, average = 'weighted', zero_division = 0)
        f1_test = f1_score(y_test, y_test_pred, average = 'weighted', zero_division = 0)
        
        # Storing predictions in its corresponding list
        train_precision_scores.append(precision_train)
        train_recall_scores.append(recall_train)
        train_f1_scores.append(f1_train)
        test_precision_scores.append(precision_test)
        test_recall_scores.append(recall_test)
        test_f1_scores.append(f1_test)
        
    # packing data
    data_per_estimator['train_precision_score'] = np.array(train_precision_scores)
    data_per_estimator['train_recall_score'] = np.array(train_recall_scores)
    data_per_estimator['train_f1_score'] = np.array(train_f1_scores)
    data_per_estimator['test_precision_score'] = np.array(test_precision_scores)
    data_per_estimator['test_recall_score'] = np.array(test_recall_scores)
    data_per_estimator['test_f1_score'] = np.array(test_f1_scores)
    
    # adding data to the corresponding estimator
    cross_scores[name] = data_per_estimator

In [24]:
pd.DataFrame(cross_scores)

Unnamed: 0,dt,kn,lr,sv
train_precision_score,"[0.44648920319033514, 0.47082203639666603, 0.3...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.44665023482126837, 0.4309710274077042, 0.46...","[0.6467168208686419, 0.6609590541095733, 0.662..."
train_recall_score,"[0.38235294117647056, 0.3917112299465241, 0.35...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.4532085561497326, 0.4425133689839572, 0.459...","[0.5655080213903744, 0.553475935828877, 0.5714..."
train_f1_score,"[0.3731088927079756, 0.37260531221123366, 0.34...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.43861491117822293, 0.42348981819164555, 0.4...","[0.5570987275853514, 0.5434517269968134, 0.559..."
test_precision_score,"[0.14465452847805788, 0.2997985907649773, 0.29...","[0.2568027210884354, 0.4214678178963893, 0.320...","[0.30990864294435727, 0.451007326007326, 0.341...","[0.2111797924297924, 0.3621546894619917, 0.304..."
test_recall_score,"[0.16666666666666666, 0.2976190476190476, 0.28...","[0.25, 0.38095238095238093, 0.2771084337349397...","[0.32142857142857145, 0.42857142857142855, 0.3...","[0.25, 0.3333333333333333, 0.30120481927710846..."
test_f1_score,"[0.14683530754959326, 0.27875452369499987, 0.2...","[0.2429235547818643, 0.38461421097699294, 0.28...","[0.30474592867450007, 0.41688732701318515, 0.3...","[0.2076599778667448, 0.3040033389546879, 0.260..."
