In [None]:
!git clone https://github.com/anminhhung/small_dog_cat_dataset

Cloning into 'small_dog_cat_dataset'...
remote: Enumerating objects: 2608, done.[K
remote: Total 2608 (delta 0), reused 0 (delta 0), pack-reused 2608[K
Receiving objects: 100% (2608/2608), 55.84 MiB | 36.28 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [None]:
!pip install scikit-optimize

In [None]:
!pip install ipython-autotime
%load_ext autotime

In [None]:
import glob 
import os 
import cv2 
import pickle 
import pprint
import time 
from functools import partial

import numpy as np 
import pandas as pd 

from skimage.feature import hog
# https://github.com/hyperopt/hyperopt-sklearn
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer

from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale, normalize
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt

time: 3.31 ms (started: 2023-06-01 04:06:13 +00:00)


In [None]:
def read_file(path, target_size=(64, 64)):
    datas = []
    label = []

    for category in os.listdir(path):
     category_dir = os.path.join(path, category)
     for image_name in os.listdir(category_dir):
        image_path = os.path.join(category_dir, image_name)
        image = cv2.imread(image_path)
        image = cv2.resize(image, target_size)
        datas.append(image)
        label.append(category)

    return np.array(datas), np.array(label)

time: 1 ms (started: 2023-06-01 03:30:47 +00:00)


In [None]:
train_dir = 'small_dog_cat_dataset/train/'
test_dir = 'small_dog_cat_dataset/test/'
target_size = (64,64)

train_data, train_label = read_file(train_dir, target_size)
test_data, test_label = read_file(test_dir, target_size)

time: 5.98 s (started: 2023-06-01 03:30:50 +00:00)


In [None]:
len(train_data), len(train_label), len(test_data), len(test_label)

(2000, 2000, 600, 600)

time: 4.41 ms (started: 2023-06-01 03:30:56 +00:00)


In [None]:
# feature selection 
def hog_feature(data):
   hog_gray_features = []

   for image in data:
      gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
      hog_features, _ = hog(gray_image, visualize=True,
                               block_norm='L2-Hys',
                               pixels_per_cell=(16, 16),
                                cells_per_block=(2, 2))
      hog_gray_features.append(hog_features)

   return np.array(hog_gray_features)

time: 780 µs (started: 2023-06-01 03:30:56 +00:00)


In [None]:
train_data = hog_feature(train_data)
test_data = hog_feature(test_data)

time: 9.5 s (started: 2023-06-01 03:30:56 +00:00)


In [None]:
N_FOLDS = 10
MAX_EVALS = 50

time: 626 µs (started: 2023-06-01 03:39:49 +00:00)


In [None]:
def objective(params, X=train_data, y=train_label, n_folds=N_FOLDS):
    clf = LogisticRegression(**params, random_state=0,verbose =0)
    scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')

    best_score = max(scores)

    # minimize loss 
    loss = 1 - best_score

    return {'loss': loss, 'params': params, 'status': STATUS_OK}

time: 793 µs (started: 2023-06-01 03:44:14 +00:00)


In [None]:
space = {
    'class_weight': 'balanced', # hp.choice('class_weight', [None, class_weight]),
    'warm_start' : hp.choice('warm_start', [True, False]),
    'fit_intercept' : hp.choice('fit_intercept', [True, False]),
    'tol' : hp.uniform('tol', 0.00001, 0.0001),
    'C' : hp.uniform('C', 0.05, 3),
    'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
    'max_iter' : hp.choice('max_iter', range(5,1000))
}

time: 7.28 ms (started: 2023-06-01 03:42:26 +00:00)


In [None]:
tpe_algorithm = tpe.suggest

# Trials object to track progress
bayes_trials = Trials()

# Optimize
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = bayes_trials)

 10%|█         | 5/50 [00:05<00:44,  1.01trial/s, best loss: 0.24002400240024002]







 12%|█▏        | 6/50 [00:06<00:39,  1.12trial/s, best loss: 0.24002400240024002]




100%|██████████| 50/50 [00:57<00:00,  1.15s/trial, best loss: 0.23500587514687865]
time: 57.8 s (started: 2023-06-01 03:44:17 +00:00)


In [None]:
best

{'C': 0.0828019713492531,
 'fit_intercept': 0,
 'max_iter': 769,
 'solver': 0,
 'tol': 6.769488940216495e-05,
 'warm_start': 1}

time: 4.67 ms (started: 2023-06-01 03:46:40 +00:00)


In [None]:
logistic_key = {
  "warm_start" : [True, False],
  "fit_intercept" : [True, False],
  "solver" : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

time: 608 µs (started: 2023-06-01 04:01:02 +00:00)


In [None]:
best_params = {}

for key, value in best.items():
  if key in logistic_key:
    best_params[key] = logistic_key[key][value]
  else:
    best_params[key] = value 

best_params

{'C': 0.0828019713492531,
 'fit_intercept': True,
 'max_iter': 769,
 'solver': 'newton-cg',
 'tol': 6.769488940216495e-05,
 'warm_start': False}

time: 9.93 ms (started: 2023-06-01 04:01:19 +00:00)


In [None]:
# Optimal model
clf = LogisticRegression(**best_params)

time: 819 µs (started: 2023-06-01 04:01:30 +00:00)


In [None]:
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
accuracy_score(y_pred, test_label)

0.7083333333333334

time: 202 ms (started: 2023-06-01 04:01:33 +00:00)


In [None]:
# save model 

# save
with open('model.pkl','wb') as f:
    pickle.dump(clf, f)

# load
with open('model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

y_pred = clf.predict(test_data)
accuracy_score(y_pred, test_label)

0.7083333333333334

time: 33 ms (started: 2023-06-01 04:08:33 +00:00)
