<a href="https://colab.research.google.com/github/ADIthaker/CNN-Techniques/blob/master/Transfer_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!unzip -q flowers17.zip -d ~/.keras/datasets

In [15]:
!ls -la ~/.keras/datasets

total 12
drwxr-xr-x 3 root root 4096 Sep 14 17:16 .
drwxr-xr-x 4 root root 4096 Sep 14 17:16 ..
drwxr-xr-x 3 root root 4096 May 30  2020 flowers17


In [16]:
import json
import os
import pathlib
from glob import glob
import h5py
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.tree import *
from tensorflow.keras.applications import *
from tensorflow.keras.preprocessing.image import *
from tqdm import tqdm
import numpy as np
import sklearn.utils as skutils

In [17]:
INPUT_SIZE = (224, 224, 3)

In [18]:
def get_pretrained_networks():
  return [
          (VGG16(input_shape=INPUT_SIZE,
                 weights='imagenet',
                 include_top=False),
           7*7*512),
          (VGG19(input_shape=INPUT_SIZE,
                 weights='imagenet',
                 include_top=False),
           7*7*512),
          (Xception(input_shape=INPUT_SIZE,
                 weights='imagenet',
                 include_top=False),
           7*7*2048),
          (ResNet152V2(input_shape=INPUT_SIZE,
                 weights='imagenet',
                 include_top=False),
           7*7*2048),
          (InceptionResNetV2(input_shape=INPUT_SIZE,
                 weights='imagenet',
                 include_top=False),
           5*5*1536),
  ]

In [33]:
def get_classifiers():
  models = {}
  models['LogisticRegression'] = LogisticRegression()
  models['SGDClf'] = SGDClassifier()
  models['PAClf'] = PassiveAggressiveClassifier()
  models['DecisionTreeClf'] = DecisionTreeClassifier()
  models['ExtraTreeClf'] = ExtraTreeClassifier()

  n_trees = 100
  models[f'RandomForestClf-{n_trees}'] = RandomForestClassifier(n_estimators=n_trees)
  models[f'ExtraTreeClf-{n_trees}'] = ExtraTreesClassifier(n_estimators=n_trees)
  
  number_of_neighbours = range(3, 25)
  for n in number_of_neighbours:
    models[f'KNeighboursClf-{n}'] = KNeighborsClassifier(n_neighbors=n)
  
  reg = [1e-3, 1e-2, 1, 10]
  for r in reg:
    models[f'LinearSVC-{r}'] = LinearSVC(C=r)
    models[f'RidgeClf-{r}'] = RidgeClassifier(alpha=r)

  print(f'Defined {len(models)} models.')
  return models

In [20]:
dataset_path = (pathlib.Path.home() / '.keras' / 'datasets' / 'flowers17')
files_pattern = (dataset_path / 'images' / '*' / '*.jpg')
images_path = [*glob(str(files_pattern))]

In [21]:
labels = []
for index in tqdm(range(len(images_path))):
  image_path = images_path[index]
  label = image_path.split(os.path.sep)[-2]
  labels.append(label)

100%|██████████| 1360/1360 [00:00<00:00, 285927.49it/s]


In [22]:
final_report = {}
best_model = None
best_accuracy = -1
best_features = None

In [23]:
class FeatureExtractor(object):
    def __init__(self,
                 model,
                 input_size,
                 label_encoder,
                 num_instances,
                 feature_size,
                 output_path,
                 features_key='features',
                 buffer_size=1000):
        if os.path.exists(output_path):
            error_msg = (f'{output_path} already exists. '
                         f'Please delete it and try again.')
            raise FileExistsError(error_msg)

        self.model = model
        self.input_size = input_size
        self.le = label_encoder
        self.feature_size = feature_size

        self.db = h5py.File(output_path, 'w')
        self.features = self.db.create_dataset(features_key,
                                               (num_instances,
                                                feature_size),
                                               dtype='float')
        self.labels = self.db.create_dataset('labels',
                                             (num_instances,),
                                             dtype='int')

        self.buffer_size = buffer_size
        self.buffer = {'features': [], 'labels': []}
        self.current_index = 0

    def extract_features(self,
                         image_paths,
                         labels,
                         batch_size=64,
                         shuffle=True):
        if shuffle:
            image_paths, labels = skutils.shuffle(image_paths,
                                                  labels)

        encoded_labels = self.le.fit_transform(labels)

        self._store_class_labels(self.le.classes_)

        for i in tqdm(range(0, len(image_paths), batch_size)):
            batch_paths = image_paths[i: i + batch_size]
            batch_labels = encoded_labels[i:i + batch_size]
            batch_images = []

            for image_path in batch_paths:
                image = load_img(image_path,
                                 target_size=self.input_size)
                image = img_to_array(image)
                image = np.expand_dims(image, axis=0)
                image = imagenet_utils.preprocess_input(image)

                batch_images.append(image)

            batch_images = np.vstack(batch_images)
            feats = self.model.predict(batch_images,
                                       batch_size=batch_size)

            new_shape = (feats.shape[0], self.feature_size)
            feats = feats.reshape(new_shape)
            self._add(feats, batch_labels)

        self._close()

    def _add(self, rows, labels):
        self.buffer['features'].extend(rows)
        self.buffer['labels'].extend(labels)

        if len(self.buffer['features']) >= self.buffer_size:
            self._flush()

    def _flush(self):
        next_index = (self.current_index +
                      len(self.buffer['features']))
        buffer_slice = slice(self.current_index, next_index)
        self.features[buffer_slice] = self.buffer['features']
        self.labels[buffer_slice] = self.buffer['labels']
        self.current_index = next_index
        self.buffer = {'features': [], 'labels': []}

    def _store_class_labels(self, class_labels):
        data_type = h5py.special_dtype(vlen=str)
        label_ds = self.db.create_dataset('label_names',
                                          (len(class_labels),),
                                          dtype=data_type)
        label_ds[:] = class_labels

    def _close(self):
        if len(self.buffer['features']) > 0:
            self._flush()

        self.db.close()

In [None]:
for model, feature_size in get_pretrained_networks():
  output_path = dataset_path / f'{model.name}_features.hdf5'
  output_path = str(output_path)
  fe = FeatureExtractor(model=model, input_size=INPUT_SIZE, label_encoder=LabelEncoder(), num_instances=len(images_path),feature_size=feature_size,output_path=output_path)
  fe.extract_features(image_paths=images_path,labels=labels)
  db = h5py.File(output_path, 'r')

  TRAIN_PROPORTION = 0.8
  SPLIT_INDEX = int(len(labels) * TRAIN_PROPORTION)

  X_train, y_train = (db['features'][:SPLIT_INDEX],
                      db['labels'][:SPLIT_INDEX])
  X_test, y_test = (db['features'][SPLIT_INDEX:],
                    db['labels'][SPLIT_INDEX:])

  classifiers_report = {
      'extractor': model.name
  }

  print(f'Spot-checking with features from {model.name}')
  for clf_name, clf in get_classifiers().items():
      try:
          clf.fit(X_train, y_train)
      except Exception as e:
          print(f'\t{clf_name}: {e}')
          continue

      predictions = clf.predict(X_test)
      accuracy = accuracy_score(y_test, predictions)

      print(f'\t{clf_name}: {accuracy}')
      classifiers_report[clf_name] = accuracy

      if accuracy > best_accuracy:
          best_accuracy = accuracy
          best_model = clf_name
          best_features = model.name

  final_report[output_path] = classifiers_report
  db.close()

100%|██████████| 22/22 [00:19<00:00,  1.13it/s]


Spot-checking with features from vgg16
Defined 37 models.
	LogisticRegression: 0.8529411764705882
	SGDClf: 0.8786764705882353
	PAClf: 0.8713235294117647
	DecisionTreeClf: 0.5294117647058824
	ExtraTreeClf: 0.3897058823529412
	RandomForestClf-100: 0.7941176470588235
	ExtraTreeClf-100: 0.8014705882352942
	KNeighboursClf-3: 0.5
	KNeighboursClf-4: 0.4889705882352941
	KNeighboursClf-5: 0.4963235294117647
	KNeighboursClf-6: 0.48161764705882354
	KNeighboursClf-7: 0.47058823529411764
	KNeighboursClf-8: 0.4485294117647059
	KNeighboursClf-9: 0.4411764705882353
	KNeighboursClf-10: 0.45588235294117646
	KNeighboursClf-11: 0.45955882352941174
	KNeighboursClf-12: 0.4411764705882353
	KNeighboursClf-13: 0.45588235294117646
	KNeighboursClf-14: 0.4522058823529412
	KNeighboursClf-15: 0.4411764705882353
	KNeighboursClf-16: 0.4411764705882353
	KNeighboursClf-17: 0.43014705882352944
	KNeighboursClf-18: 0.4264705882352941
	KNeighboursClf-19: 0.41911764705882354
	KNeighboursClf-20: 0.41911764705882354
	KNeighbo



	LinearSVC-0.001: 0.8566176470588235
	RidgeClf-0.001: 0.8419117647058824




	LinearSVC-0.01: 0.8566176470588235
	RidgeClf-0.01: 0.8419117647058824




	LinearSVC-1: 0.8566176470588235
	RidgeClf-1: 0.8419117647058824




	LinearSVC-10: 0.8566176470588235
	RidgeClf-10: 0.8419117647058824


100%|██████████| 22/22 [00:21<00:00,  1.03it/s]


Spot-checking with features from vgg19
Defined 37 models.
	LogisticRegression: 0.9117647058823529
	SGDClf: 0.8786764705882353
	PAClf: 0.9154411764705882
	DecisionTreeClf: 0.4632352941176471
	ExtraTreeClf: 0.3382352941176471
	RandomForestClf-100: 0.8419117647058824
	ExtraTreeClf-100: 0.8125
	KNeighboursClf-3: 0.5367647058823529
	KNeighboursClf-4: 0.5367647058823529
	KNeighboursClf-5: 0.5183823529411765
	KNeighboursClf-6: 0.4852941176470588
	KNeighboursClf-7: 0.48161764705882354
	KNeighboursClf-8: 0.5036764705882353
	KNeighboursClf-9: 0.4889705882352941
	KNeighboursClf-10: 0.46691176470588236
	KNeighboursClf-11: 0.47058823529411764
	KNeighboursClf-12: 0.46691176470588236
	KNeighboursClf-13: 0.46691176470588236
	KNeighboursClf-14: 0.46691176470588236
	KNeighboursClf-15: 0.4632352941176471
	KNeighboursClf-16: 0.46691176470588236
	KNeighboursClf-17: 0.45588235294117646
	KNeighboursClf-18: 0.46691176470588236
	KNeighboursClf-19: 0.45588235294117646
	KNeighboursClf-20: 0.47058823529411764
	KN



	LinearSVC-0.001: 0.8970588235294118
	RidgeClf-0.001: 0.8933823529411765




	LinearSVC-0.01: 0.8970588235294118
	RidgeClf-0.01: 0.8933823529411765




	LinearSVC-1: 0.8970588235294118
	RidgeClf-1: 0.8933823529411765




	LinearSVC-10: 0.8970588235294118
	RidgeClf-10: 0.8970588235294118


100%|██████████| 22/22 [00:25<00:00,  1.18s/it]


Spot-checking with features from xception
Defined 37 models.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	LogisticRegression: 0.5588235294117647
	SGDClf: 0.5404411764705882
	PAClf: 0.5625
	DecisionTreeClf: 0.29044117647058826
	ExtraTreeClf: 0.21323529411764705
	RandomForestClf-100: 0.43014705882352944
	ExtraTreeClf-100: 0.44485294117647056
	KNeighboursClf-3: 0.3161764705882353
	KNeighboursClf-4: 0.2757352941176471
	KNeighboursClf-5: 0.2867647058823529
	KNeighboursClf-6: 0.29044117647058826
	KNeighboursClf-7: 0.3272058823529412
	KNeighboursClf-8: 0.33455882352941174
	KNeighboursClf-9: 0.3161764705882353
	KNeighboursClf-10: 0.30514705882352944
	KNeighboursClf-11: 0.30514705882352944
	KNeighboursClf-12: 0.28308823529411764
	KNeighboursClf-13: 0.28308823529411764
	KNeighboursClf-14: 0.29044117647058826
	KNeighboursClf-15: 0.28308823529411764
	KNeighboursClf-16: 0.30514705882352944
	KNeighboursClf-17: 0.3125
	KNeighboursClf-18: 0.3235294117647059
	KNeighboursClf-19: 0.3235294117647059
	KNeighboursClf-20: 0.31985294117647056
	KNeighboursClf-21: 0.30514705882352944
	KNeighboursClf-22: 0.30882352



	LinearSVC-0.001: 0.5588235294117647
	RidgeClf-0.001: 0.5183823529411765




	LinearSVC-0.01: 0.5588235294117647
	RidgeClf-0.01: 0.5183823529411765




	LinearSVC-1: 0.5588235294117647
	RidgeClf-1: 0.5220588235294118




	LinearSVC-10: 0.5588235294117647
	RidgeClf-10: 0.5220588235294118


100%|██████████| 22/22 [00:44<00:00,  2.03s/it]


Spot-checking with features from resnet152v2
Defined 37 models.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	LogisticRegression: 0.3786764705882353
	SGDClf: 0.29411764705882354
	PAClf: 0.39338235294117646
	DecisionTreeClf: 0.26838235294117646
	ExtraTreeClf: 0.19852941176470587
	RandomForestClf-100: 0.3161764705882353
	ExtraTreeClf-100: 0.3161764705882353
	KNeighboursClf-3: 0.1948529411764706
	KNeighboursClf-4: 0.16911764705882354
	KNeighboursClf-5: 0.16544117647058823
	KNeighboursClf-6: 0.16176470588235295
	KNeighboursClf-7: 0.18382352941176472
	KNeighboursClf-8: 0.1801470588235294
	KNeighboursClf-9: 0.17647058823529413
	KNeighboursClf-10: 0.1801470588235294
	KNeighboursClf-11: 0.17279411764705882
	KNeighboursClf-12: 0.17647058823529413
	KNeighboursClf-13: 0.17647058823529413
	KNeighboursClf-14: 0.17279411764705882
	KNeighboursClf-15: 0.17279411764705882
	KNeighboursClf-16: 0.16911764705882354
	KNeighboursClf-17: 0.17279411764705882
	KNeighboursClf-18: 0.17279411764705882
	KNeighboursClf-19: 0.18382352941176472
	KNeighboursClf-20: 0.17279411764705882
	KNeighboursClf-21: 0.16544117647058823
	



	LinearSVC-0.001: 0.40808823529411764
	RidgeClf-0.001: 0.48161764705882354




	LinearSVC-0.01: 0.40441176470588236
	RidgeClf-0.01: 0.4852941176470588




	LinearSVC-1: 0.4117647058823529
	RidgeClf-1: 0.4852941176470588




	LinearSVC-10: 0.40441176470588236
	RidgeClf-10: 0.4852941176470588


100%|██████████| 22/22 [00:40<00:00,  1.84s/it]


Spot-checking with features from inception_resnet_v2
Defined 37 models.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	LogisticRegression: 0.22058823529411764
	SGDClf: 0.14705882352941177
	PAClf: 0.1213235294117647
	DecisionTreeClf: 0.18382352941176472
	ExtraTreeClf: 0.21691176470588236
	RandomForestClf-100: 0.27941176470588236
	ExtraTreeClf-100: 0.25735294117647056
	KNeighboursClf-3: 0.15441176470588236
	KNeighboursClf-4: 0.12867647058823528
	KNeighboursClf-5: 0.1213235294117647
	KNeighboursClf-6: 0.14338235294117646
	KNeighboursClf-7: 0.12867647058823528
	KNeighboursClf-8: 0.12867647058823528
	KNeighboursClf-9: 0.13602941176470587
	KNeighboursClf-10: 0.1323529411764706
	KNeighboursClf-11: 0.1323529411764706
	KNeighboursClf-12: 0.1323529411764706
	KNeighboursClf-13: 0.12867647058823528
	KNeighboursClf-14: 0.12867647058823528
	KNeighboursClf-15: 0.13970588235294118
	KNeighboursClf-16: 0.13602941176470587
	KNeighboursClf-17: 0.15073529411764705
	KNeighboursClf-18: 0.13970588235294118
	KNeighboursClf-19: 0.13970588235294118
	KNeighboursClf-20: 0.14338235294117646
	KNeighboursClf-21: 0.14338235294117646




	LinearSVC-0.001: 0.15441176470588236
	RidgeClf-0.001: 0.22426470588235295


In [30]:
final_report['best_model'] = best_model
final_report['best_accuracy'] = best_accuracy
final_report['best_features'] = best_features

for key, value in final_report.items():
  print(key, ' : ', value)