### Import Libraries

In [1]:
import os
import glob
import random
import pickle

from tqdm import tqdm

import numpy as np

# OpenCV
import cv2

# Pandas
import pandas as pd

# Tenforflow
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet101
from tensorflow.keras.applications.densenet import DenseNet169
from tensorflow.keras.layers import Flatten

# Scikit-learn
from sklearn.decomposition import PCA

### Read image file paths and store to a list both for pneumonia and normal

In [2]:
dataset_path = "../../dataset/ctscan/3A_images_resized/all"

all_files = []

pneumonia_files = glob.glob(os.path.join(dataset_path, "Pneumonia", "*.png"))
pneumonia_files.extend(glob.glob(os.path.join(dataset_path, "Pneumonia", "*.jpg")))

normal_files = glob.glob(os.path.join(dataset_path, "Normal", "*.png"))
normal_files.extend(glob.glob(os.path.join(dataset_path, "Normal", "*.jpg")))

# Get all the files from the directory in a two element list.
# First element is list of file location to pneumonia images and second element is list of file location to Normal images.
all_files = [normal_files, pneumonia_files]
print("Image Files Count\nNormal: {}\npneumonia: {}\nTotal: {}".format(len(normal_files), len(pneumonia_files), len(all_files[0] + all_files[1])))

Image Files Count
Normal: 6332
pneumonia: 6334
Total: 12666


### Define Data Size

In [3]:
normal_count = pneumonia_count = 2500 # 2500 or 5000

### Function to read image from file list and store the corresponding label

In [4]:
def get_dataset(files, label, data_count, img_size, image_id_count):
  dataset = []  # List to hold all the dataset. Each element is a dictionary

  count = 1
  for j in tqdm(files):  # Loop over each file location
    data_dict = {}
    data_dict["id"] = image_id_count
    data_dict["filepath"] = j
    try:
      img = cv2.imread(j)
      img = cv2.resize(img, (img_size, img_size))
      data_dict["image"] = img
      data_dict["label"] = label
      dataset.append(data_dict)
      if count == data_count:
        break
      count += 1
      image_id_count += 1
    except Exception as e:
      print("faulty image: {} {}".format(j, e))
  return dataset, count

### Read image from the file and store the corresponding label in a list

In [5]:
c_dataset, nc_dataset, t_dataset = [], [], []
image_id_count = 0       # Count to record the ids of files. Each file has a unique ID.
img_size = 224
# all_files => [NC, C]
for i, data in enumerate(all_files): # only two loops for pneumonia and Normal
  if i == 0:
    nc_dataset, image_id_count = get_dataset(data, i, normal_count, img_size, image_id_count)
  else:
    c_dataset, image_id_count = get_dataset(data, i, pneumonia_count, img_size, image_id_count)
tot_dataset = nc_dataset + c_dataset
print("Dataset Count\nNormal: {}\npneumonia: {}\nTotal: {}".format(len(nc_dataset), len(c_dataset), len(tot_dataset)))

 39%|██████████████████████████████▍                                              | 2499/6332 [00:20<00:32, 119.20it/s]
 39%|██████████████████████████████▍                                              | 2499/6334 [00:17<00:27, 139.63it/s]

Dataset Count
Normal: 2500
pneumonia: 2500
Total: 5000





### Extract image only from the dataset to send to DNN

In [6]:
image_only = []
for data in tot_dataset:
  image_only.append(data["image"])
image_only = np.array(image_only)

### Define batch size

In [7]:
print(image_only.shape)
total_ds = pneumonia_count + normal_count
image_only = image_only[:total_ds]
batch_size = int(image_only.shape[0] / 2) if total_ds > 5000 else image_only.shape[0]
print(batch_size)

(5000, 224, 224, 3)
5000


### Generate batches of images to feed into DNN

In [8]:
img_datagen = ImageDataGenerator(rescale=1./255)
batch_img = img_datagen.flow(image_only, batch_size=batch_size, shuffle=False)

### Define a function to select a model from three (VGG16, ResNet101 and DenseNet169)

In [9]:
def all_models(img_size, model_sel):
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel == 2:
    resnet_pre_t= ResNet101(input_shape=(img_size, img_size, 3), include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel == 3:
    densenet169_pre_t = DenseNet169(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet' )
    return densenet169_pre_t, 81536

### Select model among 3

In [14]:
select_model = 2 # int(input("Enter the number for: \n 1) VGG16 \n 2) Resnet101  \n 3) Densenet169 "))

### Extract image feature from the selected DNN model

In [15]:
all_features, reduced_features = [], []
pca = PCA(n_components=batch_size)

all_feat = []
model, feature_size = all_models(img_size, select_model)
for data in tqdm(range(len(batch_img))):
  try:
    features = model.predict(batch_img[data]).flatten().reshape(batch_size, feature_size)
    feature_matrix = features.reshape(features.shape[0], -1)
    reduced_features = pca.fit_transform(feature_matrix)
  except:
    img_len = len(batch_img[data])
    features = model.predict(batch_img[data]).flatten().reshape(img_len, feature_size)
    feature_matrix = features.reshape(features.shape[0], -1)
    reduced_features = pca.fit_transform(feature_matrix)
  all_feat.extend(reduced_features)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:32<00:00, 92.64s/it]


### Replace image value by image features

In [16]:
for i in range(len(tot_dataset[:total_ds])):
  tot_dataset[i]['image'] = all_feat[i]

# print sample dataset pneumonia and Normal
print(len(tot_dataset))
print(tot_dataset[0])
print(tot_dataset[2500])

5000
{'id': 0, 'filepath': '../../dataset/ctscan/3A_images_resized/all\\Normal\\Normal_1671_793_0000.png', 'image': array([-5.1546894e+01,  5.1121563e-01, -9.4625206e+00, ...,
       -2.0666956e-04,  3.2767650e-05,  8.2749648e-05], dtype=float32), 'label': 0}
{'id': 2500, 'filepath': '../../dataset/ctscan/3A_images_resized/all\\Pneumonia\\CP_1072_3115_0037.png', 'image': array([4.8701744e+00, 6.6690707e+00, 6.0499935e+00, ..., 1.9359033e-04,
       5.5109460e-05, 8.3378371e-05], dtype=float32), 'label': 1}


### Save extracted feature in pickle file for future use

In [17]:
filepath = "../../pickle_files/al/ct_scan/pneumonia_"
if select_model == 1:
  filename = f"ct_scan_pca_{len(tot_dataset)}_vgg16.pickle"
elif select_model == 2:
  filename = f"ct_scan_pca_{len(tot_dataset)}_resnet101.pickle"
elif select_model == 3:
  filename = f"ct_scan_pca_{len(tot_dataset)}_densenet169.pickle"

file = filepath + filename
with open(file, 'wb') as handle:
  pickle.dump(tot_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

# ----------------------------------------------------------------------------------