### Import Libraries

In [2]:
import os
import glob
import random
import pickle

from tqdm import tqdm

import numpy as np

# OpenCV
import cv2

# Pandas
import pandas as pd

# Tenforflow
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet101
from tensorflow.keras.applications.densenet import DenseNet169
from tensorflow.keras.layers import Flatten

# Scikit-learn
from sklearn.decomposition import PCA

### Read image file paths and store to a list both for Pneumonia and Normal

In [4]:
dataset_path = "../../dataset/xray_new/covid_19_Radiography_Dataset_Refined/"

all_files = []

pneumonia_files = glob.glob(os.path.join(dataset_path, "train", "Viral Pneumonia", "images", "*.png"))
pneumonia_files.extend(glob.glob(os.path.join(dataset_path, "test", "Viral Pneumonia", "images", "*.png")))

normal_files = glob.glob(os.path.join(dataset_path, "train", "Normal", "images","*.png"))
normal_files.extend(glob.glob(os.path.join(dataset_path, "test", "Normal", "images", "*.png")))

# Get all the files from the directory in a two element list.
# First element is list of file location to pneumonia images and second element is list of file location to Normal images.
all_files = [normal_files, pneumonia_files]
print("Image Files Count\nNormal: {}\npneumonia: {}\nTotal: {}".format(len(normal_files), len(pneumonia_files), len(all_files[0] + all_files[1])))

Image Files Count
Normal: 2400
pneumonia: 695
Total: 3095


### Define Data Size

In [5]:
normal_count = 800
pneumonia_count = 695

### Function to read image from file list and store the corresponding label

In [6]:
def get_dataset(files, label, data_count, img_size, image_id_count):
  dataset = []  # List to hold all the dataset. Each element is a dictionary

  count = 1
  for j in tqdm(files):  # Loop over each file location
    data_dict = {}
    data_dict["id"] = image_id_count
    data_dict["filepath"] = j
    try:
      img = cv2.imread(j)
      img = cv2.resize(img, (img_size, img_size))
      data_dict["image"] = img
      data_dict["label"] = label
      dataset.append(data_dict)
      if count == data_count:
        break
      count += 1
      image_id_count += 1
    except Exception as e:
      print("faulty image: {} {}".format(j, e))
  return dataset, image_id_count

### Read image from the file and store the corresponding label in a list

In [7]:
c_dataset, nc_dataset, t_dataset = [], [], []
image_id_count = 1       # counter to record the ids of files. Each file has a unique ID.
img_size = 224
# all_files => [NC, C]
for i, data in enumerate(all_files): # only two loops for pneumonia and Normal
  if i == 0:
    nc_dataset, image_id_count = get_dataset(data, i, normal_count, img_size, image_id_count)
  else:
    c_dataset, image_id_count = get_dataset(data, i, pneumonia_count, img_size, image_id_count)
tot_dataset = nc_dataset + c_dataset
print("Dataset Count\nNormal: {}\npneumonia: {}\nTotal: {}".format(len(nc_dataset), len(c_dataset), len(tot_dataset)))

 33%|██████████████████████████▎                                                    | 799/2400 [00:14<00:29, 53.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████▉| 694/695 [00:09<00:00, 72.15it/s]

Dataset Count
Normal: 800
pneumonia: 695
Total: 1495





### Extract image only from the dataset to send to DNN

In [8]:
image_only = []
for data in tot_dataset:
  image_only.append(data["image"])
image_only = np.array(image_only)

### Define batch size

In [9]:
batch_size = image_only.shape[0]
print(batch_size)

1495


### Generate batches of images to feed into DNN

In [10]:
img_datagen = ImageDataGenerator(rescale=1./255)
batch_img = img_datagen.flow(image_only, batch_size=batch_size, shuffle=False)

### Define a function to select a model from three (VGG16, ResNet101 and DenseNet169)

In [11]:
def all_models(img_size, model_sel):
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel == 2:
    resnet_pre_t= ResNet101(input_shape=(img_size, img_size, 3), include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel == 3:
    densenet169_pre_t = DenseNet169(input_shape=(img_size, img_size, 3), include_top=False, weights ='imagenet' )
    return densenet169_pre_t, 81536

### Select model among 3

In [12]:
select_model = 1 # int(input("Enter the number for: \n 1) VGG16 \n 2) Resnet101  \n 3) Densenet169 "))

### Extract image feature from the selected DNN model

In [13]:
all_features, reduced_features = [], []
pca = PCA(n_components=batch_size)

all_feat = []
model, feature_size = all_models(img_size, select_model)
for data in tqdm(range(len(batch_img))):
  try:
    features = model.predict(batch_img[data]).flatten().reshape(batch_size, feature_size)
    feature_matrix = features.reshape(features.shape[0], -1)
    reduced_features = pca.fit_transform(feature_matrix)
  except:
    img_len = len(batch_img[data])
    features = model.predict(batch_img[data]).flatten().reshape(img_len, feature_size)
    feature_matrix = features.reshape(features.shape[0], -1)
    reduced_features = pca.fit_transform(feature_matrix)
  all_feat.extend(reduced_features)

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:40<00:00, 40.79s/it]


### Replace image value by image features

In [15]:
for i in range(len(tot_dataset)):
  tot_dataset[i]['image'] = all_feat[i]

# print sample dataset pneumonia and Normal
print(len(tot_dataset))
print(tot_dataset[0])
print(tot_dataset[900])

1495
{'id': 1, 'filepath': '../../dataset/xray_new/covid_19_Radiography_Dataset_Refined/train\\Normal\\images\\Normal-1000.png', 'image': array([-1.0655748e+01,  2.6115465e+00, -5.2301559e+00, ...,
        6.1773482e-11,  6.5538365e-12,  2.4485494e-12], dtype=float32), 'label': 0}
{'id': 900, 'filepath': '../../dataset/xray_new/covid_19_Radiography_Dataset_Refined/train\\Viral Pneumonia\\images\\Viral Pneumonia-1100.png', 'image': array([3.3517089e+00, 5.7182102e+00, 2.8066650e-01, ..., 6.2010910e-11,
       6.7252532e-12, 2.3683824e-12], dtype=float32), 'label': 1}


### Save extracted feature in pickle file for future use

In [16]:
filepath = "../../pickle_files/al/x_ray/pneumonia"
if select_model == 1:
  filename = f"x_ray_pca_{len(tot_dataset)}_vgg16.pickle"
elif select_model == 2:
  filename = f"x_ray_pca_{len(tot_dataset)}_resnet101.pickle"
elif select_model == 3:
  filename = f"x_ray_pca_{len(tot_dataset)}_densenet169.pickle"

file = filepath + filename
with open(file, 'wb') as handle:
  pickle.dump(tot_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

# ----------------------------------------------------------------------------------