In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install joblib==1.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# set project folder directory
DATA_DIR = '/content/drive/MyDrive/W281 Project/data'

In [None]:
# load some lib
import os
import glob
import re
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.notebook import tqdm
import joblib
import pickle
import cv2 as cv
from sklearn.decomposition import PCA


# read in metadata
df_meta = pd.read_csv(f"{DATA_DIR}/raw/Chest_xray_Corona_Metadata.csv", index_col = 0)
df_meta

Unnamed: 0,X_ray_image_name,Label,Dataset_type,Label_2_Virus_category,Label_1_Virus_category
0,IM-0128-0001.jpeg,Normal,TRAIN,,
1,IM-0127-0001.jpeg,Normal,TRAIN,,
2,IM-0125-0001.jpeg,Normal,TRAIN,,
3,IM-0122-0001.jpeg,Normal,TRAIN,,
4,IM-0119-0001.jpeg,Normal,TRAIN,,
...,...,...,...,...,...
5928,person1637_virus_2834.jpeg,Pnemonia,TEST,,Virus
5929,person1635_virus_2831.jpeg,Pnemonia,TEST,,Virus
5930,person1634_virus_2830.jpeg,Pnemonia,TEST,,Virus
5931,person1633_virus_2829.jpeg,Pnemonia,TEST,,Virus


In [None]:
# read in summary of data set
df_data_summary = pd.read_csv(f"{DATA_DIR}/raw/Chest_xray_Corona_dataset_Summary.csv", index_col = 0)
df_data_summary['Label_2_Virus_category'] = np.where((df_data_summary['Label'] == 'Normal'), 
                                              'Normal',
                                              np.where(
                                                  (df_data_summary['Label_2_Virus_category'] != df_data_summary['Label_2_Virus_category']) & 
                                                  ~(df_data_summary['Label_1_Virus_category'] != df_data_summary['Label_1_Virus_category']),
                                                  df_data_summary['Label_1_Virus_category'],
                                                  df_data_summary['Label_2_Virus_category']
                                              ))

df_data_summary['Label_1_Virus_category'] = np.where((df_data_summary['Label'] == 'Normal'), 
                                              'Normal', 
                                              df_data_summary['Label_1_Virus_category']
                                              )

df_data_summary

Unnamed: 0,Label,Label_1_Virus_category,Label_2_Virus_category,Image_Count
0,Normal,Normal,Normal,1576
1,Pnemonia,Stress-Smoking,ARDS,2
2,Pnemonia,Virus,Virus,1493
3,Pnemonia,Virus,COVID-19,58
4,Pnemonia,Virus,SARS,4
5,Pnemonia,bacteria,bacteria,2772
6,Pnemonia,bacteria,Streptococcus,5


In [None]:

# creating data label map
df_data_summary = df_data_summary[df_data_summary['Label_1_Virus_category'] != 'Stress-Smoking']
label_map = df_data_summary[['Label_1_Virus_category']].drop_duplicates().reset_index(drop=True).to_dict()['Label_1_Virus_category']
label_map = { val.lower(): key for key, val in label_map.items()}
label_map

{'normal': 0, 'virus': 1, 'bacteria': 2}

# Image Visualization
Read in some image to show some visualization

In [None]:
# load image data
def grab_one_and_plot(df, col_category, data_set = 'train'):
  """
    For data visualization, grab each image from the specified column directory
    Show the image
  """
  # copy the meta; avoid mutation
  df_copy = df.copy()


  # instantitae canvas for plotting
  plt.figure(figsize=(15,5))

  for i, cat in enumerate(df_copy[col_category].unique()):

      print(cat)

      # locate the image and build path to the image
      img_meta = df_copy[df_copy[col_category] == cat].iloc[153]
      train_test = data_set
      path = f"{DATA_DIR}/raw/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/{train_test}/{img_meta['X_ray_image_name']}"


      # grab the image path and plot
      img = Image.open(path)
      plt.subplot(1, len(df_copy[col_category].unique()), i+1)
      plt.imshow(img, cmap='gray')
      plt.axis('off')
      plt.title(f"{train_test}: {cat} Chest X-ray")
      
      
  plt.show()

# grab_one_and_plot(df_meta, "Label")


# Simple Model

1. pre-process the image to grab the label per image
2. do a simple model with multiple labels

In [None]:
df_meta = df_meta[df_meta['Label_1_Virus_category'] != 'Stress-Smoking']

In [None]:
df_meta['Label_1_Virus_category'].unique()

array([nan, 'Virus', 'bacteria'], dtype=object)

In [None]:
# fill some label nan
# certain bacteria/virus does not have a distinct label, for these we will be using the regular virus/barteria term
# in case of normal, we will label it as nomal
df_meta['Label_2_Virus_category'] = np.where((df_meta['Label'] == 'Normal'), 
                                              'Normal',
                                              np.where(
                                                  (df_meta['Label_2_Virus_category'] != df_meta['Label_2_Virus_category']) & 
                                                  ~(df_meta['Label_1_Virus_category'] != df_meta['Label_1_Virus_category']),
                                                  df_meta['Label_1_Virus_category'],
                                                  df_meta['Label_2_Virus_category']
                                              ))


df_meta['Label_1_Virus_category'] = np.where((df_meta['Label'] == 'Normal'), 
                                              'Normal',
                                              df_meta['Label_1_Virus_category'])

# select and rename meta dataset columns
df_meta = df_meta[['Dataset_type', 'X_ray_image_name', 'Label_1_Virus_category']]
df_meta['Label_1_Virus_category'] = df_meta['Label_1_Virus_category'].str.lower()
df_meta['label'] = df_meta['Label_1_Virus_category'].map(label_map)


# get a list of unique labels to train from
label_list = sorted(list(df_meta['label'].unique()))
print(label_list)

# get train data
df_train = df_meta[df_meta['Dataset_type'] == 'TRAIN']
df_train

[0, 1, 2]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['Label_2_Virus_category'] = np.where((df_meta['Label'] == 'Normal'),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['Label_1_Virus_category'] = np.where((df_meta['Label'] == 'Normal'),


Unnamed: 0,Dataset_type,X_ray_image_name,Label_1_Virus_category,label
0,TRAIN,IM-0128-0001.jpeg,normal,0
1,TRAIN,IM-0127-0001.jpeg,normal,0
2,TRAIN,IM-0125-0001.jpeg,normal,0
3,TRAIN,IM-0122-0001.jpeg,normal,0
4,TRAIN,IM-0119-0001.jpeg,normal,0
...,...,...,...,...
5304,TRAIN,1-s2.0-S0929664620300449-gr2_lrg-c.jpg,virus,1
5305,TRAIN,1-s2.0-S0929664620300449-gr2_lrg-b.jpg,virus,1
5306,TRAIN,1-s2.0-S0929664620300449-gr2_lrg-a.jpg,virus,1
5307,TRAIN,1-s2.0-S0140673620303706-fx1_lrg.jpg,virus,1


In [None]:
# get test data
df_test = df_meta[df_meta['Dataset_type'] == 'TEST']
df_test

Unnamed: 0,Dataset_type,X_ray_image_name,Label_1_Virus_category,label
5309,TEST,IM-0021-0001.jpeg,normal,0
5310,TEST,IM-0019-0001.jpeg,normal,0
5311,TEST,IM-0017-0001.jpeg,normal,0
5312,TEST,IM-0016-0001.jpeg,normal,0
5313,TEST,IM-0015-0001.jpeg,normal,0
...,...,...,...,...
5928,TEST,person1637_virus_2834.jpeg,virus,1
5929,TEST,person1635_virus_2831.jpeg,virus,1
5930,TEST,person1634_virus_2830.jpeg,virus,1
5931,TEST,person1633_virus_2829.jpeg,virus,1


# Load the Feature Map
- Load all the pertinent feature map

In [None]:
# load clrs hist features
sift_features_train = pickle.load(open(f"{DATA_DIR}/processed/sift_features.pkl", "rb"))
sift_features_test = pickle.load(open(f"{DATA_DIR}/processed/sift_features_test.pkl", "rb"))
len(sift_features_train.keys()), len(sift_features_test.keys())

(21649, 3119)

In [None]:
# load clrs hist features
clrs_hist_features_train = pickle.load(open(f"{DATA_DIR}/processed/hsv_hist_features_train.pkl", "rb"))
clrs_hist_features_test = pickle.load(open(f"{DATA_DIR}/processed/hsv_hist_features_test.pkl", "rb"))
len(clrs_hist_features_train.keys()), len(clrs_hist_features_test.keys())

(21649, 3119)

In [None]:
# load lbp features
lbp_features_train = pickle.load(open(f"{DATA_DIR}/processed/lbp_features.pkl", "rb"))
lbp_features_test = pickle.load(open(f"{DATA_DIR}/processed/lbp_features_test.pkl", "rb"))
len(lbp_features_train.keys()), len(lbp_features_test.keys())

(21649, 3119)

In [None]:
df_train['IMG'] = df_train['X_ray_image_name'].str.split(".").str[0]
df_test['IMG'] = df_test['X_ray_image_name'].str.split(".").str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['IMG'] = df_train['X_ray_image_name'].str.split(".").str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['IMG'] = df_test['X_ray_image_name'].str.split(".").str[0]


In [None]:
# Seam like for SIFT PCA components = 20 captures ~85% of the total variance :)
# Seam like for LBP PCA components = 50 captures ~80% of the total variance :)

In [None]:
train_image_keys = sift_features_train.keys()
test_image_keys = sift_features_test.keys()

In [None]:
# build a list of training data
# data_path = f'{DATA_DIR}/raw/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/augmented_train_resized'

train_image_data = {'sift': [], 'clrs_hist':[], 'lbp': []}
train_label = []

# grab the sift description
# isolate key description to grab the correct image
for key in tqdm(train_image_keys):
  # remove unwanted charaters
  img_name_list = key.split("_")[1: -2]
  img_name = "_".join(img_name_list)

  # grab the image label
  label = df_train[df_train['IMG'] == img_name]['label'].values
  
  # grab each of the feature
  sift_feature = sift_features_train[key]
  clrs_hist_feature = clrs_hist_features_train[key]
  lbp_feature = lbp_features_train[key]


  # # ignore image that have low number of SIFT keypoint descriptions
  # # here we decide to grab a threshold of 100 to: 
  # # 1. keep training size large 
  # # 2. keep a lower computation power
  # # ignore faulty image nameing
  if (sift_feature.shape[0] < 100) or len(label) != 1:
    continue

  # label = label[0]
  # desc = sift_desc[:100,:]
  # test_image_data.append(np.array(desc))
  # test_label.append(label)

  label = label[0]



  # transform and flatten the feature
  sift_feature = sift_feature[:100,:].flatten()
  clrs_hist_feature = np.reshape(clrs_hist_feature, (256, 1, 3))[:,:,1].flatten()
  lbp_feature = lbp_feature.flatten()

  train_image_data['sift'].append(np.array(sift_feature.astype(float)))
  train_image_data['clrs_hist'].append(np.array(clrs_hist_feature.astype(float)))
  train_image_data['lbp'].append(np.array(lbp_feature.astype(float)))
  train_label.append(label)

  0%|          | 0/21649 [00:00<?, ?it/s]

In [None]:
# perform the same procedure for testing set

test_image_data =  {'sift': [], 'clrs_hist':[], 'lbp': []}
test_label = []

# grab the sift description
# isolate key description to grab the correct image
for key in tqdm(test_image_keys):
  # remove unwanted charaters
  img_name_list = key.split("_")[1: -2]
  img_name = "_".join(img_name_list)
  
  # grab the image label
  label = df_test[df_test['IMG'] == img_name]['label'].values


  # grab each of the feature
  sift_feature = sift_features_test[key]
  clrs_hist_feature = clrs_hist_features_test[key]
  lbp_feature = lbp_features_test[key]

  # # ignore image that have low number of SIFT keypoint descriptions
  # # here we decide to grab a threshold of 100 to: 
  # # 1. keep training size large 
  # # 2. keep a lower computation power
  # # ignore faulty image nameing
  if (sift_feature.shape[0] < 100) or len(label) != 1:
    continue

  # label = label[0]
  # desc = sift_desc[:100,:]
  # test_image_data.append(np.array(desc))
  # test_label.append(label)

  label = label[0]



  # transform and flatten the feature
  sift_feature = sift_feature[:100,:].flatten()
  clrs_hist_feature = np.reshape(clrs_hist_feature, (256, 1, 3))[:,:,1].flatten()
  lbp_feature = lbp_feature.flatten()


  test_image_data['sift'].append(np.array(sift_feature.astype(float)))
  test_image_data['clrs_hist'].append(np.array(clrs_hist_feature.astype(float)))
  test_image_data['lbp'].append(np.array(lbp_feature.astype(float)))
  test_label.append(label)

  0%|          | 0/3119 [00:00<?, ?it/s]

In [None]:
# train_image_data = np.array(train_image_data)
train_image_data['sift'] = np.array(train_image_data['sift'])
train_image_data['clrs_hist'] = np.array(train_image_data['clrs_hist'])
train_image_data['lbp'] = np.array(train_image_data['lbp'])

In [None]:
# test_image_data = np.array(test_image_data)
test_image_data['sift'] = np.array(test_image_data['sift'])
test_image_data['clrs_hist'] = np.array(test_image_data['clrs_hist'])
test_image_data['lbp'] = np.array(test_image_data['lbp'])

In [None]:
train_label = np.array(train_label)
train_label[:5]

array([0, 0, 0, 0, 0])

In [None]:
test_label = np.array(test_label)
test_label[:5]

array([0, 0, 0, 0, 0])

In [None]:
# save on data loading by pickling the dataset
# output the dataset as a pickle so we don't have to load it everytime
# since the data is a large numpy array, we're using joblib to zip the data to save spaces

# save the train set
joblib.dump(train_image_data, f"{DATA_DIR}/processed/combined_features_image_data.gz", compress='gzip')  
joblib.dump(train_label, f"{DATA_DIR}/processed/combined_caption_data.gz", compress='gzip')  

# save the test set
joblib.dump(test_image_data, f"{DATA_DIR}/processed/combined_features_test_image_data.gz", compress='gzip')  
joblib.dump(test_label, f"{DATA_DIR}/processed/combined_test_caption_data.gz", compress='gzip')  

['/content/drive/MyDrive/W281 Project/data/processed/combined_caption_data.gz']