In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
!pip install joblib==1.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [67]:
# set project folder directory
DATA_DIR = '/content/drive/MyDrive/W281 Project/data'

In [68]:
# load some lib
import os
import glob
import re
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.notebook import tqdm
import joblib
import pickle
import cv2 as cv
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# read in metadata
df_meta = pd.read_csv(f"{DATA_DIR}/raw/Chest_xray_Corona_Metadata.csv", index_col = 0)
df_meta

Unnamed: 0,X_ray_image_name,Label,Dataset_type,Label_2_Virus_category,Label_1_Virus_category
0,IM-0128-0001.jpeg,Normal,TRAIN,,
1,IM-0127-0001.jpeg,Normal,TRAIN,,
2,IM-0125-0001.jpeg,Normal,TRAIN,,
3,IM-0122-0001.jpeg,Normal,TRAIN,,
4,IM-0119-0001.jpeg,Normal,TRAIN,,
...,...,...,...,...,...
5928,person1637_virus_2834.jpeg,Pnemonia,TEST,,Virus
5929,person1635_virus_2831.jpeg,Pnemonia,TEST,,Virus
5930,person1634_virus_2830.jpeg,Pnemonia,TEST,,Virus
5931,person1633_virus_2829.jpeg,Pnemonia,TEST,,Virus


In [69]:
# read in summary of data set
df_data_summary = pd.read_csv(f"{DATA_DIR}/raw/Chest_xray_Corona_dataset_Summary.csv", index_col = 0)
df_data_summary['Label_2_Virus_category'] = np.where((df_data_summary['Label'] == 'Normal'), 
                                              'Normal',
                                              np.where(
                                                  (df_data_summary['Label_2_Virus_category'] != df_data_summary['Label_2_Virus_category']) & 
                                                  ~(df_data_summary['Label_1_Virus_category'] != df_data_summary['Label_1_Virus_category']),
                                                  df_data_summary['Label_1_Virus_category'],
                                                  df_data_summary['Label_2_Virus_category']
                                              ))

df_data_summary['Label_1_Virus_category'] = np.where((df_data_summary['Label'] == 'Normal'), 
                                              'Normal', 
                                              df_data_summary['Label_1_Virus_category']
                                              )

df_data_summary

Unnamed: 0,Label,Label_1_Virus_category,Label_2_Virus_category,Image_Count
0,Normal,Normal,Normal,1576
1,Pnemonia,Stress-Smoking,ARDS,2
2,Pnemonia,Virus,Virus,1493
3,Pnemonia,Virus,COVID-19,58
4,Pnemonia,Virus,SARS,4
5,Pnemonia,bacteria,bacteria,2772
6,Pnemonia,bacteria,Streptococcus,5


In [70]:


# creating data label map
df_data_summary = df_data_summary[df_data_summary['Label_1_Virus_category'] != 'Stress-Smoking']
label_map = df_data_summary[['Label_1_Virus_category']].drop_duplicates().reset_index(drop=True).to_dict()['Label_1_Virus_category']
label_map = { val.lower(): key for key, val in label_map.items()}
label_map

{'normal': 0, 'virus': 1, 'bacteria': 2}

# Simple Data Analysis

1. pre-process the image to grab the label per image
2. do a simple model with multiple labels

In [71]:
df_meta = df_meta[df_meta['Label_1_Virus_category'] != 'Stress-Smoking']

In [72]:
df_meta['Label_1_Virus_category'].unique()

array([nan, 'Virus', 'bacteria'], dtype=object)

In [73]:
# fill some label nan
# certain bacteria/virus does not have a distinct label, for these we will be using the regular virus/barteria term
# in case of normal, we will label it as nomal
df_meta['Label_2_Virus_category'] = np.where((df_meta['Label'] == 'Normal'), 
                                              'Normal',
                                              np.where(
                                                  (df_meta['Label_2_Virus_category'] != df_meta['Label_2_Virus_category']) & 
                                                  ~(df_meta['Label_1_Virus_category'] != df_meta['Label_1_Virus_category']),
                                                  df_meta['Label_1_Virus_category'],
                                                  df_meta['Label_2_Virus_category']
                                              ))


df_meta['Label_1_Virus_category'] = np.where((df_meta['Label'] == 'Normal'), 
                                              'Normal',
                                              df_meta['Label_1_Virus_category'])

# select and rename meta dataset columns
df_meta = df_meta[['Dataset_type', 'X_ray_image_name', 'Label_1_Virus_category']]
df_meta['Label_1_Virus_category'] = df_meta['Label_1_Virus_category'].str.lower()
df_meta['label'] = df_meta['Label_1_Virus_category'].map(label_map)


# get a list of unique labels to train from
label_list = sorted(list(df_meta['label'].unique()))
print(label_list)

# get train data
df_train = df_meta[df_meta['Dataset_type'] == 'TRAIN']
df_train

[0, 1, 2]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['Label_2_Virus_category'] = np.where((df_meta['Label'] == 'Normal'),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['Label_1_Virus_category'] = np.where((df_meta['Label'] == 'Normal'),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta['Label_1_Virus_category'] = df_meta['Label_

Unnamed: 0,Dataset_type,X_ray_image_name,Label_1_Virus_category,label
0,TRAIN,IM-0128-0001.jpeg,normal,0
1,TRAIN,IM-0127-0001.jpeg,normal,0
2,TRAIN,IM-0125-0001.jpeg,normal,0
3,TRAIN,IM-0122-0001.jpeg,normal,0
4,TRAIN,IM-0119-0001.jpeg,normal,0
...,...,...,...,...
5304,TRAIN,1-s2.0-S0929664620300449-gr2_lrg-c.jpg,virus,1
5305,TRAIN,1-s2.0-S0929664620300449-gr2_lrg-b.jpg,virus,1
5306,TRAIN,1-s2.0-S0929664620300449-gr2_lrg-a.jpg,virus,1
5307,TRAIN,1-s2.0-S0140673620303706-fx1_lrg.jpg,virus,1


In [74]:
# get test data
df_test = df_meta[df_meta['Dataset_type'] == 'TEST']
df_test

Unnamed: 0,Dataset_type,X_ray_image_name,Label_1_Virus_category,label
5309,TEST,IM-0021-0001.jpeg,normal,0
5310,TEST,IM-0019-0001.jpeg,normal,0
5311,TEST,IM-0017-0001.jpeg,normal,0
5312,TEST,IM-0016-0001.jpeg,normal,0
5313,TEST,IM-0015-0001.jpeg,normal,0
...,...,...,...,...
5928,TEST,person1637_virus_2834.jpeg,virus,1
5929,TEST,person1635_virus_2831.jpeg,virus,1
5930,TEST,person1634_virus_2830.jpeg,virus,1
5931,TEST,person1633_virus_2829.jpeg,virus,1


In [75]:
df_train['IMG'] = df_train['X_ray_image_name'].str.split(".").str[0]
df_test['IMG'] = df_test['X_ray_image_name'].str.split(".").str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['IMG'] = df_train['X_ray_image_name'].str.split(".").str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['IMG'] = df_test['X_ray_image_name'].str.split(".").str[0]


In [76]:
import pickle
# load the SIFT feature extracted data
features_train = pickle.load(open(f"{DATA_DIR}/processed/sift_features.pkl", "rb"))
features_test = pickle.load(open(f"{DATA_DIR}/processed/sift_features_test.pkl", "rb"))


# load lbp features
# features_train = pickle.load(open(f"{DATA_DIR}/processed/lbp_features.pkl", "rb"))
# features_test = pickle.load(open(f"{DATA_DIR}/processed/lbp_features_test.pkl", "rb"))

# load hsv hist features
# features_train = pickle.load(open(f"{DATA_DIR}/processed/hsv_hist_features_train.pkl", "rb"))
# features_test = pickle.load(open(f"{DATA_DIR}/processed/hsv_hist_features_test.pkl", "rb"))

In [77]:
def run_PCA(df, features):
    """
        Compute PCA 
        Plot Explained Variance Graph
    """
    data = df.copy()

    # create 10 PCA plots per category

    for label in [0, 1, 2]:
      sliced = data[data['label'] == label]

      counter = 0
      for i, row in tqdm(sliced.iterrows()):
        if counter == 10:
          break

        path = f"{DATA_DIR}/raw/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train_resized_augmented/"

        img_name = "augmented_"+row['IMG']+"_"
        aug_images_list = list(pathlib.Path(path).glob(f'{img_name}*.jpeg'))
        img_path = str(aug_images_list[np.random.randint(0,4)])
        img = img_path[len(path):]

        feat = features[img][12:-12, 12:-12]

        # feat = np.reshape(feat, (256, 1, 3))
        # # only taking single channel
        # feat = feat[:,:,0]
        # print(feat)
        # print(feat.shape)
        # print(f"Begin PCA IMG {i}")

        # run PCA analysis
        # instantiate PCA
        # determine transformed features
        # determine explained variance using explained_variance_ration_ attribute
        # compute cumulative sum of eigenvalues
        pca=PCA() 
        train_pca=pca.fit_transform(feat) 
        exp_var_pca = pca.explained_variance_ratio_
        cum_sum_eigenvalues = np.cumsum(exp_var_pca)

        # print("Begin Plotting Explaination of Variance")
        # Create the visualization plot
        plt.clf()
        plt.bar(range(0,len(exp_var_pca)), exp_var_pca, 
                alpha=0.5, align='center', 
                label='Individual explained variance')
        
        plt.step(range(0,len(cum_sum_eigenvalues)), 
                cum_sum_eigenvalues, 
                where='mid',
                label='Cumulative explained variance')
        plt.ylabel('Explained variance ratio')
        plt.xlabel('Principal component index')
        plt.legend(loc='best')
        plt.title(f"Label: {label} PCA Explained Variance Plot")
        plt.tight_layout()
        plt.savefig(f"{DATA_DIR}/plots_sift_pca/label_{label}-img-{i}_exp_var_plot.png", facecolor='white')
        counter += 1
    return

# run_PCA(df_train, sift_features_train)

In [78]:
# Seam like for SIFT PCA components = 20 captures ~85% of the total variance :)
# Seam like for LBP PCA components = 50 captures ~80% of the total variance :)

In [81]:
# build a list of training data
# data_path = f'{DATA_DIR}/raw/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/augmented_train_resized'

train_image_data = []
train_label = []

# grab the sift description
# isolate key description to grab the correct image
for key, feat in tqdm(features_train.items()):
  # remove unwanted charaters
  img_name_list = key.split("_")[1: -2]
  img_name = "_".join(img_name_list)

  # grab the image label
  label = df_train[df_train['IMG'] == img_name]['label'].values

  ## SIFT features
  # ignore image that have low number of SIFT keypoint descriptions
  # here we decide to grab a threshold of 100 to: 
  # 1. keep training size large 
  # 2. keep a lower computation power
  # ignore faulty image nameing
  if (feat.shape[0] < 100) or len(label) != 1:
    continue

  label = label[0]
  desc = feat[:100,:].flatten()
  train_image_data.append(np.array(desc))
  train_label.append(label)


  # ignore image that have low number of SIFT keypoint descriptions
  # here we decide to grab a threshold of 100 to: 
  # 1. keep training size large 
  # 2. keep a lower computation power
  # ignore faulty image nameing
  if len(label) != 1:
    continue

  label = label[0]

  #feat = np.reshape(feat, (256, 1, 1)) ### HSV
  #feat = feat[12:-12, 12:-12] ### LBP
  
  feat = feat.flatten()
  train_image_data.append(np.array(feat.astype(float)))
  train_label.append(label)

  0%|          | 0/21649 [00:00<?, ?it/s]

TypeError: ignored

In [None]:
train_image_data = np.vstack(train_image_data)
train_label = np.array(train_label)
train_image_data[:5], train_label[:5]

In [None]:
def plot_classes(X, y, ax, title):

  # color code each cluster (person ID)
  colormap = plt.cm.tab20
  colorst = [colormap(i) for i in np.linspace(0, 1.0, len(np.unique(y)))]

  # project the features into 2 dimensions
  for k in range(len(np.unique(y))):
    ax.scatter(X[y==k, 0], X[y==k, 1], alpha=0.5, facecolors=colorst[k])

  ax.set_title(title)

In [None]:
def get_PCA(X_list, n_components=2):
  pca_list = []
  xpca_list = []
  for X in X_list:
    pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True).fit(X)
    X_pca = pca.transform(X)
    pca_list.append(pca)
    xpca_list.append(X_pca)
  return pca_list, xpca_list


def get_tsne(X_list, n_components=2):
  xtsne_list = []
  for X in X_list:
    tsne = TSNE(n_components=n_components, random_state=0)
    X_tsne = tsne.fit_transform(X)
    xtsne_list.append(X_tsne)
  return xtsne_list

In [None]:
def plot_classes(X, y, ax, title):

  # color code each cluster (person ID)
  colormap = plt.cm.tab20
  colorst = [colormap(i) for i in np.linspace(0, 1.0, len(np.unique(y)))]

  # project the features into 2 dimensions
  for k in range(len(np.unique(y))):
    ax.scatter(X[y==k, 0], X[y==k, 1], alpha=0.5, facecolors=colorst[k])

  ax.set_title(title)

In [None]:
# dimensionality reduction
X_pca = get_PCA([train_image_data], n_components=10)
X_tsne = get_tsne([train_image_data], n_components=2)

In [None]:
# change names
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
plot_classes(X_tsne[0], train_label, ax[0], title='SIFT tSNE')
plot_classes(X_pca[1][0], train_label, ax[1], title='SIFT PCA')

legend = ["Normal", "Virus", "Bacteria"]
ax[0].legend(legend)
ax[1].legend(legend)

plt.show()