In [3]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import os

In [4]:
covid_path = './COVID-19 Radiography Database/COVID-19'
normal_path = './COVID-19 Radiography Database/NORMAL'
pneumonia_path = './COVID-19 Radiography Database/Viral Pneumonia'

In [5]:
# creating separate folders for training and test data since we are going to split it 90% training /10% test data 
os.mkdir('./COVID-19 Radiography Database/train')
os.mkdir('./COVID-19 Radiography Database/test')

os.mkdir('./COVID-19 Radiography Database/train/covid')
os.mkdir('./COVID-19 Radiography Database/test/covid')

os.mkdir('./COVID-19 Radiography Database/train/normal')
os.mkdir('./COVID-19 Radiography Database/test/normal')

os.mkdir('./COVID-19 Radiography Database/train/pneumonia')
os.mkdir('./COVID-19 Radiography Database/test/pneumonia')

In [6]:
covid_train_len = int(np.floor(len(os.listdir(covid_path))*0.9))
covid_len = len(os.listdir(covid_path))
print(covid_train_len, covid_len)

normal_train_len = int(np.floor(len(os.listdir(normal_path))*0.9))
normal_len = len(os.listdir(normal_path))
print(normal_train_len, normal_len)

pneumonia_train_len = int(np.floor(len(os.listdir(pneumonia_path))*0.9))
pneumonia_len = len(os.listdir(pneumonia_path))
print(pneumonia_train_len, pneumonia_len)

197 219
1206 1341
1210 1345


In [7]:
import glob
import shutil
import itertools

In [8]:
#Copying COVID-19 train data (90%)
src_dir = './COVID-19 Radiography Database/COVID-19'
dst_dir = './COVID-19 Radiography Database/train/covid'
for imgtrain in itertools.islice(glob.iglob(os.path.join(src_dir, "*.png")), covid_train_len):
    shutil.copy(imgtrain, dst_dir)

In [9]:
#Copying NORMAL train data (90%)
src_dir = './COVID-19 Radiography Database/NORMAL'
dst_dir = './COVID-19 Radiography Database/train/normal'
for imgtrain in itertools.islice(glob.iglob(os.path.join(src_dir, "*.png")), normal_train_len):
    shutil.copy(imgtrain, dst_dir)

In [10]:
#Copying VIRAL PNEUMONIA train data (90%)
src_dir = './COVID-19 Radiography Database/Viral Pneumonia'
dst_dir = './COVID-19 Radiography Database/train/pneumonia'
for imgtrain in itertools.islice(glob.iglob(os.path.join(src_dir, "*.png")), pneumonia_train_len):
    shutil.copy(imgtrain, dst_dir)

In [11]:
#Copying COVID-19 test data (the other 10%)
src_dir = './COVID-19 Radiography Database/COVID-19'
dst_dir = './COVID-19 Radiography Database/test/covid'
for imgtrain in itertools.islice(glob.iglob(os.path.join(src_dir, "*.png")), covid_train_len, covid_len):
    shutil.copy(imgtrain, dst_dir)

In [12]:
#Copying NORMAL test data (the other 10%)
src_dir = './COVID-19 Radiography Database/NORMAL'
dst_dir = './COVID-19 Radiography Database/test/normal'
for imgtrain in itertools.islice(glob.iglob(os.path.join(src_dir, "*.png")), normal_train_len, normal_len):
    shutil.copy(imgtrain, dst_dir)

In [13]:
#Copying VIRAL PNEUMONIA test data (the other 10%)
src_dir = './COVID-19 Radiography Database/Viral Pneumonia'
dst_dir = './COVID-19 Radiography Database/test/pneumonia'
for imgtrain in itertools.islice(glob.iglob(os.path.join(src_dir, "*.png")), pneumonia_train_len, pneumonia_len):
    shutil.copy(imgtrain, dst_dir)

In [14]:
#TRAINING DATA
DATADIR = './COVID-19 Radiography Database/train'
CATEGORIES = ['covid', 'normal', 'pneumonia']
IMG_SIZE = 250

training_data = []

for category in CATEGORIES:
    path = os.path.join(DATADIR, category)
    class_num = CATEGORIES.index(category)
    for img in os.listdir(path):
        img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
        new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
        training_data.append([new_array, class_num])

#TEST DATA
DATADIR = './COVID-19 Radiography Database/test'
CATEGORIES = ['covid', 'normal', 'pneumonia']

test_data = []

for category in CATEGORIES:
    path = os.path.join(DATADIR, category)
    class_num = CATEGORIES.index(category)
    for img in os.listdir(path):
        img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
        new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
        test_data.append([new_array, class_num])

In [15]:
print(len(training_data))
print(len(test_data))

2613
292


In [16]:
#Shuffle traning data since they are all in order at the moment
import random

random.shuffle(training_data)
random.shuffle(test_data)


In [27]:
train_X = []
train_y = []

test_X = []
test_y = []

for features,label in training_data:
    train_X.append(features)
    train_y.append(label)

for features,label in test_data:
    test_X.append(features)
    test_y.append(label)
    
train_X = np.array(train_X).reshape(-1, IMG_SIZE, IMG_SIZE, 1)
test_X = np.array(test_X).reshape(-1, IMG_SIZE, IMG_SIZE, 1)

In [28]:
print(train_X.shape)
print(train_X[0], train_y[0])

(2613, 250, 250, 1)
[[[121]
  [117]
  [114]
  ...
  [164]
  [ 66]
  [173]]

 [[117]
  [121]
  [123]
  ...
  [181]
  [179]
  [180]]

 [[120]
  [122]
  [117]
  ...
  [184]
  [182]
  [178]]

 ...

 [[  0]
  [  0]
  [  1]
  ...
  [  0]
  [  0]
  [  0]]

 [[  0]
  [  0]
  [  0]
  ...
  [  0]
  [  0]
  [  0]]

 [[  0]
  [  0]
  [  0]
  ...
  [  0]
  [  0]
  [  0]]] 2


In [29]:
print(test_X.shape)
print(test_X[0], test_y[0])

(292, 250, 250, 1)
[[[73]
  [81]
  [79]
  ...
  [69]
  [65]
  [61]]

 [[72]
  [78]
  [77]
  ...
  [69]
  [64]
  [60]]

 [[71]
  [77]
  [79]
  ...
  [72]
  [66]
  [60]]

 ...

 [[ 0]
  [ 0]
  [ 0]
  ...
  [ 0]
  [ 0]
  [ 0]]

 [[ 0]
  [ 0]
  [ 0]
  ...
  [ 0]
  [ 0]
  [ 0]]

 [[ 0]
  [ 0]
  [ 0]
  ...
  [ 0]
  [ 0]
  [ 0]]] 1
