In [None]:
#Connect your Google Drive 
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#upgrade and install essential libraries
!pip install keras --upgrade
!pip install zipfile36
!pip install pydicom

In [4]:
import keras
import numpy as np
import cv2
import os
import random
import shutil
import pandas as pd
import csv
import pydicom as dicom
import zipfile
from keras import optimizers
from keras.models import Sequential,Model
from keras.layers import Dropout, Flatten, Dense,Input
from keras.applications.resnet_v2 import ResNet50V2
from keras.applications.xception import Xception
from keras.callbacks import ModelCheckpoint
from keras.applications.imagenet_utils import preprocess_input
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras.initializers import RandomNormal

Using TensorFlow backend.


In [None]:
#Get the zip file I have shared, that contains the covid-chestxray-dataset images until 12 April
# Through the link below get the shared zip file and add it to your drive:
# https://drive.google.com/file/d/1Bwn4vTQUUB0tHK5aHh--Rk6eOxs2jg3q/view?usp=sharing
archive = zipfile.ZipFile('kaggle.zip') #Extract Kaggle Dataset
for file in archive.namelist():
     archive.extract(file, './All')
archive = zipfile.ZipFile('covid-chestxray-dataset.zip') #Extract covid-chestxray-Dataset
for file in archive.namelist():
     archive.extract(file, './covid-chestxray-dataset')

In [None]:
fold_num=1 #Select Fold Number

In [None]:
try:
  os.mkdir('All/All')
except:
  pass

In [None]:
!git clone https://github.com/mr7495/covid19 #connect to our repository on GitHub

In [None]:
#Warning: Our prepared All.csv & train1.csv to train8.csv in each fold, are based on the covid-chestxray-dataset until 12 April.
#If you have used https://drive.google.com/file/d/1Bwn4vTQUUB0tHK5aHh--Rk6eOxs2jg3q/view?usp=sharing link to get the covid-chestxray-dataset.zip file(like the cells above), you can use our prepared csv files
#But if you want to load the updated covid-chestxray-dataset, you must make some changes to the csvfiles.
shutil.copy('covid19/prepared_csv_files/All.csv','All')
for i in range(1,9): #Load the 8 training phases csv files of the indicated fold
  shutil.copy('covid19/prepared_csv_files/fold{}/train{}.csv'.format(fold_num,i),'.')
  globals()['train{}'.format(i)]=[]

# The code for creating All.csv and training.csv files is available on  covid19/dataset preparing.ipynb.

In [None]:
images=[]
for r,d,f in os.walk('All/stage_2_train_images'): #Read the name of the images in both datasets
  for file in f:
    images.append(os.path.join(r,file))
for r,d,f in os.walk('covid-chestxray-dataset/images'):
  for file in f:
   images.append(os.path.join(r,file))

In [None]:
csv_all=pd.read_csv('All/All.csv', nrows=None) #Read the CSV file that contains the names of the images with their labels.
for index, row in csv_all.iterrows(): #This loop reads the images, converts them to suitable format and saves them in the All directory
  if '.png' in row['filename']: #For creating the All.csv we have converted the kaggle dataset images to png format,
                                #but some of the images in the other dataset also are in the format of png, so we use try/except here to distinguish which dataset, the annotation in the CSV file belongs to.
    try:
      png_index=row['filename'].find('.png')
      last_name=row['filename'][:png_index]+'.dcm'
      ds = dicom.dcmread(os.path.join('All/stage_2_train_images',last_name))
      pixel_array_numpy = ds.pixel_array
      imgname = last_name[:-4]+'.png'
      cv2.imwrite(os.path.join('All/All', imgname), pixel_array_numpy)
    except:
      png_index=row['filename'].find('.png')
      img=cv2.imread(os.path.join('covid-chestxray-dataset/images',row['filename'][:png_index+4])) 
      gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
      cv2.imwrite(os.path.join('All/All', row['filename'][:png_index+4]), gray)  
  else:
    img=cv2.imread(os.path.join('covid-chestxray-dataset/images',row['filename']))
    gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    cv2.imwrite(os.path.join('All/All', row['filename']), gray)  

In [None]:
All=[] #Thie list that is readed from All.csv
all_train=[] #This list contains the training annotations
all_test=[]
with open('All/All.csv',newline='', mode='r') as csvfile: #Adding All.csv rows to All list
      csvreader = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
      for row in csvreader:
          All.append(row)
for i in range(1,9): #Adding training1.csv to training8.csv rows to All_train list. This 1 to 8 indicate the 8 training phases
  with open('train{}.csv'.format(i),newline='', mode='r') as csvfile:
      csvreader = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
      for row in csvreader:
        all_train.append(row)
with open('all_test.csv'.format(i),newline='', mode='w') as csvfile: #Add all the other images that do not belong to the training phases, to the test set
    csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(['filename','class'])
    for row in All:
      if row not in all_train:
        csvwriter.writerow(row)

In [None]:
#Because we have written our code somehow to only save the epochs with the best validation accuracy during the training,
# we created the s_test.csv with 631 images. That is why validating each epoch for 11302 images during training would be terribly time-consuming
#so we select a random s_test.csv for evaluating the network during the training process.

In [None]:
for i in range(10): #Shuffle the All list
  random.shuffle(All)
with open('s_test.csv'.format(i),newline='', mode='w') as csvfile: #Create s_test.csv file for evaluating the network during training
    csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(['filename','class'])
    ln=0
    lp=0
    for row in All:
      if row not in all_train:
        if row[1]=='COVID-19':
          csvwriter.writerow(row)
        elif row[1]=='normal':
          if ln<300:
            csvwriter.writerow(row)
            ln+=1
        else:
          if lp<300:
            csvwriter.writerow(row)
            lp+=1

In [None]:
#remove the unnecessary file to increase the free space
try:
  os.remove('kaggle.zip')
  shutil.rmtree('All/stage_2_train_images')
  shutil.rmtree('All/stage_2_test_images')
  shutil.rmtree('covid-chestxray-dataset')
except:
  pass