# Import libraries

In [1]:
%matplotlib inline
from PIL import Image
from os import listdir
from skimage import data, img_as_float
from skimage import exposure
from skimage.filters import gaussian
from skimage.transform import rotate, AffineTransform, warp
from skimage.util import random_noise
from sklearn.metrics import classification_report, confusion_matrix
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchvision import transforms, utils, datasets
from tqdm.notebook import tqdm
import matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import skimage.io as io
import torch
import torch.nn as nn
import torch.nn.functional
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import warnings
warnings.filterwarnings('ignore')

# UDF

In [2]:
def Unique_values_from_Column (df, ColumnName):
    return df[ColumnName].unique().tolist();

In [3]:
def remove_files_from_folder(directoryPath, filesEndWith):
    import glob, os, os.path
    filelist = glob.glob(os.path.join(directoryPath, filesEndWith))
    for f in filelist:
        os.remove(f)

In [4]:
def Remove_duplicated_images(df):
    from pathlib import Path
    df['Duplicted'] = df['ImagePath'].apply(lambda x : not Path(x).is_file())
    df=df.loc[df['Duplicted'] == False]
    return df

In [5]:
def create_directory (Path):
    import os
    if not os.path.exists(Path):
        os.makedirs(Path)
        return True;
    else: 
        return False;

# Data Selection & Cleaning

## Data1

### Loading data

In [6]:
CsvPath='Chest_xray_Corona_Metadata.csv'
df1 = pd.read_csv(CsvPath)

Add a column which describes the type of Pneumonia (Non-Covid19, Covid-19).

Add a column contains the path of the image from source folder.

In [7]:
df1['Label_2'] = df1['Label_2_Virus_category']
df1['ImagePath'] = "Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/"+ df1['Dataset_type']+"/"+df1['X_ray_image_name']

Classify Streptococcus, ARDS and SARS as Pneumonia diseases different from COVID-19.

In [8]:
for label in Unique_values_from_Column (df1, 'Label_2'):
    if label in ['Streptococcus','ARDS','SARS' ]:
        df1.loc[df1.Label_2 == label, 'Label_2'] = "non-COVID-19"
df1.loc[df1['X_ray_image_name'].str.contains("bacteria"), 'Label_2'] = "non-COVID-19"

### Data selection

In [9]:
df1=df1[["ImagePath", "Label","Label_2", "Label_2_Virus_category","X_ray_image_name"]]
len(df1)

5910

### Remove Duplicated data

In [10]:
df1=Remove_duplicated_images(df1)
len(df1)

5876

### Save data in CSV file

In [11]:
df1.to_csv('data1.csv')

## Data 2

### Loading Data

In [12]:
CsvPath2='covid-chestxray-dataset-master/covid-chestxray-dataset-master/metadata.csv'
df2 = pd.read_csv(CsvPath2)

Add some columns to not change basic data but to add new information.

In [13]:
df2['Label_2'] = df2['finding']
df2['X_ray_image_name'] = df2['filename']
df2['Label_2_Virus_category']= df2['finding']
df2['ImagePath'] = "covid-chestxray-dataset-master/covid-chestxray-dataset-master/images/"+ df2['filename']
df2=df2[(df2['finding']!='No Finding') & (df2['finding']!='COVID-19, ARDS') & (df2['finding']!='todo')& (df2['folder']=='images')]
df2['Label'] = "Pnemonia"
for label in Unique_values_from_Column (df2, 'Label_2'):
    if label != 'COVID-19':
        df2.loc[df2.Label_2 == label, 'Label_2'] = "non-COVID-19"
df2.loc[df2['X_ray_image_name'].str.contains("bacteria"), 'Label_2'] = "non-COVID-19"

### Data Selection

In [14]:
df2=df2[["ImagePath", "Label", "Label_2", "Label_2_Virus_category","X_ray_image_name"]]

### Remove Duplicated Data

In [15]:
len(df2)

488

In [16]:
df2=Remove_duplicated_images(df2)
len(df2)

395

### Save data in CSV file

In [17]:
df2.to_csv('data2.csv')

## Merge 2 datasets

In [18]:
df = df1.append(df2, ignore_index=True, sort=False)

### Remove Duplicated Data

In [19]:
len(df)

6271

No duplicated data since we have already removed them from first and second data.

In [20]:
df=Remove_duplicated_images(df)
len(df)

6271

### Replace NULL Values

In [21]:
df=df.fillna("")

### Visualize columns' possible values

In [22]:
df

Unnamed: 0,ImagePath,Label,Label_2,Label_2_Virus_category,X_ray_image_name,Duplicted
0,Coronahack-Chest-XRay-Dataset/Coronahack-Chest...,Normal,,,IM-0128-0001.jpeg,False
1,Coronahack-Chest-XRay-Dataset/Coronahack-Chest...,Normal,,,IM-0127-0001.jpeg,False
2,Coronahack-Chest-XRay-Dataset/Coronahack-Chest...,Normal,,,IM-0125-0001.jpeg,False
3,Coronahack-Chest-XRay-Dataset/Coronahack-Chest...,Normal,,,IM-0122-0001.jpeg,False
4,Coronahack-Chest-XRay-Dataset/Coronahack-Chest...,Normal,,,IM-0119-0001.jpeg,False
...,...,...,...,...,...,...
6266,covid-chestxray-dataset-master/covid-chestxray...,Pnemonia,non-COVID-19,Klebsiella,000012-3.jpg,False
6267,covid-chestxray-dataset-master/covid-chestxray...,Pnemonia,non-COVID-19,Influenza,000001-6.png,False
6268,covid-chestxray-dataset-master/covid-chestxray...,Pnemonia,non-COVID-19,Influenza,000002-7.png,False
6269,covid-chestxray-dataset-master/covid-chestxray...,Pnemonia,non-COVID-19,Legionella,000001-27.jpg,False


In [23]:
Unique_values_from_Column (df, 'Label')

['Normal', 'Pnemonia']

In [24]:
Unique_values_from_Column (df, 'Label_2')

['', 'non-COVID-19', 'COVID-19']

In [25]:
Unique_values_from_Column (df, 'Label_2_Virus_category')

['',
 'Streptococcus',
 'COVID-19',
 'ARDS',
 'SARS',
 'Pneumocystis',
 'Chlamydophila',
 'E.Coli',
 'Klebsiella',
 'Legionella',
 'Pneumonia',
 'Varicella',
 'Bacterial',
 'Influenza']

### Save dataframe

In [26]:
create_directory ("data")
df.to_csv("data/data.csv")