# 1-Import relevant packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 2-Load and visualise the data

Giemsa-stained thin blood smear slides from 150 P. falciparum-infected and 50 healthy patients were collected and photographed at Chittagong Medical College Hospital, Bangladesh. The smartphone’s built-in camera acquired images of slides for each microscopic field of view. The images were manually annotated by an expert slide reader at the Mahidol-Oxford Tropical Medicine Research Unit in Bangkok, Thailand. The de-identified images and annotations are archived at NLM (IRB#12972). We applied a level-set based algorithm to detect and segment the red blood cells. The dataset contains a total of 27,558 cell images with equal instances of parasitized and uninfected cells

TensorFlow data builder link: tfds.image.malaria.Malaria Publication link: https://lhncbc.nlm.nih.gov/publication/pub9932

## 2.1 Load the labels

In [2]:
parasited=pd.read_csv("/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/patientid_cellmapping_parasitized.csv")
uninfected = pd.read_csv("/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/patientid_cellmapping_uninfected.csv")

In [11]:
parasited.head()


Unnamed: 0,C100P61ThinF,['C100P61ThinF_IMG_20150918_144104_cell_162.png','C100P61ThinF_IMG_20150918_144104_cell_163.png','C100P61ThinF_IMG_20150918_144104_cell_164.png','C100P61ThinF_IMG_20150918_144104_cell_165.png','C100P61ThinF_IMG_20150918_144104_cell_166.png','C100P61ThinF_IMG_20150918_144104_cell_167.png','C100P61ThinF_IMG_20150918_144104_cell_168.png','C100P61ThinF_IMG_20150918_144104_cell_169.png','C100P61ThinF_IMG_20150918_144104_cell_170.png',...,Unnamed: 624,Unnamed: 625,Unnamed: 626,Unnamed: 627,Unnamed: 628,Unnamed: 629,Unnamed: 630,Unnamed: 631,Unnamed: 632,Unnamed: 633
0,C101P62ThinF,['C101P62ThinF_IMG_20150918_151006_cell_61.png','C101P62ThinF_IMG_20150918_151006_cell_62.png','C101P62ThinF_IMG_20150918_151006_cell_63.png','C101P62ThinF_IMG_20150918_151006_cell_64.png','C101P62ThinF_IMG_20150918_151006_cell_65.png','C101P62ThinF_IMG_20150918_151006_cell_66.png','C101P62ThinF_IMG_20150918_151006_cell_67.png','C101P62ThinF_IMG_20150918_151006_cell_68.png','C101P62ThinF_IMG_20150918_151006_cell_69.png',...,,,,,,,,,,
1,C102P63ThinF,['C102P63ThinF_IMG_20150918_161508_cell_190.png','C102P63ThinF_IMG_20150918_161508_cell_191.png','C102P63ThinF_IMG_20150918_161508_cell_192.png','C102P63ThinF_IMG_20150918_161508_cell_193.png','C102P63ThinF_IMG_20150918_161508_cell_194.png','C102P63ThinF_IMG_20150918_161508_cell_195.png','C102P63ThinF_IMG_20150918_161826_cell_170.png','C102P63ThinF_IMG_20150918_161826_cell_171.png','C102P63ThinF_IMG_20150918_161826_cell_172.png',...,,,,,,,,,,
2,C103P64ThinF,['C103P64ThinF_IMG_20150918_164250_cell_170.png','C103P64ThinF_IMG_20150918_164250_cell_171.png','C103P64ThinF_IMG_20150918_164250_cell_172.png','C103P64ThinF_IMG_20150918_164250_cell_173.png','C103P64ThinF_IMG_20150918_164250_cell_174.png','C103P64ThinF_IMG_20150918_164250_cell_175.png','C103P64ThinF_IMG_20150918_164331_cell_184.png','C103P64ThinF_IMG_20150918_164331_cell_185.png','C103P64ThinF_IMG_20150918_164331_cell_186.png',...,,,,,,,,,,
3,C104P65ThinF,['C104P65ThinF_IMG_20150918_170850_cell_204.png','C104P65ThinF_IMG_20150918_171154_cell_212.png','C104P65ThinF_IMG_20150918_171154_cell_213.png','C104P65ThinF_IMG_20150918_171751_cell_186.png','C104P65ThinF_IMG_20150918_171751_cell_187.png','C104P65ThinF_IMG_20150918_172053_cell_187.png','C104P65ThinF_IMG_20150918_172537_cell_149.png','C104P65ThinF_IMG_20150918_172639_cell_182.png','C104P65ThinF_IMG_20150918_172639_cell_183.png',...,,,,,,,,,,
4,C105P66ThinF,['C105P66ThinF_IMG_20150924_094800_cell_136.png','C105P66ThinF_IMG_20150924_094800_cell_137.png','C105P66ThinF_IMG_20150924_095034_cell_148.png','C105P66ThinF_IMG_20150924_095034_cell_149.png','C105P66ThinF_IMG_20150924_095034_cell_150.png','C105P66ThinF_IMG_20150924_095130_cell_155.png','C105P66ThinF_IMG_20150924_095130_cell_156.png','C105P66ThinF_IMG_20150924_095130_cell_157.png','C105P66ThinF_IMG_20150924_095130_cell_158.png',...,,,,,,,,,,


In [12]:
parasited.shape

(150, 634)

In [13]:
uninfected.head()


Unnamed: 0,C100P61ThinF,['C100P61ThinF_IMG_20150918_144104_cell_128.png','C100P61ThinF_IMG_20150918_144104_cell_131.png','C100P61ThinF_IMG_20150918_144104_cell_144.png','C100P61ThinF_IMG_20150918_144104_cell_21.png','C100P61ThinF_IMG_20150918_144104_cell_25.png','C100P61ThinF_IMG_20150918_144104_cell_34.png','C100P61ThinF_IMG_20150918_144104_cell_48.png','C100P61ThinF_IMG_20150918_144104_cell_65.png','C100P61ThinF_IMG_20150918_144348_cell_108.png',...,'C100P61ThinF_IMG_20150918_150041_cell_76.png'],Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77
0,C101P62ThinF,['C101P62ThinF_IMG_20150918_151006_cell_17.png','C101P62ThinF_IMG_20150918_151006_cell_29.png','C101P62ThinF_IMG_20150918_151006_cell_32.png','C101P62ThinF_IMG_20150918_151006_cell_53.png','C101P62ThinF_IMG_20150918_151006_cell_56.png','C101P62ThinF_IMG_20150918_151006_cell_59.png','C101P62ThinF_IMG_20150918_151006_cell_8.png','C101P62ThinF_IMG_20150918_151149_cell_23.png','C101P62ThinF_IMG_20150918_151149_cell_28.png',...,'C101P62ThinF_IMG_20150923_170344_cell_61.png','C101P62ThinF_IMG_20150923_170344_cell_66.png','C101P62ThinF_IMG_20150923_170344_cell_69.png','C101P62ThinF_IMG_20150923_170344_cell_70.png'],,,,,,
1,C102P63ThinF,['C102P63ThinF_IMG_20150918_161508_cell_101.png','C102P63ThinF_IMG_20150918_161508_cell_113.png','C102P63ThinF_IMG_20150918_161508_cell_133.png','C102P63ThinF_IMG_20150918_161508_cell_14.png','C102P63ThinF_IMG_20150918_161508_cell_143.png','C102P63ThinF_IMG_20150918_161508_cell_150.png','C102P63ThinF_IMG_20150918_161508_cell_172.png','C102P63ThinF_IMG_20150918_161508_cell_175.png','C102P63ThinF_IMG_20150918_161508_cell_37.png',...,'C102P63ThinF_IMG_20150918_163054_cell_87.png'],,,,,,,,,
2,C103P64ThinF,['C103P64ThinF_IMG_20150918_164250_cell_121.png','C103P64ThinF_IMG_20150918_164250_cell_128.png','C103P64ThinF_IMG_20150918_164250_cell_15.png','C103P64ThinF_IMG_20150918_164250_cell_153.png','C103P64ThinF_IMG_20150918_164250_cell_164.png','C103P64ThinF_IMG_20150918_164250_cell_19.png','C103P64ThinF_IMG_20150918_164250_cell_28.png','C103P64ThinF_IMG_20150918_164250_cell_39.png','C103P64ThinF_IMG_20150918_164250_cell_67.png',...,'C103P64ThinF_IMG_20150918_165510_cell_98.png'],,,,,,,,,
3,C104P65ThinF,['C104P65ThinF_IMG_20150918_170850_cell_140.png','C104P65ThinF_IMG_20150918_170850_cell_167.png','C104P65ThinF_IMG_20150918_170850_cell_174.png','C104P65ThinF_IMG_20150918_170850_cell_189.png','C104P65ThinF_IMG_20150918_170850_cell_29.png','C104P65ThinF_IMG_20150918_170850_cell_32.png','C104P65ThinF_IMG_20150918_170850_cell_66.png','C104P65ThinF_IMG_20150918_170850_cell_68.png','C104P65ThinF_IMG_20150918_170850_cell_83.png',...,'C104P65ThinF_IMG_20150918_172639_cell_34.png','C104P65ThinF_IMG_20150918_172639_cell_44.png','C104P65ThinF_IMG_20150918_172639_cell_5.png','C104P65ThinF_IMG_20150918_172639_cell_93.png','C104P65ThinF_IMG_20150918_172639_cell_95.png'],,,,,
4,C105P66ThinF,['C105P66ThinF_IMG_20150924_094800_cell_11.png','C105P66ThinF_IMG_20150924_094800_cell_132.png','C105P66ThinF_IMG_20150924_094800_cell_30.png','C105P66ThinF_IMG_20150924_094800_cell_39.png','C105P66ThinF_IMG_20150924_094800_cell_62.png','C105P66ThinF_IMG_20150924_094800_cell_63.png','C105P66ThinF_IMG_20150924_094800_cell_70.png','C105P66ThinF_IMG_20150924_094800_cell_74.png','C105P66ThinF_IMG_20150924_094800_cell_90.png',...,'C105P66ThinF_IMG_20150924_100655_cell_96.png'],,,,,,,,,


In [14]:
uninfected.shape

(200, 78)

For both datasets each row represents an individual and the columns represent cell images from that individual.

First we remove the first column

Then we can do:
- put all the image names in one single column and then add another column with with the label 
- After that we can join both uninfected and infected dataframes.

We can then: 
- "Shuffle" the rows (can use: from sklearn.utils import shuffle)
- Then sample the dataframe for the training and the test datasets.

In [17]:
uninfected.iloc[:,0]

0      C101P62ThinF
1      C102P63ThinF
2      C103P64ThinF
3      C104P65ThinF
4      C105P66ThinF
           ...     
195     C95P56ThinF
196     C96P57ThinF
197     C97P58ThinF
198     C98P59ThinF
199     C99P60ThinF
Name: C100P61ThinF, Length: 200, dtype: object

In [3]:
uninfected_new=uninfected.iloc[:, 1:]

In [4]:
parasited_new=parasited.iloc[:, 1:]

In [5]:
#now we create a vector of both of them:
uninfected_new = np.asarray(uninfected_new)
parasited_new = np.asarray(parasited_new)

In [27]:
uninfected_new

array([["['C101P62ThinF_IMG_20150918_151006_cell_17.png'",
        " 'C101P62ThinF_IMG_20150918_151006_cell_29.png'",
        " 'C101P62ThinF_IMG_20150918_151006_cell_32.png'", ..., nan, nan,
        nan],
       ["['C102P63ThinF_IMG_20150918_161508_cell_101.png'",
        " 'C102P63ThinF_IMG_20150918_161508_cell_113.png'",
        " 'C102P63ThinF_IMG_20150918_161508_cell_133.png'", ..., nan,
        nan, nan],
       ["['C103P64ThinF_IMG_20150918_164250_cell_121.png'",
        " 'C103P64ThinF_IMG_20150918_164250_cell_128.png'",
        " 'C103P64ThinF_IMG_20150918_164250_cell_15.png'", ..., nan, nan,
        nan],
       ...,
       ["['C97P58ThinF_IMG_20150917_145555_cell_103.png'",
        " 'C97P58ThinF_IMG_20150917_145555_cell_12.png'",
        " 'C97P58ThinF_IMG_20150917_145555_cell_122.png'", ..., nan, nan,
        nan],
       ["['C98P59ThinF_IMG_20150917_153030_cell_120.png'",
        " 'C98P59ThinF_IMG_20150917_153030_cell_136.png'",
        " 'C98P59ThinF_IMG_20150917_153030

In [6]:
#convert to a one dimensional array:
lab_uninfected=pd.DataFrame(uninfected_new.flatten())

In [7]:
lab_uninfected.head()

Unnamed: 0,0
0,['C101P62ThinF_IMG_20150918_151006_cell_17.png'
1,'C101P62ThinF_IMG_20150918_151006_cell_29.png'
2,'C101P62ThinF_IMG_20150918_151006_cell_32.png'
3,'C101P62ThinF_IMG_20150918_151006_cell_53.png'
4,'C101P62ThinF_IMG_20150918_151006_cell_56.png'


In [8]:
#add column with label:
lab_uninfected["infect_status"]=0
lab_uninfected.head()

Unnamed: 0,0,infect_status
0,['C101P62ThinF_IMG_20150918_151006_cell_17.png',0
1,'C101P62ThinF_IMG_20150918_151006_cell_29.png',0
2,'C101P62ThinF_IMG_20150918_151006_cell_32.png',0
3,'C101P62ThinF_IMG_20150918_151006_cell_53.png',0
4,'C101P62ThinF_IMG_20150918_151006_cell_56.png',0


In [9]:
lab_uninfected.shape

(15400, 2)

Now we need to:
- remove the square brackets 
- do the same with the parasited.

In [10]:
#now for the parasited DF:
##convert to a one dimensional array:
lab_parasited=pd.DataFrame(parasited_new.flatten())
##add one column with label:
lab_parasited["infect_status"]=1
lab_parasited.head()

Unnamed: 0,0,infect_status
0,['C101P62ThinF_IMG_20150918_151006_cell_61.png',1
1,'C101P62ThinF_IMG_20150918_151006_cell_62.png',1
2,'C101P62ThinF_IMG_20150918_151006_cell_63.png',1
3,'C101P62ThinF_IMG_20150918_151006_cell_64.png',1
4,'C101P62ThinF_IMG_20150918_151006_cell_65.png',1


In [11]:
lab_parasited.shape

(94950, 2)

this is a bit of a strange number ... aha! need to remove the NAs!!!From both dfs.

In [12]:
#remove NAs
lab_parasited=lab_parasited.dropna()
lab_uninfected=lab_uninfected.dropna()

In [13]:
print(lab_uninfected.shape, lab_parasited.shape)

(13711, 2) (13722, 2)


In [14]:
#find the square brackets and remove it from the rows that have ! 
lab_parasited.iloc[:,0]=lab_parasited.iloc[:, 0].str.replace("[", "")
lab_parasited.iloc[:,0]=lab_parasited.iloc[:, 0].str.replace("]", "")
lab_parasited.iloc[:,0]=lab_parasited.iloc[:, 0].str.replace("'", "")
lab_parasited.head()

Unnamed: 0,0,infect_status
0,C101P62ThinF_IMG_20150918_151006_cell_61.png,1
1,C101P62ThinF_IMG_20150918_151006_cell_62.png,1
2,C101P62ThinF_IMG_20150918_151006_cell_63.png,1
3,C101P62ThinF_IMG_20150918_151006_cell_64.png,1
4,C101P62ThinF_IMG_20150918_151006_cell_65.png,1


In [15]:
lab_parasited.tail()

Unnamed: 0,0,infect_status
94876,C99P60ThinF_IMG_20150918_142334_cell_5.png,1
94877,C99P60ThinF_IMG_20150918_142334_cell_6.png,1
94878,C99P60ThinF_IMG_20150918_142334_cell_7.png,1
94879,C99P60ThinF_IMG_20150918_142334_cell_8.png,1
94880,C99P60ThinF_IMG_20150918_142334_cell_9.png,1


In [16]:
#find the square brackets and remove it from the rows that have ! 
lab_uninfected.iloc[:,0]=lab_uninfected.iloc[:, 0].str.replace("[", "")
lab_uninfected.iloc[:,0]=lab_uninfected.iloc[:, 0].str.replace("]", "")
lab_uninfected.iloc[:,0]=lab_uninfected.iloc[:, 0].str.replace("'", "")
lab_uninfected.head()

Unnamed: 0,0,infect_status
0,C101P62ThinF_IMG_20150918_151006_cell_17.png,0
1,C101P62ThinF_IMG_20150918_151006_cell_29.png,0
2,C101P62ThinF_IMG_20150918_151006_cell_32.png,0
3,C101P62ThinF_IMG_20150918_151006_cell_53.png,0
4,C101P62ThinF_IMG_20150918_151006_cell_56.png,0


In [17]:
lab_uninfected.tail()

Unnamed: 0,0,infect_status
15386,C99P60ThinF_IMG_20150918_142128_cell_47.png,0
15387,C99P60ThinF_IMG_20150918_142128_cell_52.png,0
15388,C99P60ThinF_IMG_20150918_142128_cell_53.png,0
15389,C99P60ThinF_IMG_20150918_142128_cell_55.png,0
15390,C99P60ThinF_IMG_20150918_142128_cell_56.png,0


In [36]:
lab_parasited[0]=lab_parasited[0].str.replace(" ", "")
lab_uninfected[0]=lab_uninfected[0].str.replace(" ", "")

In [37]:
#save CSV files just in case

lab_parasited.to_csv("/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/lab_parasited.csv")
lab_uninfected.to_csv("/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/lab_uninfected.csv")

In [19]:
#merge the dataframes into one

merged=pd.concat([lab_parasited, lab_uninfected])

In [20]:
merged.head()

Unnamed: 0,0,infect_status
0,C101P62ThinF_IMG_20150918_151006_cell_61.png,1
1,C101P62ThinF_IMG_20150918_151006_cell_62.png,1
2,C101P62ThinF_IMG_20150918_151006_cell_63.png,1
3,C101P62ThinF_IMG_20150918_151006_cell_64.png,1
4,C101P62ThinF_IMG_20150918_151006_cell_65.png,1


In [21]:
merged.tail()

Unnamed: 0,0,infect_status
15386,C99P60ThinF_IMG_20150918_142128_cell_47.png,0
15387,C99P60ThinF_IMG_20150918_142128_cell_52.png,0
15388,C99P60ThinF_IMG_20150918_142128_cell_53.png,0
15389,C99P60ThinF_IMG_20150918_142128_cell_55.png,0
15390,C99P60ThinF_IMG_20150918_142128_cell_56.png,0


In [22]:
merged.shape

(27433, 2)

In [33]:
merged[0]=merged[0].str.replace(" ", "")


In [34]:
merged.head()

Unnamed: 0,0,infect_status
0,C101P62ThinF_IMG_20150918_151006_cell_61.png,1
1,C101P62ThinF_IMG_20150918_151006_cell_62.png,1
2,C101P62ThinF_IMG_20150918_151006_cell_63.png,1
3,C101P62ThinF_IMG_20150918_151006_cell_64.png,1
4,C101P62ThinF_IMG_20150918_151006_cell_65.png,1


Now save.

In [38]:
merged.to_csv('/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/labels.csv')

## 2.2 Explore images

In [24]:
#import relevant libraries
import shutil 
import tarfile
from zipfile import ZipFile #this causes problems in combination with the built in zip function ! !!!!!!!!

In [115]:
# specifying the zip file name 
cell_images = "cell_images.zip"
  
# opening the zip file in READ mode 
with ZipFile(cell_images, 'r') as zip: 
    # extracting all the files 
    print('Extracting all the files now...') 
    zip.extractall() 
    print('Done!') 

Extracting all the files now...
Done!


I think this is a virtual environment probelm maybe the zip library doens't work well with zip ... so need to do the function in an independent environment and notebook! 