In [1]:
import sklearn
from sklearn import utils, neighbors, ensemble, svm, metrics

In [2]:
# Load the libraries
import os
import imageio
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import seaborn as sns

In [3]:
#read_csv

labels=pd.read_csv("/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/labels.csv")



In [4]:
#shuffle the df
random.seed(30)
labels = sklearn.utils.shuffle(labels)

In [5]:
labels.head()

Unnamed: 0.1,Unnamed: 0,0,infect_status
8680,59549,C45P6ThinF_IMG_20151130_155110_cell_207.png,1
24749,12378,C62P23N_ThinF_IMG_20150818_133527_cell_103.png,0
26527,14374,C86P47ThinF_IMG_20150820_124943_cell_78.png,0
5550,47714,C176P137NThinF_IMG_20151201_122811_cell_162.png,1
25451,13157,C71P32_ThinF_IMG_20150813_163655_cell_58.png,0


In [6]:
labels.to_csv("/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/shuffled_labels.csv", index=False)

## Splitting the training and test sets

Now we have a shuffled label df, we need to decide what the best way to split the training and test sets.
- 80% and 20% ? or something else? how do we determine the best way to do it?
- in any case we should have 50% of images that are infected and uninfected in the training set ( and the test set)

In [7]:
#sampleing labels for the training set:
training=labels.sample(frac=0.8, replace=False, random_state=30)

In [8]:
training.shape

(21946, 3)

In [9]:
training.head()

Unnamed: 0.1,Unnamed: 0,0,infect_status
2297,18025,C129P90ThinF_IMG_20151004_134944_cell_25.png,1
10054,69701,C60P21thinF_IMG_20150804_105034_cell_119.png,1
1678,13931,C123P84ThinF_IMG_20151002_150931_cell_202.png,1
8372,58597,C39P4thinF_original_IMG_20150622_111942_cell_1...,1
11237,76152,C70P31_ThinF_IMG_20150819_142703_cell_1.png,1


In [10]:
training["infect_status"].value_counts()

0    10977
1    10969
Name: infect_status, dtype: int64

In [11]:
training["0"].value_counts()

C132P93ThinF_IMG_20151004_152642_cell_48.png       1
C132P93ThinF_IMG_20151004_151733_cell_166.png      1
C174P135NThinF_IMG_20151127_135342_cell_221.png    1
C117P78ThinF_IMG_20150930_220616_cell_65.png       1
C143P104ThinF_IMG_20151005_225746_cell_173.png     1
                                                  ..
C222ThinF_IMG_20151115_150925_cell_215.png         1
C59P20thinF_IMG_20150803_112858_cell_157.png       1
C110P71ThinF_IMG_20150930_105319_cell_144.png      1
C1_thinF_IMG_20150604_104722_cell_79.png           1
C137P98ThinF_IMG_20151005_160122_cell_73.png       1
Name: 0, Length: 21946, dtype: int64

Below I could have used the train.test.split function from sklearn.

In [13]:
#sampleing lables for the test set: 
#Identify what values are in labels and not in training:
# %%timeit
diff=set(labels["0"]).difference(training["0"])
where_diff =labels["0"].isin(diff)
#slice labels and add create df
test=labels[where_diff]
test.shape

(5487, 3)

In [14]:
test.head()

Unnamed: 0.1,Unnamed: 0,0,infect_status
26557,14413,C87P48ThinF_IMG_20150820_132514_cell_13.png,0
5098,46231,C174P135NThinF_IMG_20151127_135342_cell_220.png,1
8414,58639,C39P4thinF_original_IMG_20150622_112119_cell_6...,1
13598,94757,C99P60ThinF_IMG_20150918_141857_cell_32.png,1
26837,14722,C91P52ThinF_IMG_20150821_123314_cell_16.png,0


In [15]:
test["infect_status"].value_counts()

1    2753
0    2734
Name: infect_status, dtype: int64

In [16]:
#now save both as csvs
path="/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/"
training.to_csv((path+"training_labels.csv"), index=False)
test.to_csv((path+"test_labels.csv"), index=False)

In [17]:
#Just to check that thecsvs look fine 
a=pd.read_csv((path+"training_labels.csv"))
a.shape

(21946, 3)

In [18]:
b=pd.read_csv((path+"test_labels.csv"))
b.shape

(5487, 3)

In [19]:
c=pd.read_csv((path+"shuffled_labels.csv"))
c.shape

(27433, 3)

All looks good ecscept that we now have tw extra useless columns! 

Now we need to put all pictures in one folder and then create a function to grab from that folder the training and the test images.

## Split the training set in cross validation sets...

## Making a toy dataset label file

In [20]:
#randomly select 200 parasited and uninfected images:

toy=labels.sample(n=200, replace=False, random_state=30)



In [21]:
toy.shape

(200, 3)

In [22]:
toy["infect_status"].value_counts()

0    106
1     94
Name: infect_status, dtype: int64

This toy doesn't have the same number of uninfected and infected images, so I am going to try a different strategy:

In [23]:
grouped=labels.groupby("infect_status")
toy_df=grouped.apply(lambda x: x.sample(n=100, replace=False))
toy_df["infect_status"].value_counts()

1    100
0    100
Name: infect_status, dtype: int64

I will now check that they are unique images: 

In [24]:
print(len(toy_df.drop_duplicates()))

200


In [25]:
toy_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,0,infect_status
infect_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,23577,11064,C47P8thin_Original_Motic_IMG_20150714_093947_c...,0
0,21166,8356,C217ThinF_IMG_20151106_141500_cell_98.png,0
0,18461,5328,C168P129ThinF_IMG_20151118_154126_cell_78.png,0
0,24083,11641,C54P15thinF_IMG_20150728_105949_cell_103.png,0
0,24821,12458,C63P24N_ThinF_IMG_20150818_144456_cell_135.png,0


In [26]:
#just checking that the label is correct!
labels[labels["0"]=="C53P14thinF_IMG_20150726_114606_cell_11.png"]


Unnamed: 0.1,Unnamed: 0,0,infect_status
24018,11567,C53P14thinF_IMG_20150726_114606_cell_11.png,0


In [27]:
#shuffle: 
random.seed(30)
toy_df = sklearn.utils.shuffle(toy_df)

In [28]:
toy_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,0,infect_status
infect_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,13152,93717,C98P59ThinF_IMG_20150917_154452_cell_204.png,1
1,1781,14604,C124P85ThinF_IMG_20151002_155223_cell_170.png,1
0,14188,515,C107P68ThinF_IMG_20150924_222535_cell_69.png,0
0,23264,10718,C42P5thin_original_IMG_20150623_134547_cell_15...,0
1,11405,79822,C76P37ThinF_IMG_20150815_173019_cell_233.png,1


In [33]:
#now save df:
path="/Users/anaraquelpengelly/Desktop/MSC_health_data_science/term_2/machine_learning/project_malaria/Malaria_blood_image_classification/"
toy_df.to_csv((path+"toy_df.csv"), index=False)

In [34]:
#test:
toy_df=pd.read_csv(path+"toy_df.csv")
toy_df.head()

Unnamed: 0.1,Unnamed: 0,0,infect_status
0,93717,C98P59ThinF_IMG_20150917_154452_cell_204.png,1
1,14604,C124P85ThinF_IMG_20151002_155223_cell_170.png,1
2,515,C107P68ThinF_IMG_20150924_222535_cell_69.png,0
3,10718,C42P5thin_original_IMG_20150623_134547_cell_15...,0
4,79822,C76P37ThinF_IMG_20150815_173019_cell_233.png,1


Need to see how to access the rownames of a df in pandas..

In [35]:
#now make the training and the test DFs
##sampleing labels for the training set:
training_toy=toy_df.sample(frac=0.8, replace=False, random_state=30)
##sampleing labels for the test set: 
diff=set(toy_df["0"]).difference(training_toy["0"])
where_diff =toy_df["0"].isin(diff)
#slice labels and add create df
test_toy=toy_df[where_diff]
test_toy.shape

(40, 3)

In [36]:
test_toy.head()

Unnamed: 0.1,Unnamed: 0,0,infect_status
7,42434,C168P129ThinF_IMG_20151118_155802_cell_152.png,1
11,17144,C128P89ThinF_IMG_20151004_131030_cell_152.png,1
13,4203,C153P114ThinF_IMG_20151115_135911_cell_247.png,0
15,5748,C173P134NThinF_IMG_20151130_125408_cell_67.png,0
18,9519,C116P77ThinF_IMG_20150930_171219_cell_85.png,1


In [71]:
#save all the toys as csvs:
test_toy.to_csv(path+"test_toy.csv", index=False)
training_toy.to_csv(path+"training_toy.csv", index=False)