# Data preparation

In [1]:
from pathlib import Path
import numpy as np 
import pandas as pd
import pickle
from skimage import io
from tqdm.notebook import tqdm
tqdm().pandas();
import os

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

  from pandas import Panel


## 1. Extracting paths & metadata in a dataframe

In [2]:
# Extract and store all image paths into a list
def store_image_paths(input_path = Path.cwd()/'input/breast-histopathology-images'):
    image_paths = [image_path for image_path in Path.glob(input_path,pattern = '*/*/*.png')]
    return image_paths


# Extract all metadata into a dictionary
def store_metadata(image_paths) -> dict:
    path_data = {'path':[],'patient_id':[],'x_coord':[] ,'y_coord':[],'target':[]}
    for image_path in tqdm(image_paths, total = len(image_paths)):
        
        # get only filename and split to get such a list : ['10253', 'idx5', 'x1001', 'y1001', 'class0.png']
        filename_splitted = str(image_path).split('\\')[-1].split('_')
        
        path_data['path'].append(image_path)
        path_data['patient_id'].append(filename_splitted[0])
        path_data['x_coord'].append(filename_splitted[2][1:])
        path_data['y_coord'].append(filename_splitted[3][1:])
        path_data['target'].append(filename_splitted[4][5])   
        
    return path_data

In [3]:
image_paths = store_image_paths()
path_data = store_metadata(image_paths)

# Create dataframe from dictionary
df_total = pd.DataFrame.from_dict(path_data)
df_total = df_total.astype({"patient_id" : int, "x_coord": int, "y_coord": int, "target" : int})




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=277524.0), HTML(value='')))




In [4]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277524 entries, 0 to 277523
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   path        277524 non-null  object
 1   patient_id  277524 non-null  int32 
 2   x_coord     277524 non-null  int32 
 3   y_coord     277524 non-null  int32 
 4   target      277524 non-null  int32 
dtypes: int32(4), object(1)
memory usage: 6.4+ MB


## 2. Extracting patches arrays

In [5]:
def get_img_arrays(df,):
    # read each image array from corresponding path as grayscale and flatten the image array
    df['img_array'] = df.progress_apply(lambda x : io.imread(x['path']).flatten(),axis=1); # make sure to specify axis = 1
    # get the shape of each image array and store it in the dataframe
    df['array_shape'] = df.progress_apply(lambda x : x['img_array'].shape[0],axis=1); # make sure to specify axis = 1
    return df

df_img = get_img_arrays(df = df_total)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=277524.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=277524.0), HTML(value='')))




## 3. Removing outliers

In [6]:
# get all images that do not have an array shape of 7500
weird_imgs = df_img[df_img['array_shape'] != 7500] 

# drop images using indices of the filter
df_img.drop(weird_imgs.index,inplace=True) 

## 4. Selecting sample for model training

In [7]:
# quantity of cancerous and healthy data to be used for model training
size_cancerous_sample = 10000
size_healthy_sample = 10000

cancerous_indexes = np.random.choice(df_img[df_img['target']==1].index, size=size_cancerous_sample, replace=False)
healthy_indexes = np.random.choice(df_img[df_img['target']==0].index, size=size_healthy_sample, replace=False)
all_indexes = np.concatenate((cancerous_indexes, healthy_indexes))

sample_df = df_img.loc[all_indexes,:]

## 5. Performing PCA on images

In [12]:
from datetime import datetime
from sklearn.decomposition import PCA

In [13]:
img_arrays = np.zeros((sample_df.shape[0], 7500))

# add all the image arrays to the numpy array that we just initialized
for i,array in enumerate(sample_df['img_array']):
    img_arrays[i,:] = array

In [14]:
init_time = datetime.now() 

images_pca = PCA(0.8)

# fit the function to our image arrays
images_pca.fit(img_arrays);

final_time = datetime.now()

print(final_time - init_time)

0:05:43.816271


In [15]:
sample_df['pca_array'] = sample_df['img_array'].progress_apply(lambda x: images_pca.transform([x]).flatten());

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20000.0), HTML(value='')))




In [29]:
sample_df.head()

Unnamed: 0,path,patient_id,x_coord,y_coord,target,img_array,array_shape,pca_array
192497,C:\Users\anton\Documents\ML Projects\Breast Cl...,16554,551,1301,1,"[222, 181, 199, 225, 177, 199, 224, 165, 191, ...",7500,"[339.27909168279393, 205.07866746574393, -91.3..."
227325,C:\Users\anton\Documents\ML Projects\Breast Cl...,9077,1501,1301,1,"[228, 209, 220, 218, 199, 215, 210, 190, 212, ...",7500,"[-1639.6465021716242, 170.60474865510534, -124..."
252580,C:\Users\anton\Documents\ML Projects\Breast Cl...,9257,1601,901,1,"[204, 132, 173, 202, 137, 170, 206, 150, 183, ...",7500,"[2007.3033646687752, -161.69294358216584, 25.2..."
16216,C:\Users\anton\Documents\ML Projects\Breast Cl...,10273,1351,2151,1,"[161, 97, 159, 132, 93, 149, 220, 188, 210, 18...",7500,"[3329.5729634833674, 345.93163163454653, -81.2..."
59085,C:\Users\anton\Documents\ML Projects\Breast Cl...,12751,1651,1651,1,"[139, 110, 147, 195, 162, 186, 133, 111, 152, ...",7500,"[1661.246158370532, -159.08556159077523, -135...."


## 6. Saving data as pickle file

In [36]:
# sample_df.to_pickle("./training-data.pkl")