### Imports

In [2]:
import os, sys, random, shutil
import xml.etree.ElementTree as ET
from glob import glob
import pandas as pd
from shutil import copyfile
import pandas as pd
from sklearn import preprocessing, model_selection
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import patches
import numpy as np

### Preprocessing

In [5]:
annotations = sorted(glob('./data/whitebloodcells/dataset-master/dataset-master/Annotations/*.xml'))

In [24]:
df = []
cnt = 0
for file in annotations:
    prev_filename = file.split('\\')[-1].split('.')[0] + '.jpg'
    filename = str(cnt) + '.jpg'
    row = []
    parsedXML = ET.parse(file)
    
    for node in parsedXML.getroot().iter('object'):
        blood_cells = node.find('name').text
        xmin = int(node.find('bndbox/xmin').text)
        xmax = int(node.find('bndbox/xmax').text)
        ymin = int(node.find('bndbox/ymin').text)
        ymax = int(node.find('bndbox/ymax').text)

        row = [prev_filename, filename, blood_cells, xmin, xmax, ymin, ymax]
        df.append(row)
    cnt += 1


#### Save to df and csv

In [25]:
data = pd.DataFrame(df, columns=['prev_filename', 'filename', 'cell_type', 'xmin', 'xmax', 'ymin', 'ymax'])
data[['prev_filename','filename', 'cell_type', 'xmin', 'xmax', 'ymin', 'ymax']].to_csv('data\whitebloodcells\dataset-master\dataset-master\\blood_cell_detection.csv', index=False)
data.head(10)

Unnamed: 0,prev_filename,filename,cell_type,xmin,xmax,ymin,ymax
0,BloodImage_00000.jpg,0.jpg,RBC,216,316,359,464
1,BloodImage_00000.jpg,0.jpg,RBC,77,177,326,431
2,BloodImage_00000.jpg,0.jpg,RBC,540,640,353,458
3,BloodImage_00000.jpg,0.jpg,RBC,405,513,350,457
4,BloodImage_00000.jpg,0.jpg,RBC,160,245,72,177
5,BloodImage_00000.jpg,0.jpg,RBC,5,90,335,440
6,BloodImage_00000.jpg,0.jpg,RBC,540,640,39,149
7,BloodImage_00000.jpg,0.jpg,RBC,383,504,1,113
8,BloodImage_00000.jpg,0.jpg,RBC,9,108,82,168
9,BloodImage_00000.jpg,0.jpg,RBC,68,165,212,346


#### Transform to required dataframe structure

In [26]:
img_width = 640
img_height = 480

def width(df):
    return int(df.xmax - df.xmin)
def height(df):
    return int(df.ymax - df.ymin)
def x_center(df):
    return int(df.xmin + (df.width/2))
def y_center(df):
    return int(df.ymin + (df.height/2))
def w_norm(df):
    return df/img_width
def h_norm(df):
    return df/img_height

df = pd.read_csv('data\whitebloodcells\dataset-master\dataset-master\\blood_cell_detection.csv')

le = preprocessing.LabelEncoder()
le.fit(df['cell_type'])
print(le.classes_)
labels = le.transform(df['cell_type'])
df['labels'] = labels

df['width'] = df.apply(width, axis=1)
df['height'] = df.apply(height, axis=1)

df['x_center'] = df.apply(x_center, axis=1)
df['y_center'] = df.apply(y_center, axis=1)

df['x_center_norm'] = df['x_center'].apply(w_norm)
df['width_norm'] = df['width'].apply(w_norm)

df['y_center_norm'] = df['y_center'].apply(h_norm)
df['height_norm'] = df['height'].apply(h_norm)

df.head(15)

['RBC']


Unnamed: 0,prev_filename,filename,cell_type,xmin,xmax,ymin,ymax,labels,width,height,x_center,y_center,x_center_norm,width_norm,y_center_norm,height_norm
0,BloodImage_00000.jpg,0.jpg,RBC,216,316,359,464,0,100,105,266,411,0.415625,0.15625,0.85625,0.21875
1,BloodImage_00000.jpg,0.jpg,RBC,77,177,326,431,0,100,105,127,378,0.198437,0.15625,0.7875,0.21875
2,BloodImage_00000.jpg,0.jpg,RBC,540,640,353,458,0,100,105,590,405,0.921875,0.15625,0.84375,0.21875
3,BloodImage_00000.jpg,0.jpg,RBC,405,513,350,457,0,108,107,459,403,0.717187,0.16875,0.839583,0.222917
4,BloodImage_00000.jpg,0.jpg,RBC,160,245,72,177,0,85,105,202,124,0.315625,0.132812,0.258333,0.21875
5,BloodImage_00000.jpg,0.jpg,RBC,5,90,335,440,0,85,105,47,387,0.073438,0.132812,0.80625,0.21875
6,BloodImage_00000.jpg,0.jpg,RBC,540,640,39,149,0,100,110,590,94,0.921875,0.15625,0.195833,0.229167
7,BloodImage_00000.jpg,0.jpg,RBC,383,504,1,113,0,121,112,443,57,0.692187,0.189062,0.11875,0.233333
8,BloodImage_00000.jpg,0.jpg,RBC,9,108,82,168,0,99,86,58,125,0.090625,0.154688,0.260417,0.179167
9,BloodImage_00000.jpg,0.jpg,RBC,68,165,212,346,0,97,134,116,279,0.18125,0.151562,0.58125,0.279167


#### Save labels to txt and split data into train and test

In [62]:
df_train, df_valid = model_selection.train_test_split(df, test_size=0.1, random_state=13, shuffle=True)
print(df_train.shape, df_valid.shape)

os.mkdir('data/whitebloodcells/dataset-master/dataset-master/images/')
os.mkdir('data/whitebloodcells/dataset-master/dataset-master/images/train/')
os.mkdir('data/whitebloodcells/dataset-master/dataset-master/images/valid/')

os.mkdir('data/whitebloodcells/dataset-master/dataset-master/labels/')
os.mkdir('data/whitebloodcells/dataset-master/dataset-master/labels/train/')
os.mkdir('data/whitebloodcells/dataset-master/dataset-master/labels/valid/')

def segregate_data(df, img_path, label_path, train_img_path, train_label_path):
    filenames = []
    for filename in df.filename:
        filenames.append(filename)
    filenames = set(filenames)
  
    for filename in filenames:
        yolo_list = []

        for _,row in df[df.filename == filename].iterrows():
            yolo_list.append([row.labels, row.x_center_norm, row.y_center_norm, row.width_norm, row.height_norm])

        yolo_list = np.array(yolo_list)
        txt_filename = os.path.join(train_label_path,str(row.prev_filename.split('.')[0])+".txt")
        # Save the .img & .txt files to the corresponding train and validation folders
        np.savetxt(txt_filename, yolo_list, fmt=["%d", "%f", "%f", "%f", "%f"])
        shutil.copyfile(os.path.join(img_path,row.prev_filename), os.path.join(train_img_path,row.prev_filename))

src_img_path = "data/whitebloodcells/dataset-master/dataset-master/JPEGImages/"
src_label_path = "data/whitebloodcells/dataset-master/dataset-master/Annotations/"

train_img_path = "data/whitebloodcells/dataset-master/dataset-master/images/train"
train_label_path = "data/whitebloodcells/dataset-master/dataset-master/labels/train"

valid_img_path = "data/whitebloodcells/dataset-master/dataset-master/images/valid"
valid_label_path = 'data/whitebloodcells/dataset-master/dataset-master/labels/valid'

segregate_data(df_train, src_img_path, src_label_path, train_img_path, train_label_path)
segregate_data(df_valid, src_img_path, src_label_path, valid_img_path, valid_label_path)

print("No. of Training images", len(os.listdir('data/whitebloodcells/dataset-master/dataset-master/images/train/')))
print("No. of Training labels", len(os.listdir('data/whitebloodcells/dataset-master/dataset-master/labels/train/')))

print("No. of valid images", len(os.listdir('data/whitebloodcells/dataset-master/dataset-master/images/valid/')))
print("No. of valid labels", len(os.listdir('data/whitebloodcells/dataset-master/dataset-master/labels/valid/')))

(3468, 16) (386, 16)
No. of Training images 343
No. of Training labels 343
No. of valid images 234
No. of valid labels 234
