In [53]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [54]:
import warnings
warnings.filterwarnings('ignore')

In [55]:
# step-1: get path of each xml file
xmlfiles = glob('./data_images/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [56]:
xmlfiles

['./data_images/basmati (1).xml',
 './data_images/basmati (10).xml',
 './data_images/basmati (100).xml',
 './data_images/basmati (101).xml',
 './data_images/basmati (102).xml',
 './data_images/basmati (103).xml',
 './data_images/basmati (104).xml',
 './data_images/basmati (105).xml',
 './data_images/basmati (106).xml',
 './data_images/basmati (107).xml',
 './data_images/basmati (108).xml',
 './data_images/basmati (109).xml',
 './data_images/basmati (11).xml',
 './data_images/basmati (110).xml',
 './data_images/basmati (111).xml',
 './data_images/basmati (112).xml',
 './data_images/basmati (113).xml',
 './data_images/basmati (114).xml',
 './data_images/basmati (115).xml',
 './data_images/basmati (116).xml',
 './data_images/basmati (117).xml',
 './data_images/basmati (118).xml',
 './data_images/basmati (119).xml',
 './data_images/basmati (12).xml',
 './data_images/basmati (120).xml',
 './data_images/basmati (121).xml',
 './data_images/basmati (122).xml',
 './data_images/basmati (123).xml

In [57]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [58]:
parser_all = list(map(extract_text,xmlfiles))

In [59]:
data = reduce(lambda x, y : x+y,parser_all)

In [60]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [61]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,basmati (1).png,300,300,Basmati,123,163,121,183
1,basmati (10).png,300,300,Basmati,135,177,113,171
2,basmati (100).png,300,300,Basmati,134,191,133,182
3,basmati (101).png,300,300,Basmati,116,164,142,204
4,basmati (102).png,300,300,Basmati,117,164,142,203


In [62]:
df.shape

(3116, 8)

In [63]:
df['name'].value_counts()

Nazirshail    634
Paijam        630
Jirashail     625
Chinigura     624
Basmati       603
Name: name, dtype: int64

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116 entries, 0 to 3115
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  3116 non-null   object
 1   width     3116 non-null   object
 2   height    3116 non-null   object
 3   name      3116 non-null   object
 4   xmin      3116 non-null   object
 5   xmax      3116 non-null   object
 6   ymin      3116 non-null   object
 7   ymax      3116 non-null   object
dtypes: object(8)
memory usage: 194.9+ KB


In [65]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116 entries, 0 to 3115
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  3116 non-null   object
 1   width     3116 non-null   int32 
 2   height    3116 non-null   int32 
 3   name      3116 non-null   object
 4   xmin      3116 non-null   int32 
 5   xmax      3116 non-null   int32 
 6   ymin      3116 non-null   int32 
 7   ymax      3116 non-null   int32 
dtypes: int32(6), object(2)
memory usage: 121.8+ KB


In [66]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [67]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,basmati (1).png,300,300,Basmati,123,163,121,183,0.476667,0.506667,0.133333,0.206667
1,basmati (10).png,300,300,Basmati,135,177,113,171,0.52,0.473333,0.14,0.193333
2,basmati (100).png,300,300,Basmati,134,191,133,182,0.541667,0.525,0.19,0.163333
3,basmati (101).png,300,300,Basmati,116,164,142,204,0.466667,0.576667,0.16,0.206667
4,basmati (102).png,300,300,Basmati,117,164,142,203,0.468333,0.575,0.156667,0.203333


# split data into train and test

In [69]:
images = df['filename'].unique()

In [70]:
len(images)

3116

In [71]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [72]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [73]:
len(img_train), len(img_test)

(2493, 623)

In [74]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [75]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
1,basmati (10).png,300,300,Basmati,135,177,113,171,0.52,0.473333,0.14,0.193333
2,basmati (100).png,300,300,Basmati,134,191,133,182,0.541667,0.525,0.19,0.163333
3,basmati (101).png,300,300,Basmati,116,164,142,204,0.466667,0.576667,0.16,0.206667
4,basmati (102).png,300,300,Basmati,117,164,142,203,0.468333,0.575,0.156667,0.203333
5,basmati (103).png,300,300,Basmati,117,189,140,169,0.51,0.515,0.24,0.096667


In [76]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,basmati (1).png,300,300,Basmati,123,163,121,183,0.476667,0.506667,0.133333,0.206667
10,basmati (108).png,300,300,Basmati,146,175,117,191,0.535,0.513333,0.096667,0.246667
30,basmati (126).png,300,300,Basmati,121,189,150,186,0.516667,0.56,0.226667,0.12
39,basmati (134).png,300,300,Basmati,126,196,154,180,0.536667,0.556667,0.233333,0.086667
41,basmati (136).png,300,300,Basmati,146,179,136,200,0.541667,0.56,0.11,0.213333


# Assign id number to object names


In [77]:
# label encoding
def label_encoding(x):
    labels = {'Nazirshail':0, 'Paijam':1, 'Jirashail':2, 'Chinigura':3, 'Basmati':4}
    return labels[x]

In [78]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [79]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
1,basmati (10).png,300,300,Basmati,135,177,113,171,0.52,0.473333,0.14,0.193333,4
2,basmati (100).png,300,300,Basmati,134,191,133,182,0.541667,0.525,0.19,0.163333,4
3,basmati (101).png,300,300,Basmati,116,164,142,204,0.466667,0.576667,0.16,0.206667,4
4,basmati (102).png,300,300,Basmati,117,164,142,203,0.468333,0.575,0.156667,0.203333,4
5,basmati (103).png,300,300,Basmati,117,189,140,169,0.51,0.515,0.24,0.096667,4
6,basmati (104).png,300,300,Basmati,120,188,140,173,0.513333,0.521667,0.226667,0.11,4
7,basmati (105).png,300,300,Basmati,125,188,131,181,0.521667,0.52,0.21,0.166667,4
8,basmati (106).png,300,300,Basmati,130,184,130,182,0.523333,0.52,0.18,0.173333,4
9,basmati (107).png,300,300,Basmati,149,176,120,190,0.541667,0.516667,0.09,0.233333,4
11,basmati (109).png,300,300,Basmati,138,213,146,171,0.585,0.528333,0.25,0.083333,4


# Save Image and Labels in text

In [80]:
import os
from shutil import move

In [81]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [82]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [84]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)

In [85]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [86]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
2488    None
2489    None
2490    None
2491    None
2492    None
Length: 2493, dtype: object

In [87]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0      None
1      None
2      None
3      None
4      None
       ... 
618    None
619    None
620    None
621    None
622    None
Length: 623, dtype: object