In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et 
import warnings
warnings.filterwarnings("ignore")

In [2]:
#load all xml files and store in a list
xml_list = glob('./data_images/*.xml')

#data cleaning (replace \\ with /)
replace_text = lambda x: x.replace('\\','/')
xml_list = list(map(replace_text,xml_list))


In [3]:
xml_list

['./data_images/2007_000027.xml',
 './data_images/2007_000032.xml',
 './data_images/2007_000033.xml',
 './data_images/2007_000039.xml',
 './data_images/2007_000042.xml',
 './data_images/2007_000061.xml',
 './data_images/2007_000063.xml',
 './data_images/2007_000068.xml',
 './data_images/2007_000121.xml',
 './data_images/2007_000123.xml',
 './data_images/2007_000129.xml',
 './data_images/2007_000170.xml',
 './data_images/2007_000175.xml',
 './data_images/2007_000187.xml',
 './data_images/2007_000241.xml',
 './data_images/2007_000243.xml',
 './data_images/2007_000250.xml',
 './data_images/2007_000256.xml',
 './data_images/2007_000272.xml',
 './data_images/2007_000323.xml',
 './data_images/2007_000332.xml',
 './data_images/2007_000333.xml',
 './data_images/2007_000346.xml',
 './data_images/2007_000363.xml',
 './data_images/2007_000364.xml',
 './data_images/2007_000392.xml',
 './data_images/2007_000423.xml',
 './data_images/2007_000452.xml',
 './data_images/2007_000464.xml',
 './data_image

In [4]:
# Step -2 : read xml files
# from each xml file we need to extract
# filename, size(width, heigth), object(name, xmin, xmax, ymin, ymax)

def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    #extract file name
    image_name = root.find('filename').text
    
    # width and height
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    
    # find object(name, xmin, xmax, ymin, ymax)
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        ymin = bndbox.find('ymin').text
        xmax = bndbox.find('xmax').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
    return parser

In [5]:
parser_all = list(map(extract_text,xml_list))

data = reduce(lambda x, y : x+y, parser_all)
df = pd.DataFrame(data, columns = ['image_name', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])
df.head()

Unnamed: 0,image_name,width,height,name,xmin,xmax,ymin,ymax
0,2007_000027.jpg,486,500,person,174,349,101,351
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123
3,2007_000032.jpg,500,281,person,195,213,180,229
4,2007_000032.jpg,500,281,person,26,44,189,238


In [6]:
df.shape

(40138, 8)

In [7]:
df['name'].value_counts()

name
person         17401
chair           3056
car             2492
dog             1598
bottle          1561
cat             1277
bird            1271
pottedplant     1202
sheep           1084
boat            1059
aeroplane       1002
tvmonitor        893
sofa             841
bicycle          837
horse            803
motorbike        801
diningtable      800
cow              771
train            704
bus              685
Name: count, dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40138 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   image_name  40138 non-null  object
 1   width       40138 non-null  object
 2   height      40138 non-null  object
 3   name        40138 non-null  object
 4   xmin        40138 non-null  object
 5   xmax        40138 non-null  object
 6   ymin        40138 non-null  object
 7   ymax        40138 non-null  object
dtypes: object(8)
memory usage: 2.4+ MB


In [9]:
# type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(float)
df[cols] = df[cols].astype(int)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40138 entries, 0 to 40137
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   image_name  40138 non-null  object
 1   width       40138 non-null  int64 
 2   height      40138 non-null  int64 
 3   name        40138 non-null  object
 4   xmin        40138 non-null  int64 
 5   xmax        40138 non-null  int64 
 6   ymin        40138 non-null  int64 
 7   ymax        40138 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 2.4+ MB


In [10]:
# calculate center x, center y, w ,h

df['center_x'] = ((df['xmin']+df['xmax'])/2)/df['width']
df['center_y'] = ((df['ymin']+df['ymax'])/2)/df['height']
df['w'] = (df['xmax']-df['xmin'])/df['width']
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [11]:
df.head()

Unnamed: 0,image_name,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377


In [12]:
# Split data into train and test set

images = df['image_name'].unique()
images
len(images)

17125

In [13]:
# 80% TRAIN and 20 % test

img_df = pd.DataFrame(images, columns=['image_name'])
img_train = tuple(img_df.sample(frac=0.8)['image_name'])
img_test = tuple(img_df.query(f'image_name not in {img_train}')['image_name'])
len(img_train), len(img_test)

(13700, 3425)

In [14]:
train_df = df.query(f'image_name in {img_train}')
test_df = df.query(f'image_name in {img_test}')

In [15]:
train_df.head()

Unnamed: 0,image_name,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377
5,2007_000033.jpg,500,366,aeroplane,9,499,107,263,0.508,0.505464,0.98,0.42623


In [16]:
test_df.head()

Unnamed: 0,image_name,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,2007_000027.jpg,486,500,person,174,349,101,351,0.538066,0.452,0.360082,0.5
13,2007_000063.jpg,500,375,dog,123,379,115,275,0.502,0.52,0.512,0.426667
14,2007_000063.jpg,500,375,chair,75,428,1,375,0.503,0.501333,0.706,0.997333
19,2007_000129.jpg,334,500,bicycle,70,255,202,500,0.486527,0.702,0.553892,0.596
20,2007_000129.jpg,334,500,bicycle,251,334,242,500,0.875749,0.742,0.248503,0.516


In [17]:
# label encoding

def label_encoding(x):
    labels = {'person': 0, 'car': 1, 'chair': 2, 'bottle': 3, 'pottedplant': 4, 'bird': 5, 'dog': 6, 'sofa': 7,
              'bicycle': 8, 'horse': 9, 'boat': 10, 'motorbike': 11, 'cat': 12, 'tvmonitor': 13,
              'cow': 14, 'sheep':15, 'aeroplane': 16, 'train': 17, 'diningtable': 18,'bus': 19}
    return labels[x]

In [18]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [19]:
train_df.head(10)

Unnamed: 0,image_name,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
1,2007_000032.jpg,500,281,aeroplane,104,375,78,183,0.479,0.464413,0.542,0.373665,16
2,2007_000032.jpg,500,281,aeroplane,133,197,88,123,0.33,0.375445,0.128,0.124555,16
3,2007_000032.jpg,500,281,person,195,213,180,229,0.408,0.727758,0.036,0.174377,0
4,2007_000032.jpg,500,281,person,26,44,189,238,0.07,0.759786,0.036,0.174377,0
5,2007_000033.jpg,500,366,aeroplane,9,499,107,263,0.508,0.505464,0.98,0.42623,16
6,2007_000033.jpg,500,366,aeroplane,421,482,200,226,0.903,0.581967,0.122,0.071038,16
7,2007_000033.jpg,500,366,aeroplane,325,411,188,223,0.736,0.561475,0.172,0.095628,16
8,2007_000039.jpg,500,375,tvmonitor,156,344,89,279,0.5,0.490667,0.376,0.506667,13
9,2007_000042.jpg,500,335,train,263,500,32,295,0.763,0.48806,0.474,0.785075,17
10,2007_000042.jpg,500,335,train,1,235,36,299,0.236,0.5,0.468,0.785075,17


In [20]:
# save Image and Labels in text

import os
from shutil import move

In [21]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [22]:
cols = ['image_name', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('image_name')
groupby_obj_test = test_df[cols].groupby('image_name')

In [23]:
#save each image in train/test folder and respective labels in .txt

def save_data(image_name, folder_path, groupby_obj):
    # move image
    src = os.path.join('data_images', image_name)
    dst = os.path.join(folder_path, image_name)
    move(src, dst)

    # save the labels
    text_filename = os.path.join(folder_path, os.path.splitext(image_name)[0]+'.txt')
    groupby_obj.get_group(image_name).set_index('image_name').to_csv(text_filename, sep=' ', index=False, header=False)
    

In [24]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [25]:
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0        None
1        None
2        None
3        None
4        None
         ... 
13695    None
13696    None
13697    None
13698    None
13699    None
Length: 13700, dtype: object

In [26]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
3420    None
3421    None
3422    None
3423    None
3424    None
Length: 3425, dtype: object