In [1]:
import os
import pandas as pd
from glob import glob
from functools import reduce
from xml.etree import ElementTree as et


In [2]:
# step-1: get path of each xml file
# Load all xml file and store in a list
xml_list = glob('./data_images/*.xml')

In [3]:
# repalce \\ with /
# xml_list = list(map(lambda x : x.replace('\\','/'),xml_list))
xml_list = [x.replace('\\','/') for x in xml_list]

In [4]:
xml_list

['./data_images/2007_000027.xml',
 './data_images/2007_000032.xml',
 './data_images/2007_000033.xml',
 './data_images/2007_000039.xml',
 './data_images/2007_000042.xml',
 './data_images/2007_000061.xml',
 './data_images/2007_000063.xml',
 './data_images/2007_000068.xml',
 './data_images/2007_000121.xml',
 './data_images/2007_000123.xml',
 './data_images/2007_000129.xml',
 './data_images/2007_000170.xml',
 './data_images/2007_000175.xml',
 './data_images/2007_000187.xml',
 './data_images/2007_000241.xml',
 './data_images/2007_000243.xml',
 './data_images/2007_000250.xml',
 './data_images/2007_000256.xml',
 './data_images/2007_000272.xml',
 './data_images/2007_000323.xml',
 './data_images/2007_000332.xml',
 './data_images/2007_000333.xml',
 './data_images/2007_000346.xml',
 './data_images/2007_000363.xml',
 './data_images/2007_000364.xml',
 './data_images/2007_000392.xml',
 './data_images/2007_000423.xml',
 './data_images/2007_000452.xml',
 './data_images/2007_000464.xml',
 './data_image

In [5]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_info(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # get filename
    image_name = root.find('filename').text

    # extract height, width
    height = root.find('size').find('height').text
    width = root.find('size').find('width').text
    # store all info
    parser = []
    # extract all object in an image
    objs = root.findall('object')
    # extract xmin, ymin, xmax, ymax for each object in an image
    for i in objs:
        name = i.find('name').text
        x_min = i.find('bndbox').find('xmin').text
        y_min = i.find('bndbox').find('ymin').text
        x_max = i.find('bndbox').find('xmax').text
        y_max = i.find('bndbox').find('ymax').text
        parser.append([image_name, height, width, name, x_min, y_min, x_max, y_max])
    return parser


In [6]:
parser_all = list((extract_info(x) for x in xml_list))

In [7]:
parser_all

[[['2007_000027.jpg', '500', '486', 'person', '174', '101', '349', '351']],
 [['2007_000032.jpg', '281', '500', 'aeroplane', '104', '78', '375', '183'],
  ['2007_000032.jpg', '281', '500', 'aeroplane', '133', '88', '197', '123'],
  ['2007_000032.jpg', '281', '500', 'person', '195', '180', '213', '229'],
  ['2007_000032.jpg', '281', '500', 'person', '26', '189', '44', '238']],
 [['2007_000033.jpg', '366', '500', 'aeroplane', '9', '107', '499', '263'],
  ['2007_000033.jpg', '366', '500', 'aeroplane', '421', '200', '482', '226'],
  ['2007_000033.jpg', '366', '500', 'aeroplane', '325', '188', '411', '223']],
 [['2007_000039.jpg', '375', '500', 'tvmonitor', '156', '89', '344', '279']],
 [['2007_000042.jpg', '335', '500', 'train', '263', '32', '500', '295'],
  ['2007_000042.jpg', '335', '500', 'train', '1', '36', '235', '299']],
 [['2007_000061.jpg', '333', '500', 'boat', '274', '11', '437', '279'],
  ['2007_000061.jpg', '333', '500', 'boat', '184', '214', '281', '252']],
 [['2007_000063.jpg

In [8]:
# reduce dimension of parser_all to 2
data = reduce(lambda x, y: x+y, parser_all)

In [9]:
data

[['2007_000027.jpg', '500', '486', 'person', '174', '101', '349', '351'],
 ['2007_000032.jpg', '281', '500', 'aeroplane', '104', '78', '375', '183'],
 ['2007_000032.jpg', '281', '500', 'aeroplane', '133', '88', '197', '123'],
 ['2007_000032.jpg', '281', '500', 'person', '195', '180', '213', '229'],
 ['2007_000032.jpg', '281', '500', 'person', '26', '189', '44', '238'],
 ['2007_000033.jpg', '366', '500', 'aeroplane', '9', '107', '499', '263'],
 ['2007_000033.jpg', '366', '500', 'aeroplane', '421', '200', '482', '226'],
 ['2007_000033.jpg', '366', '500', 'aeroplane', '325', '188', '411', '223'],
 ['2007_000039.jpg', '375', '500', 'tvmonitor', '156', '89', '344', '279'],
 ['2007_000042.jpg', '335', '500', 'train', '263', '32', '500', '295'],
 ['2007_000042.jpg', '335', '500', 'train', '1', '36', '235', '299'],
 ['2007_000061.jpg', '333', '500', 'boat', '274', '11', '437', '279'],
 ['2007_000061.jpg', '333', '500', 'boat', '184', '214', '281', '252'],
 ['2007_000063.jpg', '375', '500', 'do

In [10]:
df = pd.DataFrame(data, columns=['filename', 'height', 'width', 'name', 'x_min', 'y_min', 'x_max', 'y_max'])

In [11]:
df.sample()

Unnamed: 0,filename,height,width,name,x_min,y_min,x_max,y_max
3914,2008_001451.jpg,375,500,person,203.9671,153.8174,226.8713,306.512


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14689 entries, 0 to 14688
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  14689 non-null  object
 1   height    14689 non-null  object
 2   width     14689 non-null  object
 3   name      14689 non-null  object
 4   x_min     14689 non-null  object
 5   y_min     14689 non-null  object
 6   x_max     14689 non-null  object
 7   y_max     14689 non-null  object
dtypes: object(8)
memory usage: 918.2+ KB


In [13]:
#  datatype conversion
cols = ['height','width', 'x_min', 'y_min', 'x_max', 'y_max']
df[cols] = df[cols].astype(float)
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14689 entries, 0 to 14688
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  14689 non-null  object
 1   height    14689 non-null  int32 
 2   width     14689 non-null  int32 
 3   name      14689 non-null  object
 4   x_min     14689 non-null  int32 
 5   y_min     14689 non-null  int32 
 6   x_max     14689 non-null  int32 
 7   y_max     14689 non-null  int32 
dtypes: int32(6), object(2)
memory usage: 573.9+ KB


In [14]:
df['center_x'] = ((df['x_min'] + df['x_max']) / 2) / df['width']
df['center_y'] = ((df['y_min'] + df['y_max']) / 2) / df['height']
df['w'] = (df['x_max'] - df['x_min']) / df['width']
df['h'] = (df['y_max'] - df['y_min']) / df['height']

In [15]:
df.head()

Unnamed: 0,filename,height,width,name,x_min,y_min,x_max,y_max,center_x,center_y,w,h
0,2007_000027.jpg,500,486,person,174,101,349,351,0.538066,0.452,0.360082,0.5
1,2007_000032.jpg,281,500,aeroplane,104,78,375,183,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,281,500,aeroplane,133,88,197,123,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,281,500,person,195,180,213,229,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,281,500,person,26,189,44,238,0.07,0.759786,0.036,0.174377


### Spit data to train and test


In [16]:
images = df['filename'].unique()
len(images)

5096

In [17]:
img_df = pd.DataFrame(images, columns=['filename'])
img_df.head()

Unnamed: 0,filename
0,2007_000027.jpg
1,2007_000032.jpg
2,2007_000033.jpg
3,2007_000039.jpg
4,2007_000042.jpg


In [18]:
#  80% train. 20% test
img_train = tuple(img_df['filename'].sample(frac=0.8)) # pick randomly 80% to create a train set
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])  # take the rest to create test set

In [19]:
len(img_test), len(img_train)

(1019, 4077)

In [20]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [21]:
train_df.shape, test_df.shape

((11896, 12), (2793, 12))

In [22]:
train_df

Unnamed: 0,filename,height,width,name,x_min,y_min,x_max,y_max,center_x,center_y,w,h
0,2007_000027.jpg,500,486,person,174,101,349,351,0.538066,0.452000,0.360082,0.500000
1,2007_000032.jpg,281,500,aeroplane,104,78,375,183,0.479000,0.464413,0.542000,0.373665
2,2007_000032.jpg,281,500,aeroplane,133,88,197,123,0.330000,0.375445,0.128000,0.124555
3,2007_000032.jpg,281,500,person,195,180,213,229,0.408000,0.727758,0.036000,0.174377
4,2007_000032.jpg,281,500,person,26,189,44,238,0.070000,0.759786,0.036000,0.174377
...,...,...,...,...,...,...,...,...,...,...,...,...
14677,2008_008767.jpg,374,500,train,0,29,149,181,0.149000,0.280749,0.298000,0.406417
14678,2008_008770.jpg,500,333,train,0,0,333,500,0.500000,0.500000,1.000000,1.000000
14679,2008_008770.jpg,500,333,pottedplant,252,79,266,96,0.777778,0.175000,0.042042,0.034000
14680,2008_008770.jpg,500,333,pottedplant,229,72,247,90,0.714715,0.162000,0.054054,0.036000


### Convert categorical name to number

In [23]:
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
       'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
       'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [24]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


In [25]:
train_df.head()

Unnamed: 0,filename,height,width,name,x_min,y_min,x_max,y_max,center_x,center_y,w,h,id
0,2007_000027.jpg,500,486,person,174,101,349,351,0.538066,0.452,0.360082,0.5,0
1,2007_000032.jpg,281,500,aeroplane,104,78,375,183,0.479,0.464413,0.542,0.373665,16
2,2007_000032.jpg,281,500,aeroplane,133,88,197,123,0.33,0.375445,0.128,0.124555,16
3,2007_000032.jpg,281,500,person,195,180,213,229,0.408,0.727758,0.036,0.174377,0
4,2007_000032.jpg,281,500,person,26,189,44,238,0.07,0.759786,0.036,0.174377,0


### Create .txt file contain id, center_x, center_y, w, h

In [26]:
from shutil import move

In [None]:
train_folder = './data_images/train'
test_folder = './data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [28]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h'] 
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [35]:
groupby_obj_train.get_group('2007_000032.jpg')

Unnamed: 0,filename,id,center_x,center_y,w,h
1,2007_000032.jpg,16,0.479,0.464413,0.542,0.373665
2,2007_000032.jpg,16,0.33,0.375445,0.128,0.124555
3,2007_000032.jpg,0,0.408,0.727758,0.036,0.174377
4,2007_000032.jpg,0,0.07,0.759786,0.036,0.174377


In [39]:
# set index = filename , create a .txt file without header (columns name), and index(filename)
# groupby_obj_train.get_group('2007_000032.jpg').set_index('filename').to_csv('sample.txt',header=False, index=False)

In [60]:
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image to destiantion
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)
    move(src,dst)

    # save the labels
    txt_name = os.path.splitext(filename)[0]
    group_obj.get_group(filename).set_index('filename').to_csv(os.path.join(folder_path, f'{txt_name}.txt'),header=False, index=False, sep =' ')
    return os.path.join(folder_path, f'{txt_name}.txt')

In [61]:
# save_data('2007_000032.jpg','./data_images/train', groupby_obj_train)

'./data_images/train\\2007_000032.txt'

In [67]:
# save train data
for name, group in groupby_obj_train:
    save_data(name, train_folder, groupby_obj_train)


In [71]:
# save train data
for name, group in groupby_obj_test:
    save_data(name, test_folder, groupby_obj_test)


#### Note:
In groupby object, we can acess name of each object bu using the following command:

`for name, group in groupby_obj_train:`

If we just using `for name in groupby_obj_train:` we will get another thing

In [70]:
for name, group in groupby_obj_train:
    print(f"Group name: {name}")


Group name: 2007_000027.jpg
Group name: 2007_000032.jpg
Group name: 2007_000033.jpg
Group name: 2007_000039.jpg
Group name: 2007_000042.jpg
Group name: 2007_000061.jpg
Group name: 2007_000063.jpg
Group name: 2007_000068.jpg
Group name: 2007_000121.jpg
Group name: 2007_000123.jpg
Group name: 2007_000129.jpg
Group name: 2007_000170.jpg
Group name: 2007_000175.jpg
Group name: 2007_000241.jpg
Group name: 2007_000243.jpg
Group name: 2007_000250.jpg
Group name: 2007_000256.jpg
Group name: 2007_000272.jpg
Group name: 2007_000323.jpg
Group name: 2007_000333.jpg
Group name: 2007_000346.jpg
Group name: 2007_000363.jpg
Group name: 2007_000364.jpg
Group name: 2007_000392.jpg
Group name: 2007_000423.jpg
Group name: 2007_000452.jpg
Group name: 2007_000464.jpg
Group name: 2007_000480.jpg
Group name: 2007_000491.jpg
Group name: 2007_000515.jpg
Group name: 2007_000528.jpg
Group name: 2007_000549.jpg
Group name: 2007_000559.jpg
Group name: 2007_000572.jpg
Group name: 2007_000584.jpg
Group name: 2007_000