In [1]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [106]:
# step-1: get path of each xml file
xmlfiles = glob('/home/cpow/Desktop/pegasus/datasets/valid/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [107]:
xmlfiles

['/home/cpow/Desktop/pegasus/datasets/valid/image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff634e.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/image_194_jpg.rf.f4b70c520b6deab116ebe4c875fdcb08.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/image_213_jpg.rf.9f731d3d872c5ed77a8ababbef415f06.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/image_169_jpg.rf.969c4f8e28061136d7ce9d4887da1259.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/image_222_jpg.rf.66dc43b804487d560c3965d3077c1401.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/ppe_0926_jpg.rf.a1616137fe6bde3f797bf82a8b7a821d.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/Twalv0509_jpg.rf.936f594299864a5a7162d8803e9d0263.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/TsirinTu0020_jpg.rf.6616844bcb76ec053233b96e3f5e981e.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/image_155_jpg.rf.92495fba618249dbe74d79e77d05c1bc.xml',
 '/home/cpow/Desktop/pegasus/datasets/valid/image_168_jpg.rf.8e24a61b3f4300bf9a7800ac60f97e33.xml'

In [108]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [109]:
parser_all = list(map(extract_text,xmlfiles))

In [110]:
data = reduce(lambda x, y : x+y,parser_all)

In [111]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [112]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Boot,390,423,482,549
1,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Boot,453,513,543,590
2,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Gloves,484,518,384,430
3,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Vest,426,505,213,369
4,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Person,368,549,126,611


In [113]:
df.shape

(564, 8)

In [114]:
df['name'].value_counts()

name
Safety-Boot        118
Gloves              91
vest                78
Person              69
Safety-Vest         67
Helmet              53
Glass               35
No-Helmet           22
helmet              15
Goggles              4
protective_suit      3
no helmet            2
Vest                 2
No-Vest              1
no_helmet            1
no_vest              1
no vest              1
worker               1
Name: count, dtype: int64

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  564 non-null    object
 1   width     564 non-null    object
 2   height    564 non-null    object
 3   name      564 non-null    object
 4   xmin      564 non-null    object
 5   xmax      564 non-null    object
 6   ymin      564 non-null    object
 7   ymax      564 non-null    object
dtypes: object(8)
memory usage: 35.4+ KB


In [116]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  564 non-null    object
 1   width     564 non-null    int64 
 2   height    564 non-null    int64 
 3   name      564 non-null    object
 4   xmin      564 non-null    int64 
 5   xmax      564 non-null    int64 
 6   ymin      564 non-null    int64 
 7   ymax      564 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 35.4+ KB


In [117]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [118]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Boot,390,423,482,549,0.635156,0.805469,0.051562,0.104688
1,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Boot,453,513,543,590,0.754687,0.885156,0.09375,0.073438
2,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Gloves,484,518,384,430,0.782813,0.635938,0.053125,0.071875
3,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Vest,426,505,213,369,0.727344,0.454688,0.123438,0.24375
4,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Person,368,549,126,611,0.716406,0.575781,0.282813,0.757812


In [119]:
images = df['filename'].unique()

In [120]:
len(images)

99

In [121]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [122]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [123]:
len(img_train), len(img_test)

(79, 20)

In [124]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [125]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Boot,390,423,482,549,0.635156,0.805469,0.051562,0.104688
1,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Boot,453,513,543,590,0.754687,0.885156,0.09375,0.073438
2,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Gloves,484,518,384,430,0.782813,0.635938,0.053125,0.071875
3,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Vest,426,505,213,369,0.727344,0.454688,0.123438,0.24375
4,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Person,368,549,126,611,0.716406,0.575781,0.282813,0.757812


In [126]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
13,image_213_jpg.rf.9f731d3d872c5ed77a8ababbef415...,640,640,Safety-Boot,323,378,545,641,0.547656,0.926562,0.085938,0.15
14,image_213_jpg.rf.9f731d3d872c5ed77a8ababbef415...,640,640,Safety-Boot,376,450,577,633,0.645312,0.945312,0.115625,0.0875
15,image_213_jpg.rf.9f731d3d872c5ed77a8ababbef415...,640,640,Person,311,498,100,641,0.632031,0.578906,0.292187,0.845313
16,image_213_jpg.rf.9f731d3d872c5ed77a8ababbef415...,640,640,No-Helmet,391,471,99,216,0.673438,0.246094,0.125,0.182812
17,image_213_jpg.rf.9f731d3d872c5ed77a8ababbef415...,640,640,Glass,405,471,135,173,0.684375,0.240625,0.103125,0.059375


In [127]:
def label_encoding(x):
    labels = {
        'hat': 0, 
        'vest': 1, 
        'no hat': 2, 
        'no vest': 3,
        'Glass': 4,
        'Gloves': 5,
        'Goggles': 6,
        'Helmet': 7,
        'No-Helmet': 8,
        'No-Vest': 9,
        'Person': 10,
        'Safety-Boot': 11,
        'Safety-Vest': 12,
        'Vest': 13,
        'helmet': 14,
        'no helmet': 15,
        'no vest': 16,
        'no_helmet': 17,
        'no_vest': 18,
        'protective_suit': 19,
        'worker': 20
    }
    return labels[x]


In [128]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [129]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Boot,390,423,482,549,0.635156,0.805469,0.051562,0.104688,11
1,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Boot,453,513,543,590,0.754687,0.885156,0.09375,0.073438,11
2,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Gloves,484,518,384,430,0.782813,0.635938,0.053125,0.071875,5
3,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Safety-Vest,426,505,213,369,0.727344,0.454688,0.123438,0.24375,12
4,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,Person,368,549,126,611,0.716406,0.575781,0.282813,0.757812,10
5,image_317_jpg.rf.7e994ca9e9a99feba22861c3eaff6...,640,640,No-Helmet,421,504,141,216,0.722656,0.278906,0.129688,0.117188,8
6,image_194_jpg.rf.f4b70c520b6deab116ebe4c875fdc...,640,640,Safety-Vest,185,286,177,361,0.367969,0.420312,0.157812,0.2875,12
7,image_194_jpg.rf.f4b70c520b6deab116ebe4c875fdc...,640,640,Helmet,190,247,89,141,0.341406,0.179688,0.089063,0.08125,7
8,image_194_jpg.rf.f4b70c520b6deab116ebe4c875fdc...,640,640,Gloves,184,232,187,231,0.325,0.326562,0.075,0.06875,5
9,image_194_jpg.rf.f4b70c520b6deab116ebe4c875fdc...,640,640,Gloves,248,293,167,211,0.422656,0.295312,0.070312,0.06875,5


In [130]:
import os
from shutil import move

In [131]:
train_folder = '/home/cpow/Desktop/pegasus/datasets/train'
test_folder = '/home/cpow/Desktop/pegasus/datasets/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

FileExistsError: [Errno 17] File exists: '/home/cpow/Desktop/pegasus/datasets/train'

In [132]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [133]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('/home/cpow/Desktop/pegasus/datasets/valid',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [134]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [135]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0     None
1     None
2     None
3     None
4     None
      ... 
74    None
75    None
76    None
77    None
78    None
Length: 79, dtype: object

In [136]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
dtype: object