In [1]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [75]:
# step-1: get path of each xml file
xmlfiles = glob('/home/cpow/Desktop/thermal/datasets/labelled_train/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [76]:
xmlfiles

['/home/cpow/Desktop/thermal/datasets/labelled_train/Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-3-screenshot_png_jpg.rf.8a78661b805f434845aca700ecbd1802.xml',
 '/home/cpow/Desktop/thermal/datasets/labelled_train/163_jpg.rf.061a5e83aa901933bf3d7204ec944865.xml',
 '/home/cpow/Desktop/thermal/datasets/labelled_train/Automated-Fire-Suppression-System-Relies-on-FLIR-Thermal-Imaging-Cameras-2-13-screenshot-6-_png_jpg.rf.b99812557fb05dee66dec0e7a516ea12.xml',
 '/home/cpow/Desktop/thermal/datasets/labelled_train/Fire-Assessment-w_-FLIR-Thermal-2-3-screenshot_png_jpg.rf.0b1a84d2ef2cb10b8ce3d75e6f426683.xml',
 '/home/cpow/Desktop/thermal/datasets/labelled_train/168_jpg.rf.3899f3381c5a872d1facca43103c8a5d.xml',
 '/home/cpow/Desktop/thermal/datasets/labelled_train/Automated-Fire-Suppression-System-Relies-on-FLIR-Thermal-Imaging-Cameras-1-47-screenshot-3-_png_jpg.rf.999df52dab81bc46ddcf942a6a2b14cc.xml',
 '/home/cpow/Desktop/thermal/datasets/labelled_train/Vodka-shot-on-fire-Thermal-camera-foota

In [77]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [78]:
parser_all = list(map(extract_text,xmlfiles))

In [79]:
data = reduce(lambda x, y : x+y,parser_all)

In [80]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [81]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-...,640,640,fire,152,208,497,550
1,Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-...,640,640,fire,76,107,351,432
2,163_jpg.rf.061a5e83aa901933bf3d7204ec944865.jpg,640,640,fire,170,515,295,570
3,Automated-Fire-Suppression-System-Relies-on-FL...,640,640,fire,343,386,253,315
4,Fire-Assessment-w_-FLIR-Thermal-2-3-screenshot...,640,640,fire,1,191,1,66


In [82]:
df.shape

(5238, 8)

In [83]:
df['name'].value_counts()

name
fire    5238
Name: count, dtype: int64

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5238 entries, 0 to 5237
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  5238 non-null   object
 1   width     5238 non-null   object
 2   height    5238 non-null   object
 3   name      5238 non-null   object
 4   xmin      5238 non-null   object
 5   xmax      5238 non-null   object
 6   ymin      5238 non-null   object
 7   ymax      5238 non-null   object
dtypes: object(8)
memory usage: 327.5+ KB


In [85]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5238 entries, 0 to 5237
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  5238 non-null   object
 1   width     5238 non-null   int64 
 2   height    5238 non-null   int64 
 3   name      5238 non-null   object
 4   xmin      5238 non-null   int64 
 5   xmax      5238 non-null   int64 
 6   ymin      5238 non-null   int64 
 7   ymax      5238 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 327.5+ KB


In [86]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [87]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-...,640,640,fire,152,208,497,550,0.28125,0.817969,0.0875,0.082812
1,Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-...,640,640,fire,76,107,351,432,0.142969,0.611719,0.048438,0.126562
2,163_jpg.rf.061a5e83aa901933bf3d7204ec944865.jpg,640,640,fire,170,515,295,570,0.535156,0.675781,0.539062,0.429688
3,Automated-Fire-Suppression-System-Relies-on-FL...,640,640,fire,343,386,253,315,0.569531,0.44375,0.067187,0.096875
4,Fire-Assessment-w_-FLIR-Thermal-2-3-screenshot...,640,640,fire,1,191,1,66,0.15,0.052344,0.296875,0.101562


In [88]:
images = df['filename'].unique()

In [89]:
len(images)

1757

In [90]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [91]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [92]:
len(img_train), len(img_test)

(1406, 351)

In [93]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [94]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-...,640,640,fire,152,208,497,550,0.28125,0.817969,0.0875,0.082812
1,Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-...,640,640,fire,76,107,351,432,0.142969,0.611719,0.048438,0.126562
2,163_jpg.rf.061a5e83aa901933bf3d7204ec944865.jpg,640,640,fire,170,515,295,570,0.535156,0.675781,0.539062,0.429688
3,Automated-Fire-Suppression-System-Relies-on-FL...,640,640,fire,343,386,253,315,0.569531,0.44375,0.067187,0.096875
4,Fire-Assessment-w_-FLIR-Thermal-2-3-screenshot...,640,640,fire,1,191,1,66,0.15,0.052344,0.296875,0.101562


In [95]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
8,168_jpg.rf.3899f3381c5a872d1facca43103c8a5d.jpg,640,640,fire,182,403,32,351,0.457031,0.299219,0.345313,0.498437
14,001-How-To-See-The-Heat-Generated-From-Fire-Sc...,640,640,fire,81,225,452,615,0.239063,0.833594,0.225,0.254688
22,226_jpg.rf.af1c4906e809b66e09e480728fbd92aa.jpg,640,640,fire,578,641,566,641,0.952344,0.942969,0.098437,0.117188
29,Dahua-Thermal-IP-Camera-Flame_Fire-Detection-0...,640,640,fire,368,400,233,379,0.6,0.478125,0.05,0.228125
34,Fire-Assessment-w_-FLIR-Thermal-1-45-screensho...,640,640,fire,1,248,1,88,0.194531,0.069531,0.385937,0.135937


In [96]:
def label_encoding(x):
    labels = {
        'fire': 0, 

    }
    return labels[x]


In [97]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [98]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-...,640,640,fire,152,208,497,550,0.28125,0.817969,0.0875,0.082812,0
1,Dublin-Fire-Brigade-FLIR-Thermal-Drone-RPAS-0-...,640,640,fire,76,107,351,432,0.142969,0.611719,0.048438,0.126562,0
2,163_jpg.rf.061a5e83aa901933bf3d7204ec944865.jpg,640,640,fire,170,515,295,570,0.535156,0.675781,0.539062,0.429688,0
3,Automated-Fire-Suppression-System-Relies-on-FL...,640,640,fire,343,386,253,315,0.569531,0.44375,0.067187,0.096875,0
4,Fire-Assessment-w_-FLIR-Thermal-2-3-screenshot...,640,640,fire,1,191,1,66,0.15,0.052344,0.296875,0.101562,0
5,Fire-Assessment-w_-FLIR-Thermal-2-3-screenshot...,640,640,fire,333,495,49,141,0.646875,0.148438,0.253125,0.14375,0
6,Fire-Assessment-w_-FLIR-Thermal-2-3-screenshot...,640,640,fire,218,333,456,578,0.430469,0.807813,0.179688,0.190625,0
7,Fire-Assessment-w_-FLIR-Thermal-2-3-screenshot...,640,640,fire,333,388,613,641,0.563281,0.979688,0.085938,0.04375,0
9,Automated-Fire-Suppression-System-Relies-on-FL...,640,640,fire,378,440,275,319,0.639062,0.464062,0.096875,0.06875,0
10,Automated-Fire-Suppression-System-Relies-on-FL...,640,640,fire,136,215,583,641,0.274219,0.95625,0.123438,0.090625,0


In [99]:
import os
from shutil import move

In [100]:
train_folder = '/home/cpow/Desktop/thermal/datasets/train'
test_folder = '/home/cpow/Desktop/thermal/datasets/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

FileExistsError: [Errno 17] File exists: '/home/cpow/Desktop/thermal/datasets/train'

In [101]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [102]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('/home/cpow/Desktop/thermal/datasets/train',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [103]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [104]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

FileNotFoundError: [Errno 2] No such file or directory: '/home/cpow/Desktop/thermal/datasets/train/-1-_jpg.rf.02db23621ccd6faba8ed3e1c56264968.jpg'

In [105]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

FileNotFoundError: [Errno 2] No such file or directory: '/home/cpow/Desktop/thermal/datasets/train/-3-17-screenshot_png_jpg.rf.ba9c8b2989ef3935b2282cdaf4175169.jpg'