In [15]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et
import shutil

In [16]:
#load all xml file into a list:
Tree = et.parse(r"E:\DH_data\data_dh\annotations.xml")
root = Tree.getroot()

### creating a function for extracting elements for nested .xml file: 

In [17]:
# Define a function to find the specified elements and attributes

def extract_info(root):

    # Create a list to store extracted information

    extracted_data = []
 
    # Iterate through each 'image' tag in the root

    for image in root.findall('image'):

        # Get the 'name' attribute of the 'image' tag

        image_name = image.get('name')
        #split the str:
        my_list = image_name.split('/')
        # grabbing the 2nd elements from them:
        image_name = my_list[1]
        # get width:
        width =image.get('width')
        # get hight:
        height = image.get('height')
 
        # Iterate through each 'box' tag within each 'image' tag

        for box in image.findall('box'):

            # Get the 'label' and coordinates attributes of the 'box' tag

            label = box.get('label')

            xtl = box.get('xtl')

            ytl = box.get('ytl')

            xbr = box.get('xbr')

            ybr = box.get('ybr')

            # Find the 'attribute' tag within each 'box' tag (assuming there is only one 'attribute' tag per 'box')

            attribute_tag = box.find('attribute')
            

            # Get the text content of the 'attribute' tag

            attribute = attribute_tag.text if attribute_tag is not None else None
 
            # Append the extracted information to the list
            extracted_data.append([image_name,width,height,label,xtl,ytl,xbr,ybr,attribute])

    return extracted_data

In [18]:
# Run the function and print the result
data =extract_info(root)

In [19]:
#creating a pandas data frame from this xml file: 
df = pd.DataFrame(data,columns=["filename","width","height","label","xtl","ytl","xbr","ybr","attribute"])

In [20]:
df.head()

Unnamed: 0,filename,width,height,label,xtl,ytl,xbr,ybr,attribute
0,frame_0.jpg,1280,720,player,967.93,291.3,1103.5,557.25,Running
1,frame_0.jpg,1280,720,player,158.1,247.1,228.4,396.5,Running
2,frame_0.jpg,1280,720,player,1115.93,325.2,1280.0,660.3,Running
3,frame_0.jpg,1280,720,player,242.5,288.1,335.1,575.9,Standing
4,frame_0.jpg,1280,720,player,888.32,247.0,983.82,433.3,Running


In [21]:
df.shape

(6153, 9)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6153 entries, 0 to 6152
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   filename   6153 non-null   object
 1   width      6153 non-null   object
 2   height     6153 non-null   object
 3   label      6153 non-null   object
 4   xtl        6153 non-null   object
 5   ytl        6153 non-null   object
 6   xbr        6153 non-null   object
 7   ybr        6153 non-null   object
 8   attribute  5379 non-null   object
dtypes: object(9)
memory usage: 432.8+ KB


In [23]:
# type conversition:
cols = ["width","height","xtl","ytl","xbr","ybr"]
df[cols] = df[cols].astype(float)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6153 entries, 0 to 6152
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   filename   6153 non-null   object 
 1   width      6153 non-null   float64
 2   height     6153 non-null   float64
 3   label      6153 non-null   object 
 4   xtl        6153 non-null   float64
 5   ytl        6153 non-null   float64
 6   xbr        6153 non-null   float64
 7   ybr        6153 non-null   float64
 8   attribute  5379 non-null   object 
dtypes: float64(6), object(3)
memory usage: 432.8+ KB


In [25]:
#creating Center_x and center_y:
df["Center_x"] = round(((df['xtl']+df["xbr"])/2)/df['width'],2)
df["Center_y"] = round(((df['ytl']+df["ybr"])/2)/df['height'],2)

#creating With and hight of the bounding box:
df["W"] = round((df["xbr"]-df["xtl"])/df['width'],2)
df["H"] = round((df["ybr"]-df["ytl"])/df['height'],2)

In [26]:
df.head()

Unnamed: 0,filename,width,height,label,xtl,ytl,xbr,ybr,attribute,Center_x,Center_y,W,H
0,frame_0.jpg,1280.0,720.0,player,967.93,291.3,1103.5,557.25,Running,0.81,0.59,0.11,0.37
1,frame_0.jpg,1280.0,720.0,player,158.1,247.1,228.4,396.5,Running,0.15,0.45,0.05,0.21
2,frame_0.jpg,1280.0,720.0,player,1115.93,325.2,1280.0,660.3,Running,0.94,0.68,0.13,0.47
3,frame_0.jpg,1280.0,720.0,player,242.5,288.1,335.1,575.9,Standing,0.23,0.6,0.07,0.4
4,frame_0.jpg,1280.0,720.0,player,888.32,247.0,983.82,433.3,Running,0.73,0.47,0.07,0.26


In [27]:
# asing ID number for lebel:
def Lebel_encoder(x):
    labels = {'player':0 ,'football':1 , 'goalkeeper':2 , 'referee':3}
    return labels[x]

In [28]:
#applying lebel encoder on lebel :
df['cls'] = df['label'].apply(Lebel_encoder)

In [29]:
df.head()

Unnamed: 0,filename,width,height,label,xtl,ytl,xbr,ybr,attribute,Center_x,Center_y,W,H,cls
0,frame_0.jpg,1280.0,720.0,player,967.93,291.3,1103.5,557.25,Running,0.81,0.59,0.11,0.37,0
1,frame_0.jpg,1280.0,720.0,player,158.1,247.1,228.4,396.5,Running,0.15,0.45,0.05,0.21,0
2,frame_0.jpg,1280.0,720.0,player,1115.93,325.2,1280.0,660.3,Running,0.94,0.68,0.13,0.47,0
3,frame_0.jpg,1280.0,720.0,player,242.5,288.1,335.1,575.9,Standing,0.23,0.6,0.07,0.4,0
4,frame_0.jpg,1280.0,720.0,player,888.32,247.0,983.82,433.3,Running,0.73,0.47,0.07,0.26,0


In [30]:
# asing ID number for attribute:
def Lebel_encoder(x):
    labels = {'Running':0 ,'Standing':1 , 'Rolling':2, 'on_a_ball' :3, 'On_foot' :4 }
    return labels[x]

In [31]:
#taking only that imformation where attribute none are not there:
df=df[df['attribute'] .notnull()]

In [32]:
#checking unique name for attributes:
df['attribute'].unique()

array(['Running', 'Standing', 'Rolling', 'on_a_ball', 'On_foot'],
      dtype=object)

In [33]:
#applying lebel encoder on lebel :
df['Attributes'] = df['attribute'].apply(Lebel_encoder)

In [34]:
df.head()

Unnamed: 0,filename,width,height,label,xtl,ytl,xbr,ybr,attribute,Center_x,Center_y,W,H,cls,Attributes
0,frame_0.jpg,1280.0,720.0,player,967.93,291.3,1103.5,557.25,Running,0.81,0.59,0.11,0.37,0,0
1,frame_0.jpg,1280.0,720.0,player,158.1,247.1,228.4,396.5,Running,0.15,0.45,0.05,0.21,0,0
2,frame_0.jpg,1280.0,720.0,player,1115.93,325.2,1280.0,660.3,Running,0.94,0.68,0.13,0.47,0,0
3,frame_0.jpg,1280.0,720.0,player,242.5,288.1,335.1,575.9,Standing,0.23,0.6,0.07,0.4,0,1
4,frame_0.jpg,1280.0,720.0,player,888.32,247.0,983.82,433.3,Running,0.73,0.47,0.07,0.26,0,0


In [35]:
# now split the 80% for train and 20% test file:
images = df["filename"].unique()
#lets see how many images are there:
len(images)

921

In [36]:
# now creating a data frame based on unique file name value:
images_df = pd.DataFrame(images,columns=["filename"])
# now split the 80% for train and 20% test file:
images_train = tuple(images_df.sample(frac=.8)["filename"]) # 80% for train
images_test = tuple(images_df.query(f'filename not in {images_train}')['filename']) # 20% for train

In [37]:
len(images_train),len(images_test)

(737, 184)

In [38]:
#now spliting the data frame bsed on image name:
train_df = df.query(f'filename in {images_train}')
test_df = df.query(f'filename in {images_test}')

In [39]:
train_df.head()

Unnamed: 0,filename,width,height,label,xtl,ytl,xbr,ybr,attribute,Center_x,Center_y,W,H,cls,Attributes
0,frame_0.jpg,1280.0,720.0,player,967.93,291.3,1103.5,557.25,Running,0.81,0.59,0.11,0.37,0,0
1,frame_0.jpg,1280.0,720.0,player,158.1,247.1,228.4,396.5,Running,0.15,0.45,0.05,0.21,0,0
2,frame_0.jpg,1280.0,720.0,player,1115.93,325.2,1280.0,660.3,Running,0.94,0.68,0.13,0.47,0,0
3,frame_0.jpg,1280.0,720.0,player,242.5,288.1,335.1,575.9,Standing,0.23,0.6,0.07,0.4,0,1
4,frame_0.jpg,1280.0,720.0,player,888.32,247.0,983.82,433.3,Running,0.73,0.47,0.07,0.26,0,0


In [40]:
test_df.head()

Unnamed: 0,filename,width,height,label,xtl,ytl,xbr,ybr,attribute,Center_x,Center_y,W,H,cls,Attributes
6,frame_10.jpg,1280.0,720.0,player,720.4,246.1,803.0,441.0,Standing,0.6,0.48,0.06,0.27,0,1
7,frame_10.jpg,1280.0,720.0,player,826.2,309.92,985.8,643.5,Standing,0.71,0.66,0.12,0.46,0,1
8,frame_10.jpg,1280.0,720.0,player,855.9,283.0,1001.6,557.34,Running,0.73,0.58,0.11,0.38,0,0
9,frame_10.jpg,1280.0,720.0,player,68.9,243.8,137.5,401.0,Running,0.08,0.45,0.05,0.22,0,0
10,frame_10.jpg,1280.0,720.0,player,114.08,290.4,202.0,580.8,Standing,0.12,0.6,0.07,0.4,0,1


In [41]:
train_df.columns

Index(['filename', 'width', 'height', 'label', 'xtl', 'ytl', 'xbr', 'ybr',
       'attribute', 'Center_x', 'Center_y', 'W', 'H', 'cls', 'Attributes'],
      dtype='object')

In [42]:
#now slectiong out usefull columns:
cols =['filename','cls','Center_x','Center_y','W','H','Attributes']
groupby_obj_train= train_df[cols].groupby('filename')
groupby_obj_test= test_df[cols].groupby('filename')

In [43]:
groupby_obj_train.get_group('frame_0.jpg').set_index('filename').to_csv('sample.txt',sep= ' ' ,index = False,header = False)

#### now creting a folder for train data and inside it there will be two folder images , labels with the hel of "OS"

In [44]:
os.mkdir(os.path.join("E:\DH_data\data_dh","train"))
#creating folders in side train folder
if os.path.exists(os.path.join("E:\DH_data\data_dh","train")):
    os.mkdir(os.path.join("E:\DH_data\data_dh","train","images"))
    os.mkdir(os.path.join("E:\DH_data\data_dh","train","labels"))
#running loop from train file names
for files in images_train:
    src_img_path=os.path.join("E:/DH_data/data_dh/images/",files)
    des_img_path=os.path.join("E:\DH_data\data_dh","train","images")
    shutil.copy(src=src_img_path,dst=des_img_path)
    print(files,"copied successfully!!!")

frame_1644.jpg copied successfully!!!
frame_160.jpg copied successfully!!!
frame_1840.jpg copied successfully!!!
frame_1188.jpg copied successfully!!!
frame_44.jpg copied successfully!!!
frame_544.jpg copied successfully!!!
frame_1244.jpg copied successfully!!!
frame_308.jpg copied successfully!!!
frame_1026.jpg copied successfully!!!
frame_690.jpg copied successfully!!!
frame_1050.jpg copied successfully!!!
frame_810.jpg copied successfully!!!
frame_528.jpg copied successfully!!!
frame_1742.jpg copied successfully!!!
frame_372.jpg copied successfully!!!
frame_784.jpg copied successfully!!!
frame_1480.jpg copied successfully!!!
frame_902.jpg copied successfully!!!
frame_1640.jpg copied successfully!!!
frame_1392.jpg copied successfully!!!
frame_746.jpg copied successfully!!!
frame_1198.jpg copied successfully!!!
frame_868.jpg copied successfully!!!
frame_1630.jpg copied successfully!!!
frame_1154.jpg copied successfully!!!
frame_1204.jpg copied successfully!!!
frame_624.jpg copied succ

In [45]:
#creating txt_file bsed on test_image df data:
src = os.listdir(r"E:\DH_data\data_dh\train\images")
for i in src:
    print(i)
    filenam_txt = os.path.splitext(i)[0]+ '.txt'
    filename_txt_path=os.path.join(r"E:\DH_data\data_dh\train\labels",filenam_txt)
    groupby_obj_train.get_group(i).set_index('filename').to_csv(filename_txt_path,sep = ' ' ,index = False,header = False)
    print("Loaded",filename_txt_path)

frame_0.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_0.txt
frame_100.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_100.txt
frame_1000.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1000.txt
frame_1002.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1002.txt
frame_1006.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1006.txt
frame_1008.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1008.txt
frame_1010.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1010.txt
frame_1012.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1012.txt
frame_1014.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1014.txt
frame_1016.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1016.txt
frame_1018.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1018.txt
frame_102.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_102.txt
frame_1020.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1020.txt
frame_1022.jpg
Loaded E:\DH_data\data_dh\train\labels\frame_1022.txt
frame_1026.jpg
Loaded E:\DH_data\data_dh\tra

### now creting a folder for test data and inside it there will be two folder images , labels with the hel of "OS"

In [46]:
os.mkdir(os.path.join("E:\DH_data\data_dh","test"))
#creating folders in side train folder
if os.path.exists(os.path.join("E:\DH_data\data_dh","test")):
    os.mkdir(os.path.join("E:\DH_data\data_dh","test","images"))
    os.mkdir(os.path.join("E:\DH_data\data_dh","test","labels"))
#running loop from train file names
for files in images_test:
    src_img_path=os.path.join("E:/DH_data/data_dh/images/",files)
    des_img_path=os.path.join("E:\DH_data\data_dh","test","images")
    shutil.copy(src=src_img_path,dst=des_img_path)
    print(files,"copied successfully!!!")

frame_10.jpg copied successfully!!!
frame_1004.jpg copied successfully!!!
frame_1024.jpg copied successfully!!!
frame_1028.jpg copied successfully!!!
frame_1030.jpg copied successfully!!!
frame_1036.jpg copied successfully!!!
frame_1038.jpg copied successfully!!!
frame_1040.jpg copied successfully!!!
frame_1042.jpg copied successfully!!!
frame_1078.jpg copied successfully!!!
frame_1082.jpg copied successfully!!!
frame_1096.jpg copied successfully!!!
frame_1098.jpg copied successfully!!!
frame_1116.jpg copied successfully!!!
frame_1120.jpg copied successfully!!!
frame_1134.jpg copied successfully!!!
frame_1144.jpg copied successfully!!!
frame_1148.jpg copied successfully!!!
frame_116.jpg copied successfully!!!
frame_1164.jpg copied successfully!!!
frame_1166.jpg copied successfully!!!
frame_1170.jpg copied successfully!!!
frame_1182.jpg copied successfully!!!
frame_1212.jpg copied successfully!!!
frame_1218.jpg copied successfully!!!
frame_1224.jpg copied successfully!!!
frame_1232.jpg 

In [47]:
#creating txt_file bsed on test_image df data:
src = os.listdir(r"E:\DH_data\data_dh\test\images")
for i in src:
    filenam_txt = os.path.splitext(i)[0]+ '.txt'
    filename_txt_path=os.path.join(r"E:\DH_data\data_dh\test\labels",filenam_txt)
    groupby_obj_test.get_group(i).set_index('filename').to_csv(filename_txt_path,sep = ' ' ,index = False,header = False)
    print("Loaded",filename_txt_path)

Loaded E:\DH_data\data_dh\test\labels\frame_10.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1004.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1024.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1028.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1030.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1036.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1038.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1040.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1042.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1078.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1082.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1096.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1098.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1116.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1120.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1134.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1144.txt
Loaded E:\DH_data\data_dh\test\labels\frame_1148.txt
Loaded E:\DH_data\data_dh\test\labels\frame_116.