In [1]:
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import numpy as np

In [2]:
import pandas as pd
from PIL import Image
import xml.etree.ElementTree as ET

def create_DF(files, images_dir):
    """
    Convierte una lista de PosixPaths en un dataframe
    """
    df = pd.DataFrame([])
    for file in files:
        tree = ET.parse(file)
        root = tree.getroot()

        image_dic = []

        for obj in root.findall('object'):
            label = obj.find('name').text       # pineapple
            image = file.stem + ".png"          # fruit0.png
            bndbox = obj.find('bndbox')         
            xmin = float(bndbox.find('xmin').text)      # 38
            ymin = float(bndbox.find('ymin').text)     # 18
            xmax = float(bndbox.find('xmax').text)     # 271)
            ymax = float(bndbox.find('ymax').text)      # 227)

            image_dic.append({
                "label" : label,
                "left" : xmin, 
                "top" : ymin, 
                "right" : xmax,
                "bottom" : ymax
            })
        
        aux_df = pd.DataFrame(image_dic)
        
        # try:
        image_file = next(images_dir.glob(f"**/{file.stem}*.png"))
        aux_df["height"] = float(root.find('size').find('height').text)
        aux_df["width"] = float(root.find('size').find('width').text)
        aux_df["image"] = image_file
        aux_df["image_name"] = root.find('filename').text
        df = pd.concat([df, aux_df])
        # except:
        #     print(file.stem)
    print(df)
    df["center_x"] = (df["right"] + df["left"])/2
    df["center_y"] = (df["bottom"] + df["top"])/2
    df["delta_x"] = df["right"] - df["left"]
    df["delta_y"] = df["bottom"] - df["top"]

    return df

In [5]:
train_files = list(Path("dataset/annotations/").glob("**/*xml"))
print(train_files)

[PosixPath('dataset/annotations/fruit0.xml'), PosixPath('dataset/annotations/fruit1.xml'), PosixPath('dataset/annotations/fruit10.xml'), PosixPath('dataset/annotations/fruit100.xml'), PosixPath('dataset/annotations/fruit101.xml'), PosixPath('dataset/annotations/fruit102.xml'), PosixPath('dataset/annotations/fruit103.xml'), PosixPath('dataset/annotations/fruit104.xml'), PosixPath('dataset/annotations/fruit105.xml'), PosixPath('dataset/annotations/fruit106.xml'), PosixPath('dataset/annotations/fruit107.xml'), PosixPath('dataset/annotations/fruit108.xml'), PosixPath('dataset/annotations/fruit109.xml'), PosixPath('dataset/annotations/fruit11.xml'), PosixPath('dataset/annotations/fruit110.xml'), PosixPath('dataset/annotations/fruit111.xml'), PosixPath('dataset/annotations/fruit112.xml'), PosixPath('dataset/annotations/fruit113.xml'), PosixPath('dataset/annotations/fruit114.xml'), PosixPath('dataset/annotations/fruit115.xml'), PosixPath('dataset/annotations/fruit116.xml'), PosixPath('dataset

In [6]:
df_train = create_DF(train_files, Path("dataset/images/"))

           label   left    top  right  bottom  height  width  \
0      pineapple   38.0   82.0  271.0   227.0   300.0  400.0   
1    snake fruit  244.0  174.0  280.0   207.0   300.0  400.0   
2   dragon fruit  254.0  228.0  351.0   300.0   300.0  400.0   
0      pineapple   38.0   87.0  275.0   241.0   300.0  400.0   
1    snake fruit  240.0  185.0  279.0   220.0   300.0  400.0   
..           ...    ...    ...    ...     ...     ...    ...   
1    snake fruit  144.0  181.0  162.0   204.0   300.0  400.0   
2         banana  204.0  217.0  253.0   262.0   300.0  400.0   
0      pineapple  155.0   80.0  261.0   175.0   300.0  400.0   
1    snake fruit  143.0  184.0  161.0   208.0   300.0  400.0   
2         banana  203.0  222.0  257.0   269.0   300.0  400.0   

                         image   image_name  
0    dataset/images/fruit0.png   fruit0.png  
1    dataset/images/fruit0.png   fruit0.png  
2    dataset/images/fruit0.png   fruit0.png  
0    dataset/images/fruit1.png   fruit1.png  
1

In [8]:
df_train.head()

Unnamed: 0,label,left,top,right,bottom,height,width,image,image_name,center_x,center_y,delta_x,delta_y
0,pineapple,38.0,82.0,271.0,227.0,300.0,400.0,dataset/images/fruit0.png,fruit0.png,154.5,154.5,233.0,145.0
1,snake fruit,244.0,174.0,280.0,207.0,300.0,400.0,dataset/images/fruit0.png,fruit0.png,262.0,190.5,36.0,33.0
2,dragon fruit,254.0,228.0,351.0,300.0,300.0,400.0,dataset/images/fruit0.png,fruit0.png,302.5,264.0,97.0,72.0
0,pineapple,38.0,87.0,275.0,241.0,300.0,400.0,dataset/images/fruit1.png,fruit1.png,156.5,164.0,237.0,154.0
1,snake fruit,240.0,185.0,279.0,220.0,300.0,400.0,dataset/images/fruit1.png,fruit1.png,259.5,202.5,39.0,35.0


In [9]:
valores_unicos_columna2 = df_train['image_name'].unique()

np.random.shuffle(valores_unicos_columna2)

df_val = df_train[df_train['image_name'].isin(valores_unicos_columna2[:20])]
df_train = df_train[~df_train['image_name'].isin(valores_unicos_columna2[:20])]

In [10]:
df_val.head(100)

Unnamed: 0,label,left,top,right,bottom,height,width,image,image_name,center_x,center_y,delta_x,delta_y
0,pineapple,37.0,47.0,225.0,165.0,300.0,400.0,dataset/images/fruit100.png,fruit100.png,131.0,106.0,188.0,118.0
1,snake fruit,203.0,176.0,241.0,202.0,300.0,400.0,dataset/images/fruit100.png,fruit100.png,222.0,189.0,38.0,26.0
2,banana,285.0,97.0,342.0,156.0,300.0,400.0,dataset/images/fruit100.png,fruit100.png,313.5,126.5,57.0,59.0
0,pineapple,115.0,76.0,260.0,226.0,300.0,400.0,dataset/images/fruit113.png,fruit113.png,187.5,151.0,145.0,150.0
1,snake fruit,292.0,132.0,314.0,153.0,300.0,400.0,dataset/images/fruit113.png,fruit113.png,303.0,142.5,22.0,21.0
2,banana,221.0,55.0,276.0,101.0,300.0,400.0,dataset/images/fruit113.png,fruit113.png,248.5,78.0,55.0,46.0
0,pineapple,120.0,113.0,237.0,264.0,300.0,400.0,dataset/images/fruit116.png,fruit116.png,178.5,188.5,117.0,151.0
1,snake fruit,261.0,162.0,279.0,181.0,300.0,400.0,dataset/images/fruit116.png,fruit116.png,270.0,171.5,18.0,19.0
2,banana,205.0,72.0,251.0,118.0,300.0,400.0,dataset/images/fruit116.png,fruit116.png,228.0,95.0,46.0,46.0
0,pineapple,161.0,70.0,396.0,260.0,300.0,400.0,dataset/images/fruit139.png,fruit139.png,278.5,165.0,235.0,190.0


In [11]:
classes = df_train["label"].unique()
classes

array(['pineapple', 'snake fruit', 'dragon fruit', 'banana'], dtype=object)

In [12]:
classes_idx = {clase:idx for idx,clase in enumerate(classes)}

In [15]:
cant_classes = len(classes)
cant_classes

4

In [14]:
import pickle
with open("data.pkl","wb") as f:
    pickle.dump((df_train, df_val, train_files, classes_idx, cant_classes, classes),f)