# Collect Data
The purpose of this notebook is to cull images from a public Github repository.<br>
Furthermore, there are also self-labeled images in this directory.<br>
Moreover, the XML files containing the annotations are converted to a CSV file.<br>
Adjust the path specifications with respect to your appropriate path.<br>
Additionally, remember to save our images (see [Kaggle](https://www.kaggle.com/datasets/bastianberle/eurocoins-images-object-detection)) in the same directory.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import xml.etree.ElementTree as ET
import os
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Adjust if necessary
rootdir = '/content/drive/MyDrive/data'

In [None]:
%cd /content/drive/MyDrive/data
!git clone https://github.com/SuperDiodo/euro-coin-dataset.git

In [None]:
# convert xml annotation to a csv compatible format
def get_annotation(xmlfile, filefolder):

  annos = []

  tree = ET.parse(xmlfile)
  root = tree.getroot()
  # folder = root.getchildren()[0].text # won't work since files are in a different folder
  filename = root.getchildren()[1].text
  width = root.getchildren()[4].getchildren()[0].text
  height = root.getchildren()[4].getchildren()[1].text
  depth = root.getchildren()[4].getchildren()[2].text
  segmented = root.getchildren()[5].text

  # if folder != filefolder:
  #   print(filefolder, folder)

  if '.jpg' not in filename.lower():
    filename += '.jpg'

  # objects
  for i in range(6, len(root.getchildren())):
    name = root.getchildren()[i].getchildren()[0].text
    pose = root.getchildren()[i].getchildren()[0].text
    truncated = root.getchildren()[i].getchildren()[0].text
    difficult = root.getchildren()[i].getchildren()[0].text
    xmin = root.getchildren()[i].getchildren()[4].getchildren()[0].text
    ymin = root.getchildren()[i].getchildren()[4].getchildren()[1].text
    xmax = root.getchildren()[i].getchildren()[4].getchildren()[2].text
    ymax = root.getchildren()[i].getchildren()[4].getchildren()[3].text

    annos.append(
        [
        filefolder,
        filename,
        width,
        height,
        depth,
        segmented,
        name,
        pose,
        truncated,
        difficult,
        xmin,
        ymin,
        xmax,
        ymax
      ]
    )

  return annos

#### Store all annotations in list

In [None]:
annotations = []

for subdir, dirs, files in os.walk(rootdir):
  for filename in files:
    if filename.endswith('.xml'):
      folder = subdir.split('/')[-1]
      annos = get_annotation(os.path.join(subdir, filename), folder)
      annotations.extend(annos)

In [None]:
df = pd.DataFrame(annotations)
df.columns = ['folder','filename','width','height','depth','segmented','name','pose','truncated','difficult','xmin','ymin','xmax','ymax']
df.replace('own_2', 'own', inplace = True) # because we had 2 folders with own annotated images

In [None]:
df.tail(30)

Unnamed: 0,folder,filename,width,height,depth,segmented,name,pose,truncated,difficult,xmin,ymin,xmax,ymax
3893,own,20220517_181130.jpg,3024,3024,3,0,5,5,5,5,1525,509,2119,1103
3894,own,20220517_181130.jpg,3024,3024,3,0,50,50,50,50,981,821,1619,1478
3895,own,20220517_181130.jpg,3024,3024,3,0,20,20,20,20,419,878,972,1462
3896,own,20220517_181130.jpg,3024,3024,3,0,10,10,10,10,1250,1940,1766,2471
3897,own,20220517_181130.jpg,3024,3024,3,0,1,1,1,1,1835,1581,2297,2037
3898,own,20220517_181130.jpg,3024,3024,3,0,1,1,1,1,613,1456,1035,1881
3899,own,20220517_181130.jpg,3024,3024,3,0,1,1,1,1,847,1796,1278,2231
3900,own,20220517_181130.jpg,3024,3024,3,0,1,1,1,1,1125,1465,1566,1903
3901,own,20220517_181130.jpg,3024,3024,3,0,1,1,1,1,1522,1274,1960,1718
3902,own,20220517_181147.jpg,3024,3024,3,0,200,200,200,200,2144,634,2863,1337


In [None]:
df[['filename']].nunique()

filename    614
dtype: int64

In [None]:
df['folder'].unique()

array(['50', '5', '100', 'mixed', 'images', '200', '10', '1', '20', '2',
       'own'], dtype=object)

#### Save annotations

In [None]:
# save csv file
df.to_csv(os.path.join(rootdir, 'annotations.csv'), index = False)