---
# Imports

In [1]:
import os, sys
import pandas as pd
import numpy as np

from tqdm import tqdm

---
# Setup

In [2]:
DATA_PATH = os.path.join('..', 'data')
IMAGES_PATH = os.path.join(DATA_PATH, 'train_val_images', 'train_images')

assert os.path.exists(DATA_PATH), f"Data path {DATA_PATH} does not exist. Please create it and add the data files."
assert os.path.exists(IMAGES_PATH), f"Images path {IMAGES_PATH} does not exist. Please create it and add the image files."

---
# Data Import

In [3]:
img_info_df = pd.read_csv(os.path.join(DATA_PATH, 'img.csv'))
annotations_df = pd.read_csv(os.path.join(DATA_PATH, 'annot.csv'))

images = os.listdir(IMAGES_PATH)

In [4]:
img_info_df.drop(columns=['Unnamed: 0'], inplace=True)
annotations_df.drop(columns=['Unnamed: 0'], inplace=True)

---
# Data Validation

In [5]:
img_info_df

Unnamed: 0,id,width,height,set,file_name
0,a4ea732cd3d5948a,840,1024,train,train/a4ea732cd3d5948a.jpg
1,4bf43a7b2a898044,1024,683,train,train/4bf43a7b2a898044.jpg
2,1b55b309b0f50d02,1024,683,train,train/1b55b309b0f50d02.jpg
3,00c359f294f7dcd9,1024,680,train,train/00c359f294f7dcd9.jpg
4,04b5a37f762b0f51,768,1024,train,train/04b5a37f762b0f51.jpg
...,...,...,...,...,...
21773,d13b7ec72bd5eed5,1024,683,train,train/d13b7ec72bd5eed5.jpg
21774,0022933cdceee189,1024,554,train,train/0022933cdceee189.jpg
21775,6029c75e0325d164,1024,768,train,train/6029c75e0325d164.jpg
21776,0ebbecdc46b78d42,1024,681,train,train/0ebbecdc46b78d42.jpg


In [6]:
img_info_df['set'].value_counts()

train    21778
Name: set, dtype: int64

In [7]:
annotations_df

Unnamed: 0,id,image_id,bbox,utf8_string,points,area
0,a4ea732cd3d5948a_1,a4ea732cd3d5948a,"[525.83, 3.4, 197.64, 33.94]",Performance,"[525.83, 3.4, 723.47, 7.29, 722.76, 36.99, 525...",6707.90
1,a4ea732cd3d5948a_2,a4ea732cd3d5948a,"[534.67, 64.68, 91.22, 38.19]",Sport,"[535.73, 64.68, 623.41, 67.51, 625.89, 102.87,...",3483.69
2,a4ea732cd3d5948a_3,a4ea732cd3d5948a,"[626.95, 63.62, 96.52, 31.82]",Watch,"[626.95, 63.62, 721.7, 63.62, 723.47, 95.44, 6...",3071.27
3,a4ea732cd3d5948a_4,a4ea732cd3d5948a,"[577.4, 141.87, 147.13, 43.1]",...period.,"[580.02, 143.61, 724.53, 141.87, 723.66, 184.9...",6341.30
4,a4ea732cd3d5948a_5,a4ea732cd3d5948a,"[391.03, 163.9, 60.82, 38.65]",.,"[395.2, 163.9, 451.85, 191.94, 445.59, 202.55,...",2350.69
...,...,...,...,...,...,...
1052349,0ebbecdc46b78d42_15,0ebbecdc46b78d42,"[267.47, -0.14, 28.18, 27.47]",.,"[295.65, -0.14, 295.65, 27.33, 267.47, 27.03, ...",774.10
1052350,a37e1fb026b80a6d_1,a37e1fb026b80a6d,"[331.69, 462.84, 417.31, 201.08]",RÖR,"[331.69, 466.97, 749.0, 462.84, 749.0, 659.79,...",83912.69
1052351,a37e1fb026b80a6d_2,a37e1fb026b80a6d,"[876.75, 285.63, 36.98, 10.28]",Moderna,"[876.75, 287.61, 912.34, 285.63, 913.73, 293.9...",380.15
1052352,a37e1fb026b80a6d_3,a37e1fb026b80a6d,"[913.53, 282.86, 32.23, 11.27]",Museet,"[913.53, 284.84, 944.77, 282.86, 945.76, 292.3...",363.23


## Number of images check

In [8]:
images_set = set(img_info_df['id'])
images_with_annotations = set(annotations_df['image_id'])

In [9]:
print(f"Number of images in the dataset: {len(images)}")
print(f"Number of images in the dataset (From img.csv): {len(images_set)}")
print(f"Number of images with annotations (From annot.csv): {len(images_with_annotations)}")

Number of images in the dataset: 25119
Number of images in the dataset (From img.csv): 21778
Number of images with annotations (From annot.csv): 21778


In [10]:
missing_images_cnt = 0

for image in tqdm(images):
    image_id = image.split('.')[0]
    
    if image_id not in images_set:
        missing_images_cnt += 1
        
print(f"Number of images not in img.csv: {missing_images_cnt}")

100%|██████████| 25119/25119 [00:00<00:00, 1144907.98it/s]

Number of images not in img.csv: 3341





In [11]:
images_set == images_with_annotations

True

### Conclusion:
Use images only in img.csv

## Check if all images in img.csv are present in the images directory

In [12]:
images_id = images = [img.split('.')[0] for img in images]
images_id = set(images_id)

In [13]:
list(images_id)[:5]

['14d68006b52caafb',
 '8924f92be9064c4c',
 '7a5a1251c70b93a0',
 '01ab6b284642d087',
 '752df9bf6937c60c']

In [14]:
for img in tqdm(images_set):
    assert img in images_id, f"Image {img} not found in the dataset. Please check the image files."

100%|██████████| 21778/21778 [00:00<00:00, 1679819.64it/s]
