## Practica 1 - Computer Vision

In [19]:
import os
import gdown

# Crear el directorio de datos si no existe
data_dir = "data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# URL de Google Drive en formato correcto para gdown
url = "https://drive.google.com/uc?id=1iGBv-VT5mm1RiouD-U2qWcU3BYqp2OwE"
zip_filename = "practica_1_dataset.zip"
zip_path = os.path.join(data_dir, zip_filename)

# Descargar el archivo
if not os.path.exists(zip_path):
    gdown.download(url, zip_path, quiet=False)
else:
    print("Data zipfile already exists")


Data zipfile already exists


In [43]:
import os
from pathlib import Path
from zipfile import ZipFile
from concurrent.futures import ThreadPoolExecutor

data_dir = "data"
zip_filename = "practica_1_dataset.zip"
zip_path = os.path.join(data_dir, zip_filename)
subfolders = ["test", "train", "valid"]
full_paths = [os.path.join(data_dir, folder) for folder in subfolders]

if not all(os.path.isdir(path) for path in full_paths):
    with ZipFile(zip_path, 'r') as zf:
        with ThreadPoolExecutor() as exe:
            for file in zf.namelist():
                if not file.startswith("__MACOSX"):
                    exe.submit(zf.extract, file, path=data_dir)
else:
    print("test, train and valid folders already exist")

In [21]:
import tensorflow as tf

for folder in subfolders:
    folder_path = os.path.join(data_dir, folder)
    ds_files = tf.data.Dataset.list_files(folder_path + "/*.jpg", shuffle=False)
    print(f"Total image files in {folder}: {len(ds_files)}")
    print("Filenames shape:", ds_files.element_spec)

example = next(ds_files.take(1).as_numpy_iterator())
example

Total image files in test: 63
Filenames shape: TensorSpec(shape=(), dtype=tf.string, name=None)
Total image files in train: 448
Filenames shape: TensorSpec(shape=(), dtype=tf.string, name=None)
Total image files in valid: 127
Filenames shape: TensorSpec(shape=(), dtype=tf.string, name=None)


b'data/valid/IMG_2277_jpeg_jpg.rf.86c72d6192da48d941ffa957f4780665.jpg'

In [22]:
import pandas as pd
df = pd.read_csv("data/test/annotations.csv")
count_class = df.groupby(["filename", "class"]).size()
print(f"{count_class.shape = }")
count_class.head()

count_class.shape = (83,)


filename                                                   class  
IMG_2289_jpeg_jpg.rf.fe2a7a149e7b11f2313f5a7b30386e85.jpg  puffin      1
IMG_2301_jpeg_jpg.rf.2c19ae5efbd1f8611b5578125f001695.jpg  penguin    23
IMG_2319_jpeg_jpg.rf.6e20bf97d17b74a8948aa48776c40454.jpg  penguin     8
IMG_2347_jpeg_jpg.rf.7c71ac4b9301eb358cd4a832844dedcb.jpg  penguin     2
IMG_2354_jpeg_jpg.rf.396e872c7fb0a95e911806986995ee7a.jpg  penguin     5
dtype: int64

In [23]:
df["area"] = (df["xmax"] - df["xmin"]) * (df["ymax"] - df["ymin"])
sum_area = df.groupby(["filename", "class"])["area"].sum()
print(f"{sum_area.shape = }")
sum_area.head()

sum_area.shape = (83,)


filename                                                   class  
IMG_2289_jpeg_jpg.rf.fe2a7a149e7b11f2313f5a7b30386e85.jpg  puffin      94864
IMG_2301_jpeg_jpg.rf.2c19ae5efbd1f8611b5578125f001695.jpg  penguin     32549
IMG_2319_jpeg_jpg.rf.6e20bf97d17b74a8948aa48776c40454.jpg  penguin     29583
IMG_2347_jpeg_jpg.rf.7c71ac4b9301eb358cd4a832844dedcb.jpg  penguin    250311
IMG_2354_jpeg_jpg.rf.396e872c7fb0a95e911806986995ee7a.jpg  penguin     14881
Name: area, dtype: int64

In [24]:
score = pd.Series(sum_area * count_class, name="score").reset_index()
print(f"{score.shape = }")
score.head()

score.shape = (83, 3)


Unnamed: 0,filename,class,score
0,IMG_2289_jpeg_jpg.rf.fe2a7a149e7b11f2313f5a7b3...,puffin,94864
1,IMG_2301_jpeg_jpg.rf.2c19ae5efbd1f8611b5578125...,penguin,748627
2,IMG_2319_jpeg_jpg.rf.6e20bf97d17b74a8948aa4877...,penguin,236664
3,IMG_2347_jpeg_jpg.rf.7c71ac4b9301eb358cd4a8328...,penguin,500622
4,IMG_2354_jpeg_jpg.rf.396e872c7fb0a95e911806986...,penguin,74405


In [35]:
df_labels = score.groupby("filename").max().drop("score", axis=1)
print(f"{df_labels.shape = }")
df_labels.head()

df_labels.shape = (63, 1)


Unnamed: 0_level_0,class
filename,Unnamed: 1_level_1
IMG_2289_jpeg_jpg.rf.fe2a7a149e7b11f2313f5a7b30386e85.jpg,puffin
IMG_2301_jpeg_jpg.rf.2c19ae5efbd1f8611b5578125f001695.jpg,penguin
IMG_2319_jpeg_jpg.rf.6e20bf97d17b74a8948aa48776c40454.jpg,penguin
IMG_2347_jpeg_jpg.rf.7c71ac4b9301eb358cd4a832844dedcb.jpg,penguin
IMG_2354_jpeg_jpg.rf.396e872c7fb0a95e911806986995ee7a.jpg,penguin


In [48]:
df_labels.index.shape, df_labels.index.unique().shape

((63,), (63,))

In [57]:
import pathlib
import shutil
data_dir = pathlib.Path("data/test/")
for folder in df_labels["class"].unique():
    (data_dir / folder).mkdir(parents=True, exist_ok=True)
if not all(os.path.isdir(os.path.join(data_dir, folder)) for folder in df_labels["class"].unique()):
    df_labels.apply(
        lambda row: shutil.move(
        os.path.join(data_dir, row.name),
        os.path.join(data_dir, row["class"], row.name)
        ), axis=1
    )
else:
    print("Folders already organized")
df_labels.head()

Folders already organized


Unnamed: 0_level_0,class
filename,Unnamed: 1_level_1
IMG_2289_jpeg_jpg.rf.fe2a7a149e7b11f2313f5a7b30386e85.jpg,puffin
IMG_2301_jpeg_jpg.rf.2c19ae5efbd1f8611b5578125f001695.jpg,penguin
IMG_2319_jpeg_jpg.rf.6e20bf97d17b74a8948aa48776c40454.jpg,penguin
IMG_2347_jpeg_jpg.rf.7c71ac4b9301eb358cd4a832844dedcb.jpg,penguin
IMG_2354_jpeg_jpg.rf.396e872c7fb0a95e911806986995ee7a.jpg,penguin


In [62]:
data_dir = pathlib.Path("data/test/")
ds_test = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    image_size=(224, 224),
    label_mode="int",
    batch_size=1,
)

for el in ds_test.take(10):
    image, label = el
    print(image.shape, label.shape)
    break

Found 63 files belonging to 7 classes.
(1, 224, 224, 3) (1,)


In [53]:
ds_test.element_spec

(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None),
 TensorSpec(shape=(None,), dtype=tf.int32, name=None))

```python
AUTOTUNE = tf.data.AUTOTUNE

ds_files = tf.data.Dataset.list_files("data/test/" + '*.jpg', shuffle=False)
df = pd.read_csv("data/test/annotations.csv")
df["class"] = pd.Categorical(df["class"])
ds_images = (
    ds_files
    .shuffle(len(ds_files))
    .cache()
    .map(lambda x: (get_image(x), get_label(x, df)), num_parallel_calls=AUTOTUNE)
)

print("Total images:", len(ds_images))
print("Image shape:", ds_images.element_spec)