### Homework Nr. 4
#### Tasks:
1. Complete week 2 of third course and week 1 of fourth course from specialization;
2. Complete the practical task.

#### Practical Task:
This week we will do something very different, instead of using a model-centric approach (meaning we have the
data fixed and we change the model and its hyperparameters) we will do a data-centric approach (meaning we have
the model fixed and we change the data and its preprocessing steps).
We have a similar to MNIST digits dataset, but with Roman numerals.

Your task is to:
1. Analyze the dataset
2. Clean the dataset, fix incorrect labels
3. Add your own data (your images, image augmentation, etc.).
    If you choose to create your own data, you may find this script helpful for converting your images: convert.py
4. Decide your train/val splits
5. Don't touch the test set, it's a fixed set for evaluation
6. Train and test the model (run train.py)
7. Iterate over this steps, and try to improve the results
8. Draw conclusions and see what are the differences

Good luck!

# Cleaning data

I will use widgets for displaying images, and select those that are bad and after this delete

In [2]:
import os
import ipywidgets as widgets
from IPython.display import display, clear_output
from PIL import Image

In [None]:
import os
import ipywidgets as widgets
from IPython.display import display, clear_output

############################
# 0) Configuration
############################
ROOT_DIR = r"data\train"
CHUNK_SIZE = 12

# If you have a folder name where you last stopped, set it here:
START_FROM_FOLDER = "viii"  # Or None if you want to start from the beginning

############################
# 1) Gather images per folder
############################
def gather_images_by_folder_recursive(root_dir):
    valid_exts = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'}
    folder_dict = {}
    
    for folder_name in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder_name)
        if os.path.isdir(folder_path):
            collected_paths = []
            for base, dirs, files in os.walk(folder_path):
                for file in files:
                    ext = os.path.splitext(file)[1].lower()
                    if ext in valid_exts:
                        full_path = os.path.join(base, file)
                        collected_paths.append(full_path)
            folder_dict[folder_name] = sorted(collected_paths)
    
    return folder_dict

def flatten_dict_to_list(folder_dict, start_from_folder=None):
    """
    Turn folder_dict into a list of (path, folder_name).
    If start_from_folder is given, skip all folders until we reach that folder_name,
    then take the remainder.
    """
    images_with_folders = []
    skip_mode = start_from_folder is not None
    found_start = False

    for folder_name, paths in folder_dict.items():
        # If we are in skip mode, keep skipping until we find our target folder.
        if skip_mode and (folder_name == start_from_folder):
            skip_mode = False
            found_start = True
        
        if not skip_mode:
            for p in paths:
                images_with_folders.append((p, folder_name))
    
    if start_from_folder is not None and not found_start:
        print(f"WARNING: start_from_folder '{start_from_folder}' not found in folder_dict. Starting from the first folder instead.")
    
    return images_with_folders

##########################
# 2) Chunks and UI helpers
##########################

def make_chunks(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]

def create_image_widget(path, max_width=200, max_height=200):
    with open(path, 'rb') as f:
        img_bytes = f.read()
    ext = os.path.splitext(path)[1].lower().lstrip('.')
    if ext == 'jpg':
        ext = 'jpeg'
    image_widget = widgets.Image(
        value=img_bytes,
        format=ext if ext in ['png','jpeg','gif','bmp','tiff'] else 'png',
        layout=widgets.Layout(max_width=f'{max_width}px', max_height=f'{max_height}px')
    )
    return image_widget

def create_item_widget(path, folder_name):
    label = widgets.Label(value=folder_name)
    img_widget = create_image_widget(path)
    checkbox = widgets.Checkbox(value=False, description='Bad?')
    vbox = widgets.VBox([label, img_widget, checkbox])
    return (vbox, checkbox, path, folder_name)

def build_chunk_widgets(images_with_folders):
    chunks = list(make_chunks(images_with_folders, CHUNK_SIZE))
    chunk_widgets_local = []
    for chunk in chunks:
        widget_list = []
        for (path, folder_name) in chunk:
            widget_list.append(create_item_widget(path, folder_name))
        chunk_widgets_local.append(widget_list)
    return chunk_widgets_local

##########################
# 3) Page management & buttons
##########################
chunk_widgets = []
current_chunk_idx = 0
image_display_box = widgets.VBox()

def update_page(page_idx):
    clear_output(wait=True)
    display(nav_box)
    
    if not chunk_widgets:
        print("No images in chunk_widgets.")
        return
    
    chunk_list = chunk_widgets[page_idx]
    row_widgets = []
    row = []
    MAX_PER_ROW = 4
    for i, (vbox, checkbox, path, folder_name) in enumerate(chunk_list):
        row.append(vbox)
        # finalize the row if we have 4 or we reach the end
        if (i + 1) % MAX_PER_ROW == 0 or (i + 1) == len(chunk_list):
            row_widgets.append(widgets.HBox(row))
            row = []
    image_display_box.children = tuple(row_widgets)
    display(image_display_box)

def on_prev_click(b):
    global current_chunk_idx
    if current_chunk_idx > 0:
        current_chunk_idx -= 1
    update_page(current_chunk_idx)

def on_next_click(b):
    global current_chunk_idx
    if current_chunk_idx < len(chunk_widgets) - 1:
        current_chunk_idx += 1
    update_page(current_chunk_idx)

def on_delete_click(b):
    global current_chunk_idx
    if not chunk_widgets:
        return
    current_items = chunk_widgets[current_chunk_idx]
    to_delete = []
    for (vbox, checkbox, path, folder_name) in current_items:
        if checkbox.value:
            try:
                os.remove(path)  # Real deletion
                to_delete.append(path)
            except Exception as e:
                print(f"Failed to delete {path}: {e}")
    
    # Filter out the deleted images from current chunk
    chunk_widgets[current_chunk_idx] = [
        (vbox, checkbox, path, folder_name) 
        for (vbox, checkbox, path, folder_name) in current_items
        if path not in to_delete
    ]
    
    print(f"Deleted {len(to_delete)} image(s).")
    update_page(current_chunk_idx)

prev_button = widgets.Button(description="< Previous", button_style='info')
next_button = widgets.Button(description="Next >", button_style='info')
delete_button = widgets.Button(description="Delete Bad", button_style='danger')

prev_button.on_click(on_prev_click)
next_button.on_click(on_next_click)
delete_button.on_click(on_delete_click)

nav_box = widgets.HBox([prev_button, next_button, delete_button])


In [4]:
##########################
# 4) Run the pipeline
##########################
folder_dict = gather_images_by_folder_recursive(ROOT_DIR)

# Flatten into a list, but skip all folders until START_FROM_FOLDER
images_with_folders = flatten_dict_to_list(folder_dict, start_from_folder=START_FROM_FOLDER)

chunk_widgets = build_chunk_widgets(images_with_folders)

# Display the navigation box
display(nav_box)

# Show the first page (if we have images)
if chunk_widgets:
    update_page(current_chunk_idx)
else:
    print("No images found.")


HBox(children=(Button(button_style='info', description='< Previous', style=ButtonStyle()), Button(button_style…

VBox(children=(HBox(children=(VBox(children=(Label(value='viii'), Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\…

# Augumentation