In [None]:
import cv2
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df_train = pd.read_pickle("intermediate/train_df.pkl") 

In [None]:
def auto_crop_black_borders(img, threshold=10):
    """
    Crop black borders from the right and bottom of an image.
    
    Parameters:
        img: Input image (NumPy array)
        threshold: Pixel intensity threshold to consider a pixel as "non-black"
    
    Returns:
        Cropped image (without black borders)
    """
    if len(img.shape) == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    else:
        gray = img

    # Create a binary mask of non-black pixels
    mask = gray > threshold

    # Find the bounding box of the non-black area
    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)

    if not np.any(rows) or not np.any(cols):
        return img  # nothing to crop

    y_min, y_max = np.where(rows)[0][[0, -1]]
    x_min, x_max = np.where(cols)[0][[0, -1]]

    cropped = img[y_min:y_max+1, x_min:x_max+1]
    return cropped

In [None]:
def split_by_color_dominance(df, path_col='path', threshold=0.25):
    good, bad = [], []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        path = row[path_col]
        img = cv2.imread(path)
        if img is None:
            continue

        img = auto_crop_black_borders(img)
        b, g, r = cv2.split(img)
        total = b.astype(np.float32) + g + r + 1e-5
        ratios = [np.mean(c / total) for c in (r, g, b)]

        (good if min(ratios) > threshold else bad).append(row)

    return pd.DataFrame(good), pd.DataFrame(bad)

In [None]:
clean_df, removed_df = split_by_color_dominance(df_train)

In [None]:
clean_df.to_pickle("intermediate/train_df_clean.pkl") 

In [None]:
display(clean_df)

In [None]:
display(removed_df) # A few (2 or 3) examples should not have been removed, but overall this looks okay

In [None]:
equi = cv2.imread('images/austria/1741687685_46.9949153_10.2621859.jpg')

img = cv2.cvtColor(equi, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB

plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
img = cv2.imread('images/france/1741690227_46.0110081_6.5302306.jpg')

b, g, r = cv2.split(img)
total = b.astype(np.float32) + g + r + 1e-5
ratios = [np.mean(c / total) for c in (r, g, b)]

print(ratios)

In [None]:
img = cv2.imread('images/italy/1741690057_45.4795205_7.1412341.jpg')

b, g, r = cv2.split(img)
total = b.astype(np.float32) + g + r + 1e-5
ratios = [np.mean(c / total) for c in (r, g, b)]

print(ratios)