In [None]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

In [None]:
def parse_filename(filename):
    try:
        parts = filename.replace('.jpg', '').split('_')
        lat = float(parts[1])
        lon = float(parts[2])
        return lat, lon
    except Exception:
        return None, None

In [None]:
REGION_GRID_SIZE = 2  # Try 5 or 10 for much coarser bins

data = []
country = 'france'
for fname in os.listdir('images/france'):
    lat, lon = parse_filename(fname)
    if lat is None or lon is None:
        continue
    lat_bucket = bucket_coord(lat, REGION_GRID_SIZE)
    # lon_bucket = bucket_coord(lon, REGION_GRID_SIZE)
    region = f"{lat_bucket}"
    path = os.path.join('images/france', fname)
    label = f"{country}_{region}"
    data.append((path, country, region, label))

df = pd.DataFrame(data, columns=['path', 'country', 'region', 'country_region'])
display(df)

In [None]:
inspection_dir = 'france_region_inspection'  # Where to store visual inspection folders

# Create inspection folders and copy images
os.makedirs(inspection_dir, exist_ok=True)

for _, row in tqdm(df.iterrows(), total=len(df), desc="Copying images"):
    region_folder = os.path.join(inspection_dir, row['country_region'])
    os.makedirs(region_folder, exist_ok=True)

    # Keep the original filename, or add info if you prefer
    dst_path = os.path.join(region_folder, os.path.basename(row['path']))
    shutil.copyfile(row['path'], dst_path)

In [None]:
data = []
for country in os.listdir('images'):
    country_path = os.path.join('images', country)
    if not os.path.isdir(country_path):
        continue
    for fname in os.listdir(country_path):
        lat, lon = parse_filename(fname)
        if lat is None or lon is None:
            continue
        path = os.path.join(country_path, fname)
        data.append((path, country, lat, lon))

df = pd.DataFrame(data, columns=['path', 'country', 'lat', 'lon'])
display(df)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

country_clusters = {}

for country in df['country'].unique():

    diff_lat = df[df['country'] == country]['lat'].max() - df[df['country'] == country]['lat'].min()
    diff_lon = df[df['country'] == country]['lon'].max() - df[df['country'] == country]['lon'].min()

    if diff_lat < 2 and diff_lon < 5:
        country_clusters[country] = 1 # Country too small to cluster
    else:
        coords = df[df['country'] == country][['lat', 'lon']].to_numpy()

        if len(coords) < 10:
            continue  # Not enough data for clustering

        best_k = 1
        best_score = -1
        for k in range(2, 6):  # Try clustering into 1–5 regions
            kmeans = KMeans(n_clusters=k, random_state=42)
            labels = kmeans.fit_predict(coords)
            score = silhouette_score(coords, labels)
            if score > best_score:
                best_score = score
                best_k = k

        # Run final clustering with best_k
        kmeans = KMeans(n_clusters=best_k, random_state=16)
        labels = kmeans.fit_predict(coords)

        df.loc[df['country'] == country, 'region_cluster'] = [
            f"{country}_cluster{l}" for l in labels
        ]

        country_clusters[country] = best_k

print(country_clusters)

In [None]:
display(df)

In [None]:
df.to_pickle("intermediate/region_clusters.pkl") 

In [None]:
inspection_dir = 'argentina_region_inspection'  # Where to store visual inspection folders

df = df[df['country'] == 'argentina']

# Create inspection folders and copy images
os.makedirs(inspection_dir, exist_ok=True)

for _, row in tqdm(df.iterrows(), total=len(df), desc="Copying images"):
    region_folder = os.path.join(inspection_dir, row['region_cluster'])
    os.makedirs(region_folder, exist_ok=True)

    # Keep the original filename, or add info if you prefer
    dst_path = os.path.join(region_folder, os.path.basename(row['path']))
    shutil.copyfile(row['path'], dst_path)

# Old: use coordinate bins

In [None]:
REGION_GRID_SIZE = 2  # Try 5 or 10 for much coarser bins

def bucket_coord(coord, size):
    return int(coord // size * size)

data = []
for country in os.listdir('images'):
    country_path = os.path.join('images', country)
    if not os.path.isdir(country_path):
        continue
    for fname in os.listdir(country_path):
        lat, lon = parse_filename(fname)
        if lat is None or lon is None:
            continue
        lat_bucket = bucket_coord(lat, REGION_GRID_SIZE)
        region = f"{lat_bucket}"
        path = os.path.join(country_path, fname)
        label = f"{country}_{region}"
        data.append((path, country, region, label))

df = pd.DataFrame(data, columns=['path', 'country', 'region', 'country_region'])
display(df)

In [None]:
REGION_GRID_SIZE = 2  # Try 5 or 10 for much coarser bins

data = []
country = 'france'
for fname in os.listdir('images/france'):
    lat, lon = parse_filename(fname)
    if lat is None or lon is None:
        continue
    lat_bucket = bucket_coord(lat, REGION_GRID_SIZE)
    lon_bucket = bucket_coord(lon, REGION_GRID_SIZE)
    region = f"{lat_bucket}_{lon_bucket}"
    path = os.path.join('images/france', fname)
    label = f"{country}_{region}"
    data.append((path, country, region, label))

df = pd.DataFrame(data, columns=['path', 'country', 'region', 'country_region'])
display(df)