In [None]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.cluster import DBSCAN, KMeans
from sklearn.model_selection import train_test_split

# Check image coordinates

- Images with very similar coordinates should be together in train / test set to avoid data leakage (almost identical images)
- Check image locations geographically (e.g. mountainous areas in France, Switzerland, Austria, Italy are likely very close together)

In [None]:
# make df with file path, country, and coordinates of the image

data = []
for root, dirs, files in os.walk('images'):
    for filename in files:
        if filename.endswith('.jpg'):
            try:
                parts = filename.split("_")
                lat = float(parts[1])
                lon = float(parts[2].replace(".jpg", ""))
                full_path = os.path.join(root, filename)
                country = root.split("/")[1]
                data.append({
                    "filename": filename,
                    "full_path": full_path,
                    "country": country,
                    "latitude": lat,
                    "longitude": lon
                })
            except Exception as e:
                print(f"Skipping {filename}: {e}")

df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['country'].value_counts(normalize = True)

In [None]:
# +/- 8 regions can be identified (some a lot larger (more images - more variable) than others)

fig = go.Figure(data=go.Scattergeo(
        lon = df['longitude'],
        lat = df['latitude'],
        mode = 'markers',
        marker_color = '#4287f5'
        ))

# fig.update_layout(
#         geo_scope='europe',
#     )

fig.update_layout(height=500)

fig.show()

# Make clusters based on proximity

To avoid data leakage: locations close together should be together in the train or test set
<br>Use a distance of 100m

In [None]:
coords_rad = np.radians(df[["latitude", "longitude"]])
db_local = DBSCAN(eps=100/6371000, min_samples=1, metric="haversine") 
df["local_cluster"] = db_local.fit_predict(coords_rad)

In [None]:
df[df['full_path'].str.contains('argentina')].sort_values('longitude').head(10)

In [None]:
df["local_cluster"].nunique()

# Create clusters distinguishable on world map

In [None]:
kmeans = KMeans(n_clusters=8, random_state=16) # South-America is split up rather than considering Mauritius / Reunion as a separate region

df["region_cluster"] = kmeans.fit_predict(df[["latitude", "longitude"]])

In [None]:
df["region_cluster"] = np.where((df["latitude"].between(-22, -18)) & (df["longitude"].between(52, 58)), 8, df["region_cluster"])

In [None]:
color_palette = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#f5da42", "#8c564b", "#e377c2", "#42e3f5", "#b146e3"]

df["color"] = df["region_cluster"].map(lambda x: color_palette[x % len(color_palette)])

In [None]:
df[df['region_cluster'] == 8]

In [None]:
df.head()

In [None]:
df['region_cluster'].value_counts(normalize = True)

In [None]:
fig = go.Figure(data=go.Scattergeo(
        lon = df['longitude'],
        lat = df['latitude'],
        mode = 'markers',
        text = df['region_cluster'],
        marker_color = df['color']
        ))

# fig.update_layout(
#         geo_scope='europe',
#     )

fig.update_layout(height=500)

fig.show()

# Split train and test set (stratified)

In [None]:
# first get country on local_cluster level to use it for stratification

cluster_df = df.groupby('local_cluster').first().reset_index()
cluster_df.head()

In [None]:
train_clusters, test_clusters = train_test_split(cluster_df['local_cluster'], test_size = 0.2, stratify = cluster_df['country'], random_state = 16)
df['set'] = np.where(df['local_cluster'].isin(train_clusters), 'train', 'test')

In [None]:
df_train = df[df['set'] == 'train']
df_train.shape

In [None]:
df_test = df[df['set'] == 'test']
df_test.shape

In [None]:
df_train.to_pickle("intermediate/df_train.pkl") 
df_test.to_pickle("intermediate/df_test.pkl") 

# Compare distribution of countries in train and test set

In [None]:
cluster_counts_train = df_train.groupby("country")["local_cluster"].nunique()
cluster_pct_train = cluster_counts_train / cluster_counts_train.sum() * 100
cluster_pct_train.round(2).sort_values(ascending=False)

In [None]:
cluster_counts_test = df_test.groupby("country")["local_cluster"].nunique()
cluster_pct_test = cluster_counts_test / cluster_counts_test.sum() * 100
cluster_pct_test.round(2).sort_values(ascending=False)

In [None]:
df_train['country'].value_counts(normalize = True)

In [None]:
df_test['country'].value_counts(normalize = True) # huge difference for New Zealand! many 'duplicate' images? yes, some are exact copies