In [None]:
# We're going to pull all the images and sort between
# mid may - mid june (summer when the ice melts off/floats away)
# and then we'll run an unsupervised cluster and inspect the results

# Then we'll run an unsupervised cluster and similarity search using notebooks 2/3 -- updating the paths as needed

In [None]:
!pip install torch torchvision timm --quiet

import os
import shutil
from pathlib import Path
from datetime import datetime
import pandas as pd
from torchvision import transforms
from tqdm import tqdm
import timm
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import torch
from PIL import Image
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# output dir
iceberg_season = Path("./iceberg_season")
iceberg_season.mkdir(parents=True, exist_ok=True)

# images
source_1 = Path("./image_data/halo_filtered")
source_2 = Path("./image_data_b/halo_filtered")

# list of image paths
files_1 = list(source_1.glob("*.jpg"))
files_2 = list(source_2.glob("*.jpg"))
all_files = files_1 + files_2 

# dataframe to hold file names/paths
df = pd.DataFrame({
    'filename': [f.name for f in all_files],
    'fullpath': [f for f in all_files]
})

# we need to take the timestamp out of the file name and
# convert it to a datetime object
df['ts_str'] = df['filename'].str.slice(0,15) # this is the yyyymmdd_hhmmss format
df['ts'] = pd.to_datetime(df['ts_str'], format='%Y%m%d_%H%M%S', errors = 'coerce')

# drop rows that fail
df = df.dropna(subset=['ts'])

# using arbitrary fake year for easy comparison later
df['month_day'] = df['ts'].apply(lambda x: x.replace(year=2000))

# iceberg season range
season_start = pd.Timestamp("2000-05-15")
season_end = pd.Timestamp("2000-07-15")

# filter for files within the season
mask = (df['month_day'] >= season_start) & (df['month_day'] <= season_end)
filtered = df[mask].reset_index(drop = True)

# get the image paths for use in the unsupervised model
in_season = filtered['fullpath'].tolist() # a list of paths "in season"


In [None]:
# unsupervised clustering using DINO ViT
# set settings for ViT
image_size = 244
num_clusters = 5

# load pretrained Dino model
model = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
model.eval()

# transform images
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean = [0.5] * 3, std = [0.5] * 3),
])

# feature extraction
features = []

for path in tqdm(in_season):
    try:
        img = Image.open(path).convert("RGB")
        tensor = transform(img).unsqueeze(0)
        with torch.no_grad():
            feat = model(tensor).squeeze().numpy()
        features.append(feat)
    except Exception as e:
        print(f"Skipping {path}: {e}")

In [None]:
# kmeans clustering
X = np.array(features)

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(X)

# group by cluster
output_dir = Path(f"{iceberg_season}/clusters")
os.makedirs(output_dir, exist_ok=True)

for cluster_id in range(num_clusters):
    cluster_dir = os.path.join(output_dir, f"cluster_{cluster_id}")
    os.makedirs(cluster_dir, exist_ok=True)

for path, label in tqdm(zip(in_season, labels), total=len(in_season)):
    fname = os.path.basename(path)
    dst_dir = os.path.join(output_dir, f"cluster_{label}")
    os.makedirs(dst_dir, exist_ok=True)
    dst = os.path.join(dst_dir, fname)
    shutil.copy(path, dst)

In [None]:
# Graph to see how they cluster

pca = PCA(n_components=50).fit_transform(X)
tsne = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(pca)

plt.figure(figsize=(8, 6))
scatter = plt.scatter(tsne[:,0], tsne[:, 1], c=labels, cmap="tab10", s=10)
plt.title("t-SNE of DINO Embeddings")
plt.colorbar(scatter)
plt.show()