In [1]:
### From Raw Sign Images to Import-Ready Clustering JSON

In [2]:
# ============================================
# STEP 0: Setup and Imports
# ============================================

import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from PIL import Image

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [4]:
# ============================================
# STEP 1 — Reorganize Dataset: Period → Sign
# ============================================

import os
import shutil

# Paths
src_root = "/home/vim/Desktop/asim_niaz/cuneiform-ocr-data/cropped_signs_metadata/data_full"               # current structure: period/sign/images
dst_root = "/home/vim/Desktop/asim_niaz/cuneiform-ocr-data/cropped_signs_metadata/generating_json/new_structure_data"    # new structure: sign/period/images

os.makedirs(dst_root, exist_ok=True)

# Loop through periods and signs
for period in sorted(os.listdir(src_root)):
    period_path = os.path.join(src_root, period)
    if not os.path.isdir(period_path):
        continue

    for sign in sorted(os.listdir(period_path)):
        sign_path = os.path.join(period_path, sign)
        if not os.path.isdir(sign_path):
            continue

        # destination folder: dataset_sign_model/sign/period/
        dst_dir = os.path.join(dst_root, sign, period)
        os.makedirs(dst_dir, exist_ok=True)

        # link or copy images
        for file in os.listdir(sign_path):
            if not file.lower().endswith((".jpg", ".jpeg", ".png")):
                continue

            src_file = os.path.join(sign_path, file)
            dst_file = os.path.join(dst_dir, file)

            # create symlink (fast and storage-friendly)
            if not os.path.exists(dst_file):
                try:
                    os.symlink(os.path.abspath(src_file), dst_file)
                except OSError:
                    # fallback: copy file if symlink not supported (e.g. Windows)
                    shutil.copy2(src_file, dst_file)

print("Reorganized dataset into sign-first structure at:", dst_root)


Reorganized dataset into sign-first structure at: /home/vim/Desktop/asim_niaz/cuneiform-ocr-data/cropped_signs_metadata/generating_json/new_structure_data


In [5]:
# ============================================
# STEP 2 — Fine-Tune ResNet18 on Sign Classes
# ============================================

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms, datasets
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm

# --- CONFIG ---
data_dir = dst_root #"dataset_sign_model"  # NEW root organized by sign
batch_size = 32
num_epochs = 50
lr = 1e-4
patience = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- TRANSFORMS ---
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# --- DATASET WITH PATHS ---
class ImageFolderWithPaths(datasets.ImageFolder):
    def __getitem__(self, index):
        original_tuple = super().__getitem__(index)
        path = self.imgs[index][0]
        return original_tuple + (path,)

dataset = ImageFolderWithPaths(root=data_dir, transform=transform)
num_classes = len(dataset.classes)
print(f" Found {num_classes} sign classes")

# --- Split dataset ---
val_size = int(0.2 * len(dataset))
train_size = len(dataset) - val_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# --- Model ---
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Freeze most layers except top
for name, param in model.named_parameters():
    if not name.startswith("layer4") and not name.startswith("fc"):
        param.requires_grad = False

model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

# --- Early stopping setup ---
best_val_loss = float("inf")
epochs_no_improve = 0
best_model_path = "generating_json/resnet18_sign_model_best.pth"

print(" Fine-tuning Sign Model...")

for epoch in range(num_epochs):
    # Training loop
    model.train()
    total_loss, correct, total = 0, 0, 0
    for imgs, labels, paths in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_acc = 100 * correct / total
    val_loss, val_correct, val_total = 0, 0, 0

    model.eval()
    with torch.no_grad():
        for imgs, labels, paths in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    val_acc = 100 * val_correct / val_total

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} Acc: {train_acc:.2f}% | "
          f"Val Loss: {avg_val_loss:.4f} Acc: {val_acc:.2f}%")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), best_model_path)
        print(f" Improved — saved to {best_model_path}")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f" Early stopping at epoch {epoch+1}")
            break


 Found 979 sign classes
 Fine-tuning Sign Model...


Epoch 1/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:53<00:00, 12.73it/s]


Epoch 1/50 | Train Loss: 3.1508 Acc: 37.20% | Val Loss: 2.1106 Acc: 54.60%
 Improved — saved to generating_json/resnet18_sign_model_best.pth


Epoch 2/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:33<00:00, 13.93it/s]


Epoch 2/50 | Train Loss: 1.7495 Acc: 61.17% | Val Loss: 1.6508 Acc: 62.75%
 Improved — saved to generating_json/resnet18_sign_model_best.pth


Epoch 3/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:33<00:00, 13.93it/s]


Epoch 3/50 | Train Loss: 1.1718 Acc: 73.36% | Val Loss: 1.5220 Acc: 65.37%
 Improved — saved to generating_json/resnet18_sign_model_best.pth


Epoch 4/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:33<00:00, 13.92it/s]


Epoch 4/50 | Train Loss: 0.7487 Acc: 83.17% | Val Loss: 1.5111 Acc: 65.68%
 Improved — saved to generating_json/resnet18_sign_model_best.pth


Epoch 5/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:33<00:00, 13.93it/s]


Epoch 5/50 | Train Loss: 0.4411 Acc: 90.69% | Val Loss: 1.5818 Acc: 64.80%


Epoch 6/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:33<00:00, 13.93it/s]


Epoch 6/50 | Train Loss: 0.2602 Acc: 94.82% | Val Loss: 1.6689 Acc: 64.77%


Epoch 7/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:33<00:00, 13.95it/s]


Epoch 7/50 | Train Loss: 0.1644 Acc: 96.81% | Val Loss: 1.7470 Acc: 63.98%


Epoch 8/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:33<00:00, 13.97it/s]


Epoch 8/50 | Train Loss: 0.1099 Acc: 97.95% | Val Loss: 1.8165 Acc: 63.85%


Epoch 9/50 [Train]: 100%|███████████████████████████████████████████████████████████| 2977/2977 [03:33<00:00, 13.93it/s]


Epoch 9/50 | Train Loss: 0.0807 Acc: 98.53% | Val Loss: 1.8891 Acc: 64.00%
 Early stopping at epoch 9


In [6]:
# ============================================
# STEP 3 — Extract Sign Embeddings
# ============================================

from torchvision import models
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm

# Load best weights
model = models.resnet18(weights=None)
model.fc = nn.Linear(model.fc.in_features, num_classes)
model.load_state_dict(torch.load("generating_json/resnet18_sign_model_best.pth", map_location=device))
model.to(device)
model.eval()

# Feature extractor
feature_extractor = nn.Sequential(*list(model.children())[:-1]).to(device)
feature_extractor.eval()

# Data loader (no split, full set)
full_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

features, sign_labels, periods, paths = [], [], [], []

with torch.no_grad():
    for imgs, lbls, img_paths in tqdm(full_loader, desc="Extracting Sign Features"):
        imgs = imgs.to(device)
        out = feature_extractor(imgs).squeeze(-1).squeeze(-1)
        features.append(out.cpu().numpy())
        sign_labels.extend([dataset.classes[l] for l in lbls])
        paths.extend(img_paths)
        # extract period from folder structure
        periods.extend([os.path.basename(os.path.dirname(p)) for p in img_paths])

features = np.concatenate(features, axis=0)
np.savez("generating_json/sign_model_embeddings.npz",
         features=features,
         signs=np.array(sign_labels),
         periods=np.array(periods),
         paths=np.array(paths))
print(" Saved 'sign_model_embeddings.npz'")


Extracting Sign Features: 100%|█████████████████████████████████████████████████████| 3721/3721 [04:05<00:00, 15.17it/s]


 Saved 'sign_model_embeddings.npz'


In [10]:
#!/usr/bin/env python3
# Sign Form Clustering and Dual JSON Export (paths + base64) from Embeddings
# Field order matches CroppedSignImageSchema

import json
import base64
import uuid
import os
from pathlib import Path
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
from collections import defaultdict
from tqdm import tqdm

# =========================
# Configuration
# =========================

EMBEDDINGS_PATH = "generating_json/sign_model_embeddings.npz"

OUTPUT_PATH_BASE64 = "generating_json/sign_clustering_base64.json"
OUTPUT_PATH_PATHS = "generating_json/sign_clustering_paths.json"

MAX_CANONICAL_PER_PERIOD = 2
MIN_CLUSTER_SIZE_FOR_VARIANT = 10
MAX_K = 6
RANDOM_STATE = 42

# =========================
# Helpers
# =========================

def image_to_base64(path: str) -> str:
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


def extract_fragment_number(image_path: str) -> str:
    # Example: NCBT.280_sign29_11_Neo-Babylonian.jpg → NCBT.280
    return os.path.basename(image_path).split("_")[0]


def choose_k(n: int) -> int:
    if n < 20:
        return 1
    if n < 50:
        return 2
    if n < 100:
        return 3
    return min(MAX_K, n // 30)

# =========================
# Load embeddings
# =========================

data = np.load(EMBEDDINGS_PATH, allow_pickle=True)

features = data["features"]   # (N, D)
signs = data["signs"]
periods = data["periods"]
paths = data["paths"]

print(f"Loaded {len(features)} embeddings")

# =========================
# Group by (sign, period)
# =========================

group_index = defaultdict(list)
for i, (s, p) in enumerate(zip(signs, periods)):
    group_index[(s, p)].append(i)

docs_base64 = []
docs_paths = []

# =========================
# Clustering + JSON creation
# =========================

for (sign, period), idxs in tqdm(group_index.items(), desc="Processing sign-period groups"):
    feats = features[idxs]
    img_paths = paths[idxs]

    n = len(feats)
    k = choose_k(n)

    if k == 1:
        labels = np.zeros(n, dtype=int)
    else:
        kmeans = KMeans(
            n_clusters=k,
            random_state=RANDOM_STATE,
            n_init=10
        )
        labels = kmeans.fit_predict(feats)

    # group by cluster
    clusters = defaultdict(list)
    for i, lbl in enumerate(labels):
        clusters[lbl].append(i)

    # sort clusters by size (largest first)
    sorted_clusters = sorted(
        clusters.items(),
        key=lambda x: len(x[1]),
        reverse=True
    )

    for rank, (_, members) in enumerate(sorted_clusters):
        cluster_feats = feats[members]
        cluster_size = len(members)

        # centroid selection
        centroid = cluster_feats.mean(axis=0)
        dists = cosine_distances([centroid], cluster_feats)[0]
        rep_local_idx = int(np.argmin(dists))
        rep_idx = members[rep_local_idx]

        # -------- form labeling --------
        if rank < MAX_CANONICAL_PER_PERIOD:
            form_label = f"canonical{rank + 1}"
        else:
            form_label = f"variant{rank + 1 - MAX_CANONICAL_PER_PERIOD}"

        is_main = (
            form_label.startswith("canonical") or
            cluster_size >= MIN_CLUSTER_SIZE_FOR_VARIANT
        )

        # -------- emit records --------
        for m in members:
            image_path = img_paths[m]

            if not Path(image_path).exists():
                continue

            uid = str(uuid.uuid4())
            fragment_number = extract_fragment_number(image_path)

            # ----- path-based JSON (schema order) -----
            docs_paths.append({
                "_id": uid,
                "image_path": image_path,
                "fragment_number": fragment_number,

                "sign": sign,
                "period": period,
                "form": form_label,

                "isCentroid": (m == rep_idx),
                "clusterSize": cluster_size,
                "isMain": is_main,
            })

            # ----- base64-based JSON (schema order) -----
            docs_base64.append({
                "_id": uid,
                "image": image_to_base64(image_path),
                "fragment_number": fragment_number,

                "sign": sign,
                "period": period,
                "form": form_label,

                "isCentroid": (m == rep_idx),
                "clusterSize": cluster_size,
                "isMain": is_main,
            })

# =========================
# Save outputs
# =========================

if docs_paths:
    with open(OUTPUT_PATH_PATHS, "w", encoding="utf-8") as f:
        json.dump(docs_paths, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(docs_paths)} path documents to {OUTPUT_PATH_PATHS}")

if docs_base64:
    with open(OUTPUT_PATH_BASE64, "w", encoding="utf-8") as f:
        json.dump(docs_base64, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(docs_base64)} base64 documents to {OUTPUT_PATH_BASE64}")

if not docs_paths and not docs_base64:
    print("No documents generated.")


Loaded 119047 embeddings


Processing sign-period groups: 100%|████████████████████████████████████████████████| 4513/4513 [03:02<00:00, 24.74it/s]


Saved 119047 path documents to generating_json/sign_clustering_paths.json
Saved 119047 base64 documents to generating_json/sign_clustering_base64.json
