This file contains the code of splitting the duplication dataset and efinetuning the model

In [None]:
!pip install timm albumentations torch torchvision





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Archive:  /content/drive/MyDrive/processed_faces_duplication.zip
   creating: /content/content/processed_faces/duplication/
   creating: /content/content/processed_faces/duplication/n000228/
  inflating: /content/content/processed_faces/duplication/n000228/0400_01.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0488_02.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0277_02.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0069_01.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0223_02.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0566_01.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0495_03.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0291_01.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0479_01.jpg  
  inflating: /content/content/processed_faces/duplication/n000228/0257_01.jpg  
  inflati

In [None]:
import itertools
import random
import pandas as pd
import os

dup_base = "/content/content/processed_faces/duplication"

# -------------------
# 1️⃣ Get identities with at least 2 images
# -------------------
def get_images(person_dir):
    """Return list of valid images in a folder."""
    return [f for f in os.listdir(person_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

identities = []
for d in os.listdir(dup_base):
    person_dir = os.path.join(dup_base, d)
    imgs = get_images(person_dir)
    if os.path.isdir(person_dir) and len(imgs) >= 2:  # need at least 2 images for positive pairs
        identities.append(d)

print(f"✅ Found {len(identities)} valid identities")

# -------------------
# 2️⃣ Positive pairs
# -------------------
max_pos_per_identity = 50  # limit number of positive pairs per person
pairs = []

for person in identities:
    person_dir = os.path.join(dup_base, person)
    imgs = [os.path.join(person_dir, i) for i in get_images(person_dir)]

    # All combinations of 2 images
    all_combinations = list(itertools.combinations(imgs, 2))

    # Sample up to max_pos_per_identity
    sampled_combinations = random.sample(all_combinations, min(len(all_combinations), max_pos_per_identity))

    for a, b in sampled_combinations:
        pairs.append([a, b, 1])  # 1 = same person

print(f"✅ Created {len(pairs)} positive pairs")

# -------------------
# 3️⃣ Negative pairs (different identities)
# -------------------
num_neg = len(pairs)  # balance negatives
neg_count = 0

while neg_count < num_neg:
    id1, id2 = random.sample(identities, 2)
    imgs1 = get_images(os.path.join(dup_base, id1))
    imgs2 = get_images(os.path.join(dup_base, id2))

    if len(imgs1) == 0 or len(imgs2) == 0:
        continue  # skip empty folders

    img1 = os.path.join(dup_base, id1, random.choice(imgs1))
    img2 = os.path.join(dup_base, id2, random.choice(imgs2))

    pairs.append([img1, img2, 0])  # 0 = different person
    neg_count += 1

# -------------------
# 4️⃣ Save CSV
# -------------------
pairs_df = pd.DataFrame(pairs, columns=["img1", "img2", "label"])
pairs_df.to_csv("identity_pairs.csv", index=False)

print(f"✅ Saved identity_pairs.csv with {len(pairs_df)} total pairs")


✅ Found 100 valid identities
✅ Created 5000 positive pairs
✅ Saved identity_pairs.csv with 10000 total pairs


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
train_df, val_df = train_test_split(
    pairs_df,
    test_size=0.2,
    random_state=42,
    stratify=pairs_df['label']  # maintain balance of positives/negatives
)

train_df.to_csv("identity_pairs_train.csv", index=False)
val_df.to_csv("identity_pairs_val.csv", index=False)

print(f"✅ Saved identity_pairs_train.csv ({len(train_df)}) and identity_pairs_val.csv ({len(val_df)})")


✅ Saved identity_pairs_train.csv (8000) and identity_pairs_val.csv (2000)


In [None]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import cv2
import os
import numpy as np

class IdentityPairsDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (str): Path to CSV file with columns: img1, img2, label
            transform (callable, optional): Optional transform to be applied on a sample
        """
        self.pairs_df = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.pairs_df)

    def __getitem__(self, idx):
        row = self.pairs_df.iloc[idx]
        img1_path = row['img1']
        img2_path = row['img2']
        label = int(row['label'])

        # Read images with OpenCV (BGR -> RGB)
        img1 = cv2.cvtColor(cv2.imread(img1_path), cv2.COLOR_BGR2RGB)
        img2 = cv2.cvtColor(cv2.imread(img2_path), cv2.COLOR_BGR2RGB)

        # Fallback in case image failed to load
        if img1 is None or img2 is None:
             # Return None or raise an error, depending on desired behavior
             # For now, we'll skip this sample by returning None.
             # The DataLoader will handle filtering None values if collate_fn is adjusted,
             # or we can implement a custom collate_fn.
             # For simplicity, let's raise an error for now.
             raise ValueError(f"Image not found: {img1_path} or {img2_path}")


        # Apply transforms if provided
        if self.transform:
            augmented = self.transform(image=img1)
            img1 = augmented['image']
            augmented = self.transform(image=img2)
            img2 = augmented['image']
        else:
            # Default: resize to 224x224 and convert to float32 tensor
            img1 = cv2.resize(img1, (224, 224)).astype(np.float32) / 255.0
            img2 = cv2.resize(img2, (224, 224)).astype(np.float32) / 255.0
            # HWC -> CHW
            img1 = torch.tensor(img1.transpose(2, 0, 1))
            img2 = torch.tensor(img2.transpose(2, 0, 1))

        # Convert image tensors to float32
        img1 = img1.float()
        img2 = img2.float()

        return img1, img2, torch.tensor(label, dtype=torch.float32)

In [None]:
from torch.utils.data import DataLoader

# Optional: Albumentations transforms
import albumentations as A
from albumentations.pytorch import ToTensorV2

transform = A.Compose([
    A.Resize(224, 224),
    ToTensorV2()
])

In [None]:
from torch.utils.data import DataLoader

train_dataset = IdentityPairsDataset("identity_pairs_train.csv", transform=transform)
val_dataset = IdentityPairsDataset("identity_pairs_val.csv", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2)

In [None]:
import torch.nn as nn
import timm

class ViTSiamese(nn.Module):
    def __init__(self, backbone_name="vit_base_patch16_224", pretrained=True, embedding_dim=512):
        super(ViTSiamese, self).__init__()
        self.backbone = timm.create_model(backbone_name, pretrained=pretrained)
        self.backbone.reset_classifier(0)  # remove original classifier
        self.fc = nn.Linear(self.backbone.num_features, embedding_dim)

    def forward_one(self, x):
        x = self.backbone(x)
        x = self.fc(x)
        x = nn.functional.normalize(x, p=2, dim=1)  # L2 normalize
        return x

    def forward(self, x1, x2):
        emb1 = self.forward_one(x1)
        emb2 = self.forward_one(x2)
        return emb1, emb2


In [None]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super().__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = nn.functional.pairwise_distance(output1, output2)
        loss = torch.mean(
            label * torch.pow(euclidean_distance, 2) +
            (1 - label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2)
        )
        return loss


In [None]:
from torch.utils.data import DataLoader
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = IdentityPairsDataset("identity_pairs_train.csv", transform=transform)
val_dataset   = IdentityPairsDataset("identity_pairs_val.csv", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2)

model = ViTSiamese().to(device)
criterion = ContrastiveLoss(margin=1.0)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for img1, img2, labels in train_loader:
        img1, img2, labels = img1.to(device), img2.to(device), labels.to(device)
        optimizer.zero_grad()
        emb1, emb2 = model(img1, img2)
        loss = criterion(emb1, emb2, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for img1, img2, labels in val_loader:
            img1, img2, labels = img1.to(device), img2.to(device), labels.to(device)
            emb1, emb2 = model(img1, img2)
            loss = criterion(emb1, emb2, labels)
            val_loss += loss.item()
        val_loss /= len(val_loader)
        print(f"Validation Loss: {val_loss:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Epoch [1/10], Loss: 0.2482
Validation Loss: 0.2279
Epoch [2/10], Loss: 0.2277
Validation Loss: 0.2264
Epoch [3/10], Loss: 0.2176
Validation Loss: 0.2199
Epoch [4/10], Loss: 0.2088
Validation Loss: 0.2210
Epoch [5/10], Loss: 0.1994
Validation Loss: 0.2310
Epoch [6/10], Loss: 0.1897
Validation Loss: 0.2186
Epoch [7/10], Loss: 0.1825
Validation Loss: 0.2132
Epoch [8/10], Loss: 0.1785
Validation Loss: 0.2126
Epoch [9/10], Loss: 0.1674
Validation Loss: 0.2044
Epoch [10/10], Loss: 0.1609
Validation Loss: 0.2116


In [None]:
def compute_similarity(model, img1_path, img2_path, transform):
    model.eval()

    img1 = cv2.imread(img1_path)
    img2 = cv2.imread(img2_path)

    if img1 is None:
        print(f"Error: Image not found at {img1_path}")
        return None
    if img2 is None:
        print(f"Error: Image not found at {img2_path}")
        return None

    img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
    img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)

    # Apply transforms, convert to float, and normalize
    img1 = transform(image=img1)['image'].unsqueeze(0).to(device).float() / 255.0
    img2 = transform(image=img2)['image'].unsqueeze(0).to(device).float() / 255.0


    with torch.no_grad():
        emb1, emb2 = model(img1, img2)
        score = nn.functional.cosine_similarity(emb1, emb2).item()
    return score

score = compute_similarity(model, "/content/org.jpeg", "/content/live.jpeg", transform)
if score is not None:
    print("Similarity:", score)

Similarity: 0.9834152460098267


for 500 samples of .csv file

In [None]:
import torch
import torch.nn as nn
import cv2
import pandas as pd
from tqdm import tqdm

# Assuming your model and transform are already defined
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# --------------------------
# Similarity function
# --------------------------
def compute_similarity(model, img1_path, img2_path, transform):
    model.eval()

    img1 = cv2.imread(img1_path)
    img2 = cv2.imread(img2_path)

    if img1 is None or img2 is None:
        print(f"⚠️ Missing image: {img1_path} or {img2_path}")
        return None

    img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
    img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)

    img1 = transform(image=img1)['image'].unsqueeze(0).to(device).float() / 255.0
    img2 = transform(image=img2)['image'].unsqueeze(0).to(device).float() / 255.0

    with torch.no_grad():
        emb1, emb2 = model(img1, img2)
        score = nn.functional.cosine_similarity(emb1, emb2).item()
    return score


# --------------------------
# Load CSV
# --------------------------
csv_path = "/content/identity_pairs_val.csv"  # change as needed
df = pd.read_csv(csv_path)

if not all(col in df.columns for col in ["img1", "img2", "label"]):
    raise ValueError("CSV must have columns: img1, img2, label")

# --------------------------
# Compute similarity for 500 rows
# --------------------------
subset_df = df.head(500).copy()
scores = []

for _, row in tqdm(subset_df.iterrows(), total=len(subset_df)):
    sim = compute_similarity(model, row["img1"], row["img2"], transform)
    scores.append(sim if sim is not None else 0.0)

subset_df["similarity"] = scores

# --------------------------
# Summary metrics
# --------------------------
same_person = subset_df[subset_df["label"] == 1]["similarity"]
diff_person = subset_df[subset_df["label"] == 0]["similarity"]

print("\n✅ Results Summary:")
print(f"Total pairs processed: {len(subset_df)}")
print(f"Average similarity (same person): {same_person.mean():.4f}")
print(f"Average similarity (different persons): {diff_person.mean():.4f}")

# --------------------------
# Save to CSV
# --------------------------
subset_df.to_csv("similarity_results_500.csv", index=False)
print("✅ Saved results to similarity_results_500.csv")


100%|██████████| 500/500 [00:16<00:00, 29.93it/s]



✅ Results Summary:
Total pairs processed: 500
Average similarity (same person): 0.9823
Average similarity (different persons): 0.9761
✅ Saved results to similarity_results_500.csv


In [None]:
import pandas as pd
import numpy as np
import cv2
from insightface.app import FaceAnalysis
from tqdm import tqdm

# Initialize InsightFace model
app = FaceAnalysis(name='buffalo_l')
app.prepare(ctx_id=0, det_size=(224, 224))

def face_similarity(img1_path, img2_path):
    """Compute cosine similarity between two faces."""
    img1 = cv2.imread(img1_path)
    img2 = cv2.imread(img2_path)

    if img1 is None or img2 is None:
        print(f"⚠️ Missing image: {img1_path} or {img2_path}")
        return None

    faces1 = app.get(img1)
    faces2 = app.get(img2)

    if not faces1 or not faces2:
        # No detected face
        return None

    emb1 = faces1[0]['embedding']
    emb2 = faces2[0]['embedding']

    sim = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
    return sim


# ================================
# 1️⃣ Load CSV and sample 200 pairs
# ================================
csv_path = "identity_pairs_val.csv"  # change if needed
df = pd.read_csv(csv_path)

# Sample 200 pairs randomly
df_sample = df.sample(n=200, random_state=42).reset_index(drop=True)

# ================================
# 2️⃣ Compute similarity scores
# ================================
similarities = []
for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Processing pairs"):
    sim = face_similarity(row['img1'], row['img2'])
    if sim is not None:
        similarities.append((row['img1'], row['img2'], row['label'], sim))

# Convert to DataFrame
results_df = pd.DataFrame(similarities, columns=["img1", "img2", "label", "similarity"])
results_df.to_csv("similarity_results_200.csv", index=False)

# ================================
# 3️⃣ Compute summary statistics
# ================================
same_scores = results_df[results_df['label'] == 1]['similarity']
diff_scores = results_df[results_df['label'] == 0]['similarity']

print("\n✅ Total pairs processed:", len(results_df))
print(f"Average similarity (same person): {same_scores.mean():.4f}")
print(f"Average similarity (different persons): {diff_scores.mean():.4f}")
print("📁 Results saved to similarity_results.csv")


download_path: /root/.insightface/models/buffalo_l
Downloading /root/.insightface/models/buffalo_l.zip from https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip...


100%|██████████| 281857/281857 [00:11<00:00, 25527.58KB/s]


Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /root/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127.5
set det-size: (224, 224)


Processing pairs: 100%|██████████| 200/200 [02:59<00:00,  1.12it/s]


✅ Total pairs processed: 198
Average similarity (same person): 0.5127
Average similarity (different persons): 0.0028
📁 Results saved to similarity_results.csv





In [None]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pac