In [2]:
import pandas as pd
from PIL import Image
from tqdm import tqdm
import numpy as np
import cv2
import os

import torch
from transformers import CLIPProcessor, CLIPModel

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
root_dir = "./face-matching"

seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x7fd0ceda8af0>

# Setup

In [10]:
ref_df = pd.read_csv("ref_img.csv", dtype={"ref_img": str})
ref_ids = ref_df["ref_img"].tolist()
print(f"Reference IDs: {ref_ids}")

Reference IDs: ['048', '025', '095', '043', '105', '071', '046', '096', '020', '085', '061', '073', '084', '026', '008']


In [11]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained(
    "openai/clip-vit-large-patch14", use_fast=True
)
model.to(device)
model.eval()

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [18]:
all_images = sorted([f[:-4] for f in os.listdir(f"{root_dir}/images") if f.endswith(".jpg")])

# Extract features

In [23]:
features = {}
for img_id in tqdm(all_images):
    img_path = f"{root_dir}/images/{img_id}.jpg"
    img = Image.open(img_path).convert("RGB")
    img = np.array(img)
    face_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    face_pil = Image.fromarray(face_img)

    with torch.no_grad():
        inputs = processor(images=face_pil, return_tensors="pt").to(device)
        feature = model.get_image_features(**inputs)
        features[img_id] = feature.cpu().numpy()

print(f"Extracted features for {len(features)} images")

100%|██████████| 109/109 [00:03<00:00, 32.41it/s]

Extracted features for 109 images





Find top 5 similar images for each reference

In [24]:
results = []

for ref_id in tqdm(ref_ids):
    ref_feature = features.get(ref_id)
    if ref_feature is None:
        continue

    # calculate cosine similarity with all images
    similarities = {}
    for img_id, feature in features.items():
        sim = np.dot(ref_feature[0], feature[0]) / (
            np.linalg.norm(ref_feature[0]) * np.linalg.norm(feature[0]) + 1e-8
        )
        similarities[img_id] = float(sim)

    # sort by similarity, exclude reference image, take top 5
    sorted_ids = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    top_5 = [img_id for img_id, _ in sorted_ids if img_id != ref_id][:5]

    results.append({"ref_img": ref_id, "photos": "|".join(top_5)})

100%|██████████| 15/15 [00:00<00:00, 543.51it/s]


# Submission

In [25]:
submission = pd.DataFrame(results)
submission

Unnamed: 0,ref_img,photos
0,48,101|102|073|093|039
1,25,041|006|083|067|033
2,95,014|021|047|097|072
3,43,034|027|090|013|079
4,105,024|029|023|104|046
5,71,070|011|017|065|088
6,46,060|019|010|069|056
7,96,035|098|018|068|015
8,20,031|053|077|091|038
9,85,106|009|037|103|033


In [28]:
submission.to_csv(f"{root_dir}/submission.csv", index=False)