## 任务 A：文本-图片标注任务

### A.1 数据预处理

In [11]:
import os
import fitz  # PyMuPDF
from PIL import Image
from tqdm import tqdm
import json

def extract_text_image_pairs(pdf_folder, output_image_folder, output_json="text_image_mapping.json"):
    os.makedirs(output_image_folder, exist_ok=True)
    result = []
    idx = 0

    for file_name in tqdm(os.listdir(pdf_folder)):
        if not file_name.endswith(".pdf"):
            continue
        pdf_path = os.path.join(pdf_folder, file_name)
        doc = fitz.open(pdf_path)
        for page_number in range(4, min(len(doc), 10)):  # ← 从第5页开始，最多读取4页
            page = doc.load_page(page_number)
            text = page.get_text().strip()
            if not text:
                continue  # 跳过没有文本的页面

            # 将整个页面截图保存为图像
            pix = page.get_pixmap(dpi=200)
            image_path = os.path.join(output_image_folder, f"{os.path.splitext(file_name)[0]}_page_{page_number + 1}.png")
            pix.save(image_path)

            result.append({
                "text": text,
                "image_path": image_path
            })
            idx += 1


    # 保存 JSON 映射
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    print(f"共提取 {idx} 个文本-图像对，已保存至 {output_json}")
    return result

extract_text_image_pairs(
    pdf_folder=r"D:\DATA T2\INPUT",
    output_image_folder=r"D:\zz\mid_data\extracted_images",
    output_json=r"D:\zz\mid_data\text_image_mapping.json"
)


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:30<00:00,  1.51s/it]

共提取 117 个文本-图像对，已保存至 D:\zz\mid_data\text_image_mapping.json





[{'text': '4\n5\nLiving Collection\nThe Cassina Perspective\nTHE CASSINA\nPERSPECTIVE\nAvant-gardism, authenticity, excellence and the combination of technological skill with local \nmanual expertise. Cassina’s identity is founded on these solid, shared values to ex-press the best of \nItalian and international design with a cohesive, recognisable and cultured approach. Cassina has \nsuccessfully amalgamated culture, craftsmanship and industry, reconciling tradition with research, \nrigour with emotion and uniqueness with experimentation, while interpreting everyday trends \nthanks to its untiring work on aesthetics and codiﬁcation. For over 90 years, Meda’s pioneering, free-\nspirited company has been ahead of its time, walking an unknown path, with intuition or perhaps \nforesight, through the design and interior design worlds. Cassina, in fact, moved from artisan to \nindustrial production in the 50’s, and was the ﬁrst company to involve some of the most renowned \ndesigners and arc

In [13]:
import json
import pickle

def build_hashed_mapping(json_path, output_pickle_path="hashed_mapping.pkl", key="image_path"):
    """
    从 JSON 列表创建哈希表，并保存为 pickle
    :param json_path: 原始 JSON 路径（列表格式）
    :param output_pickle_path: 输出的 pickle 路径
    :param key: 使用哪个字段作为哈希表键（如 image_path 或 text）
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # 构建哈希字典：key => entire entry
    hashed_data = {entry[key]: entry for entry in data}

    # 保存为 pickle
    with open(output_pickle_path, "wb") as f:
        pickle.dump(hashed_data, f)

    print(f"✅ 已保存哈希结构数据，共计 {len(hashed_data)} 项，路径：{output_pickle_path}")
    return hashed_data
build_hashed_mapping(
    json_path="mid_data/text_image_mapping.json",
    output_pickle_path="mid_data/image_path_hash.pkl",
    key="image_path"
)

✅ 已保存哈希结构数据，共计 117 项，路径：mid_data/image_path_hash.pkl


{'D:\\zz\\mid_data\\extracted_images\\02 - LIVING COLLECTION 2024_page_5.png': {'text': '4\n5\nLiving Collection\nThe Cassina Perspective\nTHE CASSINA\nPERSPECTIVE\nAvant-gardism, authenticity, excellence and the combination of technological skill with local \nmanual expertise. Cassina’s identity is founded on these solid, shared values to ex-press the best of \nItalian and international design with a cohesive, recognisable and cultured approach. Cassina has \nsuccessfully amalgamated culture, craftsmanship and industry, reconciling tradition with research, \nrigour with emotion and uniqueness with experimentation, while interpreting everyday trends \nthanks to its untiring work on aesthetics and codiﬁcation. For over 90 years, Meda’s pioneering, free-\nspirited company has been ahead of its time, walking an unknown path, with intuition or perhaps \nforesight, through the design and interior design worlds. Cassina, in fact, moved from artisan to \nindustrial production in the 50’s, and

In [19]:
import json
import pickle
import time
import random
from pathlib import Path

# -----------------------
# 参数
# -----------------------
JSON_PATH  = Path("mid_data/text_image_mapping.json")
PICKLE_PATH = Path("mid_data/image_path_hash.pkl")
SAMPLE_SIZE = 100     # 抽样上限
REPEAT = 1000         # 把整个查找循环重复多少次，用来放大耗时
SEED = 42             # 让采样可复现

# -----------------------
# 加载数据
# -----------------------
with JSON_PATH.open(encoding="utf-8") as f:
    json_data = json.load(f)

with PICKLE_PATH.open("rb") as f:
    hash_data = pickle.load(f)

all_image_paths = [e["image_path"] for e in json_data]

random.seed(SEED)
sample_paths = random.sample(all_image_paths,
                             min(SAMPLE_SIZE, len(all_image_paths)))

# -----------------------
# 基准函数
# -----------------------
def list_lookup():
    for path in sample_paths:
        _ = next((e for e in json_data if e["image_path"] == path), None)

def dict_lookup():
    for path in sample_paths:
        _ = hash_data.get(path, None)

def timeit(fn, repeat=REPEAT):
    start = time.perf_counter()
    for _ in range(repeat):
        fn()
    return time.perf_counter() - start

# -----------------------
# 执行基准
# -----------------------
json_total = timeit(list_lookup)
hash_total = timeit(dict_lookup)

# 单次平均
ops = REPEAT * len(sample_paths)
json_avg = json_total / ops
hash_avg = hash_total / ops

# -----------------------
# 输出结果
# -----------------------
print(f"样本数: {len(sample_paths)}，总操作数: {ops}")
print(f"🔍 列表查找总耗时: {json_total:.6f}s，平均 {json_avg*1e6:.2f} μs")
if hash_avg > 0:
    print(f"⚡ 哈希查找总耗时: {hash_total:.6f}s，平均 {hash_avg*1e6:.2f} μs")
    print(f"🚀 加速比: ×{json_avg / hash_avg:.1f}")
else:
    # 依然测不到 → 用 <1 µs 表示
    print(f"⚡ 哈希查找总耗时: <1 µs / op（低于计时精度）")
    print("🚀 加速比: 无法准确计算（哈希查找过快）")


样本数: 100，总操作数: 100000
🔍 列表查找总耗时: 0.151966s，平均 1.52 μs
⚡ 哈希查找总耗时: 0.002956s，平均 0.03 μs
🚀 加速比: ×51.4


### A.2 Clip模型

In [21]:
import os
import json
import torch
import clip
from PIL import Image
from tqdm import tqdm

# 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载模型
model, preprocess = clip.load("ViT-B/32", device=device)

# 加载图文对
with open("mid_data/text_image_mapping.json", "r", encoding="utf-8") as f:
    data_pairs = json.load(f)

# 结果保存
text_features_list = []
image_features_list = []
pair_ids = []

for i, item in enumerate(tqdm(data_pairs)):
    try:
        text = clip.tokenize(item["text"]).to(device)
        image = preprocess(Image.open(item["image_path"]).convert("RGB")).unsqueeze(0).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text)

        # 归一化向量
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        image_features_list.append(image_features.cpu())
        text_features_list.append(text_features.cpu())
        pair_ids.append(i)

    except Exception as e:
        print(f"跳过第 {i} 个 pair")

# 拼接保存
image_tensor = torch.cat(image_features_list, dim=0)
text_tensor = torch.cat(text_features_list, dim=0)
torch.save({
    "image_features": image_tensor,
    "text_features": text_tensor,
    "pair_ids": pair_ids
}, "mid_data/clip_features.pt")

print("图文向量已提取并保存至 mid_data/clip_features.pt")


  0%|                                                                                          | 0/117 [00:00<?, ?it/s]

跳过第 0 个 pair
跳过第 1 个 pair
跳过第 2 个 pair
跳过第 3 个 pair
跳过第 4 个 pair


 10%|████████▎                                                                        | 12/117 [00:00<00:03, 34.69it/s]

跳过第 6 个 pair
跳过第 7 个 pair
跳过第 8 个 pair
跳过第 9 个 pair
跳过第 10 个 pair


 26%|████████████████████▊                                                            | 30/117 [00:00<00:01, 48.51it/s]

跳过第 13 个 pair
跳过第 14 个 pair
跳过第 15 个 pair
跳过第 16 个 pair
跳过第 17 个 pair
跳过第 18 个 pair
跳过第 19 个 pair
跳过第 20 个 pair
跳过第 21 个 pair
跳过第 22 个 pair
跳过第 23 个 pair
跳过第 24 个 pair
跳过第 25 个 pair
跳过第 26 个 pair
跳过第 27 个 pair
跳过第 28 个 pair


 31%|████████████████████████▉                                                        | 36/117 [00:00<00:01, 43.78it/s]

跳过第 30 个 pair
跳过第 31 个 pair
跳过第 32 个 pair
跳过第 33 个 pair
跳过第 34 个 pair
跳过第 36 个 pair
跳过第 37 个 pair
跳过第 38 个 pair
跳过第 39 个 pair
跳过第 40 个 pair


 41%|█████████████████████████████████▏                                               | 48/117 [00:01<00:01, 39.06it/s]

跳过第 42 个 pair
跳过第 43 个 pair
跳过第 44 个 pair
跳过第 45 个 pair
跳过第 46 个 pair
跳过第 48 个 pair
跳过第 49 个 pair
跳过第 50 个 pair
跳过第 51 个 pair
跳过第 52 个 pair


 50%|████████████████████████████████████████▊                                        | 59/117 [00:01<00:01, 37.41it/s]

跳过第 54 个 pair
跳过第 55 个 pair
跳过第 56 个 pair
跳过第 57 个 pair


 54%|███████████████████████████████████████████▌                                     | 63/117 [00:01<00:01, 33.72it/s]

跳过第 60 个 pair
跳过第 61 个 pair
跳过第 62 个 pair
跳过第 63 个 pair
跳过第 64 个 pair
跳过第 65 个 pair
跳过第 66 个 pair
跳过第 67 个 pair
跳过第 68 个 pair


 67%|██████████████████████████████████████████████████████                           | 78/117 [00:02<00:01, 29.58it/s]

跳过第 71 个 pair
跳过第 72 个 pair
跳过第 73 个 pair
跳过第 74 个 pair
跳过第 75 个 pair
跳过第 77 个 pair
跳过第 78 个 pair
跳过第 79 个 pair
跳过第 80 个 pair


 73%|██████████████████████████████████████████████████████████▊                      | 85/117 [00:02<00:01, 18.41it/s]

跳过第 83 个 pair
跳过第 85 个 pair
跳过第 86 个 pair
跳过第 87 个 pair
跳过第 88 个 pair
跳过第 89 个 pair
跳过第 90 个 pair


 81%|█████████████████████████████████████████████████████████████████▊               | 95/117 [00:03<00:00, 23.38it/s]

跳过第 92 个 pair
跳过第 93 个 pair
跳过第 95 个 pair
跳过第 96 个 pair
跳过第 97 个 pair
跳过第 98 个 pair
跳过第 99 个 pair


 96%|████████████████████████████████████████████████████████████████████████████▌   | 112/117 [00:03<00:00, 35.39it/s]

跳过第 101 个 pair
跳过第 102 个 pair
跳过第 103 个 pair
跳过第 104 个 pair
跳过第 105 个 pair
跳过第 106 个 pair
跳过第 107 个 pair
跳过第 108 个 pair
跳过第 109 个 pair
跳过第 110 个 pair


100%|████████████████████████████████████████████████████████████████████████████████| 117/117 [00:03<00:00, 33.92it/s]

跳过第 112 个 pair
跳过第 113 个 pair
跳过第 114 个 pair
跳过第 115 个 pair
跳过第 116 个 pair
图文向量已提取并保存至 mid_data/clip_features.pt





### A.3 EfficientNet-B0 + TF-IDF

In [1]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# -------------------------------
# 1. 加载图文映射数据，并构建 TF‑IDF 向量器
# -------------------------------
mapping_path = "mid_data/text_image_mapping.json"
with open(mapping_path, "r", encoding="utf-8") as f:
    data_pairs = json.load(f)

# 提取所有文本，训练 TF-IDF
all_texts = [item["text"] for item in data_pairs]
vectorizer = TfidfVectorizer(max_features=2048)  # 限制向量维度
tfidf_matrix = vectorizer.fit_transform(all_texts).toarray()
# 用于 Dataset
for i, item in enumerate(data_pairs):
    item["tfidf_vector"] = tfidf_matrix[i]

# -------------------------------
# 2. 自定义 Dataset
# -------------------------------
class ImageTextTFIDFDataset(Dataset):
    def __init__(self, data_pairs, transform=None):
        self.data = data_pairs
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # 加载图片
        img = Image.open(item["image_path"]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        # 文本向量
        tfidf_vec = torch.tensor(item["tfidf_vector"], dtype=torch.float32)
        return img, tfidf_vec

# -------------------------------
# 3. 定义图像特征提取器（EfficientNet-B0） + 融合模型
# -------------------------------
class CrossModalRegressor(nn.Module):
    def __init__(self, tfidf_dim):
        super().__init__()
        # 载入预训练 EfficientNet-B0，去掉最后的分类头
        effnet = models.efficientnet_b0(pretrained=True)
        self.cnn = nn.Sequential(
            effnet.features,
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),           # 输出维度：1280
        )
        # 从 1280 维映射到隐藏，再到 TF‑IDF 维
        self.head = nn.Sequential(
            nn.Linear(1280, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, tfidf_dim)
        )

    def forward(self, x):
        feat = self.cnn(x)
        out = self.head(feat)
        return out

# -------------------------------
# 4. 训练准备
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 图像预处理
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(  # EfficientNet 推荐的 ImageNet stats
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225]
    )
])

# Dataset & DataLoader
dataset = ImageTextTFIDFDataset(data_pairs, transform=transform)
loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=0)

# 模型、损失、优化器
model = CrossModalRegressor(tfidf_dim=tfidf_matrix.shape[1]).to(device)
criterion = nn.MSELoss()  # 回归预测 TF‑IDF 向量
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

# -------------------------------
# 5. 训练循环
# -------------------------------
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for images, tfidf_vecs in tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images = images.to(device)
        tfidf_vecs = tfidf_vecs.to(device)

        optimizer.zero_grad()
        preds = model(images)
        loss = criterion(preds, tfidf_vecs)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * images.size(0)

    avg_loss = total_loss / len(dataset)
    print(f"[Epoch {epoch+1}] Avg Loss: {avg_loss:.4f}")

# -------------------------------
# 6. 保存模型
# -------------------------------
os.makedirs("mid_data", exist_ok=True)
torch.save({
    "model_state": model.state_dict(),
    "vectorizer": vectorizer
}, "mid_data/cross_modal_effnet_tfidf.pth")
print("训练完成，模型已保存至 mid_data/cross_modal_effnet_tfidf.pth")


Epoch 1/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.46s/it]


[Epoch 1] Avg Loss: 0.0043


Epoch 2/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:12<00:00,  1.53s/it]


[Epoch 2] Avg Loss: 0.0017


Epoch 3/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.43s/it]


[Epoch 3] Avg Loss: 0.0012


Epoch 4/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.38s/it]


[Epoch 4] Avg Loss: 0.0011


Epoch 5/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.39s/it]


[Epoch 5] Avg Loss: 0.0010


Epoch 6/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.38s/it]


[Epoch 6] Avg Loss: 0.0010


Epoch 7/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.38s/it]


[Epoch 7] Avg Loss: 0.0009


Epoch 8/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.39s/it]


[Epoch 8] Avg Loss: 0.0008


Epoch 9/10: 100%|████████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.40s/it]


[Epoch 9] Avg Loss: 0.0008


Epoch 10/10: 100%|███████████████████████████████████████████████████████████████████████| 8/8 [00:11<00:00,  1.42s/it]

[Epoch 10] Avg Loss: 0.0007
训练完成，模型已保存至 mid_data/cross_modal_effnet_tfidf.pth





## 任务 B： 本地搜索引擎

In [4]:
import random
import json
from PIL import Image
import clip
import torch

# 路径配置
mapping_json = "mid_data/text_image_mapping.json"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载CLIP模型
model, preprocess = clip.load("ViT-B/32", device=device)

# 加载图文对映射
with open(mapping_json, "r", encoding="utf-8") as f:
    data = json.load(f)

# 随机选择3个图像
samples = random.sample(data, k=3)

# 显示图像并输出文本
for idx, item in enumerate(samples, 1):
    image_path = item["image_path"]
    text = item["text"]
    
    print(f"🔹 图片 {idx}: {image_path}")
    print(f"📝 对应文本: {text}\n")

    try:
        img = Image.open(image_path).convert("RGB")
        img.show(title=f"Image {idx}")
    except Exception as e:
        print(f"❌ 无法打开图片: {e}")


🔹 图片 1: D:\zz\mid_data\extracted_images\09 - DETAILS COLLECTION 2024_page_9.png
📝 对应文本: 12
13
Details Collection
DIE KUNST DES TAFELNS 
Die Design-Accessoires, die mit der Kunst einer 
stilvollen Tafel (des Tisches ) und ihrer kompletten 
Gestaltung verbunden sind, sind das Ergebnis 
einer sorgfältigen Designforschung. Qualität und 
Geschmack lassen Teller, Besteck und Textilien zu 
Meisterwerken der Sorgfalt und Hingabe werden, 
dank der bedeutenden Zusammenarbeit mit 
italienischen Spitzenkünstlern wie Ginori 1735 und 
den Beiträgen berühmter internationaler Designer 
wie Le Corbusier, Charlotte Perriand, Afra und 
Tobia Scarpa. Jeder Gegenstand ist von einer 
ausgeklügelten ästhetischen Forschung begleitet, 
die die Kostbarkeit der Verarbeitung unterstreicht.
ART DE LA TABLE
Les accessoires de design liés à l’art de la table 
et à sa préparation complète sont le résultat 
d’une recherche conceptuelle minutieuse. 
Grâce à d’importantes collaborations avec des 
excellences italiennes 

In [3]:
import os
import json
import torch
import clip
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 参数
image_folder = "mid_data/extracted_images"
mapping_json = "mid_data/text_image_mapping.json"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. 加载模型
model, preprocess = clip.load("ViT-B/32", device=device)

# 2. 加载图像数据和路径
with open(mapping_json, "r", encoding="utf-8") as f:
    pairs = json.load(f)

image_paths = [item["image_path"] for item in pairs]
images = []
for p in tqdm(image_paths, desc="预处理图像"):
    try:
        img = Image.open(p).convert("RGB")
        img = preprocess(img)
        images.append(img)
    except:
        print(f"跳过损坏图像: {p}")

# 3. 批量嵌入图像向量
image_input = torch.stack(images).to(device)
with torch.no_grad():
    image_features = model.encode_image(image_input)
    image_features /= image_features.norm(dim=-1, keepdim=True)  # 归一化

# 4. 搜索函数
def search_images_by_text(query, top_k=1, threshold=0.1):
    model.eval()
    with torch.no_grad():
        text_tokens = clip.tokenize([query]).to(device)
        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # 计算余弦相似度
        sims = (image_features @ text_features.T).squeeze().cpu().numpy()

    top_indices = np.argsort(sims)[::-1][:top_k]
    best_score = sims[top_indices[0]]

    if best_score < threshold:
        print("❌ 无浏览结果（得分低于阈值）")
    else:
        print(f"✅ Top-{top_k} 匹配图像（相似度：{best_score:.4f}）:")
        for i in top_indices:
            print(f" - 相似度: {sims[i]:.4f} -> 图像路径: {image_paths[i]}")
            # 显示图像（可选）
            Image.open(image_paths[i]).show()

# 5. 示例查询
query_text = "L I V I N G  C O L L E C T I O N G h o s t  W a l l 2 0 2 3 Design Mikal Harrsen"
search_images_by_text(query_text)


预处理图像: 100%|████████████████████████████████████████████████████████████████████| 117/117 [00:08<00:00, 13.21it/s]


✅ Top-1 匹配图像（相似度：0.2910）:
 - 相似度: 0.2910 -> 图像路径: D:\zz\mid_data\extracted_images\IMaestri_Products_2022_page_9.png


In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# 1. Load text–image pairs and build TF‑IDF vectors
mapping_path = "mid_data/text_image_mapping.json"
with open(mapping_path, "r", encoding="utf-8") as f:
    data_pairs = json.load(f)
all_texts = [item["text"] for item in data_pairs]
vectorizer = TfidfVectorizer(max_features=2048)
tfidf_matrix = vectorizer.fit_transform(all_texts).toarray()
for i, item in enumerate(data_pairs):
    item["tfidf_vector"] = tfidf_matrix[i]

# 2. Custom dataset
class ImageTextTFIDFDataset(Dataset):
    def __init__(self, data_pairs, transform=None):
        self.data = data_pairs
        self.transform = transform
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]
        img = Image.open(item["image_path"]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        tfidf_vec = torch.tensor(item["tfidf_vector"], dtype=torch.float32)
        return img, tfidf_vec

# 3. Model: EfficientNet backbone → regression head
class CrossModalRegressor(nn.Module):
    def __init__(self, tfidf_dim):
        super().__init__()
        effnet = models.efficientnet_b0(pretrained=True)
        self.cnn = nn.Sequential(
            effnet.features,
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
        )
        self.head = nn.Sequential(
            nn.Linear(1280, 512),
            nn.ReLU(),
            nn.Linear(512, tfidf_dim),
        )
    def forward(self, x):
        with torch.no_grad():
            h = self.cnn(x)
        return self.head(h)

# 4. Training loop (simplified)
model = CrossModalRegressor(tfidf_dim=2048).cuda()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.head.parameters(), lr=1e-4)

dataloader = DataLoader(
    ImageTextTFIDFDataset(data_pairs, transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])), batch_size=16, shuffle=True
)

for epoch in range(5):
    for imgs, vecs in tqdm(dataloader):
        imgs, vecs = imgs.cuda(), vecs.cuda()
        preds = model(imgs)
        loss = criterion(preds, vecs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} loss: {loss.item():.4f}")

In [None]:
# Build and apply TF‑IDF vectorizer
all_texts = [item["text"] for item in data_pairs]
vectorizer = TfidfVectorizer(max_features=2048, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(all_texts)
print(vectorizer.get_feature_names_out()[:20])

In [None]:
def encode_text_with_clip(text_list):
    tokens = clip.tokenize(text_list).to(device)
    with torch.no_grad():
        emb = model.encode_text(tokens).cpu().numpy()
    return emb

# Example: compare a document to theme sentences
themes = ["urban landscape", "portrait photography", "food photo"]
embed_doc = encode_text_with_clip([query_text])
embed_themes = encode_text_with_clip(themes)

scores = cosine_similarity(embed_doc, embed_themes)[0]
print(dict(zip(themes, scores)))

In [None]:
import json
import pickle

def build_hashed_mapping(json_path, output_pickle_path="hashed_mapping.pkl", key="image_path"):
    with open(json_path, "r", encoding="utf-8") as f:
        items = json.load(f)
    hashed = {item[key]: idx for idx, item in enumerate(items)}
    with open(output_pickle_path, "wb") as f:
        pickle.dump(hashed, f)
    print(f"✅ Hashed index saved to {output_pickle_path} (size: {len(hashed)})")