In [9]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torchvision.models as models
from torchvision import transforms
from sklearn.metrics.pairwise import cosine_similarity

# --- 1. Загрузка изображений ---
def load_images(folder_path, img_size=(224, 224)):
    images = {}
    for filename in tqdm(os.listdir(folder_path), 'Donwload imgs'):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            try:
                img = Image.open(os.path.join(folder_path, filename)).convert('RGB')
                img = img.resize(img_size)
                images[filename] = img
            except Exception as e:
                print(f"Ошибка загрузки {filename}: {e}")
    return images

# --- 2. Извлечение признаков (ResNet50) ---
def get_embeddings(images):
    model = models.resnet101(pretrained=True)
    model = torch.nn.Sequential(*(list(model.children())[:-1]))  # Удаляем последний слой
    model.eval()
    
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    embeddings = {}
    with torch.no_grad():
        for filename, img in tqdm(images.items(), 'Embeddings'):
            img_tensor = transform(img).unsqueeze(0)
            embedding = model(img_tensor).squeeze().numpy()
            embeddings[filename] = embedding
    return embeddings

# --- 3. Поиск похожих изображений (через Sklearn) ---
def find_similar_images(embeddings, top_k=6):
    filenames = list(embeddings.keys())
    emb_matrix = np.array([embeddings[fn] for fn in filenames])
    
    # Нормализация для косинусной схожести
    emb_matrix = emb_matrix / np.linalg.norm(emb_matrix, axis=1, keepdims=True)
    
    # Матрица попарных схожестей
    sim_matrix = cosine_similarity(emb_matrix)
    
    results = {}
    for i, query_fn in tqdm(enumerate(filenames), 'Pairs'):
        sim_scores = sim_matrix[i]
        sim_scores[i] = -1  # Исключаем текущий файл
        top_indices = np.argsort(sim_scores)[-top_k:][::-1]  # Топ-6 похожих
        results[query_fn] = ' '.join([filenames[idx] for idx in top_indices])
    return results

# --- 4. Сохранение результатов ---
def save_to_csv(results, output_file='submission.csv'):
    df = pd.DataFrame({
        'filename': results.keys(),
        'ranking': results.values()
    })
    df.to_csv(output_file, index=False)
    print(f"Результаты сохранены в {output_file}")


folder_path = "dataset/"  # Укажите путь к папке
images = load_images(folder_path)
print(f"Загружено изображений: {len(images)}")
    
embeddings = get_embeddings(images)
results = find_similar_images(embeddings)
save_to_csv(results)

Donwload imgs: 100%|██████████| 9605/9605 [00:26<00:00, 361.93it/s]


Загружено изображений: 9605


Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /Users/arturkuzminskij/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:15<00:00, 11.4MB/s] 
Embeddings: 100%|██████████| 9605/9605 [29:32<00:00,  5.42it/s]  
Pairs: 9605it [00:05, 1898.95it/s]

Результаты сохранены в submission.csv



