In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class MMIMDbNumpyDataset(Dataset):
    def __init__(self, images_path, data_path, transform=None):
        # Ładuj obrazy przez mmap (oszczędnie)
        self.images = np.load(images_path, mmap_mode='r')
        # Ładuj metadane (data.npy)
        self.data = np.load(data_path, allow_pickle=True)
        self.transform = transform

        # Sprawdź orientacyjnie shape
        print("Obrazy shape:", self.images.shape)
        print("Pierwszy rekord data:", self.data[0])

    def __len__(self):
        return self.images.shape[0]

    def __getitem__(self, idx):
        img = self.images[idx]

        # Obsłuż różne układy obrazów: (3,H,W) lub (H,W,3)
        if img.shape[0] == 3:
            img_tensor = torch.from_numpy(img).float() / 255.0
        elif img.shape[-1] == 3:
            img = np.transpose(img, (2,0,1))
            img_tensor = torch.from_numpy(img).float() / 255.0
        else:
            raise ValueError(f"Nieoczekiwany shape obrazu: {img.shape}")

        if self.transform:
            img_tensor = self.transform(img_tensor)

        # print(f"Przetwarzanie obrazu {idx}: shape {img_tensor.shape}")

        # Załóżmy, że data[i] to dict: {'genres': ..., 'description': ...}
        meta = self.data[idx]
        genres = meta['genres'] if isinstance(meta, dict) else meta[1]
        description = meta['description'] if isinstance(meta, dict) else meta[2]
        # Możesz też zwracać inne pola wedle potrzeb

        return img_tensor, genres, description

# ====== UŻYCIE ======

import torchvision.transforms as T

img_size = (160, 160)

transform = T.Compose([
    T.ToPILImage(),
    T.Resize(img_size),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

dataset = MMIMDbNumpyDataset(
    images_path='/net/pr2/projects/plgrid/plggdnnp/datasets/MM-IMDb/images.npy',
    data_path='/net/pr2/projects/plgrid/plggdnnp/datasets/MM-IMDb/data.npy',
    transform=transform
)

loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

# Test:
for img, genres, desc in loader:
    print("Batch shape:", img.shape)
    print("Pierwsze genres w batchu:", genres[0])
    print("Pierwszy opis:", desc[0])
    break


In [None]:
import torch
import torchvision.transforms as T
from torchvision.models import resnet18
import h5py
from tqdm import trange
import numpy as np

# Ustawienia
root_path = '/net/pr2/projects/plgrid/plggdnnp/datasets/MM-IMDb'
h5_path = 'images.h5'
batch_size = 32
img_size = (96, 96)  # Rozmiar obrazów po przeskalowaniu

# Model - ResNet18 jako feature extractor
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_img = resnet18(pretrained=True)
model_img.fc = torch.nn.Identity()
model_img.eval().to(device)

transform = T.Compose([
    T.ToPILImage(),
    T.Resize(img_size),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Otwórz HDF5
with h5py.File(f'{root_path}/{h5_path}', 'r') as f:
    images = f['images']
    N = images.shape[0]
    print(type(images), images.dtype)
    print(f"Rozmiar danych: {images.shape}")
    all_features = []

    with torch.no_grad():
        for start in trange(0, N, batch_size):
            end = min(start + batch_size, N)
            batch = []
            for i in range(start, end):
                img = images[i]
                img = np.transpose(img, (2,1,0))
                img_t = transform(img)
                batch.append(img_t)
            batch_tensor = torch.stack(batch).to(device)
            feats = model_img(batch_tensor)
            all_features.append(feats.cpu().numpy())

    # Po wszystkich batchach:
    all_features = np.concatenate(all_features, axis=0)  # [N, 512]
    np.save('image_features_resnet18.npy', all_features)
print("Ekstrakcja zakończona, zapisano image_features_resnet18.npy")


In [None]:
! conda install conda-forge::transformers

In [None]:
from torchvision.models import resnet18
from transformers import AutoTokenizer, AutoModel

# ---- 2. Przygotowanie modeli ----
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# ResNet18 bez FC
resnet = resnet18(pretrained=True)
resnet.fc = torch.nn.Identity()
resnet.eval().to(device)

img_transform = T.Compose([
    T.ToTensor(),  # już mamy (3,H,W) jako uint8, zamienia na float [0,1]
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

# DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
text_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
text_model.eval()

In [None]:
from tqdm import tqdm

# ---- 3. Ekstrakcja cech batchowo ----
dataset = MMIMDbNumpyDataset(
    '/net/pr2/projects/plgrid/plggdnnp/datasets/MM-IMDb/images.npy',
    '/net/pr2/projects/plgrid/plggdnnp/datasets/MM-IMDb/data.npy'
)
loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=4)

img_feats_list = []
txt_feats_list = []

with torch.no_grad():
    for imgs, descs in tqdm(loader):
        # ==== 1. Obrazy ====
        imgs = imgs.numpy().astype(np.uint8)
        # batch transform (normalizacja)
        imgs_t = [img_transform(img) for img in imgs]
        imgs_batch = torch.stack(imgs_t).to(device)
        img_feats = resnet(imgs_batch).cpu().numpy()  # (B, 512)
        img_feats_list.append(img_feats)

        # ==== 2. Teksty ====
        # Tokenizacja batchowa
        enc = tokenizer(list(descs), padding=True, truncation=True, max_length=128, return_tensors="pt")
        enc = {k: v.to(device) for k,v in enc.items()}
        out = text_model(**enc)
        txt_feats = out.last_hidden_state[:,0,:].cpu().numpy()  # (B, 768)
        txt_feats_list.append(txt_feats)

# ---- 4. Zapis do plików ----
img_feats_arr = np.concatenate(img_feats_list, axis=0)
txt_feats_arr = np.concatenate(txt_feats_list, axis=0)
np.save('image_features_resnet18.npy', img_feats_arr)
np.save('text_features_distilbert.npy', txt_feats_arr)

print("Ekstrakcja embeddingów zakończona!")