We have 5 different types of features (modalities):

- meta features
- text basic features
- text TF-IDF features
- text embeddings
- image embeddings

For each item from dataset we want to get tuple of vectors for each modality.

In [None]:
from tqdm.auto import tqdm

In [2]:
import json
import logging
from pathlib import Path

import numpy as np
import pandas as pd
from torch.utils.data import Dataset

logger = logging.getLogger()
logger.setLevel(logging.INFO)


class MultiModalDataset(Dataset):
    def __init__(self, data_folder: str, split: str):
        logging.info(f"Initializing dataset with split: {split}.")
        self.split = split

        if self.split in ["train", "val"]:
            logging.info("Loading targets...")
            self.id2target = json.load(open(Path(data_folder) / "id2target.json"))

        logging.info("Loading meta features...")
        if self.split == "train":
            self.meta_feats = pd.read_csv(Path(data_folder) / "meta_train.csv")
        elif self.split == "val":
            self.meta_feats = pd.read_csv(Path(data_folder) / "meta_val.csv")
        elif self.split == "test":
            self.meta_feats = pd.read_csv(Path(data_folder) / "meta_test.csv")

        logging.info("Loading text basic features...")
        if self.split == "train":
            self.text_basic_feats = pd.read_csv(Path(data_folder) / "basic_text_train.csv")
        elif self.split == "val":
            self.text_basic_feats = pd.read_csv(Path(data_folder) / "basic_text_val.csv")
        elif self.split == "test":
            self.text_basic_feats = pd.read_csv(Path(data_folder) / "basic_text_test.csv")

        logging.info("Loading text TF-IDF features...")
        if self.split == "train":
            self.text_tfidf_feats = pd.read_csv(Path(data_folder) / "tfidf_train.csv")
        elif self.split == "val":
            self.text_tfidf_feats = pd.read_csv(Path(data_folder) / "tfidf_val.csv")
        elif self.split == "test":
            self.text_tfidf_feats = pd.read_csv(Path(data_folder) / "tfidf_test.csv")

        logging.info("Loading image embeddings...")
        if self.split == "train":
            self.img_embs = np.load(Path(data_folder) / "img_embs_train.npy")
            self.id2idx_img_embs = json.load(open(Path(data_folder) / "id2idx_img_embs_train.json"))
        elif self.split == "val":
            self.img_embs = np.load(Path(data_folder) / "img_embs_val.npy")
            self.id2idx_img_embs = json.load(open(Path(data_folder) / "id2idx_img_embs_val.json"))
        elif self.split == "test":
            self.img_embs = np.load(Path(data_folder) / "img_embs_test.npy")
            self.id2idx_img_embs = json.load(open(Path(data_folder) / "id2idx_img_embs_test.json"))

        logging.info("Loading text embeddings...")
        if self.split == "train":
            self.text_embs = np.load(Path(data_folder) / "text_embs_train.npy")
            self.id2idx_text_embs = json.load(open(Path(data_folder) / "id2idx_text_embs_train.json"))
        elif self.split == "val":
            self.text_embs = np.load(Path(data_folder) / "text_embs_val.npy")
            self.id2idx_text_embs = json.load(open(Path(data_folder) / "id2idx_text_embs_val.json"))
        elif self.split == "test":
            self.text_embs = np.load(Path(data_folder) / "text_embs_test.npy")
            self.id2idx_text_embs = json.load(open(Path(data_folder) / "id2idx_text_embs_test.json"))

        logging.info("All files loaded successfully.")

    def __len__(self):
        return len(self.meta_feats)

    def __getitem__(self, index):
        id = self.meta_feats.loc[index, "id"]

        if self.split in ["train", "val"]:
            target = self.id2target[str(id)]
        else:
            target = None

        meta_feats = (
            self.meta_feats[self.meta_feats["id"] == id].drop(columns=["id"]).to_numpy(dtype=np.float32).squeeze()
        )

        text_basic_feats = (
            self.text_basic_feats[self.text_basic_feats["id"] == id]
            .drop(columns=["id"])
            .to_numpy(dtype=np.float32)
            .squeeze()
        )

        text_tfidf_feats = (
            self.text_tfidf_feats[self.text_tfidf_feats["id"] == id]
            .drop(columns=["id"])
            .to_numpy(dtype=np.float32)
            .squeeze()
        )

        img_emb = self.img_embs[self.id2idx_img_embs[str(id)]]

        text_emb = self.text_embs[self.id2idx_text_embs[str(id)]]

        return meta_feats, text_basic_feats, text_tfidf_feats, img_emb, text_emb, target

# Train dataset

In [None]:
dataset = MultiModalDataset(data_folder="../data", split="train")

INFO:root:Initializing dataset with split: train.
INFO:root:Loading targets...
INFO:root:Loading meta features...
INFO:root:Loading text basic features...


In [None]:
save_to_folder = Path("../data/train")
save_to_folder.mkdir(parents=True, exist_ok=True)

for i in tqdm(range(len(dataset))):
    sample = dataset[i]
    np.save(save_to_folder / f"{i}.npy", np.array(sample, dtype=object), allow_pickle=True)

  0%|          | 0/155979 [00:00<?, ?it/s]

IndexError: index 155979 is out of bounds for axis 0 with size 155979

# Val dataset

In [None]:
dataset = MultiModalDataset(data_folder="../data", split="val")

INFO:root:Initializing dataset with split: val.
INFO:root:Loading targets...
INFO:root:Loading meta features...
INFO:root:Loading text basic features...
INFO:root:Loading text TF-IDF features...
INFO:root:Loading image embeddings...
INFO:root:Loading text embeddings...
INFO:root:All files loaded successfully.


In [None]:
save_to_folder = Path("../data/val")
save_to_folder.mkdir(parents=True, exist_ok=True)

for i in tqdm(range(len(dataset))):
    sample = dataset[i]
    np.save(save_to_folder / f"{i}.npy", np.array(sample, dtype=object), allow_pickle=True)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'int'>


# Test dataset

In [None]:
dataset = MultiModalDataset(data_folder="../data", split="test")

INFO:root:Initializing dataset with split: test.
INFO:root:Loading meta features...
INFO:root:Loading text basic features...
INFO:root:Loading text TF-IDF features...
INFO:root:Loading image embeddings...
INFO:root:Loading text embeddings...
INFO:root:All files loaded successfully.


In [None]:
save_to_folder = Path("../data/test")
save_to_folder.mkdir(parents=True, exist_ok=True)

for i in tqdm(range(len(dataset))):
    sample = dataset[i]
    np.save(save_to_folder / f"{i}.npy", np.array(sample, dtype=object), allow_pickle=True)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'NoneType'>
