### Setup

In [1]:
work_dir="/scratch/cse/phd/anz198717/XC"
dataset="MM-AmazonTitles-300K"
corpus_dset=f"{work_dir}/data/{dataset}"

In [2]:
import site
site.addsitedir(f"{work_dir}/programs/ExtremeMethods")
import xc.tools.build_from_msr as msr

## Only for MSR internal datasets

In [3]:
# args=f"--in_dir {corpus_dset} --ot_dir {corpus_dset} \
#     --docs_input corpus_data.txt --lbls_input corpus_x_y.txt"
# sys.argv = f"TOKEN {args}".split()
# print(args)
# args = msr.setup()
# lines = msr.build_docs(args)
# msr.build_lbls(args, lines)

In [4]:
data_dir=f"{corpus_dset}/images"
data_txt=f"{corpus_dset}/raw_data"
img_path=f"{corpus_dset}/img.bin"
os.makedirs(f"{data_dir}", exist_ok=True)

## Building image database hash map

In [5]:
from xc.libs.utils import pbar
import scipy.sparse as sp
import numpy as np
import os


def read_ptrs(file):
    ptrs = {}
    curr = 0
    if os.path.exists(file):
        with open(file, "rb") as f:
            for line in pbar(f):
                uid, _ = line.split(b"\t", 1)
                uid = uid.decode('utf-8')
                _ptrs = ptrs.get(uid, [])
                _ptrs.append(curr)
                ptrs[uid] = _ptrs
                curr = f.tell()
    return ptrs


def build_sparse_mat(doc_map, dict_ptrs):
    uids = list(map(lambda x: x.split("->", 1)[0], pbar(open(doc_map,"r", encoding="latin1"))))
    ptrs, cols, rows, num_cols, num_rows = [], [], [], 0, 0
    for row, uid in pbar(enumerate(uids), desc="buildling"):
        uid = uid.split(",")
        # NOTE offesting it with 1 for sparse matrix
        sub_ptrs = np.concatenate(
            list(map(lambda x: dict_ptrs.get(x, [-1]), uid))) + 1
        ptrs.append(sub_ptrs)
        cols.append(np.arange(sub_ptrs.size) + num_cols)
        rows.append(np.ones(sub_ptrs.size)*row)
        num_cols += sub_ptrs.size
    image_mat = sp.lil_matrix((len(uids), num_cols))
    rows = np.concatenate(rows)
    cols = np.concatenate(cols)
    ptrs = np.concatenate(ptrs)
    image_mat[rows, cols] = ptrs
    image_mat = image_mat.tocsr()
    return image_mat


def save(data_path, img):
    if img.nnz >0:
        sp.save_npz(f"{data_path}.img.bin.npz", img)

In [7]:
dict_ptrs = read_ptrs(img_path)

tst_img = build_sparse_mat(f"{data_txt}/test.raw.txt", dict_ptrs)
trn_img = build_sparse_mat(f"{data_txt}/train.raw.txt", dict_ptrs)
lbl_img = build_sparse_mat(f"{data_txt}/label.raw.txt", dict_ptrs)

save(f"{data_dir}/test", tst_img)
save(f"{data_dir}/label", lbl_img)
save(f"{data_dir}/train", trn_img)

docs: 970237it [00:00, 1087165.72it/s]
docs: 1305265it [00:01, 1074220.80it/s]
970237it [00:00, 1296265.58it/s]
buildling: 970237it [00:15, 64480.39it/s]
1305265it [00:00, 1336204.46it/s]
buildling: 1305265it [00:20, 63698.07it/s]
