### Setup

In [1]:
work_dir="/scratch/cse/phd/anz198717/XC"
corpus_dataset="MM-AmazonTitles-300K"
version="test_xc"
model_type="MufinTextXC"
img_model="ViT"
txt_model="sentencebert"
ranker="MufinXAttnRanker"
corpus_dset=f"{work_dir}/Corpus/{corpus_dataset}"

In [3]:
import site
import sys
sys.argv = f"MUFIN".split()
import argparse
site.addsitedir(f"{work_dir}/programs/ExtremeMethods")
import os
os.environ['KEEP_TOP_K'] = "-1"
os.environ['RESTRICTMEM'] = "0"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
import numpy as np
import scipy.sparse as sp
import mufin as mn
import torch

In [None]:
parser = argparse.ArgumentParser(description='Inference')
params = parser.parse_args()
params.A = 0.55
params.B = 1.5

params.data_dir = corpus_dset
params.img_model = img_model
params.filter_labels = "filter_labels_test.txt"
params.txt_model = txt_model
params.img_model = img_model
params.data_path = None
params.module = 1
params.max_worker_thread = 6
params.bucket = 1
params.accumulate = 1
params.margin = 0.3
params.neg_sample = 3
params.project_dim = 768
params.n_heads = 2
params.head_dims = 1024
params.n_layer = 1
params.dropout = 0.1
params.keep_all = True
params.batch_size = 512
params.num_workers = 6
params.validate = True
params.model_out_name = "model.pkl"
params.optim = "Adam"
params.prefetch_factor = 2
"""
Adjust according to need
"""
params.ranker = ranker
params.model_dir = f"{work_dir}/models/MM-AmazonTitles-300K/{model_type}/v_{version}"
params.result_dir = f"{corpus_dset}/temp"
params.model_fname = model_type
params.num_labels = 1305265
params.top_k = 100
params.lbl_indices = np.arange(params.num_labels)

params.encoder_init = None if "PreTrained" in model_type else "module3/encoder.pkl"
os.makedirs(params.model_dir, exist_ok=True)
print(params)

### M1 Training

In [None]:
params.module = 1
params.num_epochs = 0
params.surrogate_warm = 20
params.lr = 0.02
params.at_least = 5
params.ignore_img = False
params.ignore_txt = False
params.max_csim = 0.9
params.max_worker_thread = 10
params.min_leaf_sz = 32
params.min_splits = -1
params.sampling = True
params.warm_start = 0
params.multi_pos = 1
params.preload = False
params.surrogate_warm = 1000
params.hard_pos = False
params.batch_size = 1023

net = mn.construct_network(params)
optim = mn.optimizer_utils.Optimizer()
model = mn.construct_model(params, net, optim)

In [None]:
model.fit(corpus_dset, "images/train.img.bin", "raw_data/train_map.txt", "trn_X_Y.txt", "images/test.img.bin",
          "raw_data/test_map.txt", "tst_X_Y.txt", "images/label.img.bin", "raw_data/label_map.txt")

In [None]:
model.retrain(corpus_dset, "images/train.img.bin", "raw_data/train_map.txt",
              "trn_X_Y.txt", "images/label.img.bin", "raw_data/label_map.txt")

## SETUP for M2

In [None]:
params.module = 2
shorty_path = f"{params.result_dir}/module2"
os.makedirs(shorty_path, exist_ok=True)
net = mn.construct_network(params)
optim = mn.optimizer_utils.Optimizer()
model = mn.construct_model(params, net, optim)

for mode in ["test", "train", "label"]:
    tst_mat = model.predict(corpus_dset, f"images/{mode}.img.bin", f"raw_data/{mode}_map.txt", None, None, None)
    sp.save_npz(os.path.join(shorty_path, f"{mode}.npz"), tst_mat)


In [None]:
params.module = 3

emb_path = f"{params.result_dir}/module3"
os.makedirs(emb_path, exist_ok=True)

net = mn.construct_network(params)
optim = mn.optimizer_utils.Optimizer()
model = mn.construct_model(params, net, optim)

for mode in ["test", "train", "label"]:
    tst_emb = model.extract(corpus_dset, f"images/{mode}.img.bin", f"raw_data/{mode}_map.txt")
    for key in tst_emb.keys():
        tst_emb[key].save(emb_path+f"/{mode}.{key}")

encoder = model.extract_encoder()
torch.save(encoder, os.path.join(emb_path, "encoder.pkl"))


## M4 Training

In [None]:
params.module = 4
params.sample_neg = 5
params.sample_pos = 12
params.cosine_margin = 0.5
params.ranker_project_dim = 768
params.lr = 0.005
params.n_layer = 1
params.lr_mf_enc = 0.01
params.lr_mf_clf = 0.1
params.sampling = True
params.ranker_warm = 1000
params.num_epochs = 20
params.batch_size = 512
params.n_heads = 12
params.model_out_name = f"model_{params.ranker}.pkl"

net = mn.construct_network(params)
optim = mn.optimizer_utils.Optimizer()
model = mn.construct_model(params, net, optim)


In [None]:
model.fit(corpus_dset, f"module3/train.img.pretrained", "module3/train.txt.pretrained", "trn_X_Y.txt", f"module3/test.img.pretrained", 
          "module3/test.txt.pretrained", "tst_X_Y.txt", f"module3/label.img.pretrained", "module3/label.txt.pretrained")

In [None]:
score_mat = model.predict(corpus_dset, f"module3/test.img.pretrained", "module3/test.txt.pretrained",
                          None, f"module3/label.img.pretrained", "module3/label.txt.pretrained")

## Evaluation

In [None]:
from xclib.evaluation import xc_metrics as xm
from xc.libs.utils import load_overlap
from xclib.data import data_utils as du

tst_y = du.read_sparse_file(f"{corpus_dset}/tst_X_Y.txt")
acc = xm.Metrics(tst_y)
docs, lbls = load_overlap(corpus_dset, "filter_labels_test.txt")
def evaluations(score_dict, acc, docs, lbls, al=0.9):
    m2 = score_dict["module4/m2"]
    m4 = score_dict["module4/m4"]
    m2[docs, lbls] = 0
    m2.eliminate_zeros()
    print(acc.eval(m2, K=5))
    m4[docs, lbls] = 0
    m4.eliminate_zeros()
    print(acc.eval(m4, K=5))
    for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
        mat = m2.copy().multiply(alpha) + m4.copy().multiply(1-alpha)
        print(f"alpha={alpha}")
        print(acc.eval(mat, K=5))
    return m2.copy().multiply(al) + m4.copy().multiply(1-al)

In [None]:
final_score_mat = evaluations(score_mat, acc, docs, lbls, 0.9)