In [None]:
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/EECS576/VIDRec/DSSM')

import os
print(os.getcwd())

Mounted at /content/drive
/content/drive/My Drive/EECS576/VIDRec/DSSM


In [None]:
!ls
!pip list
!pip install lmdb colorlog tensorboardX colorama torch-geometric clip

'Copy of MicroLens-50k_pairs.tsv'   log		      MicroLens-100k_pairs.csv	 REC	  vbpr.yaml
 feature			    log_tensorboard   MicroLens-100k_pairs.tsv	 run.py
 ks				    main.py	      MicroLens-50k_pairs.csv	 VBPR
Package                            Version
---------------------------------- -------------------
absl-py                            1.4.0
accelerate                         1.1.1
aiohappyeyeballs                   2.4.3
aiohttp                            3.11.2
aiosignal                          1.3.1
alabaster                          1.0.0
albucore                           0.0.19
albumentations                     1.4.20
altair                             4.2.2
annotated-types                    0.7.0
anyio                              3.7.1
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
array_record                       0.5.1
arviz                              0.20.0
astropy                            6.1.6
astropy-iers-data             

In [None]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. Number of GPUs:", torch.cuda.device_count())
else:
    print("No CUDA GPUs are available.")

import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())


CUDA is available. Number of GPUs: 1
True
1


In [None]:
# This file serves the purpose of transforming raw interaction data into the data forms required to execute IDRec baselines.


import pandas as pd
import numpy as np
import os
import torch
SEQ_LEN = 10
file_l=['MicroLens-100k_pairs.tsv']
data_l = ['ks']
for idx in range(len(file_l)):
    dat_seq = pd.read_csv(file_l[idx], sep='\t',header=None)
    dat_arr = np.array(dat_seq)
    inter = []
    for seq in dat_arr:
        uid = seq[0]
        iseq = seq[1].split()
        for i, item in enumerate(iseq):
            inter.append([item, uid, i])

    inter_df = np.array(inter)
    dat = pd.DataFrame(inter_df)
    dat.columns = ['item_id', 'user_id', 'timestamp']
    dat['timestamp'] = dat['timestamp'].astype(int)
    dat.sort_values(by='timestamp', inplace=True, ascending=True)
    user_list = dat['user_id'].values
    item_list = dat['item_id'].values

    index = {}
    for i, key in enumerate(user_list):
        if key not in index:
            index[key] = [i]
        else:
            index[key].append(i)

            indices = []

    for index in index.values():
        indices.extend(list(index)[-(SEQ_LEN+3):])

    final_dat = dict()
    for k in dat:
        final_dat[k] = dat[k].values[indices]

    final_dat = pd.DataFrame(final_dat)
    print(final_dat.head(3))
    print(final_dat['user_id'].nunique(),final_dat['item_id'].nunique(),final_dat.shape[0] )
    os.makedirs(f'./{data_l[idx]}/', exist_ok=True)
    final_dat.to_csv(f'./{data_l[idx]}/{data_l[idx]}.inter', index=False)

  item_id user_id  timestamp
0    1958       1          0
1    6346       1          1
2   15223       1          2
100000 19671 678355


In [None]:
# The following part generates the popularity count file (i.e. the pop.npy file needed in baseline code) of the dataset


SEQ_LEN = 10
class Data:
    def __init__(self, df):
        self.inter_feat = df
        self._data_processing()


    def _data_processing(self):

        self.id2token = {}
        self.token2id = {}
        remap_list = ['user_id', 'item_id']
        for feature in remap_list:
            feats = self.inter_feat[feature]
            new_ids_list, mp = pd.factorize(feats)
            mp = np.array(['[PAD]'] + list(mp))
            token_id = {t: i for i, t in enumerate(mp)}
            self.id2token[feature] = mp
            self.token2id[feature] = token_id
            self.inter_feat[feature] = new_ids_list+1

        self.user_num = len(self.id2token['user_id'])
        self.item_num = len(self.id2token['item_id'])
        self.inter_num = len(self.inter_feat)
        self.uid_field = 'user_id'
        self.iid_field = 'item_id'
        self.user_seq = None
        self.train_feat = None
        self.feat_name_list = ['inter_feat']


    def build(self):

        self.sort(by='timestamp')
        user_list = self.inter_feat['user_id'].values
        item_list = self.inter_feat['item_id'].values
        grouped_index = self._grouped_index(user_list)

        user_seq = {}
        for uid, index in grouped_index.items():
            user_seq[uid] = item_list[index]

        self.user_seq = user_seq
        train_feat = dict()
        test_feat = dict()
        valid_feat = dict()
        indices = []

        for index in grouped_index.values():
            indices.extend(list(index)[:-2])
        for k in self.inter_feat:
            train_feat[k] = self.inter_feat[k].values[indices]

        indices = []
        for index in grouped_index.values():
            indices.extend([index[-2]])
        for k in self.inter_feat:
            valid_feat[k] = self.inter_feat[k].values[indices]

        indices = []
        for index in grouped_index.values():
            indices.extend([index[-1]])
        for k in self.inter_feat:
            test_feat[k] = self.inter_feat[k].values[indices]

        self.train_feat = train_feat
        return train_feat, valid_feat, test_feat


    def _grouped_index(self, group_by_list):
        index = {}
        for i, key in enumerate(group_by_list):
            if key not in index:
                index[key] = [i]
            else:
                index[key].append(i)
        return index

    def _build_seq(self, train_feat):
        max_item_list_len = SEQ_LEN+1
        uid_list, item_list_index= [], []
        seq_start = 0
        save = False
        user_list = train_feat['user_id']
        user_list = np.append(user_list, -1)
        last_uid = user_list[0]
        for i, uid in enumerate(user_list):
            if last_uid != uid :
                save = True
            if save:
                if i - seq_start > max_item_list_len:
                    offset = (i - seq_start) % max_item_list_len
                    seq_start += offset
                    x = torch.arange(seq_start, i)
                    sx = torch.split(x, max_item_list_len)
                    for sub in sx:
                        uid_list.append(last_uid)
                        item_list_index.append(slice(sub[0],sub[-1]+1))


                else:
                    uid_list.append(last_uid)
                    item_list_index.append(slice(seq_start,i))


                save = False
                last_uid = uid
                seq_start = i

        seq_train_feat = {}
        seq_train_feat['user_id'] = np.array(uid_list)
        seq_train_feat['item_seq'] = []
        seq_train_item = []
        for index in item_list_index:
            seq_train_feat['item_seq'].append(train_feat['item_id'][index])
            seq_train_item+=list(train_feat['item_id'][index])

        self.seq_train_item = seq_train_item
        return seq_train_feat


    def sort(self, by, ascending=True):
        self.inter_feat.sort_values(by=by, ascending=ascending, inplace=True)



data_list = ['ks', ]

for idx in range(len(data_list)):
    inter = pd.read_csv(f'./{data_list[idx]}/{data_list[idx]}.inter', delimiter=',', dtype={'item_id':str, 'user_id':str, 'timestamp':int}, header=0, names=['item_id', 'user_id', 'timestamp']
            )

    item_num = inter['item_id'].nunique()
    D  = Data(inter)
    train, valid, test = D.build()
    D._build_seq(train)
    train_items = D.seq_train_item
    train_item_counts = [0] * (item_num + 1)
    for i in train_items:
        train_item_counts[i] += 1
    item_counts_powered = np.power(train_item_counts, 1.0)
    pop_prob_list = []

    for i in range(1, item_num + 1):
        pop_prob_list.append(item_counts_powered[i])
    pop_prob_list = pop_prob_list / sum(np.array(pop_prob_list))
    pop_prob_list = np.append([1], pop_prob_list)
    print(('prob max: {}, prob min: {}, prob mean: {}'.\
            format(max(pop_prob_list), min(pop_prob_list), np.mean(pop_prob_list))))

    np.save(f'./{data_list[idx]}/pop',pop_prob_list)

prob max: 1.0, prob min: 0.0, prob mean: 0.00010166734444896296


In [None]:
!python main.py

[rank0]:[W1202 03:55:35.389067891 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
02 Dec 03:55    INFO  [1;35m[Training]: [0m[1;36mtrain_batch_size[0m = [1;33m[1024][0m
[0m02 Dec 03:55    INFO  [1;35m[Evaluation]: [0m[1;36meval_batch_size[0m = [1;33m[128][0m
[0m02 Dec 03:55    INFO  [1;34mrecsys_decay_params_len: 3  modal_params_decay_len: 2[0m
[0m02 Dec 03:55    INFO  [1;35m
World_Size[0m = 1 
[0m
[0m02 Dec 03:55    INFO  
[1;35mGeneral Hyper Parameters:
[0m[1;36mseed[0m =[1;33m 2020[0m
[1;36mstate[0m =[1;33m INFO[0m
[1;36mreproducibility[0m =[1;33m True[0m
[1;36mdata_path[0m =[1;33m ./ks[0m
[1;36mcheckpoint_dir[0m =[1;33m VBPR/saved[0m
[1;36mshow_progress[0m =[

In [None]:
import torch
import os
from REC.config import Config
from REC.utils import get_model
from REC.data.utils import load_data

class InferenceTrainer:
    def __init__(self, config, model):
        self.config = config
        self.model = model
        self.device = config['device']
        self.use_modality = config['use_modality']
        self.item_feature = None

    def resume_checkpoint(self, resume_file):
        checkpoint = torch.load(resume_file, map_location=self.device)
        state_dict = checkpoint['state_dict']
        new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
        self.model.load_state_dict(new_state_dict)
        self.model.eval()

    @torch.no_grad()
    def compute_item_feature(self):
        self.item_feature = self.model.compute_item_all()

def load_pretrained_vbpr(config_path, checkpoint_path, device='cuda'):
    config = Config(config_file_list=[config_path])
    config['device'] = torch.device(device)
    dataload = load_data(config)
    model = get_model(config['model'])(config, dataload)
    model = model.to(config['device'])
    trainer = InferenceTrainer(config, model)
    trainer.resume_checkpoint(checkpoint_path)
    trainer.compute_item_feature()

    return trainer, model, dataload

def get_user_recommendations(trainer, model, user_id, k=20):
    """Get top-k recommendations for a user"""
    user = torch.tensor([user_id]).to(trainer.device)

    with torch.no_grad():
        scores = model.predict(user, trainer.item_feature)
        scores = scores.view(-1)
        top_k_scores, top_k_indices = torch.topk(scores, k)

    return top_k_indices.cpu().numpy().tolist()

def get_recommendations_for_user_sample(trainer, model, dataload, num_users=50, k=10, seed=42):
    torch.manual_seed(seed)
    inter_feat = dataload.inter_feat
    user_interactions = {}
    for user_id, item_id in zip(inter_feat['user_id'], inter_feat['item_id']):
        if user_id not in user_interactions:
            user_interactions[user_id] = []
        user_interactions[user_id].append(item_id)
    available_users = list(user_interactions.keys())
    num_users = min(num_users, len(available_users))
    sampled_users = torch.randperm(len(available_users))[:num_users]
    selected_users = [available_users[i] for i in sampled_users]

    user_ids = dataload.id2token['user_id']
    item_ids = dataload.id2token['item_id']
    recommendations = {}

    for user_id in selected_users:
        history = [item_ids[item_id] for item_id in user_interactions[user_id]]
        top_k_items = get_user_recommendations(trainer, model, int(user_ids[user_id]), k)
        top_k_items = [item_ids[item_id] for item_id in top_k_items]

        recommendations[int(user_ids[user_id])] = {
            'history': history,
            'recommendations': top_k_items
        }
    return recommendations

if __name__ == "__main__":
    config_path = "vbpr.yaml"
    checkpoint_path = "VBPR/saved/VBPR-Dec-02-2024_03-55-38.pth"
    trainer, model, dataload = load_pretrained_vbpr(config_path, checkpoint_path)

    recommendations = get_recommendations_for_user_sample(
        trainer,
        model,
        dataload,
        num_users=100000,
        k=20
    )

    for user_id, result in recommendations.items():
        if result['history'][-1] in result['recommendations']:
          print(f"User {user_id}: Top 20 recommended items: {result['recommendations']}")

  checkpoint = torch.load(resume_file, map_location=self.device)


User 62568: Top 20 recommended items: ['178', '7306', '15884', '16428', '4263', '14230', '5689', '9415', '18813', '2065', '13131', '15223', '1900', '2399', '13712', '14148', '5580', '5722', '1452', '12254']
User 31938: Top 20 recommended items: ['12736', '16289', '19475', '5028', '17662', '5689', '14692', '6626', '[PAD]', '18475', '15467', '885', '3329', '4541', '18669', '17117', '6192', '11054', '8271', '7177']
User 751: Top 20 recommended items: ['16728', '925', '16303', '13981', '15423', '3996', '12343', '14229', '10833', '17110', '2581', '3028', '1671', '16823', '8999', '19204', '12444', '13045', '8741', '8950']
User 5042: Top 20 recommended items: ['8018', '5719', '10681', '16197', '11296', '10770', '14683', '17047', '3046', '835', '4984', '10921', '1501', '7335', '19443', '6673', '19455', '3895', '16230', '13457']
User 52874: Top 20 recommended items: ['13741', '13085', '12127', '4075', '15800', '7798', '7092', '17396', '885', '14243', '17311', '15048', '9579', '15223', '18066', 