In [None]:
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/EECS576/final_project/MicroLens/Code/IDRec/DSSM')

import os
print(os.getcwd())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/EECS576/final_project/MicroLens/Code/IDRec/DSSM


In [None]:
!ls
!pip list
!pip install lmdb colorlog tensorboardX colorama torch-geometric clip

DSSM	   log		    MicroLens-100k_pairs.csv  MicroLens-50k_pairs.tsv  rec-nfm.txt
dssm.yaml  log_tensorboard  MicroLens-100k_pairs.tsv  REC		       run.py
ks	   main.py	    MicroLens-50k_pairs.csv   rec-dssm.txt
Package                            Version
---------------------------------- -------------------
absl-py                            1.4.0
accelerate                         1.1.1
aiohappyeyeballs                   2.4.3
aiohttp                            3.11.2
aiosignal                          1.3.1
alabaster                          1.0.0
albucore                           0.0.19
albumentations                     1.4.20
altair                             4.2.2
annotated-types                    0.7.0
anyio                              3.7.1
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
array_record                       0.5.1
arviz                              0.20.0
astropy                            6.1.6
astropy-iers-data             

In [None]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. Number of GPUs:", torch.cuda.device_count())
else:
    print("No CUDA GPUs are available.")

import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())


CUDA is available. Number of GPUs: 1
True
1


In [None]:
# This file serves the purpose of transforming raw interaction data into the data forms required to execute IDRec baselines.


import pandas as pd
import numpy as np
import os
import torch
SEQ_LEN = 10
file_l=['MicroLens-100k_pairs.tsv']
data_l = ['ks']
for idx in range(len(file_l)):
    dat_seq = pd.read_csv(file_l[idx], sep='\t',header=None)
    dat_arr = np.array(dat_seq)
    inter = []
    for seq in dat_arr:
        uid = seq[0]
        iseq = seq[1].split()
        for i, item in enumerate(iseq):
            inter.append([item, uid, i])

    inter_df = np.array(inter)
    dat = pd.DataFrame(inter_df)
    dat.columns = ['item_id', 'user_id', 'timestamp']
    dat['timestamp'] = dat['timestamp'].astype(int)
    dat.sort_values(by='timestamp', inplace=True, ascending=True)
    user_list = dat['user_id'].values
    item_list = dat['item_id'].values

    index = {}
    for i, key in enumerate(user_list):
        if key not in index:
            index[key] = [i]
        else:
            index[key].append(i)

            indices = []

    for index in index.values():
        indices.extend(list(index)[-(SEQ_LEN+3):])

    final_dat = dict()
    for k in dat:
        final_dat[k] = dat[k].values[indices]

    final_dat = pd.DataFrame(final_dat)
    print(final_dat.head(3))
    print(final_dat['user_id'].nunique(),final_dat['item_id'].nunique(),final_dat.shape[0] )
    os.makedirs(f'./{data_l[idx]}/', exist_ok=True)
    final_dat.to_csv(f'./{data_l[idx]}/{data_l[idx]}.inter', index=False)

  item_id user_id  timestamp
0    1958       1          0
1    6346       1          1
2   15223       1          2
100000 19671 678355


In [None]:
# The following part generates the popularity count file (i.e. the pop.npy file needed in baseline code) of the dataset


SEQ_LEN = 10
class Data:
    def __init__(self, df):
        self.inter_feat = df
        self._data_processing()


    def _data_processing(self):

        self.id2token = {}
        self.token2id = {}
        remap_list = ['user_id', 'item_id']
        for feature in remap_list:
            feats = self.inter_feat[feature]
            new_ids_list, mp = pd.factorize(feats)
            mp = np.array(['[PAD]'] + list(mp))
            token_id = {t: i for i, t in enumerate(mp)}
            self.id2token[feature] = mp
            self.token2id[feature] = token_id
            self.inter_feat[feature] = new_ids_list+1

        self.user_num = len(self.id2token['user_id'])
        self.item_num = len(self.id2token['item_id'])
        self.inter_num = len(self.inter_feat)
        self.uid_field = 'user_id'
        self.iid_field = 'item_id'
        self.user_seq = None
        self.train_feat = None
        self.feat_name_list = ['inter_feat']


    def build(self):

        self.sort(by='timestamp')
        user_list = self.inter_feat['user_id'].values
        item_list = self.inter_feat['item_id'].values
        grouped_index = self._grouped_index(user_list)

        user_seq = {}
        for uid, index in grouped_index.items():
            user_seq[uid] = item_list[index]

        self.user_seq = user_seq
        train_feat = dict()
        test_feat = dict()
        valid_feat = dict()
        indices = []

        for index in grouped_index.values():
            indices.extend(list(index)[:-2])
        for k in self.inter_feat:
            train_feat[k] = self.inter_feat[k].values[indices]

        indices = []
        for index in grouped_index.values():
            indices.extend([index[-2]])
        for k in self.inter_feat:
            valid_feat[k] = self.inter_feat[k].values[indices]

        indices = []
        for index in grouped_index.values():
            indices.extend([index[-1]])
        for k in self.inter_feat:
            test_feat[k] = self.inter_feat[k].values[indices]

        self.train_feat = train_feat
        return train_feat, valid_feat, test_feat


    def _grouped_index(self, group_by_list):
        index = {}
        for i, key in enumerate(group_by_list):
            if key not in index:
                index[key] = [i]
            else:
                index[key].append(i)
        return index

    def _build_seq(self, train_feat):
        max_item_list_len = SEQ_LEN+1
        uid_list, item_list_index= [], []
        seq_start = 0
        save = False
        user_list = train_feat['user_id']
        user_list = np.append(user_list, -1)
        last_uid = user_list[0]
        for i, uid in enumerate(user_list):
            if last_uid != uid :
                save = True
            if save:
                if i - seq_start > max_item_list_len:
                    offset = (i - seq_start) % max_item_list_len
                    seq_start += offset
                    x = torch.arange(seq_start, i)
                    sx = torch.split(x, max_item_list_len)
                    for sub in sx:
                        uid_list.append(last_uid)
                        item_list_index.append(slice(sub[0],sub[-1]+1))


                else:
                    uid_list.append(last_uid)
                    item_list_index.append(slice(seq_start,i))


                save = False
                last_uid = uid
                seq_start = i

        seq_train_feat = {}
        seq_train_feat['user_id'] = np.array(uid_list)
        seq_train_feat['item_seq'] = []
        seq_train_item = []
        for index in item_list_index:
            seq_train_feat['item_seq'].append(train_feat['item_id'][index])
            seq_train_item+=list(train_feat['item_id'][index])

        self.seq_train_item = seq_train_item
        return seq_train_feat


    def sort(self, by, ascending=True):
        self.inter_feat.sort_values(by=by, ascending=ascending, inplace=True)



data_list = ['ks', ]

for idx in range(len(data_list)):
    inter = pd.read_csv(f'./{data_list[idx]}/{data_list[idx]}.inter', delimiter=',', dtype={'item_id':str, 'user_id':str, 'timestamp':int}, header=0, names=['item_id', 'user_id', 'timestamp']
            )

    item_num = inter['item_id'].nunique()
    D  = Data(inter)
    train, valid, test = D.build()
    D._build_seq(train)
    train_items = D.seq_train_item
    train_item_counts = [0] * (item_num + 1)
    for i in train_items:
        train_item_counts[i] += 1
    item_counts_powered = np.power(train_item_counts, 1.0)
    pop_prob_list = []

    for i in range(1, item_num + 1):
        pop_prob_list.append(item_counts_powered[i])
    pop_prob_list = pop_prob_list / sum(np.array(pop_prob_list))
    pop_prob_list = np.append([1], pop_prob_list)
    print(('prob max: {}, prob min: {}, prob mean: {}'.\
            format(max(pop_prob_list), min(pop_prob_list), np.mean(pop_prob_list))))

    np.save(f'./{data_list[idx]}/pop',pop_prob_list)

prob max: 1.0, prob min: 0.0, prob mean: 0.00010166734444896296


In [None]:
import pandas as pd
import numpy as np
import os
import torch

interaction_file = 'ks/ks.inter'
interactions = pd.read_csv(interaction_file, delimiter=',', dtype={'item_id':int, 'user_id':int, 'timestamp':int}, header=0, names=['item_id', 'user_id', 'timestamp'])
# print(interactions.head(3))
user_interactions = interactions.groupby('user_id')['item_id'].apply(list).to_dict()
# print(user_interactions)
recommendation_file = 'rec-dssm.txt'
recommendations = {}
with open(recommendation_file, 'r') as file:
    for line in file:
        parts = line.strip().split(':', 1)
        if len(parts) < 2:
            print(f"Skipping line due to unexpected format: {line}")
            continue

        user_id_part = parts[0].strip()
        user_id = int(user_id_part.split()[1])

        try:
            recommended_items_part = parts[1].strip()
            recommended_items = eval(recommended_items_part.split('[')[1].split(']')[0])
            recommendations[user_id] = recommended_items
        except (IndexError, SyntaxError) as e:
            print(f"Skipping line due to unexpected format: {line}")
k = 5
total_count = 0
for user_id, interaction_history in user_interactions.items():
    if user_id in recommendations:
        last_k_items = interaction_history[-k:]
        recommended_items = recommendations[user_id]

        if any(item in recommended_items for item in last_k_items):
            print(f"User ID: {user_id}, Last {k} Interactions: {last_k_items}, Recommended Items: {recommended_items}")
            total_count += 1

print(total_count)

User ID: 1531, Last 5 Interactions: [15252, 2998, 18653, 17974, 16121], Recommended Items: (2193, 6517, 6518, 6323, 3544, 4277, 1122, 7573, 1260, 12681, 3520, 7244, 3411, 9699, 93, 2205, 10867, 2998, 5971, 848)
User ID: 2421, Last 5 Interactions: [7664, 15302, 8304, 7437, 7992], Recommended Items: (8588, 8587, 2090, 7797, 3741, 500, 2257, 11773, 4043, 13667, 13345, 3136, 2636, 2149, 969, 9223, 3552, 766, 9517, 7992)
User ID: 4458, Last 5 Interactions: [11572, 3210, 6087, 17255, 19188], Recommended Items: (11572, 10535, 10495, 12688, 10791, 4278, 5552, 452, 2185, 12108, 3247, 2759, 8245, 7359, 8998, 975, 1621, 1690, 6803, 3165)
User ID: 4922, Last 5 Interactions: [15775, 6539, 9512, 680, 11924], Recommended Items: (9551, 2710, 12031, 6539, 4784, 2368, 3904, 10629, 4783, 12801, 16135, 5399, 5365, 7683, 7994, 7392, 8826, 1500, 5402, 4985)
User ID: 5555, Last 5 Interactions: [16027, 13611, 12457, 1131, 647], Recommended Items: (2207, 2375, 1131, 3334, 8080, 11807, 420, 2491, 1364, 8810, 27

In [None]:
# !python main.py

In [None]:
import torch
from REC.model.IdModel.dssm import DSSM
from REC.trainer.trainer import Trainer
from REC.config import Config
from REC.utils import init_logger, get_model, init_seed
from REC.data.utils import load_data

class InferenceTrainer(Trainer):
    def __init__(self, config, model):
        self.config = config
        self.model = model
        self.device = config['device']
        self.checkpoint_dir = config['checkpoint_dir']
        self.use_modality = config['use_modality']
        self.item_feature = None
        self.tot_item_num = None

    def resume_checkpoint(self, resume_file):
        """Simplified version of resume_checkpoint"""
        checkpoint = torch.load(resume_file, map_location=self.device)

        state_dict = checkpoint['state_dict']
        new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
        self.model.load_state_dict(new_state_dict)
        self.model.eval()

    @torch.no_grad()
    def compute_item_feature(self, config, data):
        if self.use_modality:
            item_data = BatchDataset(config, data)
            item_loader = DataLoader(item_data, batch_size=100, num_workers=10, shuffle=False, pin_memory=True)
            self.item_feature = []

            for idx, items in enumerate(item_loader):
                items = items.to(self.device)
                items = self.model.compute_item(items)
                self.item_feature.append(items)
            if isinstance(items, tuple):
                self.item_feature = torch.cat([x[0] for x in self.item_feature]), torch.cat([x[1] for x in self.item_feature])
            else:
                self.item_feature = torch.cat(self.item_feature)
        else:
            self.item_feature = self.model.compute_item_all()

def load_checkpoint_for_inference(config_path, checkpoint_path, device='cuda'):
    """
    Load model and data for inference
    """
    config = Config(config_file_list=[config_path])
    config['device'] = torch.device(device)
    print("Loading data...")
    dataload = load_data(config)
    print("Data loaded. Available attributes:", dir(dataload))

    print("Initializing model...")
    model = get_model(config['model'])(config, dataload)
    model = model.to(config['device'])

    trainer = InferenceTrainer(config, model)

    print(f"Loading checkpoint from {checkpoint_path}")
    trainer.resume_checkpoint(checkpoint_path)

    print("Computing item features...")
    trainer.compute_item_feature(config, dataload)

    return trainer, model, dataload

def get_top_k_recommendations(trainer, model, user_id, k=10):
    """Get top-k recommendations for a user"""
    user = torch.tensor([user_id]).to(trainer.device)

    with torch.no_grad():
        scores = model.predict(user, trainer.item_feature)
        scores = scores.view(-1)

        top_k_scores, top_k_indices = torch.topk(scores, k)

    return top_k_indices.cpu().numpy().tolist()

def get_recommendations_for_user_sample(trainer, model, dataload, num_users=50, k=10, seed=42):
    torch.manual_seed(seed)

    inter_feat = dataload.inter_feat
    user_interactions = {}

    for user_id, item_id in zip(inter_feat['user_id'], inter_feat['item_id']):
        if user_id not in user_interactions:
            user_interactions[user_id] = []
        user_interactions[user_id].append(item_id)

    available_users = list(user_interactions.keys())
    num_users = min(num_users, len(available_users))
    sampled_users = torch.randperm(len(available_users))[:num_users]
    selected_users = [available_users[i] for i in sampled_users]
    user_ids = dataload.id2token['user_id']
    item_ids = dataload.id2token['item_id']
    recommendations = {}

    for user_id in selected_users:
        history = [item_ids[item_id] for item_id in user_interactions[user_id]]

        # Get recommendations
        top_k_items = get_top_k_recommendations(trainer, model, int(user_ids[user_id]), k)
        top_k_items = [item_ids[item_id] for item_id in top_k_items]

        recommendations[int(user_ids[user_id])] = {
            'history': history,
            'recommendations': top_k_items
        }
    return recommendations

if __name__ == "__main__":
    config_path = "dssm.yaml"
    checkpoint_path = "DSSM/saved/DSSM-Nov-16-2024_03-48-34.pth"
    trainer, model, dataload = load_checkpoint_for_inference(config_path, checkpoint_path)
    user_recommendations = get_recommendations_for_user_sample(
        trainer,
        model,
        dataload,
        num_users=100000,
        k=20
    )

    print(f"\nRecommendations for {len(user_recommendations)} users:")
    for user_id, result in user_recommendations.items():
        if result['history'][-1] in result['recommendations']:
          print(f"User {user_id}: Top 20 recommended items: {result['recommendations']}")

Loading data...
Data loaded. Available attributes: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_build_aug_seq', '_build_seq', '_data_processing', '_from_scratch', '_grouped_index', '_load_inter_feat', 'avg_actions_of_items', 'avg_actions_of_users', 'build', 'config', 'copy', 'counter', 'dataset_name', 'dataset_path', 'feat_name_list', 'get_norm_adj_mat', 'id2token', 'iid_field', 'inter_feat', 'inter_num', 'item_counter', 'item_num', 'logger', 'save', 'sort', 'sparsity', 'token2id', 'train_feat', 'uid_field', 'user_counter', 'user_num', 'user_seq']
Initializing model...
Loading checkpoint from DSSM/saved/DSSM-Nov-16-2024_03-48-34.pth


  checkpoint = torch.load(resume_file, map_location=self.device)


Computing item features...

Recommendations for 100000 users:
User 751: Top 20 recommended items: ['16303', '925', '13981', '16728', '17303', '16823', '9797', '9868', '13860', '12444', '7858', '12709', '14229', '684', '9879', '2581', '17110', '14410', '18225', '5648']
User 5042: Top 20 recommended items: ['10770', '3046', '14683', '835', '16197', '5719', '11296', '8018', '4984', '17047', '10681', '3895', '10921', '7335', '3175', '17971', '19455', '1501', '6673', '18585']
User 86440: Top 20 recommended items: ['8500', '16389', '12590', '15952', '3978', '15374', '19313', '1192', '11918', '5135', '6998', '19723', '7284', '126', '14665', '14199', '54', '19534', '7233', '8855']
User 79819: Top 20 recommended items: ['16581', '17780', '15465', '10610', '11668', '10405', '3241', '16955', '4502', '7849', '7910', '254', '16809', '18923', '11839', '14628', '7559', '10434', '13926', '10135']
User 55093: Top 20 recommended items: ['6522', '9791', '4493', '2793', '1841', '6039', '13417', '19180', '