In [71]:
from tkinter.constants import HIDDEN

from google.colab import drive
import shutil
import os
def copy_from_drive(src_path, dst_path):

    if os.path.exists(dst_path):
        print(f"skip:{dst_path} exists")
        return

    if os.path.isdir(src_path):
        shutil.copytree(src_path, dst_path)
    elif os.path.isfile(src_path):
        shutil.copy(src_path, dst_path)

drive.mount('/content/drive')
copy_from_drive('/content/drive/MyDrive/tool', '/content/tool')
copy_from_drive('/content/drive/MyDrive/MicroLens-50k_pairs.csv','/content/MicroLens-50k_pairs.csv')
copy_from_drive('/content/drive/MyDrive/cover_emb128.lmdb','/content/cover_emb128.lmdb')
copy_from_drive('/content/drive/MyDrive/title_emb1024.lmdb','/content/title_emb1024.lmdb')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
skip:/content/tool exists
skip:/content/MicroLens-50k_pairs.csv exists
skip:/content/cover_emb128.lmdb exists
skip:/content/title_emb1024.lmdb exists


In [72]:
!pip install faiss-cpu
!pip install lmdb
from tool import preprocess
from tool import customdataset
from tool import evaluate
import faiss
import torch.nn as nn
import numpy as np
import torch
import torch.nn.functional as F
from datetime import datetime



In [73]:
preprocess.set_seed(42)

In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [75]:
path = 'MicroLens-50k_pairs.csv'
user = 'user'
item = 'item'
user_id = 'user_id'
item_id = 'item_id'
timestamp = 'timestamp'
save_dir = './embeddings'
lmdb_path = 'title_emb1024.lmdb'

top_k = 10
num_workers = 10
k_neg = 10
L2_NORM = False
LMDB_DIM = 1024
EPOCHS = 30
BATCH_SIZE = 1024
EMBEDDING_DIM = 256
HIDDEN_SIZE = [256, 128, 64]
TITLE_HIDDEN_SIZE = [256,EMBEDDING_DIM]
DROPOUT = 0.2
# path = pd.read_csv('MicroLens-50k_pairs.csv')

In [76]:
dataset_pd,num_users,num_items = preprocess.openAndSort(path,user_id=user,item_id=item,timestamp='timestamp')

dataset base informationÔºö
- number of usersÔºö50000
- number of itemsÔºö19220
- number of rowsÔºö359708


In [77]:

train_df, test_df = preprocess.split(dataset_pd,user, item, timestamp)
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")


Train size: 309708
Test size: 49424


In [78]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(dataset_pd[user].unique())}
item2id = {i: j for j, i in enumerate(dataset_pd[item].unique())}

# apply to train_df and test_df
train_df[user_id] = train_df[user].map(user2id)
train_df[item_id] = train_df[item].map(item2id)
test_df[user_id] = test_df[user].map(user2id)
test_df[item_id] = test_df[item].map(item2id)

# 1. ÊûÑÂª∫ item_id Âà∞ item ÁöÑÊò†Â∞ÑÔºàÊù•Ëá™ train_dfÔºâ
item_id_to_item = {v: k for k, v in item2id.items()}

In [79]:
# DSSM implementation in PyTorch


class DSSM(nn.Module):
    """
    ÂèåÂ°î DSSMÔºöÁî®Êà∑Â°î + Áâ©ÂìÅÂ°î
    hidden_dims Â¶Ç [128, 64]ÔºåÊúÄÂêéËæìÂá∫Áª¥Â∫¶ = hidden_dims[-1]
    """
    def __init__(self, num_users, num_items, emb_dim=EMBEDDING_DIM, mlp_hidden_size=HIDDEN_SIZE, dropout=DROPOUT,title_hidden_size=TITLE_HIDDEN_SIZE,title_dropout=DROPOUT,lmdb_dim=LMDB_DIM):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)

        # üî∏ ÂàùÂßãÂåñÔºàÂèØÈÄâ‰ΩÜÊé®ËçêÔºâ
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)

        # load title_emb_tensor from lmdb
        self.title_emb_tensor = preprocess.load_tensor_from_lmdb(lmdb_path,num_items,item_id_to_item,lmdb_dim)

        # Ê≥®ÂÜå‰∏∫ bufferÔºåË°®Á§∫ËØ•ÂèÇÊï∞‰∏çÂèÇ‰∏éÊ¢ØÂ∫¶Êõ¥Êñ∞
        self.register_buffer('frozen_extra_emb', self.title_emb_tensor)


        # ÊûÑÂª∫ MLP Â±Ç
        self.mlp_user = self.build_mlp(emb_dim, mlp_hidden_size, dropout)
        self.mlp_item = self.build_mlp(emb_dim, mlp_hidden_size, dropout)
        # Â∞Üitem+titleÊò†Â∞ÑÂõûitemÂéüÊúâÁöÑÁª¥Â∫¶
        self.mlp_item_title = self.build_mlp(emb_dim+lmdb_dim, title_hidden_size, title_dropout)


    def build_mlp(self, input_dim, hidden_sizes, dropout):
          layers = []
          for h in hidden_sizes:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.Tanh())
            layers.append(nn.Dropout(dropout))
            input_dim = h
          return nn.Sequential(*layers)




    def forward(self, user_id, item_id, l2_norm=L2_NORM):
        """
        ËøîÂõû:
          score: (B,) ÁÇπÁßØÂàÜÊï∞
          u_vec, i_vec: (B, d) ‰∏§‰æßÂêëÈáè
        """

        u = self.user_emb(user_id)          # (B, emb_dim)
        i = self.item_emb(item_id)          # (B, emb_dim)
        title = self.title_emb_tensor.to(item_id.device)[item_id]
        i = torch.cat([i, title], dim=-1) # (B, emb_dim+lmdb_dim)
        i = self.mlp_item_title(i)          # (B, emb_dim)
        u_vec = self.mlp_user(u)            # (B, d)
        i_vec = self.mlp_item(i)            # (B, d)

        if l2_norm:
            u_vec = F.normalize(u_vec, p=2, dim=1)
            i_vec = F.normalize(i_vec, p=2, dim=1)

        return u_vec, i_vec
    def get_users_embedding(self,user_ids,l2_norm=L2_NORM):
        u = self.user_emb(user_ids)          # (B, emb_dim)

        u_vec = self.mlp_user(u)            # (B, d)

        if l2_norm:
            u_vec = F.normalize(u_vec, p=2, dim=1)
        return u_vec
    def get_items_embedding(self,item_ids,l2_norm=L2_NORM):
        i = self.item_emb(item_ids)
        title = self.title_emb_tensor.to(item_ids.device)[item_ids]
        i = torch.cat([i, title], dim=-1)
        i = self.mlp_item_title(i)
        i_vec = self.mlp_item(i)
        if l2_norm:
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def save_embeddings(self, num_users, num_items, device, save_dir='./embeddings',l2_norm = L2_NORM):
        import os
        import faiss
        os.makedirs(save_dir, exist_ok=True)

        self.eval()
        self.to(device)

        user_ids = torch.arange(num_users, dtype=torch.long, device=device)
        item_ids = torch.arange(num_items, dtype=torch.long, device=device)

        with torch.no_grad():
            user_embeds = self.get_users_embedding(user_ids, l2_norm=l2_norm)
            item_embeds = self.get_items_embedding(item_ids, l2_norm=l2_norm)

        user_embeds = user_embeds.cpu().numpy().astype(np.float32)
        item_embeds = item_embeds.cpu().numpy().astype(np.float32)

        # ‰øùÂ≠òÂêëÈáè
        np.save(f"{save_dir}/user_embeddings.npy", user_embeds)
        np.save(f"{save_dir}/item_embeddings.npy", item_embeds)

        # ÊûÑÂª∫ FAISS indexÔºà‰ΩøÁî®ÂÜÖÁßØÔºâ
        dim = item_embeds.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(item_embeds)

        faiss.write_index(index, f"{save_dir}/item_index.faiss")
        print("Saved user/item embeddings and FAISS index.")



In [80]:

def train_model(model,train_df,
                epochs=10,
                batch_size=64,
                lr=1e-3,
                device=None):

    if device is None:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # ‰Ω†ÈúÄË¶Å‰∏Ä‰∏™ data_loader ËøîÂõû (user_id, pos_item_id) ÂØπÔºåÊó†Ë¥üÊ†∑Êú¨
    train_loader = customdataset.build_train_loader_inbatch(train_df, batch_size=batch_size,user_col=user_id, item_col=item_id)

    for epoch in range(1, epochs + 1):
        model.train()
        dt_start = datetime.now()
        epoch_loss = 0.0

        for batch in train_loader:
            user_ids, pos_item_ids = batch
            user_ids = user_ids.to(device)
            pos_item_ids = pos_item_ids.to(device)

            # 1. ÂâçÂêë‰º†Êí≠ÔºàËøîÂõû user / item ÂêëÈáèÔºâ
            u_vec, i_vec = model(user_ids, pos_item_ids, l2_norm=L2_NORM)

            # 2. ÂæóÂàÜÁü©ÈòµÔºöÊØè‰∏™ user ÂØπÊâÄÊúâÊ≠£ item ÁöÑÊâìÂàÜ
            logits = torch.matmul(u_vec, i_vec.T)  # shape: (B, B)

            # 3. ÊûÑÈÄ†Ê†áÁ≠æÔºöÊØè‰∏™ user ÁöÑÊ≠£Á°Æ item Âú®ÂØπËßíÁ∫øÔºàÂç≥‰ΩçÁΩÆ iÔºâ
            labels = torch.arange(logits.size(0), device=device)  # [0, 1, ..., B-1]

            # 4. Cross Entropy Loss
            loss = F.cross_entropy(logits, labels)

            # 5. ÂèçÂêë‰º†Êí≠
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # Êó•Âøó
        avg_loss = epoch_loss / len(train_loader)
        dt_end = datetime.now()
        dt = dt_end - dt_start

        print(f"[Epoch {epoch:02d}/{epochs}] avg InBatch Softmax Loss = {avg_loss:.4f}, time = {dt.total_seconds():.2f}s")

    return




In [81]:
model = DSSM(num_users,num_items)
model.to(device)
train_model(model=model,epochs=EPOCHS, train_df=train_df,batch_size=BATCH_SIZE)

[Epoch 01/30] avg InBatch Softmax Loss = 9.3466, time = 8.70s
[Epoch 02/30] avg InBatch Softmax Loss = 7.8317, time = 8.45s
[Epoch 03/30] avg InBatch Softmax Loss = 7.0787, time = 8.52s
[Epoch 04/30] avg InBatch Softmax Loss = 6.5863, time = 8.53s
[Epoch 05/30] avg InBatch Softmax Loss = 6.2141, time = 8.45s
[Epoch 06/30] avg InBatch Softmax Loss = 5.9744, time = 8.42s
[Epoch 07/30] avg InBatch Softmax Loss = 5.8167, time = 8.42s
[Epoch 08/30] avg InBatch Softmax Loss = 5.7046, time = 8.43s
[Epoch 09/30] avg InBatch Softmax Loss = 5.6199, time = 8.38s
[Epoch 10/30] avg InBatch Softmax Loss = 5.5498, time = 8.46s
[Epoch 11/30] avg InBatch Softmax Loss = 5.4869, time = 8.46s
[Epoch 12/30] avg InBatch Softmax Loss = 5.4295, time = 8.42s
[Epoch 13/30] avg InBatch Softmax Loss = 5.3769, time = 8.42s
[Epoch 14/30] avg InBatch Softmax Loss = 5.3303, time = 8.40s
[Epoch 15/30] avg InBatch Softmax Loss = 5.2902, time = 8.40s
[Epoch 16/30] avg InBatch Softmax Loss = 5.2508, time = 8.37s
[Epoch 1

In [82]:
model.save_embeddings(num_users=num_users,num_items=num_items,device=device,save_dir=save_dir)

Saved user/item embeddings and FAISS index.


In [83]:
test_loader = customdataset.build_test_loader(test_df, num_items ,user_col = user_id, item_col = item_id, batch_size=1024, num_workers=num_workers)
item_pool = list(range(num_items))
faiss_index = faiss.read_index(f"{save_dir}/item_index.faiss")

In [84]:
hr_r, ndcg_r = evaluate.evaluate_random(test_loader, item_pool ,top_k=top_k)
print(f"Random HR@{top_k} = {hr_r:.4f}, nDCG@{top_k} = {ndcg_r:.4f}")
hr_p, ndcg_p = evaluate.evaluate_popular(test_loader, train_df,top_k=top_k)
print(f"Popular HR@{top_k} = {hr_p:.4f}, nDCG@{top_k} = {ndcg_p:.4f}")
hr_m, ndcg_m = evaluate.evaluate_model(test_loader, model, faiss_index, device,top_k=top_k)
print(f"Model   HR@{top_k} = {hr_m:.4f}, nDCG@{top_k} = {ndcg_m:.4f}")


Random HR@10 = 0.0005, nDCG@10 = 0.0002
Popular HR@10 = 0.0029, nDCG@10 = 0.0014
Model   HR@10 = 0.0252, nDCG@10 = 0.0110
