In [1]:
# Import necessary libraries
import os
import torch
import numpy as np
import matplotlib.pyplot as plt

from model.sasrec import SASRecModel
from trainers import Trainer
from utils import EarlyStopping, check_path, set_seed, set_logger
from dataset import get_seq_dic, get_dataloder, get_rating_matrix

# Set up arguments
class Args:
    data_dir = "./data/"
    output_dir = "output/"
    data_name = "input_search_augmented_final_20241127"
    do_eval = False
    load_model = None
    train_name = "test_model"
    num_items = 10
    num_users = 10
    lr = 0.001
    batch_size = 256
    epochs = 10
    no_cuda = False
    log_freq = 1
    patience = 2
    num_workers = 0  # Set num_workers to 0 to avoid BrokenPipeError on Windows
    seed = 42
    weight_decay = 0.0
    adam_beta1 = 0.9
    adam_beta2 = 0.999
    gpu_id = "0"
    variance = 5
    model_type = 'bert4rec'
#     model_type = 'sasrec_model'
    max_seq_length = 50
    hidden_size = 256
    num_hidden_layers = 2
    hidden_act = "gelu"
    num_attention_heads = 2
    attention_probs_dropout_prob = 0.5
    hidden_dropout_prob = 0.5
    initializer_range = 0.02

args = Args()

In [2]:
# model train
if __name__ == "__main__":
    # Initialize logger
    log_path = os.path.join(args.output_dir, args.train_name + '.log')
    logger = set_logger(log_path)

    # Set seed for reproducibility
    set_seed(args.seed)

    # Create output directory if not exists
    check_path(args.output_dir)

    # Set CUDA environment
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
    args.cuda_condition = torch.cuda.is_available() and not args.no_cuda

    # Load data
    seq_dic, max_item, num_users = get_seq_dic(args)
    args.item_size = max_item + 1
    args.num_users = num_users + 1

    # Prepare checkpoint paths
    args.checkpoint_path = os.path.join(args.output_dir, args.train_name + '.pt')
    args.same_target_path = os.path.join(args.data_dir, args.data_name+'_same_target.npy')

    # Load dataloaders
    train_dataloader, eval_dataloader, test_dataloader = get_dataloder(args, seq_dic)

    # Initialize and log model
    logger.info(str(args))
    model = SASRecModel(args=args)
    logger.info(model)

    # Initialize trainer
    trainer = Trainer(model, train_dataloader, eval_dataloader, test_dataloader, args, logger)

    # Generate rating matrices for evaluation
    args.valid_rating_matrix, args.test_rating_matrix = get_rating_matrix(args.data_name, seq_dic, max_item)

    # Training and evaluation
    if args.do_eval:
        if args.load_model is None:
            logger.info(f"No model input!")
            exit(0)
        else:
            args.checkpoint_path = os.path.join(args.output_dir, args.load_model + '.pt')
            trainer.load(args.checkpoint_path)
            logger.info(f"Load model from {args.checkpoint_path} for test!")
            scores, result_info = trainer.test(0)
    else:
        early_stopping = EarlyStopping(args.checkpoint_path, logger=logger, patience=args.patience, verbose=True)
        for epoch in range(args.epochs):
            trainer.train(epoch)
            scores, _ = trainer.valid(epoch)
            # evaluate on MRR
            early_stopping(np.array(scores[-1:]), trainer.model)
            if early_stopping.early_stop:
                logger.info("Early stopping")
                break

        logger.info("---------------Test Score---------------")
        trainer.model.load_state_dict(torch.load(args.checkpoint_path))
        scores, result_info = trainer.test(0)

    logger.info(args.train_name)
    logger.info(result_info)

2024-11-27 12:30:44,466 - <__main__.Args object at 0x7fa09455d7d0>
2024-11-27 12:30:44,597 - SASRecModel(
  (item_embeddings): Embedding(30091, 256, padding_idx=0)
  (position_embeddings): Embedding(50, 256)
  (LayerNorm): LayerNorm()
  (dropout): Dropout(p=0.5, inplace=False)
  (item_encoder): TransformerEncoder(
    (blocks): ModuleList(
      (0): TransformerBlock(
        (layer): MultiHeadAttention(
          (query): Linear(in_features=256, out_features=256, bias=True)
          (key): Linear(in_features=256, out_features=256, bias=True)
          (value): Linear(in_features=256, out_features=256, bias=True)
          (softmax): Softmax(dim=-1)
          (attn_dropout): Dropout(p=0.5, inplace=False)
          (dense): Linear(in_features=256, out_features=256, bias=True)
          (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
          (out_dropout): Dropout(p=0.5, inplace=False)
        )
        (feed_forward): FeedForward(
          (dense_1): Linear(in_fea

In [3]:
# model save
torch.save(model, "bert4rec_20241127.pt")

In [20]:
# # Import your model class (make sure to import the correct one)
# from model.sasrec import SASRecModel  # or import the appropriate model class

# # Instantiate the model
# model = SASRecModel(args)

# # Load the state dictionary into the model
# model.load_state_dict(torch.load("/data/log-data-2024/yh/LLM_EB/src_dp/output/llmeb.pt", map_location=device))

# # Move the model to the device (CPU or GPU)
# model = model.to(device)

# # Now you can call your predict function
# pred = predict(model, ids, device).tolist()

  model.load_state_dict(torch.load("/data/log-data-2024/yh/LLM_EB/src_dp/output/llmeb.pt", map_location=device))


RuntimeError: Error(s) in loading state_dict for SASRecModel:
	Missing key(s) in state_dict: "item_embeddings.weight", "position_embeddings.weight", "LayerNorm.weight", "LayerNorm.bias", "item_encoder.blocks.0.layer.query.weight", "item_encoder.blocks.0.layer.query.bias", "item_encoder.blocks.0.layer.key.weight", "item_encoder.blocks.0.layer.key.bias", "item_encoder.blocks.0.layer.value.weight", "item_encoder.blocks.0.layer.value.bias", "item_encoder.blocks.0.layer.dense.weight", "item_encoder.blocks.0.layer.dense.bias", "item_encoder.blocks.0.layer.LayerNorm.weight", "item_encoder.blocks.0.layer.LayerNorm.bias", "item_encoder.blocks.0.feed_forward.dense_1.weight", "item_encoder.blocks.0.feed_forward.dense_1.bias", "item_encoder.blocks.0.feed_forward.dense_2.weight", "item_encoder.blocks.0.feed_forward.dense_2.bias", "item_encoder.blocks.0.feed_forward.LayerNorm.weight", "item_encoder.blocks.0.feed_forward.LayerNorm.bias", "item_encoder.blocks.1.layer.query.weight", "item_encoder.blocks.1.layer.query.bias", "item_encoder.blocks.1.layer.key.weight", "item_encoder.blocks.1.layer.key.bias", "item_encoder.blocks.1.layer.value.weight", "item_encoder.blocks.1.layer.value.bias", "item_encoder.blocks.1.layer.dense.weight", "item_encoder.blocks.1.layer.dense.bias", "item_encoder.blocks.1.layer.LayerNorm.weight", "item_encoder.blocks.1.layer.LayerNorm.bias", "item_encoder.blocks.1.feed_forward.dense_1.weight", "item_encoder.blocks.1.feed_forward.dense_1.bias", "item_encoder.blocks.1.feed_forward.dense_2.weight", "item_encoder.blocks.1.feed_forward.dense_2.bias", "item_encoder.blocks.1.feed_forward.LayerNorm.weight", "item_encoder.blocks.1.feed_forward.LayerNorm.bias". 
	Unexpected key(s) in state_dict: "module.item_embeddings.weight", "module.position_embeddings.weight", "module.LayerNorm.weight", "module.LayerNorm.bias", "module.item_encoder.blocks.0.layer.query.weight", "module.item_encoder.blocks.0.layer.query.bias", "module.item_encoder.blocks.0.layer.key.weight", "module.item_encoder.blocks.0.layer.key.bias", "module.item_encoder.blocks.0.layer.value.weight", "module.item_encoder.blocks.0.layer.value.bias", "module.item_encoder.blocks.0.layer.dense.weight", "module.item_encoder.blocks.0.layer.dense.bias", "module.item_encoder.blocks.0.layer.LayerNorm.weight", "module.item_encoder.blocks.0.layer.LayerNorm.bias", "module.item_encoder.blocks.0.feed_forward.dense_1.weight", "module.item_encoder.blocks.0.feed_forward.dense_1.bias", "module.item_encoder.blocks.0.feed_forward.dense_2.weight", "module.item_encoder.blocks.0.feed_forward.dense_2.bias", "module.item_encoder.blocks.0.feed_forward.LayerNorm.weight", "module.item_encoder.blocks.0.feed_forward.LayerNorm.bias", "module.item_encoder.blocks.1.layer.query.weight", "module.item_encoder.blocks.1.layer.query.bias", "module.item_encoder.blocks.1.layer.key.weight", "module.item_encoder.blocks.1.layer.key.bias", "module.item_encoder.blocks.1.layer.value.weight", "module.item_encoder.blocks.1.layer.value.bias", "module.item_encoder.blocks.1.layer.dense.weight", "module.item_encoder.blocks.1.layer.dense.bias", "module.item_encoder.blocks.1.layer.LayerNorm.weight", "module.item_encoder.blocks.1.layer.LayerNorm.bias", "module.item_encoder.blocks.1.feed_forward.dense_1.weight", "module.item_encoder.blocks.1.feed_forward.dense_1.bias", "module.item_encoder.blocks.1.feed_forward.dense_2.weight", "module.item_encoder.blocks.1.feed_forward.dense_2.bias", "module.item_encoder.blocks.1.feed_forward.LayerNorm.weight", "module.item_encoder.blocks.1.feed_forward.LayerNorm.bias". 

## prediction

In [4]:
from collections import Counter
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

# model = torch.load("bert4rec.pt")
model = torch.load("bert4rec_20241127.pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')

In [5]:
# Standalone predict function
def predict(model, input_ids, device):
    model.eval()
    input_ids = torch.tensor(input_ids, dtype=torch.long).to(device)
    with torch.no_grad():
        recommend_output = model.forward(input_ids, all_sequence_output=False)
        recommend_output = recommend_output[:, -1, :]  # Last item in the sequence

        test_item_emb = model.item_embeddings.weight
        rating_pred = torch.matmul(recommend_output, test_item_emb.transpose(0, 1))
        rating_pred = rating_pred.cpu().data.numpy().copy()

        top20_indices = np.argpartition(rating_pred, -40)[:, -40:]
        arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], top20_indices]
        arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]
        top20_indices = top20_indices[np.arange(len(rating_pred))[:, None], arr_ind_argsort]

    return top20_indices

def get_attention_weight(model, input_ids, device):
    model.eval()
    input_ids = torch.tensor(input_ids, dtype=torch.long).to(device)
    with torch.no_grad():
        recommend_output = model.forward(input_ids, all_sequence_output=False)
        recommend_output = recommend_output[:, -1, :]  # Last item in the sequence

        test_item_emb = model.item_embeddings.weight
    
    return test_item_emb

In [6]:
with open('/data/log-data-2024/SASRec/BSARec/src/data/input_search_augmented_final_20241127.txt', 'r') as f:
    input_data = f.readlines()

input_ids = []
for i in input_data:
    temp = [int(w) for w in i.replace("\n", "").split()[1:]]
    while len(temp) < 51:
        temp.insert(0, 0)
    input_ids.append(temp)
    
confirm = []
for i in tqdm(input_ids):
    temp = []
    for w in i:
        if w != 0:
            temp.append(w)
    confirm.append(temp)

length = [len(i) for i in confirm]
print(np.mean(length), np.std(length), np.min(length), np.max(length))

100%|███████████████████████████████| 307010/307010 [00:01<00:00, 191038.89it/s]


10.369053125305365 24.01168706773932 3 702


In [7]:
input_ids = [seq[-50:] for seq in input_ids]

label = [i[-1] for i in input_ids]
input_ids = [i[:-1] for i in input_ids]
        

lab = label[:1000]
ids = input_ids[:1000]

ids = torch.tensor(ids, dtype=torch.long).to(device)

pred = predict(model, ids, device).tolist() # SASRec 예측

  after removing the cwd from sys.path.


In [8]:
ids = ids.cpu().tolist()

cnt = []
for i in ids:
    for w in i:
        cnt.append(w)
        
cnt = dict(Counter(cnt))
cnt = pd.DataFrame({"token" : cnt.keys(), "count" : cnt.values()})
cnt = cnt.sort_values(by = "count", ascending = False).reset_index(drop = True).loc[1:21]
cnt = list(cnt["token"])

cnt = [cnt for i in range(len(lab))] # PopRec 예측

In [37]:
# # llm for embeddings
# lr = pd.read_csv("/data/log-data-2024/yh/LLM_RS/top_prediction.csv")
# lr = lr.fillna(1)
# lr_pred = []
# for i in lr.index:
#     lr_pred.append([int(w) for w in list(lr.loc[i][lr.columns[1:]])])

In [9]:
def hr_at_k(recommendations, true_labels, k=20):
    score = 0
    recommendations = [i[:k] for i in recommendations]
    for a, i in enumerate(true_labels):
        if i in recommendations[a]:
            score += 1
    return score/len(true_labels)

def precision_at_k(recommendations, true_labels, k=20):
    precision_scores = []
    
    for user_recommendations, true_label in zip(recommendations, true_labels):
        # 추천된 상위 20개 중 실제 정답이 있는지 확인
        hits = 1 if true_label in user_recommendations[:k] else 0
        
        # Precision은 정답이 있으면 1 / k, 없으면 0
        precision = hits / k
        precision_scores.append(precision)
    
    # 모든 사용자에 대한 평균 Precision을 반환
    return sum(precision_scores) / len(precision_scores)

def recall_at_k(recommendations, true_labels, k=20):
    recall_scores = []
    
    for user_recommendations, true_label in zip(recommendations, true_labels):
        # 추천된 상위 k개 중 실제 정답이 있는지 확인
        hits = 1 if true_label in user_recommendations[:k] else 0
        
        # Recall은 정답이 있으면 1, 없으면 0
        recall = hits
        recall_scores.append(recall)
    
    # 모든 사용자에 대한 평균 Recall을 반환
    return sum(recall_scores) / len(recall_scores)

def total_print(k):
    print("SASRec HR@{}: ".format(k), round(hr_at_k(pred, lab, k = k), 3))
#     print("LLM as RS HR@{}: ".format(k), round(hr_at_k(lr_pred, lab, k = k), 3))
    print("PopRec HR@{}: ".format(k), round(hr_at_k(cnt, lab, k = k), 3))
    print("")
    print("SASRec precision@{}: ".format(k), round(precision_at_k(pred, lab, k = k), 3))
#     print("LLM as RS precision@{}: ".format(k), round(precision_at_k(lr_pred, lab, k = k), 3))
    print("PopRec precision@{}: ".format(k), round(precision_at_k(cnt, lab, k = k), 3))

In [10]:
total_print(1)
print("")
total_print(3)
print("")
total_print(5)
print("")
total_print(10)
print("")
total_print(15)
print("")
total_print(20)

SASRec HR@1:  0.117
PopRec HR@1:  0.017

SASRec precision@1:  0.117
PopRec precision@1:  0.017

SASRec HR@3:  0.162
PopRec HR@3:  0.054

SASRec precision@3:  0.054
PopRec precision@3:  0.018

SASRec HR@5:  0.201
PopRec HR@5:  0.065

SASRec precision@5:  0.04
PopRec precision@5:  0.013

SASRec HR@10:  0.228
PopRec HR@10:  0.102

SASRec precision@10:  0.023
PopRec precision@10:  0.01

SASRec HR@15:  0.263
PopRec HR@15:  0.118

SASRec precision@15:  0.018
PopRec precision@15:  0.008

SASRec HR@20:  0.292
PopRec HR@20:  0.147

SASRec precision@20:  0.015
PopRec precision@20:  0.007


# Prediction for new data

In [14]:
import pandas as pd
import pickle
from tqdm import tqdm

df = pd.read_csv("/data/log-data-2024/2.sequence_generate_ksc/data/sequence_device_match_241127.csv")
cf = pd.read_csv("/data/log-data-2024/20241127_Final/input_search_final_20241127.txt", sep = "\t", header = None)
df = pd.concat([df, cf], axis = 1)

with open(file= '/data/log-data-2024/20241123_Final/match_dict_final.pickle', mode='rb') as f:
    dic1 = pickle.load(f)

with open(file= '/data/log-data-2024/20241123_Final/match_dict_final2.pickle', mode='rb') as f:
    dic2 = pickle.load(f)
    
samp = list(pd.read_csv("/data/log-data-2024/20241127_Final/8man_sample_20241127.csv")["treatment1"])

In [15]:
rev = dict(zip(list(dic1.values()), list(dic1.keys())))
dic = {}
for i in dic2:
    try:
        dic[rev[i]] = dic2[i]
    except:
        pass

df["use"] = [1 if i in samp else 0 for i in tqdm(df["device_id"])]
df = df[df["use"] == 1]
df = df.drop_duplicates(subset = 'device_id').reset_index(drop = True)

100%|███████████████████████████████████| 73505/73505 [00:20<00:00, 3616.25it/s]


In [41]:
# # LLM as RS를 위한 처리
# convert = {}
# for i in dic1:
#     try:
#         convert[i] = dic2[dic1[i]]
#     except:
#         pass

# df = df[["device_id", 0]].reset_index(drop = True)
# df[1] = [i.split()[1:] for i in df[0]]

# rev = dict(zip(list(dic2.values()), list(dic2.keys())))

# res = []
# for i in df[1]:
#     temp = []
#     for w in i:
#         try:
#             num = convert[int(w)] 
#             temp.append(num+ ". " + rev[num])
#         except:
#             pass
#     res.append(temp)
# df["llmasrs"] = res

# df.to_csv("/data/log-data-2024/2.sequence_generate_ksc/data/prediction_set_for_LLMasRS.csv")

In [17]:
# input_file_path = "/data/log-data-2024/SASRec/BSARec/src/data/input_search_prediction_final.txt"
# with open(input_file_path, 'r') as f:
#     input_data = f.readlines()

input_data = []

for i in df[0]:
    temp = []
    t = i.split()[1:]
    for w in t:
#         temp.append(dic[int(w)])
        try:
            temp.append(str(w))
        except:
            pass
    input_data.append(" ".join(temp))

input_ids = []
for line in input_data:
    items = list(map(int, line.strip().split()))
    pad_len = args.max_seq_length - len(items)
    input_ids.append([0] * pad_len + items)
    
# confirm = []
# for i in tqdm(input_ids):
#     temp = []
#     for w in i:
#         if w != 0:
#             temp.append(w)
#     confirm.append(temp)

# length = [len(i) for i in confirm]
# print(np.mean(length), np.std(length), np.min(length), np.max(length))

In [18]:
ii = []
for i in input_ids:
    temp = i
    while len(temp) > 50:
        temp = temp[1:]
    ii.append(temp)

In [19]:
pred = []
for i in range(20):
    pred = pred + predict(model, ii[i*1000:(i+1)*1000], device).tolist()

In [20]:
samp = pd.DataFrame({"treatment1" : df["device_id"], "treatment1 prediction" : pred}).reset_index(drop = True)
samp.to_csv("/data/log-data-2024/20241127_Final/8man_sample_20241127_predicted.csv")

In [48]:
# samp = pd.read_csv("/data/log-data-2024/8man_sample_new.csv")
# samp = samp[["control", "treatment1", "treatment2", "treatment3"]]
# samp["treatment1 prediction"] = pred
# samp.to_csv("/data/log-data-2024/8man_sample_new_new.csv")