# **Commerce Purchase Behavior Prediction baseline SASRec code**
> RecSys Advanced 강의의 Commerce Purchase Behavior Prediction 대회에 참가하신 여러분 환영합니다! 🎉     
> 아래 baseline에서는 RecBole을 활용해 SASRec 모델을 학습하고 및 예측 파일을 생성하는 프로세스에 대해 알아보겠습니다.

## Contents
- Prepare Environments
- Import Library & Load Dataset
- Recbole dataset으로 변환
- Modeling
- Inference & Save Submission File


## 1. Prepare Environments

* 데이터 로드를 위한 구글 드라이브를 마운트합니다.
* 필요한 라이브러리를 설치합니다.

In [None]:
# 구글 드라이브 마운트, Colab을 이용하지 않는다면 패스해도 됩니다.
from google.colab import drive
drive.mount('/gdrive', force_remount=True)
drive.mount('/content/drive')

Mounted at /gdrive
Mounted at /content/drive


In [None]:
# 구글 드라이브에 업로드된 대회 데이터를 압축 해제하고 로컬에 저장합니다.
!wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000290/data/data.tar.gz
!tar -xvf data.tar.gz > /dev/null

--2024-01-24 05:50:27--  https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000290/data/data.tar.gz
Resolving aistages-api-public-prod.s3.amazonaws.com (aistages-api-public-prod.s3.amazonaws.com)... 52.219.202.59, 52.219.206.3, 52.219.148.22, ...
Connecting to aistages-api-public-prod.s3.amazonaws.com (aistages-api-public-prod.s3.amazonaws.com)|52.219.202.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400028916 (381M) [binary/octet-stream]
Saving to: ‘data.tar.gz’


2024-01-24 05:50:38 (36.2 MB/s) - ‘data.tar.gz’ saved [400028916/400028916]

tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.macl'


In [None]:
# 필요한 라이브러리를 설치합니다.
!pip install recbole -q
!pip install kmeans_pytorch -q
!pip install ray -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.8/64.8 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Import Library & Load Dataset
* 학습에 필요한 라이브러리를 로드합니다.

In [None]:
import pandas as pd
import os
import numpy as np
import math
import random
import torch
import json
from tqdm import tqdm
from collections import defaultdict

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import SASRec
from recbole.trainer import Trainer
from recbole.utils import init_seed
from recbole.utils.case_study import full_sort_topk
from recbole.quick_start.quick_start import load_data_and_model

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

set_seed(42)

In [None]:
# 데이터를 로드합니다.
train_df = pd.read_parquet('/content/data/train.parquet')

## 3. Recbole dataset으로 변환
* Recbole library에서 활용하는 dataset으로 변환

In [None]:
# event_time을 datatime으로 변환
train_df['event_time'] = pd.to_datetime(train_df['event_time'], format='%Y-%m-%d %H:%M:%S %Z')
train_df = train_df.sort_values(by=['event_time'])
train_df = train_df[['user_id','item_id','user_session','event_time']]
train_df['event_time'] = train_df['event_time'].values.astype(float)

# 사용자(user)와 아이템(item)을 인덱스로 매핑하기 위한 딕셔너리 생성
user2idx = {v: k for k, v in enumerate(train_df['user_id'].unique())}  # 각 사용자를 인덱스로 매핑
idx2user = {k: v for k, v in enumerate(train_df['user_id'].unique())}  # 각 인덱스를 사용자로 매핑
item2idx = {v: k for k, v in enumerate(train_df['item_id'].unique())}  # 각 아이템을 인덱스로 매핑
idx2item = {k: v for k, v in enumerate(train_df['item_id'].unique())}  # 각 인덱스를 아이템으로 매핑

# 사용자와 아이템을 인덱스로 변환하여 새로운 열 추가
train_df['user_idx'] = train_df['user_id'].map(user2idx)
train_df['item_idx'] = train_df['item_id'].map(item2idx)
train_df = train_df.dropna().reset_index(drop=True)
recbole_df = train_df.rename(columns={'user_idx': 'user_idx:token', 'item_idx': 'item_idx:token', 'event_time': 'event_time:float'})

os.mkdir('./SASRec_dataset')
recbole_df[['user_idx:token', 'item_idx:token', 'event_time:float']].to_csv('./SASRec_dataset/SASRec_dataset.inter', sep='\t',index=None)

## 4. Modeling

In [None]:
config_dict = {
    'data_path': '/content',# The path of input dataset.
    'USER_ID_FIELD': 'user_idx',
    'ITEM_ID_FIELD': 'item_idx',
    'TIME_FIELD': 'event_time',
    'user_inter_num_interval': "[5,Inf)",# Users whose number of interactions is in the interval will be retained.
    'item_inter_num_interval': "[5,Inf)",# Items whose number of interactions is in the interval will be retained.
    'load_col': {'inter': ['user_idx', 'item_idx', 'event_time']},

    'train_batch_size' : 4096,
    'hidden_size': 64,
    'n_layers': 2,
    'n_heads': 4,
    'inner_size': 64,
    'hidden_dropout_prob': 0.2,
    'attn_dropout_prob': 0.2,
    'hidden_act': 'gelu',
    'layer_norm_eps': 1e-12,
    'initializer_range': 0.02,
    'pooling_mode': 'sum',
    'loss_type': 'BPR',
    'fusion_type': 'gate',
    'attribute_predictor': 'linear',
    'epoch' : 1, # 더 좋은 성능을 위해 epoch 수를 늘려보세요.
    'stopping_step': 5,

    'MAX_ITEM_LIST_LENGTH': 50,# Maximum length of each generated sequence. Defaults to 50.
    'eval_args': {
        'split': {'LS': 'valid_and_test'},
        'group_by': 'user', # the data will be grouped by the column of USER_ID_FIELD and split in user dimension.
        'order': 'TO', # sort the data by the column of TIME_FIELD in ascending order and the split them in this order.
         # uni100 means uniformly sample 100 negative items for each positive item in testing set,
         # and evaluate the model on these positive items with their sampled negative items.
        'mode': 'uni100'
                },
    'metrics': ['Recall','NDCG'],
    'topk': 10,
    'valid_metric': 'NDCG@10',
    'checkpoint_dir' : '/content'
}

config = Config(model='SASRec',
                config_dict=config_dict,
                dataset='SASRec_dataset')

init_seed(config['seed'], config['reproducibility'])
dataset = create_dataset(config)
train_data, valid_data, _ = data_preparation(config, dataset)




In [None]:
# model을 불러옵니다.
model = SASRec(config, train_data.dataset).to(config['device'])
print("model information : ", model)

# trainer를 초기화합니다.
trainer = Trainer(config, model)

# model을 학습합니다.
trainer.fit(train_data, valid_data, saved=True, show_progress=config["show_progress"])

model information :  SASRec(
  (item_embedding): Embedding(29288, 64, padding_idx=0)
  (position_embedding): Embedding(50, 64)
  (trm_encoder): TransformerEncoder(
    (layer): ModuleList(
      (0-1): 2 x TransformerLayer(
        (multi_head_attention): MultiHeadAttention(
          (query): Linear(in_features=64, out_features=64, bias=True)
          (key): Linear(in_features=64, out_features=64, bias=True)
          (value): Linear(in_features=64, out_features=64, bias=True)
          (softmax): Softmax(dim=-1)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (dense): Linear(in_features=64, out_features=64, bias=True)
          (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
          (out_dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): FeedForward(
          (dense_1): Linear(in_features=64, out_features=64, bias=True)
          (dense_2): Linear(in_features=64, out_features=64, bias=True)
          (LayerNorm): LayerNor

[1;35mTrain     0[0m: 100%|█████████████████████| 1630/1630 [03:20<00:00,  8.12it/s, [1;33mGPU RAM: 7.04 G/39.56 G[0m][0m
[1;35mEvaluate   [0m: 100%|█████████████████████| 9249/9249 [05:48<00:00, 26.57it/s, [1;33mGPU RAM: 7.04 G/39.56 G[0m][0m
[1;35mTrain     1[0m:   9%|█▉                    | 147/1630 [00:18<03:03,  8.09it/s, [1;33mGPU RAM: 7.04 G/39.56 G[0m][0m


KeyboardInterrupt: 

## 5. Inference & Save Submission File


In [None]:
!ls /content/ # 저장된 model file 경로 확인

data	     drive  log_tensorboard  SASRec_dataset
data.tar.gz  log    sample_data      SASRec-Jan-24-2024_06-25-48.pth


In [None]:
from tqdm import tqdm
train_df = train_df.sort_values(by=['user_session','event_time'])

users = defaultdict(list) # defaultdict은 dictionary의 key가 없을때 default 값을 value로 반환
for u, i in zip(train_df['user_idx'], train_df['item_idx']):
    users[u].append(i)

# 저장된 model명으로 변경하고 model과 데이터 불러오기
config, model, dataset, _ , _, test_data = load_data_and_model(
    model_file='/content/SASRec-Jan-24-2024_06-25-48.pth'
)
print('Data and model load compelete')

# cold-start user는 popular_top_10 items으로 make-up
popular_top_10 = train_df.groupby('item_idx').count().rename(columns = {"user_idx": "user_counts"}).sort_values(by=['user_counts', 'item_idx'], ascending=[False, True])[:10].index
result = []

# short history user에 대해선 popular로 처리
for uid in tqdm(users):
    if str(uid) in dataset.field2token_id['user_idx']:
        recbole_id = dataset.token2id(dataset.uid_field, str(uid))
        topk_score, topk_iid_list = full_sort_topk([recbole_id], model, test_data, k=10, device=config['device'])
        predicted_item_list = dataset.id2token(dataset.iid_field, topk_iid_list.cpu())
        predicted_item_list = predicted_item_list[-1]
        predicted_item_list = list(map(int,predicted_item_list))
    else: # cold-start users
        predicted_item_list = list(popular_top_10)

    for iid in predicted_item_list:
        result.append((idx2user[uid], idx2item[iid]))


pd.DataFrame(result, columns=["user_id", "item_id"]).to_csv("output.csv", index=False)

## Required Package

recbole==1.2.0 <br>
kmeans-pytorch==0.3 <br>
ray==2.9.1 <br>
pandas==1.5.3 <br>
scipy==1.11.4 <br>
numpy==1.23.5 <br>




## 콘텐츠 라이선스

저작권 : <font color='blue'> <b> ©2023 by Upstage X fastcampus Co., Ltd. All rights reserved.</font></b>

<font color='red'><b>WARNING</font> : 본 교육 콘텐츠의 지식재산권은 업스테이지 및 패스트캠퍼스에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다. </b>