In [1]:
import os
import sys
sys.path.append('../../')
import torch 
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import math
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from tqdm import tqdm
import recs.path as path
from recs.cf import CollaborativeFiltering, generate_cf_predict_df
from recs.util import id2cat, recall_evaluate, readjson2dict, save2json, get_cat2id


In [2]:
# Preload    
u2idx = readjson2dict("crm_idx")
i2idx = readjson2dict("fund_info_idx")
fund_info_web_path = os.path.join(path.data_path, "fund_info_web.pkl")
crm_path = os.path.join(path.data_path, "crm.pkl")
trans_buy_path = os.path.join(path.data_path, "trans_buy.pkl")


fund_info_web = pd.read_pickle(fund_info_web_path)
fund_info_web = fund_info_web[fund_info_web.fund_id.isin(i2idx.keys())]
fund_info_web.fund_id = fund_info_web.fund_id.apply(lambda x: i2idx[x])
crm = pd.read_pickle(crm_path)

exist_funds = fund_info_web.fund_id.unique().tolist()

trans_buy_df = pd.read_pickle(trans_buy_path)
trans_buy_df = trans_buy_df[trans_buy_df.fund_id.isin(exist_funds)]

user_trade_freq = trans_buy_df.id_number.value_counts().index.tolist()

groups_index = []
groups_index.append([0, 527])
groups_index.append([528, 3760])
groups_index.append([3761, 38181])
groups_index.append([38182, len(user_trade_freq)])

In [3]:
predict_period = [202001, 202007]
history_period = [201400, 202000]
bestsel_period = [201800, 202000]
target_user_date = 201912
# history_period = [201400, 202100]
# bestsel_period = [201900, 202100]
# target_user_date = 202012
trans_buy = trans_buy_df.copy()
# training
train_df = trans_buy[
    (trans_buy.buy_date < history_period[1]*100) &
    (trans_buy.buy_date > history_period[0]*100) 
]

best_selling_funds = trans_buy_df[
            (trans_buy_df.buy_date > bestsel_period[0]*100) &
            (trans_buy_df.buy_date < bestsel_period[1]*100)
        ].fund_id.value_counts().index.to_series()

trans_buy = trans_buy_df.copy()
# training
train_df = trans_buy[
    (trans_buy.buy_date < history_period[1]*100) &
    (trans_buy.buy_date > history_period[0]*100) 
]
train_df = train_df.groupby(["id_number", "fund_id"]).size().reset_index(name="Time")
best_selling_num = 50
if (best_selling_num != -1):
    best_selling_exist_funds = list(set(exist_funds) & set(best_selling_funds.tolist()[:best_selling_num]))
else:
    best_selling_exist_funds = list(set(exist_funds))

train_df = train_df[train_df.fund_id.isin(best_selling_exist_funds)]

In [4]:
# train_df.id_number.value_counts()
cat_dict, cats = get_cat2id(train_df.id_number)
train_df.id_number = cats

cat_dict

{8: 0,
 24: 1,
 25: 2,
 26: 3,
 29: 4,
 30: 5,
 32: 6,
 39: 7,
 40: 8,
 42: 9,
 45: 10,
 47: 11,
 48: 12,
 49: 13,
 53: 14,
 56: 15,
 59: 16,
 61: 17,
 62: 18,
 66: 19,
 67: 20,
 70: 21,
 74: 22,
 75: 23,
 76: 24,
 77: 25,
 79: 26,
 82: 27,
 83: 28,
 87: 29,
 92: 30,
 93: 31,
 98: 32,
 102: 33,
 103: 34,
 105: 35,
 107: 36,
 108: 37,
 115: 38,
 118: 39,
 119: 40,
 120: 41,
 121: 42,
 122: 43,
 126: 44,
 127: 45,
 132: 46,
 133: 47,
 134: 48,
 137: 49,
 138: 50,
 140: 51,
 142: 52,
 146: 53,
 147: 54,
 152: 55,
 154: 56,
 155: 57,
 158: 58,
 162: 59,
 165: 60,
 166: 61,
 169: 62,
 170: 63,
 171: 64,
 173: 65,
 175: 66,
 176: 67,
 178: 68,
 181: 69,
 182: 70,
 185: 71,
 186: 72,
 188: 73,
 191: 74,
 193: 75,
 196: 76,
 197: 77,
 198: 78,
 201: 79,
 204: 80,
 205: 81,
 209: 82,
 215: 83,
 217: 84,
 219: 85,
 220: 86,
 221: 87,
 222: 88,
 227: 89,
 228: 90,
 230: 91,
 231: 92,
 232: 93,
 236: 94,
 237: 95,
 239: 96,
 240: 97,
 241: 98,
 242: 99,
 244: 100,
 248: 101,
 254: 102,
 255: 103,


In [5]:
len(cat_dict)

27825

In [6]:
USER_ID = "id_number"
ITEM_ID = "fund_id"
RATE_KEY = "Time"


class PMF(nn.Module):
    def __init__(self, num_users, num_items, emb_size, lam_u, lam_v):
        super(PMF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        
        nn.init.normal_(self.user_emb.weight)
        nn.init.normal_(self.item_emb.weight)
        
        self.user_emb.weight.mul(0.1)
        self.item_emb.weight.mul(0.1)
        
        self.lam_u = lam_u
        self.lam_v = lam_v
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        output = (u*v).sum(1)
        
        # Frobenius norm
        u_reg = self.lam_u * torch.sum(u**2)
        v_reg = self.lam_v * torch.sum(v**2)
        
        return output, u_reg, v_reg

def train_epocs(model, train_df, epochs=10000, lr=0.01, wd=0.0):
    
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for epoch in tqdm(range(epochs)):
        user_tensor = torch.LongTensor(train_df[USER_ID].values).cuda()
        item_tensor = torch.LongTensor(train_df[ITEM_ID].values).cuda()
        ratings = torch.FloatTensor(train_df[RATE_KEY].values).cuda()
        
        pred, u_reg, v_reg = model(user_tensor, item_tensor)
        loss = F.mse_loss(pred, ratings) + u_reg + v_reg
        if epoch % 1000 == 0:
            print(f"Epoch: {epoch}, Loss:{loss}")
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    

In [7]:
num_users = len(train_df[USER_ID])
num_items = len(train_df[ITEM_ID])
print(num_users)
print(num_items)
model = PMF(num_users, num_items, emb_size=1000, lam_u=0.01, lam_v=0.01).cuda()
train_epocs(model, train_df=train_df)


59395
59395


  0%|          | 4/10000 [00:00<05:23, 30.88it/s]

Epoch: 0, Loss:1195895.625


 10%|█         | 1012/10000 [00:19<02:43, 55.12it/s]

Epoch: 1000, Loss:4.946209907531738


 20%|██        | 2008/10000 [00:37<02:22, 56.14it/s]

Epoch: 2000, Loss:4.577517032623291


 30%|███       | 3010/10000 [00:56<02:05, 55.68it/s]

Epoch: 3000, Loss:4.578078746795654


 40%|████      | 4012/10000 [01:14<01:47, 55.94it/s]

Epoch: 4000, Loss:4.579052925109863


 50%|█████     | 5008/10000 [01:33<01:31, 54.81it/s]

Epoch: 5000, Loss:4.5804901123046875


 60%|██████    | 6010/10000 [01:51<01:15, 52.53it/s]

Epoch: 6000, Loss:4.582452297210693


 70%|███████   | 7004/10000 [02:10<01:08, 43.90it/s]

Epoch: 7000, Loss:4.585075378417969


 80%|████████  | 8005/10000 [02:29<00:36, 55.32it/s]

Epoch: 8000, Loss:4.588479042053223


 90%|█████████ | 9003/10000 [02:48<00:22, 45.26it/s]

Epoch: 9000, Loss:4.593582630157471


100%|██████████| 10000/10000 [03:08<00:00, 53.00it/s]


In [8]:
target_users_id = crm[crm.yyyymm == target_user_date].id_number.unique().tolist()
target_users_id[0]

38831

In [10]:
available_users = train_df.id_number.unique().tolist()

rec_dict = {}

items = torch.tensor(
        train_df[ITEM_ID].unique().tolist()
    ).cuda()

for user in tqdm(train_df.id_number.unique().tolist()):
    ignore_item = trans_buy_df[
        (trans_buy_df.id_number == user)
    ].fund_id.unique().tolist()

    user_tensor = torch.tensor([user]).cuda()
        
    predictions, _, __ = model(user_tensor, items)
    predictions = predictions.tolist()
    
    if (user not in available_users):
        rec_dict[str(user)] = best_selling_funds[~best_selling_funds.isin(ignore_item)].tolist()[:10]
        continue
        
    rdict = {
        "items": items.cpu().numpy().tolist(),
        "recStr": predictions
    }
    
    rec_df = pd.DataFrame(rdict).sort_values(by=['recStr'], ascending=False)
    
    rec_df = rec_df[~rec_df["items"].isin(ignore_item)]

    rec_dict[str(user)] = rec_df["items"].tolist()[:10] 

100%|██████████| 27825/27825 [00:59<00:00, 466.35it/s]


In [11]:
# testing
test_df = trans_buy[
    (trans_buy.buy_date > predict_period[0] *100) &
    (trans_buy.buy_date < predict_period[1] *100)
]
# Testing Result
ans_dict = {}
test_users = test_df[
    test_df.id_number.isin(target_users_id)
].id_number.unique().tolist()

for user in tqdm(test_users):
    ignore_item = trans_buy[
        (trans_buy.id_number == user) & 
        (trans_buy.buy_date < predict_period[0]*100)
    ].fund_id.unique().tolist()

    ans_dict[str(user)] = test_df[
        ~(test_df.fund_id.isin(ignore_item))
        &
        (test_df.id_number == user)
    ].fund_id.unique().tolist()
    
len(rec_dict)
dec_rec_result = {}
for user in tqdm(rec_dict):            
    dec_user = id2cat(cat_dict, int(user))
    dec_rec_result[str(dec_user)] = rec_dict[user]    

rec_dict = dec_rec_result

100%|██████████| 13258/13258 [00:31<00:00, 419.86it/s]
100%|██████████| 27825/27825 [00:25<00:00, 1088.97it/s]


In [None]:
rec_dict[13651]

In [None]:
ans_dict

In [12]:
pred_score = recall_evaluate(rec_dict, ans_dict)
pred_score


100%|██████████| 13258/13258 [00:00<00:00, 279547.17it/s]


{'recall@3': 0.0077842785214754726,
 'recall@5': 0.013755688006947257,
 'recall@10': 0.0279050200527296}

In [None]:
SystemExit()