In [1]:
import os
import sys
sys.path.append('../../')
import torch 
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import math
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from tqdm import tqdm
import recs.path as path
from recs.cf import CollaborativeFiltering, generate_cf_predict_df
from recs.util import id2cat, recall_evaluate, readjson2dict, save2json


In [2]:
# Preload    
u2idx = readjson2dict("crm_idx")
i2idx = readjson2dict("fund_info_idx")
fund_info_web_path = os.path.join(path.data_path, "fund_info_web.pkl")
crm_path = os.path.join(path.data_path, "crm.pkl")
trans_buy_path = os.path.join(path.data_path, "trans_buy.pkl")


fund_info_web = pd.read_pickle(fund_info_web_path)
fund_info_web = fund_info_web[fund_info_web.fund_id.isin(i2idx.keys())]
fund_info_web.fund_id = fund_info_web.fund_id.apply(lambda x: i2idx[x])
crm = pd.read_pickle(crm_path)

exist_funds = fund_info_web.fund_id.unique().tolist()

trans_buy_df = pd.read_pickle(trans_buy_path)
trans_buy_df = trans_buy_df[trans_buy_df.fund_id.isin(exist_funds)]

user_trade_freq = trans_buy_df.id_number.value_counts().index.tolist()

groups_index = []
groups_index.append([0, 527])
groups_index.append([528, 3760])
groups_index.append([3761, 38181])
groups_index.append([38182, len(user_trade_freq)])

In [3]:
history_period = [201400, 202100]
bestsel_period = [201900, 202100]
target_user_date = 202012

trans_buy = trans_buy_df.copy()
# training
train_df = trans_buy[
    (trans_buy.buy_date < history_period[1]*100) &
    (trans_buy.buy_date > history_period[0]*100) 
]

best_selling_funds = trans_buy_df[
            (trans_buy_df.buy_date > bestsel_period[0]*100) &
            (trans_buy_df.buy_date < bestsel_period[1]*100)
        ].fund_id.value_counts().index.to_series()

trans_buy = trans_buy_df.copy()
# training
train_df = trans_buy[
    (trans_buy.buy_date < history_period[1]*100) &
    (trans_buy.buy_date > history_period[0]*100) 
]
train_df = train_df.groupby(["id_number", "fund_id"]).size().reset_index(name="Time")
best_selling_num = 50
if (best_selling_num != -1):
    best_selling_exist_funds = list(set(exist_funds) & set(best_selling_funds.tolist()[:best_selling_num]))
else:
    best_selling_exist_funds = list(set(exist_funds))

train_df = train_df[train_df.fund_id.isin(best_selling_exist_funds)]

In [4]:
train_df.fund_id.value_counts()


2936    5626
3197    3932
2940    3624
2099    3193
2485    3137
2541    3049
2416    2921
3215    2760
2518    2628
722     2264
2274    2262
3130    2174
1251    2136
2135    2098
1790    2062
2301    1924
2717    1851
3072    1751
3039    1746
2516    1710
2131    1687
3196    1405
995     1310
926     1221
2971    1188
2730    1111
2313    1102
1791    1026
921      984
20       974
603      969
3133     967
2489     963
2268     949
1838     933
1839     931
1461     926
1579     909
1580     838
1235     800
3065     761
2904     743
2897     742
1167     738
3066     730
3042     663
985      662
2145     635
2908     595
21       577
Name: fund_id, dtype: int64

In [5]:
USER_ID = "id_number"
ITEM_ID = "fund_id"
RATE_KEY = "Time"


class PMF(nn.Module):
    def __init__(self, num_users, num_items, emb_size, lam_u, lam_v):
        super(PMF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        
        nn.init.normal_(self.user_emb.weight)
        nn.init.normal_(self.item_emb.weight)
        
        self.user_emb.weight.mul(0.1)
        self.item_emb.weight.mul(0.1)
        
        self.lam_u = lam_u
        self.lam_v = lam_v
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        output = (u*v).sum(1)
        
        # Frobenius norm
        u_reg = self.lam_u * torch.sum(u**2)
        v_reg = self.lam_v * torch.sum(v**2)
        
        return output, u_reg, v_reg

def train_epocs(model, train_df, epochs=10000, lr=0.01, wd=0.0):
    
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for epoch in tqdm(range(epochs)):
        user_tensor = torch.LongTensor(train_df[USER_ID].values).cuda()
        item_tensor = torch.LongTensor(train_df[ITEM_ID].values).cuda()
        ratings = torch.FloatTensor(train_df[RATE_KEY].values).cuda()
        
        pred, u_reg, v_reg = model(user_tensor, item_tensor)
        loss = F.mse_loss(pred, ratings) + u_reg + v_reg
        if epoch % 1000 == 0:
            print(f"Epoch: {epoch}, Loss:{loss}")
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    

In [6]:
num_users = len(train_df[USER_ID])
num_items = len(train_df[ITEM_ID])
print(num_users)
print(num_items)
model = PMF(num_users, num_items, emb_size=1000, lam_u=0.01, lam_v=0.01).cuda()
train_epocs(model, train_df=train_df)


80887
80887


  0%|          | 4/10000 [00:00<05:05, 32.70it/s]

Epoch: 0, Loss:1614499.625


 10%|█         | 1008/10000 [00:28<03:32, 42.26it/s]

Epoch: 1000, Loss:5.982850074768066


 20%|██        | 2008/10000 [00:51<03:09, 42.24it/s]

Epoch: 2000, Loss:5.728758811950684


 30%|███       | 3004/10000 [01:16<03:29, 33.47it/s]

Epoch: 3000, Loss:5.729884624481201


 40%|████      | 4008/10000 [01:46<02:24, 41.46it/s]

Epoch: 4000, Loss:5.731063365936279


 50%|█████     | 5008/10000 [02:10<01:59, 41.95it/s]

Epoch: 5000, Loss:5.7332072257995605


 60%|██████    | 6008/10000 [02:33<01:34, 42.39it/s]

Epoch: 6000, Loss:5.7359514236450195


 70%|███████   | 7008/10000 [02:57<01:10, 42.33it/s]

Epoch: 7000, Loss:5.739982604980469


 80%|████████  | 8003/10000 [03:20<00:48, 40.78it/s]

Epoch: 8000, Loss:5.744726181030273


 90%|█████████ | 9007/10000 [03:44<00:23, 42.42it/s]

Epoch: 9000, Loss:5.750947952270508


100%|██████████| 10000/10000 [04:07<00:00, 40.32it/s]


In [7]:
target_users_id = crm[crm.yyyymm == target_user_date].id_number.unique().tolist()
target_users_id[0]

49721

In [10]:

    
available_users = train_df.id_number.unique().tolist()

rec_dict = {}

items = torch.tensor(
        train_df[ITEM_ID].unique().tolist()
    ).cuda()

for user in tqdm(target_users_id):    
    ignore_item = trans_buy_df[
        (trans_buy_df.id_number == user)
    ].fund_id.unique().tolist()

    user_tensor = torch.tensor([user]).cuda()
        
    predictions, _, __ = model(user_tensor, items)
    predictions = predictions.tolist()
    
    if (user not in available_users):
        rec_dict[str(user)] = best_selling_funds[~best_selling_funds.isin(ignore_item)].tolist()[:10]
        continue
        
    rdict = {
        "items": items.cpu().numpy().tolist(),
        "recStr": predictions
    }
    
    rec_df = pd.DataFrame(rdict).sort_values(by=['recStr'], ascending=False)
    
    rec_df = rec_df[~rec_df["items"].isin(ignore_item)]

    rec_dict[str(user)] = rec_df["items"].tolist()[:10] 

100%|██████████| 79848/79848 [02:21<00:00, 562.39it/s]


In [11]:
len(rec_dict)
dec_rec_result = {}
for user in tqdm(rec_dict):            
    dec_user = id2cat(u2idx, int(user))
    dec_funds = []
    for item in rec_dict[user]:
        dec_funds.append(id2cat(i2idx, item))
    dec_rec_result[dec_user] = dec_funds



100%|██████████| 79848/79848 [05:00<00:00, 265.52it/s]


In [12]:
save2json(dec_rec_result, folder='/result')

'/tf/recommenders/export/result/1635653384.0998619.json'