In [1]:

import json
import pandas as pd
from tqdm import tqdm

import recs.path as path
from recs.cf_mf import MatrixFactorization, generate_cf_predict_df
from recs.util import id2cat, recall_evaluate, readjson2dict, save2json

In [2]:
# Preload
with open('data/crm_idx.json') as jf:
    crm2idx = json.load(jf)

with open('data/fund_info_idx.json') as jf:
    fund2idx = json.load(jf)

with open('data/exist_funds_2021.json') as jf:
    exist_funds = json.load(jf)

crm = pd.read_pickle('data/crm.pkl')

exist_funds_idx = []
for i in exist_funds:
    try:
        exist_funds_idx.append(fund2idx[i])
    except:
        continue
exist_funds = exist_funds_idx

trans_buy_df = pd.read_pickle('data/trans_buy.pkl')

In [3]:
user_freq = trans_buy_df.id_number.value_counts().index.tolist()
groups_index = []
groups_index.append([0, 527])
groups_index.append([528, 3760])
groups_index.append([3761, 38181])
groups_index.append([38182, len(user_freq)])

In [11]:
def cf_group(predict_period, history_period, bestsel_period, target_user_date, 
        best_selling_num=150, svds_k = 100, valid_sample = 580, group=-1):
    if (bestsel_period[1] > history_period[1]):
        raise SystemExit("Error: topk_end > history_end")

    if (history_period[1] > predict_period[0]):
        raise SystemExit("Error: predict_start > history_end")

    if (target_user_date > history_period[1]):
        raise SystemExit("Error: target_user_date > history_end")

    trans_buy = trans_buy_df.copy()
        
    if group in [0, 1, 2, 3]:
        trans_buy = trans_buy[trans_buy.id_number.isin(
            user_freq[ groups_index[group][0] : groups_index[group][1] ]
        )]
        
    # best selling top k
    topk = trans_buy[trans_buy.isin(exist_funds)][
        (trans_buy.buy_date > bestsel_period[0]*100) &
        (trans_buy.buy_date < bestsel_period[1]*100)].fund_id.value_counts().index.tolist()    

    target_users_id = crm[crm.yyyymm == target_user_date].id_number.unique().tolist()    
    
    # training
    train_df = trans_buy[
        (trans_buy.buy_date < history_period[1]*100) &
        (trans_buy.buy_date > history_period[0]*100) 
    ]
        
    train_df = train_df.groupby(["id_number", "fund_id"]).size().reset_index(name="Time")

    if (best_selling_num != -1):
        exist_funds_topk = list(set(exist_funds) & set(topk[:best_selling_num]))
    else:
        exist_funds_topk = list(set(exist_funds))

    train_df = train_df[train_df.fund_id.isin(exist_funds_topk)]
    cf_preds_df = generate_cf_predict_df(train_df, svds_k)    
    cf_rec_model = MatrixFactorization(cf_preds_df)

    rec_dict = {}
    available_users = train_df.id_number.unique().tolist()
    
    for user in tqdm(available_users):
        rec_dict[str(user)] = cf_rec_model.rec_items(user).fund_id.tolist()    

    return rec_dict

In [13]:
# Baseline
predict_period = [202001, 202007]
history_period = [201400, 202000]
bestsel_period = [201800, 202000]

target_user_date = 201912
g1 = cf_group(predict_period, history_period, bestsel_period, target_user_date, group=0)
g2 = cf_group(predict_period, history_period, bestsel_period, target_user_date, group=1)
g3 = cf_group(predict_period, history_period, bestsel_period, target_user_date, group=2)
g4 = cf_group(predict_period, history_period, bestsel_period, target_user_date, group=3)

Generating Needed Dataframe...
cf_preds_df created.


100%|██████████| 521/521 [00:01<00:00, 439.43it/s]


Generating Needed Dataframe...
cf_preds_df created.


100%|██████████| 3100/3100 [00:06<00:00, 463.66it/s]


Generating Needed Dataframe...
cf_preds_df created.


100%|██████████| 24183/24183 [00:52<00:00, 462.40it/s]


Generating Needed Dataframe...
cf_preds_df created.


100%|██████████| 1689/1689 [00:03<00:00, 460.64it/s]


In [20]:
freq = trans_buy_df.id_number.value_counts()
freq[freq<5]

31662    4
59141    4
52992    4
63549    4
4364     4
        ..
22695    1
28834    1
69774    1
79299    1
8188     1
Name: id_number, Length: 28244, dtype: int64

In [22]:
trans_buy = trans_buy_df.copy()

trans_buy = trans_buy[trans_buy.id_number.isin(
    user_freq[ groups_index[3][0] : groups_index[3][1] ]
)]

topk = trans_buy[trans_buy.isin(exist_funds)][
        (trans_buy.buy_date > bestsel_period[0]*100) &
        (trans_buy.buy_date < bestsel_period[1]*100)].fund_id.value_counts().index.tolist() 

In [27]:
top10_group4 = []
for i in topk:
    top10_group4.append(id2cat(fund2idx, i))
top10_group4[:10]

['T34', '83Y', 'T38', 'X12', '68C', 'Y57', 'L17', '79A', 'L91', 'T35']

In [None]:
target_users_id = crm[crm.yyyymm == 202012].id_number.unique().tolist() 


In [35]:
g1['30']
_buy30 = trans_buy_df[trans_buy_df.id_number == 30].fund_id.tolist()
list(set(g1['30']) & set(_buy30))

[1482, 1616, 1073, 1074, 1362, 1140, 1147, 1564, 1214, 1151]