In [1]:
import os
import sys
sys.path.append('../../')
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import recs.path as path
from recs.clf import Classifier
from recs.util import readjson2dict, id2cat, save_model, save2json, convert_result_json_2_csv
from sklearn.model_selection import train_test_split

In [2]:
u2idx = readjson2dict("crm_idx")
i2idx = readjson2dict("fund_info_idx")

read_path = os.path.join(path.data_path, "merge_ns_diff.pkl")
trans_buy_path = os.path.join(path.data_path, "trans_buy.pkl")

trans_buy = pd.read_pickle(trans_buy_path)
merge = pd.read_pickle(read_path)
merge = merge.replace(float('-inf'), 0)

print(f"merge data length: {len(merge)} ")
print(merge.y.value_counts())
merge

merge data length: 616552 
0    349368
1    267184
Name: y, dtype: int64


Unnamed: 0,id_number,fund_id,yyyymm,y,local_foreign_total,local_total,local_demand_deposit,local_fixed_deposit,foreign_total,foreign_fixed_deposit,...,currency_4,currency_5,currency_6,currency_7,currency_8,currency_9,currency_10,currency_11,currency_12,currency_13
2,13651,748,201906,1,16.179238,16.069354,16.069354,0.00000,13.916462,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,13651,1461,202002,1,16.042256,15.753636,15.753636,0.00000,14.658772,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,13651,1579,202006,1,15.433278,15.185336,15.185336,0.00000,13.917308,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,13651,2053,201708,1,15.772270,15.712601,15.712601,0.00000,12.923644,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,13651,2339,201912,1,15.137757,13.303332,13.303332,0.00000,14.963754,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
783190,67343,2682,201903,0,14.981091,14.782627,11.224203,14.75373,13.266349,13.258908,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
783191,67343,733,201903,0,14.981091,14.782627,11.224203,14.75373,13.266349,13.258908,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
783192,67343,3140,201903,0,14.981091,14.782627,11.224203,14.75373,13.266349,13.258908,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
783193,67343,3200,201903,0,14.981091,14.782627,11.224203,14.75373,13.266349,13.258908,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
x = merge.drop(columns=['y', 'id_number', 'fund_id'])
y = merge['y']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
params = {
    "best_sell_n": 53,
    "n_estimators":250,
    "max_depth":3,
    "best_sell_period": [201800 ,202000]
}
c = Classifier(x_train, x_test, y_train, y_test)
c.do_gradient_boosting(max_depth=params["max_depth"], n_estimators=params["n_estimators"])
c.check_score()
c.score_result

Start Gradient Boosting Training...
      Iter       Train Loss   Remaining Time 
         1           1.3585           17.34m
         2           1.3483           17.20m
         3           1.3408           17.08m
         4           1.3346           17.04m
         5           1.3272           16.96m
         6           1.3220           16.82m
         7           1.3151           16.74m
         8           1.3094           16.69m
         9           1.3040           16.62m
        10           1.3000           16.56m
        20           1.2642           15.78m
        30           1.2432           15.03m
        40           1.2235           14.32m
        50           1.2106           13.62m
        60           1.1962           12.93m
        70           1.1880           12.24m
        80           1.1791           11.54m
        90           1.1695           10.86m
       100           1.1622           10.18m
       200           1.1039            3.39m


{'model_score': 0.7322160972019763,
 'accuracy_score': 0.7309647963279837,
 'f1_score': 0.6510135597142888,
 'precision_score': 0.7430183695521672,
 'recall_score': 0.5792833607907742}

In [5]:
# inference input data
equity_path = os.path.join(path.data_path, "equity.pkl")
crm_path = os.path.join(path.data_path, "crm_diff.pkl")
exist_funds_path = os.path.join(path.data_path, 'exist_funds_2021')

crm = pd.read_pickle(crm_path)
crm = crm.replace(float('-inf'), 0)

equity = pd.read_pickle(equity_path)
i2idx = readjson2dict('fund_info_idx')
exist_funds = readjson2dict(exist_funds_path)

exist_funds_idx = []
for i in exist_funds:
    tmp = i2idx.get(i, None)
    if (tmp):
        exist_funds_idx.append(tmp)


In [6]:
best_sell_n = list(trans_buy[
    (trans_buy.buy_date > params['best_sell_period'][0]*100) & 
    (trans_buy.buy_date < params['best_sell_period'][1]*100)
].fund_id.value_counts().index)[:params["best_sell_n"]]

In [7]:
targets = crm[crm.yyyymm == 202012]
funds = equity[
    equity.fund_id.isin(exist_funds_idx) & 
    (equity.yyyymm == equity.yyyymm.max()) &
    equity.fund_id.isin(best_sell_n)
]
targets['k'] = 0
funds['k'] = 0
print(f'{len(targets)} users, {len(funds)} items')
pred_input = targets.merge(funds, how='outer')
pred_input.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


79848 users, 53 items


Index(['yyyymm', 'id_number', 'local_foreign_total', 'local_total',
       'local_demand_deposit', 'local_fixed_deposit', 'foreign_total',
       'foreign_fixed_deposit', 'foreign_demand_deposit', 'invest_type', 'age',
       'monthly_trade_vol', 'stock_inventory_val', 'KPI',
       'local_foreign_total_diff', 'local_total_diff',
       'local_demand_deposit_diff', 'local_fixed_deposit_diff',
       'foreign_total_diff', 'foreign_fixed_deposit_diff',
       'foreign_demand_deposit_diff', 'monthly_trade_vol_diff',
       'stock_inventory_val_diff', 'local_foreign_total_pct',
       'local_total_pct', 'local_demand_deposit_pct',
       'local_fixed_deposit_pct', 'foreign_total_pct',
       'foreign_fixed_deposit_pct', 'foreign_demand_deposit_pct',
       'monthly_trade_vol_pct', 'stock_inventory_val_pct', 'age_group', 'k',
       'fund_id', 'value', 'diff_month', 'diff_quarter', 'diff_half_year',
       'diff_year', 'diff_2year', 'fund_type', 'local_or_foreign',
       'currency_type', '

In [8]:
x_predict = pred_input[x_test.columns]
x_user_fund = pred_input[['id_number', 'fund_id']]

In [9]:
# Predict
predict_prob = c.model.predict_proba(x_predict)
pred_y = c.model.predict(x_predict)
print(f"Total number of forecasts: {len(predict_prob)}.")
print("Prediction y Probability:")
print(predict_prob)
print("Prediction y:")
print(pred_y)
print("Prediction y Counts:")
print(np.unique(pred_y, return_counts=True))

Total number of forecasts: 4231944.
Prediction y Probability:
[[0.42991493 0.57008507]
 [0.67540746 0.32459254]
 [0.31550337 0.68449663]
 ...
 [0.16631368 0.83368632]
 [0.13913586 0.86086414]
 [0.58689378 0.41310622]]
Prediction y:
[1 0 1 ... 1 1 0]
Prediction y Counts:
(array([0, 1]), array([2059006, 2172938]))


In [10]:
x_user_fund['prob'] = predict_prob[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
x_user_fund[x_user_fund['id_number'] == 65136].sort_values(by=['prob'], ascending=False)[:10]

Unnamed: 0,id_number,fund_id,prob
2116235,65136,3197,0.907504
2116234,65136,3196,0.888254
2116219,65136,2516,0.827447
2116206,65136,2099,0.814005
2116233,65136,3130,0.808675
2116230,65136,2940,0.780516
2116196,65136,985,0.768185
2116197,65136,995,0.768185
2116207,65136,2131,0.742787
2116232,65136,3072,0.724056


In [12]:
x_user_fund[x_user_fund['id_number'] == 0].sort_values(by=['prob'], ascending=False)[:10]

Unnamed: 0,id_number,fund_id,prob
1636956,0,3197,0.856234
1636955,0,3196,0.828332
1636917,0,985,0.744931
1636918,0,995,0.744931
1636951,0,2940,0.73884
1636927,0,2099,0.726525
1636937,0,2301,0.614913
1636940,0,2516,0.608674
1636941,0,2518,0.581946
1636954,0,3130,0.578233


In [13]:
rec_result = {}
count = 0

# sort_values to speed up
x_user_fund.sort_values(by=['prob'], ascending=False, inplace=True)
for user in tqdm(x_user_fund['id_number'].unique()):
    ignore_item = trans_buy[(trans_buy['id_number'] == user)]['fund_id'].unique().tolist()    
    tmp = x_user_fund[x_user_fund['id_number'] == user]
    
    rec_fund = tmp[~tmp['fund_id'].isin(ignore_item)].head(10)['fund_id'].apply(
        lambda x :id2cat(i2idx, x)
    ).tolist()
    
    rec_result[id2cat(u2idx, user)] = rec_fund

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
100%|██████████| 79848/79848 [1:39:41<00:00, 13.35it/s]


In [14]:
# check format
rec_result['H2799640320']

['Y38', 'Y37', 'MU5', 'J1N', 'X01', 'T38', '79A', '78X', 'J80', 'V09']

In [15]:
r_name = "r_2021_11_21_gboost_d3_n250_b50"
json_name = r_name+".json"
model_save_path = os.path.join(path.result_path, json_name) 
with open(model_save_path, 'w') as jf:
    json.dump(rec_result, jf)

json_path = os.path.join(path.export_path, json_name)
result_path = os.path.join(path.export_path , "result")
save_path = os.path.join(result_path, f"{r_name}.csv")
convert_result_json_2_csv(json_path, save_path)

Convert /tf/recommenders/export/r_2021_11_21_gboost_d3_n250_b50.json to /tf/recommenders/export/result/r_2021_11_21_gboost_d3_n250_b50.csv success.
