In [1]:
import numpy as np
import pandas as pd
import math
import os
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 200)

In [2]:
def export_file(date, pairs_dict, file_name):
    dir_name = f'date={date}'
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    file_path = os.path.join(dir_name, file_name)
    file = open(file_path,"w")
    print(f'write to file path: {file}')
    counts = 0
    for key,value in pairs_dict.items():
        file.write('%s' % key)
        # for item in value:
        #     file.write('%s ' % item)
        for i, item in enumerate(value):
            if math.isnan(item):
                continue
            else:
                if i == (len(value)-1):
                    file.write(' %s' % int(item))
                else:
                    file.write(' %s' % int(item))
        file.write('\n')
        counts += 1
    print(counts)
    file.close()

In [3]:
test_date = '20211227'

In [4]:
# context = pd.read_csv('..//data/preprocessed//context_1217_1230.csv')
context_train = pd.read_csv(f'/home/jovyan/df-smart-channel/graph/data/preprocessed/date={test_date}/context_train.csv')
context_test = pd.read_csv(f'/home/jovyan/df-smart-channel/graph/data/preprocessed/date={test_date}/context_test.csv')
user_subtag = pd.read_csv(f'/home/jovyan/df-smart-channel/graph/data/preprocessed/date={test_date}/user_subtag.csv')
item_subtag = pd.read_csv(f'/home/jovyan/df-smart-channel/graph/data/preprocessed/date={test_date}/item_subtag.csv')

In [6]:
# context_train.cust_no.nunique()

In [7]:
# user_subtag.cust_no.nunique()

In [5]:
le_user = LabelEncoder()
context_train.cust_no = le_user.fit_transform(context_train.cust_no)
context_test.cust_no = le_user.transform(context_test.cust_no)
user_subtag['cust_no'] = le_user.transform(user_subtag['cust_no'])

In [6]:
le_item = LabelEncoder()
context_train.item_id = le_item.fit_transform(context_train.item_id)
context_test.item_id = le_item.transform(context_test.item_id)
item_subtag.item_id = le_item.transform(item_subtag.item_id)

## get item-subtag

In [7]:
mapping_df = pd.read_excel('subtag_map.xlsx', sheet_name = 'mapping')

In [8]:
mapping_df.head(3)

Unnamed: 0,subtag_03,code
0,mobile_active,0
1,forex_digital_account,1
2,account,2


In [12]:
available_items = mapping_df.subtag_03.unique()

In [13]:
item_subtag.subtag_eng_desc = item_subtag.subtag_eng_desc.str[3:]

In [14]:
item_subtag = item_subtag[item_subtag.subtag_eng_desc.isin(available_items)]

In [15]:
item_subtag.head(3)

Unnamed: 0,item_id,date,subtag_eng_desc


In [16]:
item_subtags = pd.merge(item_subtag, mapping_df, how = 'left', left_on = 'subtag_eng_desc', right_on = 'subtag_03', copy = False)

In [17]:
subtags_by_item = item_subtags.groupby('item_id')['code'].unique()
item_subtag_dict = dict(subtags_by_item.apply(list))

In [18]:
export_file(test_date, item_subtag_dict, 'item_subtag.txt')

write to file path: <_io.TextIOWrapper name='date=20211227/item_subtag.txt' mode='w' encoding='UTF-8'>
0


### get user-subtag

In [19]:
# 行銀活躍用戶
mobile_active = user_subtag.mobile_login_90
# 有外幣帳戶
forex_digital_account = user_subtag.dd_my
# 存戶
account = user_subtag.dd_md
# 純存戶
account_only = user_subtag.onlymd_ind
# 純卡戶
credit_card_only = user_subtag.onlycc_ind
# 卡存戶有e.Fingo指定卡
card_pi_only_ubear = user_subtag.efingo_card_ind
# 有信貸者
personal_loan_account_cust = user_subtag.cl_cpa_amt.notna()
# 無理專顧客
no_fc_cust = user_subtag.fc_ind == 0

In [21]:
user_subtag = user_subtag.sort_values('cust_no')

In [22]:
user_subtag_dict = {}
for i, ids in enumerate(user_subtag.cust_no):
    subtags_list = []
    if mobile_active[i]: subtags_list.append(0);
    if forex_digital_account[i]: subtags_list.append(1);
    if account[i]: subtags_list.append(2);
    if account_only[i]: subtags_list.append(3);        
    if credit_card_only[i]: subtags_list.append(4);
    if card_pi_only_ubear[i]: subtags_list.append(5);
    if personal_loan_account_cust[i]: subtags_list.append(6);
    if no_fc_cust[i]: subtags_list.append(7);
    user_subtag_dict[ids] = subtags_list

In [23]:
export_file(test_date, user_subtag_dict, 'user_subtag.txt')

write to file path: <_io.TextIOWrapper name='date=20211227/user_subtag.txt' mode='w' encoding='UTF-8'>
0


### get user-item pairs

In [25]:
## version1: retain no click users
cust_item_pair_train = context_train.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()
cust_item_pair_test = context_test.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()
cust_item_pair_train = cust_item_pair_train[cust_item_pair_train.click > 0]
cust_item_pair_test = cust_item_pair_test[cust_item_pair_test.click > 0]
cust_no = context_train.cust_no.unique() ##observed
cust_no_test = context_test.cust_no.unique() 
init_df = pd.DataFrame({'cust_no': cust_no})
init_df_test = pd.DataFrame({'cust_no': cust_no_test})

combined_train = pd.merge(init_df, cust_item_pair_train, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
# [['cust_no', 'item_id', 'click']]
# combined_train.item_id = combined_train.item_id.astype(int)
combined_test = pd.merge(init_df_test, cust_item_pair_test, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
# combined_test.item_id = combined_test.item_id.replace(np.nan, -1)
# combined_test = combined_test[combined_test.item_id != -1]
# combined_test.item_id = combined_test.item_id.astype(int)
items_by_cust_train = combined_train.groupby('cust_no')['item_id'].unique()
items_by_cust_test = combined_test.groupby('cust_no')['item_id'].unique()

cust_items_dict_train = dict(items_by_cust_train.apply(list))
cust_items_dict_test = dict(items_by_cust_test.apply(list))

KeyError: 'cust_no'

In [None]:
## version2: filter positive only (2%) 
cust_item_pair_train = context_train.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()
cust_item_pair_test = context_test.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()
cust_item_pair_train = cust_item_pair_train[cust_item_pair_train.click > 0]
cust_item_pair_test = cust_item_pair_test[cust_item_pair_test.click > 0]
cust_no = context_train.cust_no.unique() ##observed
init_df = pd.DataFrame({'cust_no': cust_no})

combined_train = pd.merge(init_df, cust_item_pair_train, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
# [['cust_no', 'item_id', 'click']]
# combined_train.item_id = combined_train.item_id.astype(int)

combined_test = pd.merge(init_df, cust_item_pair_test, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
# combined_test.item_id = combined_test.item_id.replace(np.nan, -1)
# combined_test = combined_test[combined_test.item_id != -1]
combined_test.item_id = combined_test.item_id.astype(int)

items_by_cust_train = combined_train.groupby('cust_no')['item_id'].unique()
items_by_cust_test = combined_test.groupby('cust_no')['item_id'].unique()

cust_items_dict_train = dict(items_by_cust_train.apply(list))
cust_items_dict_test = dict(items_by_cust_test.apply(list))

In [51]:
combined_test = pd.merge(init_df, cust_item_pair_test, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
# combined_test.item_id = combined_test.item_id.replace(np.nan, -1)
# combined_test = combined_test[combined_test.item_id != -1]
# combined_test.item_id = combined_test.item_id.astype(int)

# items_by_cust_train = combined_train.groupby('cust_no')['item_id'].unique()
items_by_cust_test = combined_test.groupby('cust_no')['item_id'].unique()
# cust_items_dict_train = dict(items_by_cust_train.apply(list))
cust_items_dict_test = dict(items_by_cust_test.apply(list))

In [105]:
combined_test[combined_test.item_id.notna()]

Unnamed: 0,cust_no,item_id
6,7093,5.0
262,187615,9.0
313,213071,19.0
527,361471,15.0
701,503510,11.0
844,127884,15.0
845,176207,15.0
846,332565,6.0
847,344156,15.0
848,377493,3.0


In [18]:
def get_user_item():
    ## context to user-item pair
    cust_item_pair_train = context_train.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()
    cust_item_pair_test = context_test.groupby(['cust_no', 'item_id'])['click'].sum().reset_index()

    cust_no = context_train.cust_no.unique()
    init_df = pd.DataFrame({'cust_no': cust_no})

    combined_train = pd.merge(init_df, cust_item_pair_train, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
    # combined_train.item_id = combined_train.item_id.astype(int)

    combined_test = pd.merge(init_df, cust_item_pair_test, on = 'cust_no', how = 'left')[['cust_no', 'item_id']]
    combined_test.item_id = combined_test.item_id.replace(np.nan, -1)
    combined_test = combined_test[combined_test.item_id != -1]
    combined_test.item_id = combined_test.item_id.astype(int)

    items_by_cust_train = combined_train.groupby('cust_no')['item_id'].unique()
    items_by_cust_test = combined_test.groupby('cust_no')['item_id'].unique()

    cust_items_dict_train = dict(items_by_cust_train.apply(list))
    cust_items_dict_test = dict(items_by_cust_test.apply(list))
    return cust_items_dict_train, cust_items_dict_test

In [19]:
cust_items_dict_train, cust_items_dict_test = get_user_item()

In [46]:
export_file(test_date, cust_items_dict_train, 'train_user_item_retain.txt')

write to file path: <_io.TextIOWrapper name='date=20211226/train_user_item_retain.txt' mode='w' encoding='UTF-8'>
601331


In [82]:
export_file(test_date, cust_items_dict_test, 'test_user_item_retain.txt')

write to file path: <_io.TextIOWrapper name='date=20211226/test_user_item_retain.txt' mode='w' encoding='UTF-8'>
104441
