# Import Library

In [2]:
import sys
sys.path.append('/Users/baron/Research/ncku/Recommendation/credict_recsys')
#sys.path.append('../')

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from apps.inference import make_final
from tqdm import tqdm

In [None]:
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names

# 資料讀取

In [2]:
df = pd.read_csv("../../data/tbrain_small.csv")
len(df)

22130579

In [3]:
final_df = pd.read_csv("../../data/需預測的顧客名單及提交檔案範例.csv")
len(final_df)

500000

In [4]:
df = df.drop(columns=['Unnamed: 0'])

# Model 1
利用使用者屬性資料

In [5]:
df = df[["dt", "chid", "shop_tag", "txn_cnt", "txn_amt", "masts", "educd", "trdtp", "naty", "poscd", "cuorg", "gender_code", "age", "primary_card"]]

## Data Preprocess

- 做nan處理
- txn_cnt 有負數可以進行排除
- 將id 重新編號
- 訓練樣本是否需要平衡？平衡後訓練人數會不會下降？
- 有些人數沒有出現過在需預測的類別中，因此需思考要如何對這些人做預測


任一欄位為NAN則排除

In [6]:
df.dropna(inplace=True)

排除消費次數為負數的數值

In [7]:
df = df[df["txn_cnt"]>0]

將chid重新編號, 取得最後輸出時需要的chid

In [8]:
num_to_id = { i:id for i, id in enumerate(final_df["chid"].unique())}
id_to_num = { id:i for i, id in enumerate(final_df["chid"].unique())}

In [9]:
df["adj_id"] = df["chid"].map(id_to_num)

將以下類別欄位直接轉為int
- 婚姻
- 教育程度
- 行業別
- 國籍
- 職位別
- 客戶來源
- 正卡信用額度
- 性別代碼
- 年紀
- 正附卡註記

In [10]:
df['masts'] = df['masts'].astype(int)
df['educd'] = df['educd'].astype(int)
df['trdtp'] = df['trdtp'].astype(int)
df['naty'] = df['naty'].astype(int)
df['poscd'] = df['poscd'].astype(int)
df['cuorg'] = df['cuorg'].astype(int)
df['gender_code'] = df['gender_code'].astype(int)
df['age'] = df['age'].astype(int)
df['primary_card'] = df['primary_card'].astype(int)

將消費金額轉為log型態

In [11]:
df["txn_amt_log"] = df["txn_amt"].apply(np.log)

In [12]:
df.head(10)

Unnamed: 0,chid,shop_tag,txn_cnt,txn_amt,masts,educd,trdtp,naty,poscd,cuorg,gender_code,age,primary_card,adj_id,txn_amt_log
0,10267183,2,1,21701.307598,2,6,15,1,99,30,0,6,0,354909,9.985128
1,10115966,2,1,6698.199203,1,4,9,1,1,30,0,5,1,354911,8.809594
2,10484590,2,2,6693.510475,2,2,15,1,99,30,0,3,1,354959,8.808894
3,10079974,2,1,3271.02509,2,2,2,1,2,30,0,4,1,354992,8.092859
4,10233949,2,1,2829.165439,1,5,15,1,99,30,1,6,1,354949,7.947737
5,10488184,2,2,8407.70004,2,3,12,1,6,30,1,4,1,355015,9.036903
6,10038204,2,2,7604.385348,2,3,11,1,2,30,1,3,1,352405,8.93648
7,10050073,2,2,31743.259948,1,6,15,1,99,30,0,6,1,354960,10.365436
8,10436047,2,1,4333.009825,2,3,11,1,2,30,0,2,1,355022,8.374018
9,10193951,2,5,24049.208333,2,2,17,1,2,30,0,3,1,354941,10.087857


In [19]:
i = final_df["chid"].unique()[0]
df[df["chid"]==i].iloc[0]


chid            1.012824e+07
shop_tag        2.200000e+01
txn_cnt         1.000000e+00
txn_amt         1.531701e+03
masts           1.000000e+00
educd           4.000000e+00
trdtp           8.000000e+00
naty            1.000000e+00
poscd           1.000000e+00
cuorg           3.000000e+01
gender_code     0.000000e+00
age             4.000000e+00
primary_card    1.000000e+00
adj_id          0.000000e+00
txn_amt_log     7.334134e+00
Name: 11483832, dtype: float64

In [20]:
# 建立使用者屬性
user_attributes = dict()
train_id = df["chid"].unique()
generate_col = ["masts", "txn_cnt", "educd", "trdtp", "naty", "poscd", "cuorg", "gender_code", "age", "primary_card"]
for i in tqdm(final_df["chid"].unique(), desc="get attributes"):
    # 若是id有存在於訓練資料中，則取得對應數值，否則為0
    if i in train_id:
        filter_col = df[df["chid"]==i].iloc[0]
        user_attributes[i] = {col: filter_col[col].astype("int") for col in generate_col}
    else:
        user_attributes[i] = {col: 0 for col in generate_col}

get attributes:   3%|██▏                                                                              | 13273/500000 [03:09<1:55:49, 70.04it/s]


KeyboardInterrupt: 

In [16]:
df = df.drop(columns=["chid", "txn_amt"])

## Model Training

In [None]:
def run_model(train_data, feature_names,linear_feature_columns, dnn_feature_columns, target, save_path):
    # traing data & testing data
    train, test = train_test_split(train_data, test_size=0.2, random_state=66)
    
    #training 
    train_model_input = {name:train[name].values for name in feature_names}
    test_model_input = {name:test[name].values for name in feature_names}

    model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
    model.compile("adam", "mse",
                  metrics=['mse'], )
    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=100, verbose=2, validation_split=0.2)
    
    # save model 
    model.save_weights(save_path)
    
    return model

def get_feature(data, sparse_features, dense_features):
    #normalize dense
    mms = MinMaxScaler(feature_range=(0,1))
    data[dense_features] = mms.fit_transform(data[dense_features])
    
    #generate faeture columns 
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    return data, feature_names, linear_feature_columns, dnn_feature_columns

In [None]:
sparse_features = ["adj_id", "shop_tag", "masts", "txn_cnt", "educd", "trdtp", "naty", "poscd", "cuorg", "gender_code", "age", "primary_card"]
dense_features = ["dt"]
target = ["txn_amt_log"]
save_path = "../model/DeepFM_1.h5"
#generate faeture columns
df, feature_names, linear_feature_columns, dnn_feature_columns = get_feature(df, sparse_features, dense_features)

model = run_model(df, sparse_features, dense_features, target, save_path)

## output csv

In [None]:
def make_final(model, tags, feature_names, columns, user_attributes):
    final_df = pd.read_csv("../../data/需預測的顧客名單及提交檔案範例.csv")
    output_path = "../fm_1_100.csv"
    id_to_num = {id:i for i, id in enumerate(final_df["chid"].unique())}
    output = list()
    for user in tqdm(final_df["chid"].values, desc="output to csv"):
        inference_data = get_inference_data(user, tags, id_to_num, user_attributes)
        user_df = pd.DataFrame(inference_data, columns=columns)
        user_model_input = {name:user_df[name].values for name in feature_names}
        pred_final = model.predict(user_model_input, batch_size=256)
        user_max_tags = tags[np.argsort(pred_final, axis=0)[::-1][:3]]
        output.append(np.insert(user_max_tags, 0, user))
    
    print(output)
    output_csv = pd.DataFrame(output, columns=["chid", "top1", "top2", "top3"])
    print(output_csv.head(10))
    output_csv.to_csv(output_path, index=False)

def get_inference_data(user, tags, id_to_num, user_attributes):
    #generate data
    inference_data = [[id_to_num[user],
                      tag,
                      user_attributes[user]["masts"],
                      user_attributes[user]["txn_cnt"],
                      user_attributes[user]["educd"],
                      user_attributes[user]["trdtp"],
                      user_attributes[user]["naty"],
                      user_attributes[user]["poscd"],
                      user_attributes[user]["cuorg"],
                      user_attributes[user]["gender_code"],
                      user_attributes[user]["age"],
                      user_attributes[user]["primary_card"],
                      25]
                      for tag in tags]

    return inference_data


In [None]:
columns = sparse_features + dense_features
make_final(model, tags, feature_names, columns, user_attributes)

# Model 2
利用使用者屬性資料(使用sample shop tags 資料)

In [3]:
df = pd.read_csv("../../data/tbrain_sample.csv")
len(df)

  exec(code_obj, self.user_global_ns, self.user_ns)


12184616

In [4]:
final_df = pd.read_csv("../../data/需預測的顧客名單及提交檔案範例.csv")
len(final_df)

500000

In [5]:
df = df[["dt", "chid", "shop_tag", "txn_cnt", "txn_amt", "masts", "educd", "trdtp", "naty", "poscd", "cuorg", "gender_code", "age", "primary_card"]]

## Data Preprocess

- 做nan處理
- txn_cnt 有負數可以進行排除
- 將id 重新編號
- 訓練樣本是否需要平衡？平衡後訓練人數會不會下降？
- 有些人數沒有出現過在需預測的類別中，因此需思考要如何對這些人做預測


任一欄位為NAN則排除

In [6]:
df.dropna(inplace=True)

排除消費次數為負數的數值

In [7]:
df = df[df["txn_cnt"]>0]

將chid重新編號, 取得最後輸出時需要的chid

In [8]:
num_to_id = { i:id for i, id in enumerate(final_df["chid"].unique())}
id_to_num = { id:i for i, id in enumerate(final_df["chid"].unique())}

In [9]:
df["adj_id"] = df["chid"].map(id_to_num)

將以下類別欄位直接轉為int
- 婚姻
- 教育程度
- 行業別
- 國籍
- 職位別
- 客戶來源
- 正卡信用額度
- 性別代碼
- 年紀
- 正附卡註記

In [10]:
df['masts'] = df['masts'].astype(int)
df['educd'] = df['educd'].astype(int)
df['trdtp'] = df['trdtp'].astype(int)
df['naty'] = df['naty'].astype(int)
df['poscd'] = df['poscd'].astype(int)
df['cuorg'] = df['cuorg'].astype(int)
df['gender_code'] = df['gender_code'].astype(int)
df['age'] = df['age'].astype(int)
df['primary_card'] = df['primary_card'].astype(int)

將消費金額轉為log型態

In [11]:
df["txn_amt_log"] = df["txn_amt"].apply(np.log)

In [12]:
df.head(10)

Unnamed: 0,chid,shop_tag,txn_cnt,txn_amt,masts,educd,trdtp,naty,poscd,cuorg,gender_code,age,primary_card,adj_id,txn_amt_log
0,10194083,45,2,2829.165439,1,3,2,1,2,30,0,3,1,8700,7.947737
1,10366793,45,3,10403.646053,1,4,5,1,99,30,1,5,1,91451,9.249912
2,10456333,45,5,12547.922828,1,4,10,1,4,30,1,6,1,443203,9.43731
3,10132551,45,2,10554.588142,1,5,15,1,99,30,0,5,0,357792,9.264316
4,10138738,45,5,11596.021405,1,4,10,1,99,30,1,6,1,159213,9.358417
5,10148523,45,1,3590.822394,1,4,11,1,1,4,1,5,1,135939,8.186137
6,10249913,45,1,7752.713479,1,5,11,1,2,30,1,6,1,17324,8.955798
7,10279409,45,6,18132.147105,2,3,2,1,2,30,1,3,1,472099,9.805442
8,10346547,45,2,9667.152211,1,3,11,1,2,30,0,5,1,343900,9.176489
9,10073483,45,1,8019.906666,1,2,3,1,1,30,1,3,1,176278,8.989682


In [20]:
# 建立使用者屬性
user_attributes = dict()
train_id = df["chid"].unique()
generate_col = ["masts", "txn_cnt", "educd", "trdtp", "naty", "poscd", "cuorg", "gender_code", "age", "primary_card"]
for i in tqdm(final_df["chid"].unique(), desc="get attributes"):
    # 若是id有存在於訓練資料中，則取得對應數值，否則為0
    if i in train_id:
        filter_col = df[df["chid"]==i].iloc[0]
        user_attributes[i] = {col: filter_col[col].astype("int") for col in generate_col}
    else:
        user_attributes[i] = {col: 0 for col in generate_col}

get attributes:   3%|██▏                                                                              | 13273/500000 [03:09<1:55:49, 70.04it/s]


KeyboardInterrupt: 

In [16]:
df = df.drop(columns=["chid", "txn_amt"])

## Model Training

In [None]:
def run_model(train_data, feature_names,linear_feature_columns, dnn_feature_columns, target, save_path):
    # traing data & testing data
    train, test = train_test_split(train_data, test_size=0.2, random_state=66)
    
    #training 
    train_model_input = {name:train[name].values for name in feature_names}
    test_model_input = {name:test[name].values for name in feature_names}

    model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
    model.compile("adam", "mse",
                  metrics=['mse'], )
    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2)
    
    # save model 
    model.save_weights(save_path)
    
    return model

def get_feature(data, sparse_features, dense_features):
    #normalize dense
    mms = MinMaxScaler(feature_range=(0,1))
    data[dense_features] = mms.fit_transform(data[dense_features])
    
    #generate faeture columns 
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    return data, feature_names, linear_feature_columns, dnn_feature_columns

In [None]:
sparse_features = ["adj_id", "shop_tag", "masts", "txn_cnt", "educd", "trdtp", "naty", "poscd", "cuorg", "gender_code", "age", "primary_card"]
dense_features = ["dt"]
target = ["txn_amt_log"]
save_path = "../model/DeepFM_2.h5"
#generate faeture columns
df, feature_names, linear_feature_columns, dnn_feature_columns = get_feature(df, sparse_features, dense_features)

model = run_model(df, sparse_features, dense_features, target, save_path)

## output csv

In [None]:
def make_final(model, tags, feature_names, columns, user_attributes):
    final_df = pd.read_csv("../../data/需預測的顧客名單及提交檔案範例.csv")
    output_path = "../fm_2.csv"
    id_to_num = {id:i for i, id in enumerate(final_df["chid"].unique())}
    output = list()
    for user in tqdm(final_df["chid"].values, desc="output to csv"):
        inference_data = get_inference_data(user, tags, id_to_num, user_attributes)
        user_df = pd.DataFrame(inference_data, columns=columns)
        user_model_input = {name:user_df[name].values for name in feature_names}
        pred_final = model.predict(user_model_input, batch_size=256)
        user_max_tags = tags[np.argsort(pred_final, axis=0)[::-1][:3]]
        output.append(np.insert(user_max_tags, 0, user))
    
    print(output)
    output_csv = pd.DataFrame(output, columns=["chid", "top1", "top2", "top3"])
    print(output_csv.head(10))
    output_csv.to_csv(output_path, index=False)

def get_inference_data(user, tags, id_to_num, user_attributes):
    #generate data
    inference_data = [[id_to_num[user],
                      tag,
                      user_attributes[user]["masts"],
                      user_attributes[user]["txn_cnt"],
                      user_attributes[user]["educd"],
                      user_attributes[user]["trdtp"],
                      user_attributes[user]["naty"],
                      user_attributes[user]["poscd"],
                      user_attributes[user]["cuorg"],
                      user_attributes[user]["gender_code"],
                      user_attributes[user]["age"],
                      user_attributes[user]["primary_card"],
                      25]
                      for tag in tags]

    return inference_data


In [None]:
columns = sparse_features + dense_features
make_final(model, tags, feature_names, columns, user_attributes)