# Import Library

In [25]:
import sys
sys.path.append('/Users/baron/Research/ncku/Recommendation/credict_recsys')

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from apps.inference import make_final
from tqdm import tqdm

In [None]:
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names

# 資料讀取

In [2]:
df = pd.read_csv("../../data/tbrain_small.csv")
len(df)

22130579

In [8]:
final_df = pd.read_csv("../../data/需預測的顧客名單及提交檔案範例.csv")
len(final_df)

500000

In [3]:
df = df.drop(columns=['Unnamed: 0'])

# Model 1
利用使用者屬性資料

In [4]:
df = df[["chid", "shop_tag", "txn_cnt", "txn_amt", "masts", "educd", "trdtp", "naty", "poscd", "cuorg", "gender_code", "age", "primary_card"]]

## Data Preprocess

- 做nan處理
- txn_cnt 有負數可以進行排除
- 將id 重新編號
- 訓練樣本是否需要平衡？平衡後訓練人數會不會下降？
- 有些人數沒有出現過在需預測的類別中，因此需思考要如何對這些人做預測


任一欄位為NAN則排除

In [5]:
df.dropna(inplace=True)

排除消費次數為負數的數值

In [6]:
df = df[df["txn_cnt"]>0]

將chid重新編號, 取得最後輸出時需要的chid

In [9]:
num_to_id = { i:id for i, id in enumerate(final_df["chid"].unique())}
id_to_num = { id:i for i, id in enumerate(final_df["chid"].unique())}

In [10]:
df["adj_id"] = df["chid"].map(id_to_num)

將以下類別欄位直接轉為int
- 婚姻
- 教育程度
- 行業別
- 國籍
- 職位別
- 客戶來源
- 正卡信用額度
- 性別代碼
- 年紀
- 正附卡註記

In [14]:
df['masts'] = df['masts'].astype(int)
df['educd'] = df['educd'].astype(int)
df['trdtp'] = df['trdtp'].astype(int)
df['naty'] = df['naty'].astype(int)
df['poscd'] = df['poscd'].astype(int)
df['cuorg'] = df['cuorg'].astype(int)
df['gender_code'] = df['gender_code'].astype(int)
df['age'] = df['age'].astype(int)
df['primary_card'] = df['primary_card'].astype(int)

將消費金額轉為log型態

In [12]:
df["txn_amt_log"] = df["txn_amt"].apply(np.log)

In [15]:
df.head(10)

Unnamed: 0,chid,shop_tag,txn_cnt,txn_amt,masts,educd,trdtp,naty,poscd,cuorg,gender_code,age,primary_card,adj_id,txn_amt_log
0,10267183,2,1,21701.307598,2,6,15,1,99,30,0,6,0,354909,9.985128
1,10115966,2,1,6698.199203,1,4,9,1,1,30,0,5,1,354911,8.809594
2,10484590,2,2,6693.510475,2,2,15,1,99,30,0,3,1,354959,8.808894
3,10079974,2,1,3271.02509,2,2,2,1,2,30,0,4,1,354992,8.092859
4,10233949,2,1,2829.165439,1,5,15,1,99,30,1,6,1,354949,7.947737
5,10488184,2,2,8407.70004,2,3,12,1,6,30,1,4,1,355015,9.036903
6,10038204,2,2,7604.385348,2,3,11,1,2,30,1,3,1,352405,8.93648
7,10050073,2,2,31743.259948,1,6,15,1,99,30,0,6,1,354960,10.365436
8,10436047,2,1,4333.009825,2,3,11,1,2,30,0,2,1,355022,8.374018
9,10193951,2,5,24049.208333,2,2,17,1,2,30,0,3,1,354941,10.087857


In [None]:
# 建立使用者屬性
user_attributes = dict()
train_id = df["chid"].unique()
for i in tqdm(final_df["chid"].unique(), desc="get attributes"):
    # 若是id有存在於訓練資料中，則取得對應數值，否則為0
    user_attributes[i] = {"masts": df[df["chid"]==i]["masts"].values[0] if i in train_id else 0,
                          "txn_cnt": df[df["chid"]==i]["txn_cnt"].values[0] if i in train_id else 0,
                          "educd": df[df["chid"]==i]["educd"].values[0] if i in train_id else 0,
                          "trdtp": df[df["chid"]==i]["trdtp"].values[0] if i in train_id else 0,
                          "naty": df[df["chid"]==i]["naty"].values[0] if i in train_id else 0,
                          "poscd": df[df["chid"]==i]["poscd"].values[0] if i in train_id else 0,
                          "cuorg": df[df["chid"]==i]["cuorg"].values[0] if i in train_id else 0,
                          "gender_code": df[df["chid"]==i]["gender_code"].values[0] if i in train_id else 0,
                          "age": df[df["chid"]==i]["age"].values[0] if i in train_id else 0,
                          "primary_card": df[df["chid"]==i]["primary_card"].values[0] if i in train_id else 0}

In [16]:
df = df.drop(columns=["chid", "txn_amt"])

## Model Training

In [None]:
def run_model(train_data, sparse_features, dense_features, target, save_path):
    #normalize dense
    mms = MinMaxScaler(feature_range=(0,1))
    train[dense_features] = mms.fit_transform(train_data[dense_features])
    
    #generate faeture columns 
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_data[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    # traing data & testing data
    train, test = train_test_split(train_data, test_size=0.2, random_state=66)
    
    #training 
    train_model_input = {name:train[name].values for name in feature_names}
    test_model_input = {name:test[name].values for name in feature_names}

    model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
    model.compile("adam", "mse",
                  metrics=['mse'], )
    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2)
    
    # save model 
    model.save_weights(save_path)
    
    return model

In [None]:
sparse_features = ["adj_id", "shop_tag", "masts", "txn_cnt", "educd", "trdtp", "naty", "poscd", "cuorg", "gender_code", "age", "primary_card"]
dense_features = ["dt"]
target = ["txn_amt_log"]
save_path = "../model/DeepFM_1.h5"

model = run_model(df, sparse_features, dense_features, target, save_path)

## output csv

In [None]:
make_final(model, tags, feature_names, columns, user_attributes)