In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names

# 資料讀取

In [21]:
df = pd.read_csv("../../data/tbrain_small.csv")
len(df)

22130579

In [5]:
final_df = pd.read_csv("../../data/需預測的顧客名單及提交檔案範例.csv")
len(final_df["chid"].unique())

500000

In [6]:
df = df.drop(columns=['Unnamed: 0'])
df.columns

Index(['dt', 'chid', 'shop_tag', 'txn_cnt', 'txn_amt', 'domestic_offline_cnt',
       'domestic_online_cnt', 'overseas_offline_cnt', 'overseas_online_cnt',
       'domestic_offline_amt_pct', 'domestic_online_amt_pct',
       'overseas_offline_amt_pct', 'overseas_online_amt_pct', 'card_1_txn_cnt',
       'card_2_txn_cnt', 'card_3_txn_cnt', 'card_4_txn_cnt', 'card_5_txn_cnt',
       'card_6_txn_cnt', 'card_7_txn_cnt', 'card_8_txn_cnt', 'card_9_txn_cnt',
       'card_10_txn_cnt', 'card_11_txn_cnt', 'card_12_txn_cnt',
       'card_13_txn_cnt', 'card_14_txn_cnt', 'card_other_txn_cnt',
       'card_1_txn_amt_pct', 'card_2_txn_amt_pct', 'card_3_txn_amt_pct',
       'card_4_txn_amt_pct', 'card_5_txn_amt_pct', 'card_6_txn_amt_pct',
       'card_7_txn_amt_pct', 'card_8_txn_amt_pct', 'card_9_txn_amt_pct',
       'card_10_txn_amt_pct', 'card_11_txn_amt_pct', 'card_12_txn_amt_pct',
       'card_13_txn_amt_pct', 'card_14_txn_amt_pct', 'card_other_txn_amt_pct',
       'masts', 'educd', 'trdtp', 'naty

In [7]:
df.head()

Unnamed: 0,dt,chid,shop_tag,txn_cnt,txn_amt,domestic_offline_cnt,domestic_online_cnt,overseas_offline_cnt,overseas_online_cnt,domestic_offline_amt_pct,...,masts,educd,trdtp,naty,poscd,cuorg,slam,gender_code,age,primary_card
0,1,10267183,2,1,21701.307598,1,0,0,0,1.0,...,2.0,6.0,15.0,1.0,99.0,30.0,,0.0,6.0,0
1,1,10115966,2,1,6698.199203,1,0,0,0,1.0,...,1.0,4.0,9.0,1.0,1.0,30.0,133000.939458,0.0,5.0,1
2,1,10484590,2,2,6693.510475,2,0,0,0,1.0,...,2.0,2.0,15.0,1.0,99.0,30.0,82728.10308,0.0,3.0,1
3,1,10079974,2,1,3271.02509,1,0,0,0,1.0,...,2.0,2.0,2.0,1.0,2.0,30.0,117920.305695,0.0,4.0,1
4,1,10233949,2,1,2829.165439,1,0,0,0,1.0,...,1.0,5.0,15.0,1.0,99.0,30.0,181351.200025,1.0,6.0,1


# Toy Example

In [33]:
df = df[["dt", "chid", "shop_tag", "txn_amt", "masts"]]

## Data Preprocess

- txn_cnt 有負數可以進行排除
- 做ＮＡＮ處理
- 將id 重新編號
- 訓練樣本是否需要平衡？平衡後訓練人數會不會下降？
- 有些人數沒有出現過在需預測的類別中，因此需思考要如何對這些人做預測

### 排除消費次數為負數的數值

In [12]:
#df = df[df["txn_cnt"]>0]

### 任一欄位為NAN則排除

In [34]:
df.dropna(inplace=True)

In [35]:
len(df["chid"].unique())

498038

### 將chid重新編號

取得最後輸出時需要的chid

In [36]:
num_to_id = { i:id for i, id in enumerate(final_df["chid"].unique())}
id_to_num = { id:i for i, id in enumerate(final_df["chid"].unique())}

In [37]:
df["adj_id"] = df["chid"].map(id_to_num)
df.head()

Unnamed: 0,dt,chid,shop_tag,txn_amt,masts,adj_id
0,1,10267183,2,21701.307598,2.0,354909
1,1,10115966,2,6698.199203,1.0,354911
2,1,10484590,2,6693.510475,2.0,354959
3,1,10079974,2,3271.02509,2.0,354992
4,1,10233949,2,2829.165439,1.0,354949


將婚姻類別轉為int

In [38]:
df['masts'] = df['masts'].astype(int)
df.head()

Unnamed: 0,dt,chid,shop_tag,txn_amt,masts,adj_id
0,1,10267183,2,21701.307598,2,354909
1,1,10115966,2,6698.199203,1,354911
2,1,10484590,2,6693.510475,2,354959
3,1,10079974,2,3271.02509,2,354992
4,1,10233949,2,2829.165439,1,354949


## 將消費金額轉為log型態

In [39]:
df["txn_amt_log"] = df["txn_amt"].apply(np.log)
df.head()

Unnamed: 0,dt,chid,shop_tag,txn_amt,masts,adj_id,txn_amt_log
0,1,10267183,2,21701.307598,2,354909,9.985128
1,1,10115966,2,6698.199203,1,354911,8.809594
2,1,10484590,2,6693.510475,2,354959,8.808894
3,1,10079974,2,3271.02509,2,354992,8.092859
4,1,10233949,2,2829.165439,1,354949,7.947737


In [40]:
train = df.drop(columns=["chid", "txn_amt"])
train.head()

Unnamed: 0,dt,shop_tag,masts,adj_id,txn_amt_log
0,1,2,2,354909,9.985128
1,1,2,1,354911,8.809594
2,1,2,2,354959,8.808894
3,1,2,2,354992,8.092859
4,1,2,1,354949,7.947737


### DeepCTR test

In [49]:
sparse_features = ["adj_id", "shop_tag", "masts"]
dense_features = ["dt"]
target = ["txn_amt_log"]

normalize dense features

In [29]:
mms = MinMaxScaler(feature_range=(0,1))
train[dense_features] = mms.fit_transform(train[dense_features])
train.head()

Unnamed: 0,chid,shop_tag,txn_cnt,masts,adj_id
17447537,10101577,37,0.000639,1.0,62666
6599788,10253940,15,0.0,1.0,370269
6941918,10265500,15,0.0,1.0,71089
8388154,10372923,15,0.000639,1.0,164432
12702868,10414655,26,0.001278,1.0,490131


#### generate feature columns

In [None]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [19]:
# traing data & testing data
train, test = train_test_split(train, test_size=0.2, random_state=66)

#### Train

In [None]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}


model = DeepFM(linear_feature_columns,dnn_feature_columns,task='binary')
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

#### Predict

In [None]:
pred_ans = model.predict(test_model_input, batch_size=256)

### Final output

In [51]:
shop_tag = train["shop_tag"].unique()
# user
#for
#tag
final_df = pd.DataFrame([[354911, tag, 1, 25] for tag in shop_tag], columns=sparse_features+dense_features)
final_df.head()

Unnamed: 0,adj_id,shop_tag,masts,dt
0,354911,2,1,25
1,354911,6,1,25
2,354911,10,1,25
3,354911,12,1,25
4,354911,13,1,25


In [None]:
final_model_input = {name:final_df[name].values for name in feature_names}
pred_final = model.predict(final_model_input, batch_size=256)