In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [17]:
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names

# 資料讀取

In [2]:
df = pd.read_csv("../../data/tbrain_small.csv")
len(df)

22130579

In [3]:
final_df = pd.read_csv("../../data/需預測的顧客名單及提交檔案範例.csv")
len(final_df["chid"].unique())

500000

In [4]:
df = df.drop(columns=['Unnamed: 0'])
df.columns

Index(['dt', 'chid', 'shop_tag', 'txn_cnt', 'txn_amt', 'domestic_offline_cnt',
       'domestic_online_cnt', 'overseas_offline_cnt', 'overseas_online_cnt',
       'domestic_offline_amt_pct', 'domestic_online_amt_pct',
       'overseas_offline_amt_pct', 'overseas_online_amt_pct', 'card_1_txn_cnt',
       'card_2_txn_cnt', 'card_3_txn_cnt', 'card_4_txn_cnt', 'card_5_txn_cnt',
       'card_6_txn_cnt', 'card_7_txn_cnt', 'card_8_txn_cnt', 'card_9_txn_cnt',
       'card_10_txn_cnt', 'card_11_txn_cnt', 'card_12_txn_cnt',
       'card_13_txn_cnt', 'card_14_txn_cnt', 'card_other_txn_cnt',
       'card_1_txn_amt_pct', 'card_2_txn_amt_pct', 'card_3_txn_amt_pct',
       'card_4_txn_amt_pct', 'card_5_txn_amt_pct', 'card_6_txn_amt_pct',
       'card_7_txn_amt_pct', 'card_8_txn_amt_pct', 'card_9_txn_amt_pct',
       'card_10_txn_amt_pct', 'card_11_txn_amt_pct', 'card_12_txn_amt_pct',
       'card_13_txn_amt_pct', 'card_14_txn_amt_pct', 'card_other_txn_amt_pct',
       'masts', 'educd', 'trdtp', 'naty

In [5]:
df.head()

Unnamed: 0,dt,chid,shop_tag,txn_cnt,txn_amt,domestic_offline_cnt,domestic_online_cnt,overseas_offline_cnt,overseas_online_cnt,domestic_offline_amt_pct,...,masts,educd,trdtp,naty,poscd,cuorg,slam,gender_code,age,primary_card
0,1,10267183,2,1,21701.307598,1,0,0,0,1.0,...,2.0,6.0,15.0,1.0,99.0,30.0,,0.0,6.0,0
1,1,10115966,2,1,6698.199203,1,0,0,0,1.0,...,1.0,4.0,9.0,1.0,1.0,30.0,133000.939458,0.0,5.0,1
2,1,10484590,2,2,6693.510475,2,0,0,0,1.0,...,2.0,2.0,15.0,1.0,99.0,30.0,82728.10308,0.0,3.0,1
3,1,10079974,2,1,3271.02509,1,0,0,0,1.0,...,2.0,2.0,2.0,1.0,2.0,30.0,117920.305695,0.0,4.0,1
4,1,10233949,2,1,2829.165439,1,0,0,0,1.0,...,1.0,5.0,15.0,1.0,99.0,30.0,181351.200025,1.0,6.0,1


# Toy Example

In [6]:
df = df[["dt", "chid", "shop_tag", "txn_amt", "masts"]]

## Data Preprocess

- txn_cnt 有負數可以進行排除
- 做ＮＡＮ處理
- 將id 重新編號
- 訓練樣本是否需要平衡？平衡後訓練人數會不會下降？
- 有些人數沒有出現過在需預測的類別中，因此需思考要如何對這些人做預測

### 排除消費次數為負數的數值

In [12]:
#df = df[df["txn_cnt"]>0]

### 任一欄位為NAN則排除

In [7]:
df.dropna(inplace=True)

In [8]:
len(df["chid"].unique())

498038

### 將chid重新編號

取得最後輸出時需要的chid

In [9]:
num_to_id = { i:id for i, id in enumerate(final_df["chid"].unique())}
id_to_num = { id:i for i, id in enumerate(final_df["chid"].unique())}

In [10]:
df["adj_id"] = df["chid"].map(id_to_num)
df.head()

Unnamed: 0,dt,chid,shop_tag,txn_amt,masts,adj_id
0,1,10267183,2,21701.307598,2.0,354909
1,1,10115966,2,6698.199203,1.0,354911
2,1,10484590,2,6693.510475,2.0,354959
3,1,10079974,2,3271.02509,2.0,354992
4,1,10233949,2,2829.165439,1.0,354949


將婚姻類別轉為int

In [11]:
df['masts'] = df['masts'].astype(int)
df.head()

Unnamed: 0,dt,chid,shop_tag,txn_amt,masts,adj_id
0,1,10267183,2,21701.307598,2,354909
1,1,10115966,2,6698.199203,1,354911
2,1,10484590,2,6693.510475,2,354959
3,1,10079974,2,3271.02509,2,354992
4,1,10233949,2,2829.165439,1,354949


## 將消費金額轉為log型態

In [12]:
df["txn_amt_log"] = df["txn_amt"].apply(np.log)
df.head()

Unnamed: 0,dt,chid,shop_tag,txn_amt,masts,adj_id,txn_amt_log
0,1,10267183,2,21701.307598,2,354909,9.985128
1,1,10115966,2,6698.199203,1,354911,8.809594
2,1,10484590,2,6693.510475,2,354959,8.808894
3,1,10079974,2,3271.02509,2,354992,8.092859
4,1,10233949,2,2829.165439,1,354949,7.947737


In [13]:
train = df.drop(columns=["chid", "txn_amt"])
train.head()

Unnamed: 0,dt,shop_tag,masts,adj_id,txn_amt_log
0,1,2,2,354909,9.985128
1,1,2,1,354911,8.809594
2,1,2,2,354959,8.808894
3,1,2,2,354992,8.092859
4,1,2,1,354949,7.947737


### DeepCTR test

In [14]:
sparse_features = ["adj_id", "shop_tag", "masts"]
dense_features = ["dt"]
target = ["txn_amt_log"]

normalize dense features

In [15]:
mms = MinMaxScaler(feature_range=(0,1))
train[dense_features] = mms.fit_transform(train[dense_features])
train.head()

Unnamed: 0,dt,shop_tag,masts,adj_id,txn_amt_log
0,0.0,2,2,354909,9.985128
1,0.0,2,1,354911,8.809594
2,0.0,2,2,354959,8.808894
3,0.0,2,2,354992,8.092859
4,0.0,2,1,354949,7.947737


#### generate feature columns

In [18]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train[feat].max() + 1,embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [19]:
# traing data & testing data
train, test = train_test_split(train, test_size=0.2, random_state=66)

#### Train

In [26]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}


model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
model.compile("adam", "mse",
              metrics=['mse'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Train on 14163565 samples, validate on 3540892 samples
Epoch 1/10
14163565/14163565 - 1790s - loss: 0.4254 - mean_squared_error: 0.4082 - val_loss: 0.3819 - val_mean_squared_error: 0.3612
Epoch 2/10
14163565/14163565 - 1744s - loss: 0.3738 - mean_squared_error: 0.3528 - val_loss: 0.3715 - val_mean_squared_error: 0.3505
Epoch 3/10
14163565/14163565 - 1741s - loss: 0.3728 - mean_squared_error: 0.3516 - val_loss: 0.3739 - val_mean_squared_error: 0.3526
Epoch 4/10
14163565/14163565 - 1751s - loss: 0.3723 - mean_squared_error: 0.3508 - val_loss: 0.3700 - val_mean_squared_error: 0.3486
Epoch 5/10
14163565/14163565 - 1743s - loss: 0.3716 - mean_squared_error: 0.3501 - val_loss: 0.3712 - val_mean_squared_error: 0.3500
Epoch 6/10
14163565/14163565 - 1743s - loss: 0.3713 - mean_squared_error: 0.3498 - val_loss: 0.3700 - val_mean_squared_error: 0.3487
Epoch 7/10
14163565/14163565 - 1740s - loss: 0.3711 - mean_squared_error: 0.3497 - val_loss: 0.3696 - val_mean_squared_error: 0.3484
Epoch 8/10
141

#### Predict

In [27]:
pred_ans = model.predict(test_model_input, batch_size=256)
print(pred_ans)

[[8.816601 ]
 [9.089866 ]
 [8.679361 ]
 ...
 [8.987803 ]
 [8.5921755]
 [8.492655 ]]


### Final output

In [22]:
shop_tag = train["shop_tag"].unique()
# user
#for
#tag
final_df = pd.DataFrame([[354911, tag, 1, 25] for tag in shop_tag], columns=sparse_features+dense_features)
final_df.head()

Unnamed: 0,adj_id,shop_tag,masts,dt
0,354911,12,1,25
1,354911,37,1,25
2,354911,15,1,25
3,354911,36,1,25
4,354911,10,1,25


In [28]:
final_model_input = {name:final_df[name].values for name in feature_names}
pred_final = model.predict(final_model_input, batch_size=256)

In [37]:
shop_tag

array([12, 37, 15, 36, 10,  6, 13,  2, 18, 48, 19, 25, 21, 22, 39, 26],
      dtype=int64)

In [50]:
max_index = np.argsort(pred_final, axis=0)[::-1][:3]
print(f"index:\n {max_index},\nshop_tag:\n{shop_tag[max_index]}")

index:
 [[14]
 [10]
 [15]],
shop_tag:
[[39]
 [19]
 [26]]


In [29]:
pred_final

array([[13.182239 ],
       [13.210162 ],
       [12.991496 ],
       [12.82268  ],
       [13.129737 ],
       [13.273463 ],
       [12.897992 ],
       [13.313661 ],
       [13.194458 ],
       [12.794162 ],
       [13.3712015],
       [12.961331 ],
       [13.103572 ],
       [12.892783 ],
       [13.503781 ],
       [13.33014  ]], dtype=float32)

In [51]:
test[test["adj_id"]==354911][["shop_tag", "txn_amt_log"]]

Unnamed: 0,shop_tag,txn_amt_log
12152704,25,9.08666
10013536,19,10.398011
57906,2,9.84419
1578728,2,10.488068
6597534,15,9.579403
9830092,19,10.345238
12922535,26,8.642904
16082369,37,7.800066
13054049,26,10.075165
8769710,15,9.717112
