In [1]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
from deepctr_torch.models import DeepFM

In [3]:
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

In [12]:
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]
names=["label"]+sparse_features+dense_features
data = pd.read_csv('data/criteo_large.txt',sep='\t', 
                  names=["label"]+dense_features+sparse_features)

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

       label    I1   I2     I3    I4        I5    I6    I7    I8     I9  ...  \
0          0   1.0    1    5.0   0.0    1382.0   4.0  15.0   2.0  181.0  ...   
1          0   2.0    0   44.0   1.0     102.0   8.0   2.0   2.0    4.0  ...   
2          0   2.0    0    1.0  14.0     767.0  89.0   4.0   2.0  245.0  ...   
3          0   0.0  893    0.0   0.0    4392.0   0.0   0.0   0.0    0.0  ...   
4          0   3.0   -1    0.0   0.0       2.0   0.0   3.0   0.0    0.0  ...   
...      ...   ...  ...    ...   ...       ...   ...   ...   ...    ...  ...   
99995      0   1.0   60   37.0   0.0       1.0   0.0   4.0   0.0   23.0  ...   
99996      1   0.0    0   12.0   0.0  173121.0   0.0   0.0   3.0   10.0  ...   
99997      0  10.0    2    1.0  26.0     482.0  60.0  10.0  11.0   60.0  ...   
99998      0   0.0  390   43.0   4.0  345365.0   0.0   0.0   4.0    4.0  ...   
99999      0   0.0   -1  137.0  19.0    9504.0   0.0   0.0  22.0   22.0  ...   

            C17       C18       C19    

In [13]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [14]:
# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4 )
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [15]:
# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2, random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [16]:
# 4.Define Model,train,predict and evaluate
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

In [17]:
history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

cpu
Train on 64000 samples, validate on 16000 samples, 250 steps per epoch
Epoch 1/10
15s - loss:  0.4875 - binary_crossentropy:  0.4875 - val_binary_crossentropy:  0.4591
Epoch 2/10
15s - loss:  0.3509 - binary_crossentropy:  0.3508 - val_binary_crossentropy:  0.5132
Epoch 3/10
15s - loss:  0.2356 - binary_crossentropy:  0.2356 - val_binary_crossentropy:  0.5630
Epoch 4/10
15s - loss:  0.1914 - binary_crossentropy:  0.1914 - val_binary_crossentropy:  0.6076
Epoch 5/10
15s - loss:  0.1677 - binary_crossentropy:  0.1677 - val_binary_crossentropy:  0.6567
Epoch 6/10
15s - loss:  0.1523 - binary_crossentropy:  0.1522 - val_binary_crossentropy:  0.7208
Epoch 7/10
15s - loss:  0.1396 - binary_crossentropy:  0.1395 - val_binary_crossentropy:  0.8021
Epoch 8/10
15s - loss:  0.1293 - binary_crossentropy:  0.1292 - val_binary_crossentropy:  0.8686
Epoch 9/10
15s - loss:  0.1210 - binary_crossentropy:  0.1209 - val_binary_crossentropy:  0.9358
Epoch 10/10
15s - loss:  0.1130 - binary_crossentrop

In [18]:
pred_ans = model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss 1.0578
test AUC 0.665
