In [1]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [7]:
from deepctr.models import DeepFM

In [8]:
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

In [9]:
data = pd.read_csv('./criteo_sample.txt')

sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

In [10]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [11]:
# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4 )
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [12]:
# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2, random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [13]:
# 4.Define Model,train,predict and evaluate
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
              metrics=['binary_crossentropy'], )

In [14]:
history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

Epoch 1/10
1/1 - 5s - loss: 0.7803 - binary_crossentropy: 0.7803 - val_loss: 0.7653 - val_binary_crossentropy: 0.7653
Epoch 2/10
1/1 - 0s - loss: 0.7573 - binary_crossentropy: 0.7573 - val_loss: 0.7510 - val_binary_crossentropy: 0.7510
Epoch 3/10
1/1 - 0s - loss: 0.7352 - binary_crossentropy: 0.7352 - val_loss: 0.7374 - val_binary_crossentropy: 0.7374
Epoch 4/10
1/1 - 0s - loss: 0.7140 - binary_crossentropy: 0.7139 - val_loss: 0.7243 - val_binary_crossentropy: 0.7243
Epoch 5/10
1/1 - 0s - loss: 0.6935 - binary_crossentropy: 0.6935 - val_loss: 0.7117 - val_binary_crossentropy: 0.7117
Epoch 6/10
1/1 - 0s - loss: 0.6738 - binary_crossentropy: 0.6738 - val_loss: 0.6994 - val_binary_crossentropy: 0.6994
Epoch 7/10
1/1 - 0s - loss: 0.6545 - binary_crossentropy: 0.6545 - val_loss: 0.6874 - val_binary_crossentropy: 0.6874
Epoch 8/10
1/1 - 0s - loss: 0.6356 - binary_crossentropy: 0.6356 - val_loss: 0.6756 - val_binary_crossentropy: 0.6756
Epoch 9/10
1/1 - 0s - loss: 0.6169 - binary_crossentropy

In [16]:
pred_ans = model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss 0.6319
test AUC 0.6237
