# movielens regression

In [None]:
import pandas as pd
import torch
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from deepctr_torch.inputs import SparseFeat, get_feature_names
from deepctr_torch.models import DeepFM


In [None]:
data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
                       "gender", "age", "occupation", "zip"]
target = ['rating']

In [None]:
data.head(10)

## 1.Label Encoding for sparse features,and do simple Transformation for dense features

In [None]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [None]:
data.head(10)

# 2.count #unique features for each sparse field

In [None]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                              for feat in sparse_features]
# nunique()返回唯一值的数量
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
fixlen_feature_columns

In [None]:
feature_names

#  3.generate input data for model



In [None]:
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}


# 4.Define Model,train,predict and evaluate

In [None]:
device = 'cpu'
use_cuda = False
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

In [None]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
model.compile("adam", "mse", metrics=['mse'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=256)

In [None]:
print("test MSE", round(mean_squared_error(
        test[target].values, pred_ans), 4))

In [None]:
pred_ans

In [None]:
test['rating'].values

# multivalue-movielens

In [None]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM
import numpy as np

map() 会根据提供的函数对指定序列做映射。
>>> map(lambda x: x ** 2, [1, 2, 3, 4, 5])  # 使用 lambda 匿名函数
[1, 4, 9, 16, 25]
 
提供了两个列表，对相同位置的列表数据进行相加
>>> map(lambda x, y: x + y, [1, 3, 5, 7, 9], [2, 4, 6, 8, 10])
[3, 7, 11, 15, 19]

可以这样认为,lambda作为一个表达式，定义了一个匿名函数，上例的代码x为入口参数，x+1为函数体，用函数来表示为：
非常容易理解，在这里lambda简化了函数定义的书写形式。是代码更为简洁，但是使用函数的定义方式更为直观，易理解。

In [None]:
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

## 1.Label Encoding for sparse features,and process sequence features

In [None]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
# preprocess the sequence feature

In [None]:
key2index

In [None]:
key2index = {}
# 给电影种类分类
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
# 就是个padding

In [None]:
genres_list

## 2.count #unique features for each sparse field and generate feature config for sequence feature

In [None]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                          for feat in sparse_features]

varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres',
                                                      vocabulary_size=len(key2index) + 1,
                                                      embedding_dim=4),
                                           maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
fixlen_feature_columns

In [None]:
varlen_feature_columns

 # 3.generate input data for model
    

In [None]:
model_input = {name: data[name] for name in sparse_features}  #
model_input["genres"] = genres_list


In [None]:
device = 'cpu'
use_cuda = False
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
                    batch_size=256, epochs=100, verbose=2, validation_split=0.2, )

In [None]:
pred_ans = model.predict(model_input, batch_size=256)

In [None]:
pred_ans

In [None]:
data['rating']

# CTR预测例子

In [1]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [2]:
data = pd.read_csv('./criteo_sample.txt')

sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

In [20]:
data['C1']

0       0
1      11
2       0
3       0
4       0
       ..
195     0
196    21
197     0
198     0
199    21
Name: C1, Length: 200, dtype: int32

# 1.Label Encoding for sparse features,and do simple Transformation for dense features


In [4]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [5]:
data

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.000000,0.001332,0.092362,0.000000,0.034825,0.000000,0.000000,0.673469,0.000000,...,8,66,0,0,3,0,1,96,0,0
1,0,0.000000,0.000000,0.006750,0.402299,0.059628,0.117284,0.003322,0.714286,0.154739,...,7,52,0,0,47,0,7,112,0,0
2,0,0.000000,0.000333,0.000710,0.137931,0.003968,0.077873,0.019934,0.714286,0.505803,...,8,49,0,0,25,0,6,53,0,0
3,0,0.000000,0.004664,0.000355,0.045977,0.033185,0.094967,0.016611,0.081633,0.028046,...,8,37,0,0,156,0,0,32,0,0
4,0,0.000000,0.000333,0.036945,0.310345,0.003922,0.067426,0.013289,0.653061,0.035783,...,8,14,5,3,9,0,0,5,1,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0.000000,0.000333,0.040142,0.034483,0.005984,0.273029,0.006645,0.061224,0.206963,...,0,74,5,1,30,5,0,118,17,48
196,1,0.000000,0.000666,0.000355,0.011494,0.003168,0.005698,0.003322,0.244898,0.014507,...,1,25,0,0,138,0,0,68,0,0
197,1,0.027027,0.000333,0.002131,0.034483,0.000000,0.000000,0.063123,0.061224,0.002901,...,4,40,17,2,41,0,0,12,16,11
198,0,0.000000,0.007662,0.002131,0.252874,0.000400,0.072650,0.265781,0.367347,0.491296,...,4,7,18,1,123,0,0,10,16,49


# 2.count #unique features for each sparse field,and record dense feature field name


In [6]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                          for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                          for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

In [7]:
linear_feature_columns

[SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group'),
 SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C2', group_name='default_group'),
 SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C3', group_name='default_group'),
 SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C4', group_name='default_group'),
 SparseFeat(name='C5', vocabulary_size=12, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C5', group_name='default_group'),
 SparseFeat(name='C6', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C6', group_name='default_group'),
 SparseFeat(name='C7', vocabulary_size=183, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C7', group_name='default_group'),
 SparseFeat

## 3.generate input data for model


In [8]:
train, test = train_test_split(data, test_size=0.2)

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [9]:
data.head(10)

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,0.001332,0.092362,0.0,0.034825,0.0,0.0,0.673469,0.0,...,8,66,0,0,3,0,1,96,0,0
1,0,0.0,0.0,0.00675,0.402299,0.059628,0.117284,0.003322,0.714286,0.154739,...,7,52,0,0,47,0,7,112,0,0
2,0,0.0,0.000333,0.00071,0.137931,0.003968,0.077873,0.019934,0.714286,0.505803,...,8,49,0,0,25,0,6,53,0,0
3,0,0.0,0.004664,0.000355,0.045977,0.033185,0.094967,0.016611,0.081633,0.028046,...,8,37,0,0,156,0,0,32,0,0
4,0,0.0,0.000333,0.036945,0.310345,0.003922,0.067426,0.013289,0.653061,0.035783,...,8,14,5,3,9,0,0,5,1,47
5,0,0.0,0.0,0.02238,0.45977,0.002898,0.028965,0.013289,0.755102,0.044487,...,8,105,5,3,77,0,0,13,1,34
6,0,0.0,0.123584,0.001421,0.011494,0.003522,0.030864,0.046512,0.510204,0.472921,...,4,46,18,3,58,0,2,41,3,71
7,1,0.513514,0.003664,0.010657,0.114943,2e-06,0.001425,0.109635,0.959184,0.121857,...,8,80,5,1,128,0,0,12,16,17
8,0,0.0,0.000333,0.012789,0.252874,0.009233,0.103039,0.0299,0.714286,0.130561,...,8,57,0,0,113,0,9,12,0,0
9,0,0.054054,0.003997,0.002842,0.264368,5.9e-05,0.005223,0.006645,0.163265,0.022244,...,0,95,5,2,28,0,8,34,11,74


# 4.Define Model,train,predict,evaluate

In [14]:
device = 'cpu'
use_cuda = False
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
model.fit(train_model_input, train[target].values,
          batch_size=32, epochs=70, validation_split=0.2, verbose=2)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cpu
Train on 128 samples, validate on 32 samples, 4 steps per epoch
Epoch 1/70
0s - loss:  0.6166 - binary_crossentropy:  0.6166 - auc:  0.5453 - val_binary_crossentropy:  0.5793 - val_auc:  0.7246
Epoch 2/70
0s - loss:  0.4696 - binary_crossentropy:  0.4696 - auc:  0.9768 - val_binary_crossentropy:  0.5946 - val_auc:  0.6377
Epoch 3/70
0s - loss:  0.3474 - binary_crossentropy:  0.3474 - auc:  1.0000 - val_binary_crossentropy:  0.6003 - val_auc:  0.5266
Epoch 4/70
0s - loss:  0.1872 - binary_crossentropy:  0.1872 - auc:  1.0000 - val_binary_crossentropy:  0.6505 - val_auc:  0.5024
Epoch 5/70
0s - loss:  0.0954 - binary_crossentropy:  0.0954 - auc:  1.0000 - val_binary_crossentropy:  0.7233 - val_auc:  0.4831
Epoch 6/70
0s - loss:  0.0582 - binary_crossentropy:  0.0582 - auc:  1.0000 - val_binary_crossentropy:  0.7743 - val_auc:  0.4879
Epoch 7/70
0s - loss:  0.0398 - binary_crossentropy:  0.0398 - auc:  1.0000 - val_binary_crossentropy:  0.8263 - val_auc:  0.4831
Epoch 8/70
0s - loss: 

0s - loss:  0.0009 - binary_crossentropy:  0.0009 - auc:  1.0000 - val_binary_crossentropy:  1.2575 - val_auc:  0.4589
Epoch 64/70
0s - loss:  0.0009 - binary_crossentropy:  0.0009 - auc:  1.0000 - val_binary_crossentropy:  1.2604 - val_auc:  0.4589
Epoch 65/70
0s - loss:  0.0009 - binary_crossentropy:  0.0009 - auc:  1.0000 - val_binary_crossentropy:  1.2634 - val_auc:  0.4589
Epoch 66/70
0s - loss:  0.0008 - binary_crossentropy:  0.0008 - auc:  1.0000 - val_binary_crossentropy:  1.2656 - val_auc:  0.4589
Epoch 67/70
0s - loss:  0.0008 - binary_crossentropy:  0.0008 - auc:  1.0000 - val_binary_crossentropy:  1.2677 - val_auc:  0.4589
Epoch 68/70
0s - loss:  0.0008 - binary_crossentropy:  0.0008 - auc:  1.0000 - val_binary_crossentropy:  1.2715 - val_auc:  0.4589
Epoch 69/70
0s - loss:  0.0008 - binary_crossentropy:  0.0008 - auc:  1.0000 - val_binary_crossentropy:  1.2719 - val_auc:  0.4589
Epoch 70/70
0s - loss:  0.0008 - binary_crossentropy:  0.0008 - auc:  1.0000 - val_binary_cross

In [15]:
pred=model.predict(train_model_input, 256)

In [17]:
pred

array([[1.10217617e-04],
       [3.01186046e-05],
       [4.93904190e-05],
       [1.15745264e-04],
       [1.18597108e-03],
       [6.59942234e-05],
       [1.48401727e-04],
       [2.38176781e-05],
       [2.17674722e-04],
       [4.35395777e-05],
       [1.07054751e-04],
       [9.37636869e-05],
       [3.25871442e-05],
       [6.31685689e-05],
       [5.89048832e-05],
       [3.35569275e-05],
       [1.97419959e-05],
       [8.12507205e-05],
       [9.98389482e-01],
       [9.94843602e-01],
       [9.96833980e-01],
       [9.97778952e-01],
       [2.94820202e-04],
       [1.45310900e-04],
       [2.84458547e-05],
       [1.88346326e-04],
       [5.05146782e-05],
       [9.97907043e-01],
       [4.84730117e-05],
       [1.75593814e-04],
       [1.07931955e-04],
       [3.58702964e-05],
       [5.09825077e-05],
       [9.97884810e-01],
       [9.96704876e-01],
       [9.97035742e-01],
       [4.56492708e-05],
       [4.02543737e-05],
       [9.37036166e-05],
       [9.97111440e-01],


In [16]:
train[target].values

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
    