In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from collections import defaultdict
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
path = 'data/data.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,user_id,label,gender,age,order_cnt_3d,order_cnt_7d,order_cnt_14d,order_cnt_30d,order_cnt_60d,coupon_id,coupon_product_name,during_days,discount_total,original_price,pay_amt
0,1829056,1,1,58,1,2,4,7,27,5058,7天4次卡,7,4,800,650
1,84498884,1,1,45,2,2,2,4,4,6098,5天3次卡,5,3,600,490
2,53034061,1,1,23,0,1,1,7,7,5206,6天5次卡,6,5,1000,700
3,58544645,1,1,46,2,2,6,11,35,5105,30天15次卡,30,15,3000,1580
4,22567227,1,2,31,2,4,10,10,10,6100,7天7次卡,7,7,1400,1100


In [7]:
data.coupon_id.unique()

array([5058, 6098, 5206, 5105, 6100, 6111, 5215, 5209, 5087, 5203, 5115,
       5247, 5331, 5207, 5039, 5205, 5211, 5090, 5208, 5094, 5336, 5237,
       5104, 5243, 5214, 5091, 5221, 5102, 6302, 5076, 5068, 5088, 5099,
       6275, 6102, 5251, 5061, 5218, 4943, 5212, 5120, 5220, 5213, 5101,
       5891, 6222, 6094, 5886, 5079, 5118, 5210, 6217, 5125, 5204, 5093,
       4922, 5332, 6274, 5117, 5122, 6381, 6281, 6253, 5108, 5119, 5189,
       5894, 4955, 5779, 6263, 5085, 5216, 6185, 5106, 5249, 5083, 4931,
       5241, 6286, 5080, 5040, 4983, 6276, 5056, 5217, 6248, 6326, 6107,
       5124, 5630, 5219, 6273, 5182, 6299, 5245, 4977, 6157, 5195, 5899,
       6171, 5201, 5335, 6189, 6093, 5009, 5897, 5231, 5017, 6265, 6285,
       4930, 5198, 5097, 6099, 5185, 4976, 5086, 5914, 5095, 6091, 6218,
       6092, 4984, 5916, 5250, 6255, 6331, 4923, 6149, 6309, 6266, 6319,
       6095, 4980, 6308, 4981, 5628, 6096, 6251, 5900, 4985, 6277, 5084,
       6161, 5906, 5239, 5235, 6175, 4969, 6279, 63

In [3]:
data['order_cnt_3d'] = (data['order_cnt_3d'] - data['order_cnt_3d'].min())/(data['order_cnt_3d'].max() - data['order_cnt_3d'].min())

In [4]:
data.columns

Index(['user_id', 'label', 'gender', 'age', 'order_cnt_3d', 'order_cnt_7d',
       'order_cnt_14d', 'order_cnt_30d', 'order_cnt_60d', 'coupon_id',
       'coupon_product_name', 'during_days', 'discount_total',
       'original_price', 'pay_amt'],
      dtype='object')

In [37]:
data.dtypes

user_id                  int64
label                    int64
gender                   int64
age                      int64
order_cnt_3d           float64
order_cnt_7d             int64
order_cnt_14d            int64
order_cnt_30d            int64
order_cnt_60d            int64
coupon_id                int64
coupon_product_name     object
during_days              int64
discount_total           int64
original_price           int64
pay_amt                  int64
dtype: object

In [5]:
data['age'] = pd.cut(data['age'], bins=[0, 15, 25, 35, 45, 60, 100], labels=['0-15', '15-25', '25-35',
                                                                             '35-45', '45-60', '60-100'])

In [6]:
user_sparse_features = ['user_id', 'age', 'gender']
# 'order_cnt_7d','order_cnt_14d','order_cnt_30d','order_cnt_60d'
user_dense_features = ['order_cnt_3d']
item_features = ['coupon_id', 'during_days', 'discount_total', 'original_price']
    

In [8]:
feature_max_idx = {}
lbe_list = []
# 类别特征编码
for feature in user_sparse_features + item_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
    lbe_list.append(lbe)
    # 记录每个类别特征最大取值数量
    feature_max_idx[feature] = data[feature].max() + 1
feature_max_idx

{'user_id': 379950,
 'age': 5,
 'gender': 2,
 'coupon_id': 298,
 'during_days': 17,
 'discount_total': 28,
 'original_price': 34}

In [27]:
lbe_list[0].classes_

array([  911919,   912175,   912254, ..., 88423337, 88423431, 88423504])

In [26]:
lbe_list[-4].classes_

array([4922, 4923, 4930, 4931, 4943, 4955, 4965, 4969, 4976, 4977, 4980,
       4981, 4983, 4984, 4985, 5009, 5017, 5039, 5040, 5042, 5048, 5056,
       5058, 5061, 5068, 5076, 5079, 5080, 5083, 5084, 5085, 5086, 5087,
       5088, 5090, 5091, 5093, 5094, 5095, 5097, 5099, 5101, 5102, 5104,
       5105, 5106, 5108, 5115, 5117, 5118, 5119, 5120, 5122, 5124, 5125,
       5182, 5185, 5189, 5195, 5198, 5201, 5203, 5204, 5205, 5206, 5207,
       5208, 5209, 5210, 5211, 5212, 5213, 5214, 5215, 5216, 5217, 5218,
       5219, 5220, 5221, 5227, 5229, 5231, 5233, 5235, 5237, 5239, 5241,
       5243, 5245, 5247, 5249, 5250, 5251, 5331, 5332, 5335, 5336, 5626,
       5628, 5630, 5631, 5632, 5638, 5779, 5884, 5886, 5887, 5889, 5891,
       5893, 5894, 5896, 5897, 5899, 5900, 5901, 5902, 5904, 5906, 5907,
       5908, 5909, 5911, 5914, 5915, 5916, 5918, 5919, 5920, 6091, 6092,
       6093, 6094, 6095, 6096, 6098, 6099, 6100, 6102, 6107, 6111, 6145,
       6149, 6150, 6151, 6153, 6155, 6157, 6158, 61

In [13]:
def sparseFeature(feat, feat_num, feat_len=1, embed_dim=4):
    """
    create dictionary for varlen sparse feature
    :param feat: feature name
    :param feat_num: the total number of sparse features that do not repeat
    :param feat_len: while feature is array: feat_len > 1
    :param embed_dim: embedding dimension
    :return:
    """
    return {'feat': feat, 'feat_num': feat_num, 'feat_len': feat_len, 'embed_dim': embed_dim}


In [14]:
# 每个特征组装成字典格式
embed_dim=16
user_sparse_feat_cols = []
user_sparse_feat_cols.append(sparseFeature(feat='user_id', feat_num=feature_max_idx['user_id'], embed_dim=embed_dim))
user_sparse_feat_cols = user_sparse_feat_cols + [sparseFeature(feat=uf, feat_num=feature_max_idx[uf]) for uf in ['age', 'gender']]

user_dense_feat_cols = []
user_dense_feat_cols.append({'feat':'order_cnt_3d'})

item_feat_cols = []
item_feat_cols.append(sparseFeature(feat='coupon_id', feat_num=feature_max_idx['coupon_id'], embed_dim=embed_dim))
item_type = ['during_days', 'discount_total', 'original_price']
item_feat_cols = item_feat_cols + [sparseFeature(feat=mt, feat_num=feature_max_idx[mt]) for mt in item_type]



In [15]:
user_sparse_feat_cols

[{'feat': 'user_id', 'feat_num': 379950, 'feat_len': 1, 'embed_dim': 16},
 {'feat': 'age', 'feat_num': 5, 'feat_len': 1, 'embed_dim': 4},
 {'feat': 'gender', 'feat_num': 2, 'feat_len': 1, 'embed_dim': 4}]

In [16]:
user_dense_feat_cols

[{'feat': 'order_cnt_3d'}]

In [17]:
item_feat_cols

[{'feat': 'coupon_id', 'feat_num': 298, 'feat_len': 1, 'embed_dim': 16},
 {'feat': 'during_days', 'feat_num': 17, 'feat_len': 1, 'embed_dim': 4},
 {'feat': 'discount_total', 'feat_num': 28, 'feat_len': 1, 'embed_dim': 4},
 {'feat': 'original_price', 'feat_num': 34, 'feat_len': 1, 'embed_dim': 4}]

In [18]:
train, test = train_test_split(data, test_size=0.1)

train_X = [{feat: train[feat].values for feat in user_sparse_features },
           {feat: train[feat].values for feat in user_dense_features },
           {feat: train[feat].values for feat in item_features}]
train_X

[{'user_id': array([289342, 327184,  28287, ..., 311450, 160351, 330304]),
  'age': array([0, 3, 1, ..., 1, 2, 0]),
  'gender': array([0, 0, 1, ..., 0, 0, 1])},
 {'order_cnt_3d': array([0.30769231, 0.        , 0.1025641 , ..., 0.        , 0.        ,
         0.        ])},
 {'coupon_id': array([ 21,  67, 139, ...,  63, 140,  67]),
  'during_days': array([10,  4,  8, ...,  9,  3,  4]),
  'discount_total': array([8, 4, 8, ..., 5, 3, 4]),
  'original_price': array([10,  5, 10, ...,  6,  4,  5])}]

In [19]:
train_y = train['label'].values.astype('int32')

test_X = [{feat: test[feat].values for feat in user_sparse_features  },
          {feat: test[feat].values for feat in user_dense_features },
          {feat: test[feat].values for feat in item_features}]

test_y = test['label'].values.astype('int32')
train_y

array([0, 1, 0, ..., 0, 0, 1], dtype=int32)

In [20]:
import os
import sys
sys.path.append('/Users/lxh/Desktop/recommend-tf2.0/src')
import numpy as np
import faiss
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from model2 import Dssm, DNN

In [21]:

os.environ['CUDA_VISIBLE_DEVICES'] = '2, 3'
# ========================= Hyper Parameters =======================
embed_dim = 16
dnn_dropout = 0.3
hidden_units = [256, 128, 64]

learning_rate = 0.005
batch_size = 512
epochs = 1

# ========================== Create dataset =======================
# user_feat_cols,user_dense_feat_cols, item_feat_cols, train_X, train_y, test_X, test_y = create_ml_100k_dataset_test(embed_dim=embed_dim)
user_feat_cols,user_dense_feat_cols, item_feat_cols = user_sparse_feat_cols,user_dense_feat_cols,item_feat_cols
# ============================Build Model==========================

model = Dssm(user_sparse_feature_columns=user_feat_cols,user_dense_feature_columns=user_dense_feat_cols,  \
                 item_sparse_feature_columns=item_feat_cols, dnn_dropout=dnn_dropout)
    # model = dssm.summary()
    # model.summary()
    # ============================Compile============================
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=Adam(learning_rate=learning_rate))
#         loss = sampledsoftmaxloss

# ==============================Fit==============================
model.fit(
    train_X,
    train_y,
    epochs=epochs,
    # callbacks=[EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)],  # checkpoint
#         callbacks=[checkpoint],
    batch_size=batch_size,
    validation_split=0.1
)

path = 'saved_model_sg_data2'
model.save(path, save_format='tf') 


   





INFO:tensorflow:Assets written to: saved_model_sg_data2/assets


INFO:tensorflow:Assets written to: saved_model_sg_data2/assets


In [22]:
custom_objects = {"Dssm": Dssm}
loaded_model = tf.keras.models.load_model(path, custom_objects=custom_objects)
model = loaded_model.build_graph()



In [23]:
user_embed_model = Model(inputs=[model.user_input,model.user_dense_input], outputs=model.user_embed)
item_embed_model = Model(inputs=model.item_input, outputs=model.item_embed)



In [25]:
user_embs = user_embed_model.predict([test_X[0],test_X[1]])

item_embs = item_embed_model.predict(test_X[2])
user_embs = tf.squeeze(user_embs)
item_embs = tf.squeeze(item_embs)

# ===========================recommend==============================
index = faiss.IndexFlatIP(item_embs.shape[1])
# faiss.normalize_L2(item_embs)
index.add(np.array(item_embs))
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), 10)

item_index_mapping = {}  # {item_matrix_index: item_id}
index = 0
for i, item_id in enumerate(train_X[2]['coupon_id']):
    item_index_mapping[index] = int(item_id)
    index += 1


recommed_dict = {}
for i, uid in enumerate(test_X[0]['user_id']):
    recommed_dict.setdefault(uid, [])
    try:
        pred = [item_index_mapping[x] for x in I[i]]
        recommed_dict[uid] = pred
    except:
        print(i)

print(recommed_dict)

{340058: [33, 141, 61, 63, 21, 95, 63, 43, 74, 77], 356912: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 8395: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 289181: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 235081: [47, 141, 88, 15, 0, 65, 74, 85, 22, 63], 295314: [63, 73, 140, 138, 136, 3, 17, 61, 0, 141], 157717: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 181331: [47, 141, 88, 15, 0, 65, 74, 85, 22, 63], 286542: [17, 63, 34, 22, 33, 136, 65, 34, 34, 214], 126422: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 339493: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 289963: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 292283: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 176875: [17, 63, 34, 22, 33, 136, 65, 34, 34, 214], 89398: [94, 67, 85, 22, 63, 75, 215, 138, 47, 52], 183686: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 15927: [67, 85, 22, 63, 75, 215, 138, 47, 52, 96], 360959: [44, 46, 60, 22, 69, 32, 63, 34, 197, 96], 54190: [67, 85, 22, 63, 75, 215, 138, 47, 52, 96], 322657: [63, 73, 140, 138, 136,

In [75]:
## 向量测试
user_embs

<tf.Tensor: shape=(45127, 32), dtype=float32, numpy=
array([[6.2370969e-18, 2.5232330e-18, 6.2691216e-18, ..., 1.6382405e-17,
        0.0000000e+00, 0.0000000e+00],
       [3.1030102e-04, 0.0000000e+00, 1.3750757e-03, ..., 0.0000000e+00,
        0.0000000e+00, 1.4940371e-04],
       [6.2060205e-04, 0.0000000e+00, 2.7501513e-03, ..., 0.0000000e+00,
        0.0000000e+00, 2.9880743e-04],
       ...,
       [5.4951883e-18, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 4.5493813e-18],
       [1.2412041e-03, 0.0000000e+00, 5.5003027e-03, ..., 0.0000000e+00,
        0.0000000e+00, 5.9761485e-04],
       [3.0947205e-18, 0.0000000e+00, 2.5066629e-18, ..., 9.7066866e-18,
        0.0000000e+00, 0.0000000e+00]], dtype=float32)>

In [29]:
lbe_list[0].classes_[340057]

84496179

In [30]:
lbe_list[-4].classes_[32]

5087

In [38]:
data[data.user_id==340058]

Unnamed: 0,user_id,label,gender,age,order_cnt_3d,order_cnt_7d,order_cnt_14d,order_cnt_30d,order_cnt_60d,coupon_id,coupon_product_name,during_days,discount_total,original_price,pay_amt
5688,340058,1,0,1,0.0,1,1,1,4,136,5天3次卡,3,1,1,490


In [42]:
dfs = pd.read_csv('data/data.csv')


In [47]:
dfs[dfs.coupon_id==5087]

9         5天3次卡
23        5天3次卡
32        5天3次卡
42        5天3次卡
43        5天3次卡
          ...  
450977    5天3次卡
451007    5天3次卡
451031    5天3次卡
451135    5天3次卡
451170    5天3次卡
Name: coupon_product_name, Length: 9563, dtype: object

In [52]:
cid_list = [33, 141, 61, 63, 21, 95, 63, 43, 74, 77]
cid_list2 = list(map(lambda x:lbe_list[-4].classes_[x-1],cid_list))
cid_list2

[5087, 6107, 5201, 5204, 5048, 5331, 5204, 5102, 5215, 5218]

In [55]:
dfs[dfs.coupon_id.isin(cid_list2)].coupon_product_name.unique()

array(['6天10次卡', '5天3次卡', '5天5次卡', '30天10次卡', '6天15次卡', '8天5次卡', '20天7次卡',
       '7天5次卡'], dtype=object)

In [56]:
data[data.user_id==356912]

Unnamed: 0,user_id,label,gender,age,order_cnt_3d,order_cnt_7d,order_cnt_14d,order_cnt_30d,order_cnt_60d,coupon_id,coupon_product_name,during_days,discount_total,original_price,pay_amt
336637,356912,0,0,0,0.025641,1,7,11,11,44,30天15次卡,13,11,14,1580


In [57]:

cid_list = [44, 46, 60, 22, 69, 32, 63, 34, 197, 96]
cid_list2 = list(map(lambda x:lbe_list[-4].classes_[x-1],cid_list))
cid_list2

[5104, 5106, 5198, 5056, 5210, 5086, 5204, 5088, 6280, 5332]

In [58]:
dfs[dfs.coupon_id.isin(cid_list2)].coupon_product_name.unique()

array(['15天10次卡', '10天5次卡', '14天10次卡', '8天5次卡', '7天10次卡', '20天45次卡',
       '6天4次卡', '30天7次卡', '7天7次卡'], dtype=object)

In [59]:
data[data.user_id==53967]

Unnamed: 0,user_id,label,gender,age,order_cnt_3d,order_cnt_7d,order_cnt_14d,order_cnt_30d,order_cnt_60d,coupon_id,coupon_product_name,during_days,discount_total,original_price,pay_amt
425922,53967,0,0,2,0.0,0,0,4,4,34,15天7次卡,10,5,6,880


In [60]:
cid_list = [63, 73, 140, 138, 136, 3, 17, 61, 0, 141]
cid_list2 = list(map(lambda x:lbe_list[-4].classes_[x-1],cid_list))
dfs[dfs.coupon_id.isin(cid_list2)].coupon_product_name.unique()

array(['25天20次卡', '10天10次卡', '8天5次卡', '5天5次卡', '20天7次卡', '10天15次卡',
       '5天3次卡', '10天5次卡', '25天45次卡', '30天90次卡'], dtype=object)

In [61]:
data[data.user_id==156429]


Unnamed: 0,user_id,label,gender,age,order_cnt_3d,order_cnt_7d,order_cnt_14d,order_cnt_30d,order_cnt_60d,coupon_id,coupon_product_name,during_days,discount_total,original_price,pay_amt
254245,156429,0,0,3,0.0,0,1,4,4,62,8天5次卡,6,3,4,640


In [62]:

cid_list = [141, 88, 15, 17, 0, 65, 74, 85, 85, 22]
cid_list2 = list(map(lambda x:lbe_list[-4].classes_[x-1],cid_list))
dfs[dfs.coupon_id.isin(cid_list2)].coupon_product_name.unique()

array(['6天5次卡', '6天10次卡', '15天7次卡', '15天10次卡', '5天5次卡', '10天15次卡',
       '30天45次卡', '30天7次卡', '30天90次卡'], dtype=object)

In [63]:
data[data.user_id==72646]


Unnamed: 0,user_id,label,gender,age,order_cnt_3d,order_cnt_7d,order_cnt_14d,order_cnt_30d,order_cnt_60d,coupon_id,coupon_product_name,during_days,discount_total,original_price,pay_amt
55391,72646,0,1,1,0.102564,9,29,55,92,138,7天7次卡,5,5,6,1100


In [64]:

cid_list = [22, 63, 75, 43, 215, 138, 47, 51, 52, 96]
cid_list2 = list(map(lambda x:lbe_list[-4].classes_[x-1],cid_list))
dfs[dfs.coupon_id.isin(cid_list2)].coupon_product_name.unique()

array(['30天10次卡', '30天30次卡', '8天5次卡', '7天10次卡', '30天60次卡', '15天20次卡',
       '14天20次卡', '15天10次卡', '5天5次卡', '10天5次卡'], dtype=object)

In [65]:
data[data.user_id==305937]


Unnamed: 0,user_id,label,gender,age,order_cnt_3d,order_cnt_7d,order_cnt_14d,order_cnt_30d,order_cnt_60d,coupon_id,coupon_product_name,during_days,discount_total,original_price,pay_amt
366363,305937,0,0,1,0.0,0,6,9,17,64,6天5次卡,4,3,4,700


In [66]:

    
cid_list =  [140, 44, 46, 60, 22, 69, 32, 63, 34, 197]
cid_list2 = list(map(lambda x:lbe_list[-4].classes_[x-1],cid_list))
dfs[dfs.coupon_id.isin(cid_list2)].coupon_product_name.unique()

array(['15天10次卡', '10天5次卡', '10天10次卡', '14天10次卡', '8天5次卡', '20天45次卡',
       '6天4次卡', '30天7次卡', '7天7次卡'], dtype=object)

In [67]:
data[data.user_id==81737]

Unnamed: 0,user_id,label,gender,age,order_cnt_3d,order_cnt_7d,order_cnt_14d,order_cnt_30d,order_cnt_60d,coupon_id,coupon_product_name,during_days,discount_total,original_price,pay_amt
319755,81737,1,1,2,0.0,0,1,2,5,17,3天3次卡,1,1,1,450


In [68]:

    
cid_list = [24, 85, 141, 61, 63, 21, 95, 63, 43, 74]
cid_list2 = list(map(lambda x:lbe_list[-4].classes_[x-1],cid_list))
dfs[dfs.coupon_id.isin(cid_list2)].coupon_product_name.unique()

array(['6天10次卡', '5天5次卡', '30天10次卡', '15天8次卡', '8天5次卡', '20天7次卡',
       '30天7次卡', '7天5次卡'], dtype=object)

In [72]:
len(train.coupon_id.unique().tolist())

292

In [73]:
len(test.coupon_id.unique().tolist())

220

In [74]:
set(test.coupon_id.unique().tolist()).difference(set(train.coupon_id.unique().tolist()))

{108, 158, 187, 212, 223, 270}

- 1.为什么出现推荐的商品重复？
  - 因为灌入faiss的商品重复了，我们预测数据中商品要全包含所有的商品，因为商品一般而言是固定的，直接全部灌入faiss计算相似
- 2.多久更新一次用户和商品向量？
  - 一般而言特征有静态特征和动态特征。如果要求每天推最新的，那么就需要定时任务每天计算所有用户和商品特征向量，然后相似推荐。
  - 如果是用户/商品更新没那么频繁，特征更新也没那么频繁，可以提前存起来直接推荐。或是存储商品计算用户最新向量/存储用户计算商品最新向量相似推荐。
  - 如果特征有实时特征，那么就需要分钟/小时级别的推荐了
- 3.用户冷启动如何推荐？
  - 如果高频用户可以存下id对应的特征向量，直接在线计算商品的特征向量，然后相似度推荐。新用户可以输入特征计算向量，但更多可以固定推荐策略
- 4.需要id作为特征吗？
  - 不用，因为是dnn输出的最后一层向量计算相似，用户id嵌入的向量在我们的任务场景里没什么主要作用。
  - 如果是用户特征只有id，那么他的嵌入向量作用可能比较明显，相似用户嵌入向量相近，但是最主要的还是dnn最后一层向量有用
- 5.那为什么语言任务需要使用字符文字的embedding？
  - 因为字符是固定的，通用的，而且使用的都是字符的嵌入向量，位置坐标等特征，语义明确

