In [1]:
!pip install deepmatch

Collecting deepmatch
  Downloading https://files.pythonhosted.org/packages/e4/ae/6fec2e57b922ce8832653278bc9bd7a3d844d725f3c2b7c3cd544cba1a90/deepmatch-0.2.0-py3-none-any.whl
Collecting deepctr==0.8.2
[?25l  Downloading https://files.pythonhosted.org/packages/5d/61/fb1c7f06f0fed2be82068f365824532afcf0bbed77e85cdb4107196ea0bf/deepctr-0.8.2-py3-none-any.whl (110kB)
[K     |████████████████████████████████| 112kB 14.6MB/s 
Installing collected packages: deepctr, deepmatch
Successfully installed deepctr-0.8.2 deepmatch-0.2.0


In [2]:
!pip install faiss-gpu

Collecting faiss-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/7d/32/8b29e3f99224f24716257e78724a02674761e034e6920b4278cc21d19f77/faiss_gpu-1.6.5-cp36-cp36m-manylinux2014_x86_64.whl (67.6MB)
[K     |████████████████████████████████| 67.7MB 56kB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.6.5


In [3]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive"

os.chdir(path)
os.listdir(path)

Mounted at /content/drive


['.ipynb_checkpoints', 'machine learning', 'TIANCHI04', 'N05', 'out']

In [4]:
import time
from tqdm import tqdm
import collections
import math
import pickle
from datetime import datetime
import numpy as np
import pandas as pd
import random

In [5]:
import pandas as pd  
import numpy as np
from tqdm import tqdm  
from collections import defaultdict  
import os, math, warnings, math, pickle
from tqdm import tqdm
import collections
import faiss
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss
warnings.filterwarnings('ignore')

In [6]:
# 节省数据内存
article_dtypes = {
    "article_id": "int32",
    "category_id": "int16",
    "created_at_ts": "int64",
    "words_count": "int16"}

click_log_dtypes = {
    "user_id": "int32",
    "click_article_id": "int32",
    "click_timestamp": "int64",
    "click_environment": "int8",
    "click_deviceGroup": "int8",
    "click_os": "int8",
    "click_country": "int8",
    "click_region": "int8",
    "click_referrer_type": "int8"}

In [7]:
def get_train_click_df(data_save_path, name='train_click_log.csv'):
    """获取训练样本"""
    train_df = pd.read_csv(data_save_path + name, dtype=click_log_dtypes)
    return train_df

def get_test_click_df(data_save_path, name='testA_click_log.csv'):
    """获取测试样本"""
    test_df = pd.read_csv(data_save_path + name, dtype=click_log_dtypes)
    return test_df

def get_item_info_df(data_save_path, name='articles.csv'):
    """获取文章特征"""
    item_info_df = pd.read_csv(data_save_path + name,  dtype=article_dtypes)
    item_info_df = item_info_df.rename(columns={'article_id': 'click_article_id'})
    return item_info_df

In [8]:
data_path = "N05/data/"

In [9]:
trn_df = get_train_click_df(data_path, "offline_trn_df.csv")
val_df = get_test_click_df(data_path, "offline_val_df.csv")

In [10]:
sample_df = trn_df.append(val_df)
item_info_df = get_item_info_df(data_path)

In [11]:
user_profile_ = sample_df[["user_id"]].drop_duplicates('user_id')
item_profile_ = sample_df[["click_article_id"]].drop_duplicates('click_article_id')

In [12]:
user_features = ["user_id", "click_article_id"]
feature_max_idx = {}
for feature in user_features:
    lbe = LabelEncoder()
    sample_df[feature] = lbe.fit_transform(sample_df[feature]) + 1
    feature_max_idx[feature] = sample_df[feature].max() + 1

In [13]:
user_profile = sample_df[["user_id"]].drop_duplicates('user_id')
item_profile = sample_df[["click_article_id"]].drop_duplicates('click_article_id')

In [14]:
user_index_2_rawid = dict(zip(user_profile['user_id'], user_profile_['user_id']))
item_index_2_rawid = dict(zip(item_profile['click_article_id'], item_profile_['click_article_id']))

In [15]:
user_profile.set_index("user_id", inplace=True)

In [16]:
# 获取双塔召回时的训练验证数据
# negsample指的是通过滑窗构建样本的时候，负样本的数量
def gen_data_set(click_df, negsample=0):
    click_df.sort_values("click_timestamp", inplace=True)
    item_ids = click_df["click_article_id"].unique()
    
    train_set = []
    test_set = []
    for reviewerId, hist in tqdm(click_df.groupby('user_id')):

        pos_list = hist["click_article_id"].tolist()
        
        if negsample > 0:
            neg_candidate_set = list(set(item_ids) - set(pos_list)) # 用户没看过的文章里面选择负样本
            neg_list = np.random.choice(neg_candidate_set, size=len(pos_list)*negsample, replace=True) # 对于每个正样本，选择n个负样本
    
        # 长度只有一个的时候，需要把这条数据也放到训练集中，不然的话最终学到的embedding就会有缺失
        if len(pos_list) == 1:
            train_set.append((reviewerId, [], pos_list[0], 1, len(pos_list)))
            test_set.append((reviewerId, [], pos_list[0], 1, len(pos_list)))
            
        # 滑窗构造正负样本
        for i in range(1, len(pos_list)):
            
            hist = pos_list[:i]
            if i != len(pos_list) - 1: 
                train_set.append((reviewerId, hist[::-1], pos_list[i], 1, len(hist[::-1])))
                for negi in range(negsample):
                    train_set.append((reviewerId, hist[::-1], neg_list[i*negsample+negi], 0, len(hist[::-1]))) # 负样本 [user_id, his_item, neg_item, label, len(his_item)]
            else:
                # 将最长的那一个序列长度作为测试数据
                train_set.append((reviewerId, hist[::-1], pos_list[i], 1, len(hist[::-1])))
                test_set.append((reviewerId, hist[::-1], pos_list[i], 1, len(hist[::-1])))
                
    random.shuffle(train_set)
    random.shuffle(test_set)
    
    return train_set, test_set

In [17]:
# 将输入的数据进行padding，使得序列特征的长度都一致
def gen_model_input(train_set, user_profile, seq_max_len):

    train_uid = np.array([line[0] for line in train_set])
    train_seq = [line[1] for line in train_set]
    train_iid = np.array([line[2] for line in train_set])
    train_label = np.array([line[2] for line in train_set])
    train_hist_len = np.array([line[4] for line in train_set])

    train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
    train_model_input = {"user_id": train_uid, "click_article_id": train_iid, "hist_article_id": train_seq_pad,
                         "hist_len": train_hist_len}
    
    for key in ["click_environment", "click_deviceGroup", "click_os", "click_country","click_region","click_referrer_type"]:
        train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values
        
    return train_model_input, train_label

In [None]:
train_set, test_set = gen_data_set(sample_df, 1)
pickle.dump(train_set, open('train_set_0.pkl', 'wb'))
pickle.dump(test_set, open('test_set_0.pkl', 'wb'))
#train_set = pickle.load(open('train_set.pkl', 'rb'))
#test_set = pickle.load(open('test_set.pkl', 'rb'))

100%|██████████| 200000/200000 [24:39<00:00, 135.15it/s]


In [None]:
SEQ_LEN = 100
train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)

In [None]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

embedding_dim = 32

# SparseFeat(name, vocabulary_size, embedding_dim)
# VarLenSparseFeat(SparseFeat(), SEQ_LEN, 'mean', 'hist_len'),  序列特征
user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
                        SparseFeat("click_environment", feature_max_idx['click_environment'], 16),
                        SparseFeat("click_deviceGroup", feature_max_idx['click_deviceGroup'], 16),
                        SparseFeat("click_os", feature_max_idx['click_os'], 16),
                        SparseFeat("click_country", feature_max_idx['click_country'], 16),
                        SparseFeat("click_region", feature_max_idx['click_region'], 16),
                        SparseFeat("click_referrer_type", feature_max_idx['click_referrer_type'], 16),
                        VarLenSparseFeat(SparseFeat('hist_article_id', feature_max_idx['click_article_id'], embedding_dim,
                                                    embedding_name="click_article_id"), SEQ_LEN, 'mean', 'hist_len'),]

item_feature_columns = [SparseFeat('click_article_id', feature_max_idx['click_article_id'], embedding_dim),]
""""""
# 3.Define Model and train

K.set_learning_phase(True)

import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()

#model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=100, user_dnn_hidden_units=(128, 64, embedding_dim))
model = MIND(user_feature_columns,item_feature_columns,dynamic_k=False,p=1,k_max=5,num_sampled=100,user_dnn_hidden_units=(128,64, embedding_dim))

model.compile(optimizer="adam", loss=sampledsoftmaxloss)  # "binary_crossentropy")

history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=512, epochs=30, verbose=1, validation_split=0.0, )
model.save("N05/model/ytb_model.h5")

In [None]:
# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"click_article_id": item_profile['click_article_id'].values,}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

#user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
user_embs = user_embs[:, 5, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)
pickle.dump(user_embs, open('N05/FE/offline_user_embs.pkl', 'wb'))
pickle.dump(item_embs, open('N05/FE/offline_item_embs.pkl', 'wb'))

In [None]:
submit_df = pd.read_csv("N05/data/sample_submit.csv")

In [None]:
test_true_label = {line[0]:[line[2]] for line in test_set}

import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), 50)

In [None]:
s = []
hit = 0
user_recall_items_dict = collections.defaultdict(dict)
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
    pred = [item_profile['click_article_id'].values[x] for x in I[i]]
    truePred = [item_index_2_rawid[p] for p in pred]
    for j in range(len(truePred)):
        user_recall_items_dict[uid - 1][truePred[j]] = D[i][j]

In [None]:
ytb_sort_recall_dict = {}
for user_id, item_dict in tqdm(user_recall_items_dict.items()):
    ytb_sort_recall_dict[user_id] = list(item_dict.items())

In [None]:
pickle.dump(ytb_sort_recall_dict, open('N05/recall/ytb_sort_recall_dict.pkl', 'wb'))

In [None]:
D[0][1]

In [None]:
user_recall_items_dict[1]

submit_df = pd.read_csv("../data/sample_submit.csv")

youtube_user_recall_items_dict = youtubednn_u2i_dict(sample_df, topk=40)
pickle.dump(youtube_user_recall_items_dict, open('youtube_user_recall_items_dict.pkl', 'wb'))