In [0]:
#https://blog.csdn.net/u013714645/article/details/97899342
import pandas as pd
from deepctr.inputs import SparseFeat, VarLenSparseFeat, DenseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss
from google.colab import drive
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import math
from sklearn.externals import joblib
import os
import random
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import csv

In [0]:
drive.mount('/content/gdrive') 
data_path = '/content/gdrive/My Drive/project_data/'
now_phase = 6
SEQ_LEN = 50

In [0]:
! pip install deepmatch
! pip install deepctr
! pip install faiss-cpu

In [0]:
! wget http://tianchi-public-us-east-download.oss-us-east-1.aliyuncs.com/231785/underexpose_train.zip -O /content/gdrive/My Drive/underexpose_train.zip 
! wget http://tianchi-public-us-east-download.oss-us-east-1.aliyuncs.com/231785/underexpose_test.zip -O /content/gdrive/My Drive/underexpose_test.zip
! unzip -o /content/gdrive/My Drive/underexpose_train.zip  
! unzip -o /content/gdrive/My Drive/underexpose_test.zip 

In [0]:
%cd /content/gdrive/My Drive/project_data

In [0]:
!ls

In [0]:
def read_item_feat(path):
    col_name = ['movie_id']
    for i in range(256):
      col_name.append(str(i))
    
    item_feat = pd.read_csv(path, header=None, names=col_name)
    
    item_feat.iloc[:,   1] = [float(i) for i in item_feat.iloc[:,   1].str[1:]]
    item_feat.iloc[:, 128] = [float(i) for i in item_feat.iloc[:, 128].str[:-1]]
    item_feat.iloc[:, 129] = [float(i) for i in item_feat.iloc[:, 129].str[1:]]
    item_feat.iloc[:, 256] = [float(i) for i in item_feat.iloc[:, 256].str[:-1]]
    return item_feat

In [0]:
def gen_data_set(data):
    data.sort_values("timestamp", inplace=True)
    item_ids = data['movie_id'].unique()

    train_set = []
    test_set = []

    for reviewerID, hist in tqdm(data.groupby('user_id')):
        pos_list = hist['movie_id'].tolist()
        gender_list = hist['gender'].tolist()
        age_list = hist['age'].tolist()
        occupation_list = hist['occupation'].tolist()
        txt_feature = hist[[str(i) for i in range(0, 128)]].values.tolist()
        #img_feature = hist[[str(i) for i in range(128, 256)]].values.tolist()
        for i in range(1, len(pos_list)):
            hist = pos_list[:i]   
            if i != len(pos_list) - 1:
                #train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]),gender_list[i],age_list[i],occupation_list[i],txt_feature[i],img_feature[i]))
                train_set.append([reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]),gender_list[i],age_list[i],occupation_list[i],txt_feature[i]])

            else:
                #test_set.append((reviewerID, hist[::-1], pos_list[i], 1,len(hist[::-1]),gender_list[i],age_list[i],occupation_list[i],txt_feature[i],img_feature[i]))
                test_set.append([reviewerID, hist[::-1], pos_list[i], 1,len(hist[::-1]),gender_list[i],age_list[i],occupation_list[i],txt_feature[i]])

    random.shuffle(train_set)
    random.shuffle(test_set)
    return train_set,test_set
def gen_model_input(train_set,seq_max_len):

    train_uid = np.array([line[0] for line in train_set])
    train_seq = [line[1] for line in train_set]
    train_iid = np.array([line[2] for line in train_set])
    train_label = np.array([line[3] for line in train_set])
    train_hist_len = np.array([line[4] for line in train_set])
    train_gender = np.array([line[5] for line in train_set])
    train_age = np.array([line[6] for line in train_set])
    train_occupation = np.array([line[7] for line in train_set])
    train_txt = np.array([line[8] for line in train_set])
    #train_img = np.array([line[9] for line in train_set])

    train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
    train_model_input = {"user_id": train_uid, "movie_id": train_iid, "hist_movie_id": train_seq_pad,
                         "hist_len": train_hist_len, "gender": train_gender, "age": train_age, 
                         "occupation": train_occupation, "train_txt": train_txt}#, "train_img": train_img}

    return train_model_input, train_label

In [0]:
item_feat = read_item_feat(data_path+'underexpose_item_feat.csv')
user_feat = pd.read_csv(data_path+'underexpose_user_feat.csv', header=None, names=['user_id', 'age', 'gender', 'occupation'])
user_feat['gender'] = (user_feat['gender'] != 'M').astype(int)

test_click = pd.DataFrame()
train_click = pd.DataFrame()
for c in range(now_phase + 1):
    test_tmp = pd.read_csv(data_path + '/underexpose_test_click-{}.csv'.format(c), header=None, names=['user_id', 'movie_id', 'timestamp'])
    train_tmp = pd.read_csv(data_path + '/underexpose_train_click-{}.csv'.format(c), header=None, names=['user_id', 'movie_id', 'timestamp'])

    test_click = test_click.append(test_tmp)
    test_click = test_click.drop_duplicates(subset=['user_id', 'movie_id', 'timestamp'], keep='last')
    train_click = train_click.append(test_tmp)
    train_click = train_click.append(train_tmp)
    train_click = train_click.drop_duplicates(subset=['user_id', 'movie_id', 'timestamp'], keep='last')
    train_click = train_click.sort_values('timestamp')


print('item_feat:', item_feat['movie_id'].nunique())
print('item_total:', train_click['movie_id'].nunique())
print('no item feat:',len(set(train_click['movie_id'].unique()).difference(set(item_feat['movie_id'].unique()))))

test_click = pd.merge(test_click, user_feat, how='left', on=['user_id'])
train_click = pd.merge(train_click, user_feat, how='left', on=['user_id'])

#test_click = pd.merge(test_click, click_to_rating, how='left', on=['user_id','movie_id'])
#train_click = pd.merge(train_click, click_to_rating, how='left', on=['user_id','movie_id'])

test_click = test_click.sort_values(by=['user_id','timestamp'])
train_click = train_click.sort_values(by=['user_id','timestamp'])

test_click = pd.merge(test_click, item_feat, how='left', on=['movie_id'])
train_click = pd.merge(train_click, item_feat, how='left', on=['movie_id'])

test_click = test_click.fillna(method='bfill')
train_click = train_click.fillna(method='bfill')
joblib.dump(train_click, os.path.join(data_path+'train_click.pkl'))

In [0]:
data = joblib.load(os.path.join(data_path+'train_click.pkl'))
# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`
labelencoder_dict = {}
feature_max_idx = {}
features = ['user_id', 'movie_id', 'gender', 'age', 'occupation']
for feature in features:
    lbe = LabelEncoder()
    if feature == 'user_id' or feature == 'movie_id':
        tmp_data = data[feature].copy()
        data[feature] = lbe.fit_transform(data[feature])
        feature_max_idx[feature] = data[feature].max() 
        labelencoder_dict[feature]=lbe
        #　print(labelencoder_dict[feature].inverse_transform(data.head()[feature]))
        
    else:
        data[feature] = lbe.fit_transform(data[feature]) 
        feature_max_idx[feature] = data[feature].max() 

item_profile = data[["movie_id"]].drop_duplicates('movie_id')
# user_profile = data[["user_id", "gender", "age", "occupation"]].drop_duplicates('user_id')
# user_profile.set_index("user_id", inplace=True)
# user_item_list = data.groupby("user_id")['movie_id'].apply(list)

train_set, test_set= gen_data_set(data)
train_model_input, train_label = gen_model_input(train_set, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, SEQ_LEN)

In [0]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

embedding_dim = 32
user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
                        SparseFeat("gender", feature_max_idx['gender'], 16),
                        SparseFeat("age", feature_max_idx['age'], 16),
                        SparseFeat("occupation", feature_max_idx['occupation'], 16),
                        DenseFeat("train_txt", 128),
                        #DenseFeat("train_img", 128),
                        VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
                                                    embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
                        ]

item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

# 3.Define Model and train

K.set_learning_phase(True)

import tensorflow as tf
if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()

model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=100, user_dnn_hidden_units=(128,64, embedding_dim))
# model = MIND(user_feature_columns,item_feature_columns,dynamic_k=False,p=1,k_max=2,num_sampled=100,user_dnn_hidden_units=(128,64, embedding_dim),init_std=0.001)

model.compile(optimizer="adam", loss=sampledsoftmaxloss)  # "binary_crossentropy")

history = model.fit(train_model_input, train_label,  # train_label,
                    batch_size=512, epochs=20, verbose=1, validation_split=0.0, )

# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values,}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :]  # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)


In [0]:
test_true_label = {line[0]:[line[2]] for line in test_set}
result = pd.DataFrame()
import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N
result = {}
index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), 50)
s = []
hit = 0
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
    try:
        pred = [item_profile['movie_id'].values[x] for x in I[i]]
        filter_item = None
        recall_score = recall_N(test_true_label[uid], pred, N=50)
        s.append(recall_score)
        if test_true_label[uid] in pred:
            hit += 1
        result.setdefault(uid, list())  
        result[uid] = pred
    except:
        print(i)

print("recall", np.mean(s))
print("hit rate", hit / len(test_user_model_input['user_id']))
joblib.dump(result, os.path.join(data_path+'result.pkl'))

In [0]:
with open(data_path+'underexpose_submit-{}.csv'.format(now_phase), 'w', newline='') as csvfile:
  writer  = csv.writer(csvfile)
  for k, v in result.items():
      row = labelencoder_dict['user_id'].inverse_transform([k])
      row = np.append(row, labelencoder_dict['movie_id'].inverse_transform(v), axis=0)
      writer.writerow(row)