In [1]:
%pylab inline

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""


import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import gc, os
import pickle
import warnings
import multiprocessing as mp
import lightgbm as lgb
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder


# 节省内存的一个函数
# 减少内存
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,100*(start_mem-end_mem)/start_mem,(time.time()-starttime)/60))
    return df

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [2]:
data_path = './data_v2/'
save_path = './5000_sample/'
pd.set_option('display.max_columns', None)

In [3]:
vid_info = pd.read_csv(data_path + 'vid_info.csv')
candidate_items = pd.read_csv(data_path + 'candidate_items_A.csv')
seq_train = pd.read_csv(data_path + 'main_vv_seq_train.csv')

vid_info = reduce_mem(vid_info)
candidate_items = reduce_mem(candidate_items)
seq_train = reduce_mem(seq_train)

# 视频库标签编码 , 这里的 训练和测试都一样
data_did_lb = LabelEncoder()
vid_info_lb = LabelEncoder()
cid_info_lb = LabelEncoder()

seq_train['did'] = data_did_lb.fit_transform(seq_train[['did']])

vid_info['vid'] = vid_info_lb.fit_transform(vid_info[['vid']])
vid_info['cid'] = cid_info_lb.fit_transform(vid_info[['cid']])

seq_train['vid'] = vid_info_lb.transform(seq_train[['vid']])
candidate_items['vid'] = vid_info_lb.transform(candidate_items[['vid']])

vid_info['stars'] = vid_info['stars'].apply(eval)
vid_info['tags'] = vid_info['tags'].apply(eval)
vid_info['key_word'] = vid_info['key_word'].apply(eval)

vid_info['stars'] = vid_info['stars'].apply(set)
vid_info['tags'] = vid_info['tags'].apply(set)
vid_info['key_word'] = vid_info['key_word'].apply(set)

# 获取当前数据的历史点击和最后一次点击
def get_test_train(train_):
    
    train_.sort_values(by=['did','seq_no'],inplace=True,ascending=False)
    train_['site'] =  train_.groupby('did').cumcount()+1
    
    local_final_log = train_[train_['site'] == 1].reset_index(drop=True)
    train_d = train_[train_['site'] != 1].reset_index(drop=True)

    del local_final_log['site']
    del train_d['site']

    return local_final_log,train_d

# 划分数据集
local_final_log,train_data = get_test_train(seq_train.copy()) 
all_data = seq_train 

del seq_train
print(all_data.shape,train_data.shape)

-- Mem. usage decreased to  7.56 Mb (44.8% reduction),time spend:0.00 min
-- Mem. usage decreased to  0.10 Mb (0.0% reduction),time spend:0.00 min
-- Mem. usage decreased to 154.78 Mb (54.7% reduction),time spend:0.01 min


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(5596413, 8) (5425504, 8)


In [4]:
from transformers import AutoTokenizer

import torch
from torch.utils.data import DataLoader, Dataset

from transformers import AutoConfig, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [6]:
cand_vid = candidate_items['vid'].tolist()

In [None]:
tokenizer.add_tokens(cand_vid)

In [None]:
import numpy as np
import tensorflow as tf

from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat,get_feature_names
from deepctr.models import DSIN

In [33]:
def get_xy_fd(df, dense_fea, sparse_fea, behavior_fea, emb_dim=16, max_len=10, hash_flag=False):

    # 稀疏型特征
    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea]
    # 稠密型特征
    dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]

    feature_columns = sparse_feature_columns + dense_feature_columns 
    # 以最近两小时为 session 0 
    feature_columns += [ 
        VarLenSparseFeat(SparseFeat('sess_0_vid', vocabulary_size=df['candi_vid'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='candi_vid'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_0_cid', vocabulary_size=df['cid'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='cid'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_0_is_intact', vocabulary_size=df['is_intact'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='is_intact'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_0_classify_id', vocabulary_size=df['classify_id'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='classify_id'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_0_series_id', vocabulary_size=df['series_id'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='series_id'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_0_cpn', vocabulary_size=df['cpn'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='cpn'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_0_fpn', vocabulary_size=df['fpn'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='fpn'),
                         maxlen=10),
        ]
    # 以两小时后的历史记录 为 session 1
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_1_vid', vocabulary_size=df['candi_vid'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='candi_vid'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_1_cid', vocabulary_size=df['cid'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='cid'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_1_is_intact', vocabulary_size=df['is_intact'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='is_intact'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_1_classify_id', vocabulary_size=df['classify_id'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='classify_id'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_1_series_id', vocabulary_size=df['series_id'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='series_id'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_1_cpn', vocabulary_size=df['cpn'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='cpn'),
                         maxlen=10),
        VarLenSparseFeat(SparseFeat('sess_1_fpn', vocabulary_size=df['fpn'].nunique() + 1, embedding_dim=emb_dim, use_hash=hash_flag, embedding_name='fpn'),
                         maxlen=10),
        ]

    sess_number = np.array([2, 1, 0])

    feature_dict = {'fpn_score','nen_score','next_score','vid_pop','vid_pop_7','vid_pop_2','wr_mean','fr_mean',
       'wr','fr','time_diff','wr_favor','fr_favor','dura_mean','dura_max','dura_min','til_mean',
       'cid_s','isi_s','cla_s','ser_s','stars_sim','tags_sim','key_word_sim','duration_s','vid_pop_s','title_s',
       'candi_vid','cid','serialno','is_intact','classify_id','series_id','cpn','fpn',
       'sess_0_vid','sess_0_cid','sess_0_is_intact','sess_0_classify_id','sess_0_series_id','sess_0_cpn','sess_0_fpn',
       'sess_1_vid','sess_1_cid','sess_1_is_intact','sess_1_classify_id','sess_1_series_id','sess_1_cpn','sess_1_fpn',
    }


    # x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    x = { get_feature_names(feat) for feat in feature_dict }
    x["sess_length"] = 2

    return x, feature_columns


In [None]:
trn_x, feature_columns = get_xy_fd(trn_data, dense_fea, sparse_fea, behavior_fea, emb_dim=16, max_len=10 ,hash_flag = True)

In [8]:
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3])  # 0 is mask value
cateid = np.array([1, 2, 2])  # 0 is mask value
score = np.array([0.1, 0.2, 0.3])

sess1_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [0, 0, 0, 0]])
sess1_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [0, 0, 0, 0]])

sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
sess2_cate_id = np.array([[1, 2, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

sess_number = np.array([2, 1, 0])

In [31]:
sess1_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [0, 0, 0, 0]])
sess1_iid

array([[1, 2, 3, 0],
       [3, 2, 1, 0],
       [0, 0, 0, 0]])

In [32]:
sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
sess2_iid

array([[1, 2, 3, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [None]:
import numpy as np
import tensorflow as tf

from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat,get_feature_names
from deepctr.models import DSIN

In [None]:
# 把特征分开
sparse_fea = ['did','candi_vid','cid','is_intact','classify_id','series_id','cpn','fpn']

dense_fea = ['fpn_score','nen_score','next_score','vid_pop_7','vid_pop_2','wr_mean','fr_mean',
       'wr','fr','wr_favor','fr_favor','dura_mean','dura_max',
       'dura_min','til_mean','cid_s','isi_s','cla_s','ser_s','stars_sim',
       'tags_sim','key_word_sim','vid_emb',
       ]

behavior_fea = ['cid','classify_id','is_intact','series_id','cpn','fpn']

In [None]:
tf.compat.v1.disable_eager_execution()
model = DSIN(feature_columns, behavior_fea, sess_max_count=2,
              dnn_hidden_units=[4, 4, 4], dnn_dropout=0.5, )

model.summary()

model.compile('adam', 'binary_crossentropy',
              metrics=['binary_crossentropy'])

In [28]:
import numpy as np
import tensorflow as tf

from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat,get_feature_names
from deepctr.models import DSIN


def get_xy_fd(hash_flag=False):
    feature_columns = [SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
                       SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
                       SparseFeat('item', 3 + 1, embedding_dim=4, use_hash=hash_flag),
                       SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
                       DenseFeat('pay_score', 1)]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_0_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'),
                         maxlen=4), VarLenSparseFeat(
            SparseFeat('sess_0_cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='cate_id'),
            maxlen=4)]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_1_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'),
                         maxlen=4), 
                         VarLenSparseFeat(
            SparseFeat('sess_1_cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='cate_id'),
            maxlen=4)]

    behavior_feature_list = ["item", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cateid = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [0, 0, 0, 0]])
    sess1_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_cate_id = np.array([[1, 2, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'cate_id': cateid,
                    'sess_0_item': sess1_iid, 'sess_0_cate_id': sess1_cate_id, 'pay_score': score,
                    'sess_1_item': sess2_iid, 'sess_1_cate_id': sess2_cate_id, }

    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    x["sess_length"] = sess_number
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list



if tf.__version__ >= '2.0.0':
    tf.compat.v1.disable_eager_execution()

x, y, feature_columns, behavior_feature_list = get_xy_fd(True)



In [18]:
model = DSIN(feature_columns, behavior_feature_list, sess_max_count=2,
                dnn_hidden_units=[4, 4, 4], dnn_dropout=0.5, )

In [35]:
hist_emb_size = list(
        map(lambda fc: fc.embedding_dim, filter(lambda fc: fc.name in behavior_feature_list, feature_columns)))

In [33]:
hist_emb_size

[4, 4]

In [37]:
behavior_feature_list

['item', 'cate_id']

In [None]:
feature_columns

In [None]:
model.compile('adam', 'binary_crossentropy',
                metrics=['binary_crossentropy'])
history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("Hello I'm a [MASK] model.")