# YoutubeDNN 召回实现

## 1. 下载文件

In [1]:
from urllib.request import urlretrieve
import zipfile
import pandas as pd
import os

filename = 'ml-100k.zip'
if not os.path.exists(filename):
    # 下载文件
    urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", filename)
    zip_ref = zipfile.ZipFile(filename, 'r')
    zip_ref.extractall()
    print(f'Download File: {filename}')
print(f'{filename} existed.')

ml-100k.zip existed.


## 2. Preprocess

处理四种数据：movies, users, ratings, genre

In [2]:
# user --- u.user
users_col = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_col)

# rating --- u.data
ratings_col = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_col)

# movies and genres --- aggregate u.item and u.genre
movies_col = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL']
genres_col = ['genre_unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies_col = movies_col + genres_col
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_col)

print(users.dtypes, '\n')
print(ratings.dtypes, '\n')
print(movies.dtypes, '\n')

user_id        int64
age            int64
gender        object
occupation    object
zip_code      object
dtype: object 

user_id      int64
movie_id     int64
rating       int64
timestamp    int64
dtype: object 

movie_id                int64
movie_title            object
release_date           object
video_release_date    float64
IMDb_URL               object
genre_unknown           int64
Action                  int64
Adventure               int64
Animation               int64
Children                int64
Comedy                  int64
Crime                   int64
Documentary             int64
Drama                   int64
Fantasy                 int64
Film-Noir               int64
Horror                  int64
Musical                 int64
Mystery                 int64
Romance                 int64
Sci-Fi                  int64
Thriller                int64
War                     int64
Western                 int64
dtype: object 



将电影所属的 genre 拼接成一个多值属性，比如 `3,4,5,15` 的形式

In [3]:
genre_encoded = {x:i for i, x in enumerate(genres_col)}
all_genre = [','.join([str(i) for i,x in enumerate(arr) if x == 1]) for arr in movies[genres_col].values]
movies['all_genres'] = all_genre

In [4]:
movies.head(3)

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,genre_unknown,Action,Adventure,Animation,Children,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,all_genres
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,345
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,1216
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,16


将 ratings, movies, users 全部聚合在一起

In [5]:
ratings_all = ratings.merge(movies, on='movie_id').merge(users, on='user_id')
ratings_all = ratings_all.drop(columns=genres_col)
ratings_all.shape

(100000, 13)

根据 ratings 的数值来判断喜欢还是不喜欢，>= 3 则为喜欢

In [6]:
import numpy as np
# TODO 这里可以换为 0 和 1
ratings_all['like_type'] = np.where(ratings_all['rating']>=3, 'like', 'dislike')
ratings_all['movie_name'] = ratings_all['movie_title'].str[:-6]

按照 user_id 来排序，内部再根据时间戳来排序

In [7]:
ratings_all=ratings_all.sort_values(by=['user_id', 'timestamp'])
ratings_all.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_release_date,IMDb_URL,all_genres,age,gender,occupation,zip_code,like_type,movie_name
1544,1,172,5,874965478,"Empire Strikes Back, The (1980)",01-Jan-1980,,http://us.imdb.com/M/title-exact?Empire%20Stri...,128141517,24,M,technician,85711,like,"Empire Strikes Back, The"
1623,1,168,5,874965478,Monty Python and the Holy Grail (1974),01-Jan-1974,,http://us.imdb.com/M/title-exact?Monty%20Pytho...,5,24,M,technician,85711,like,Monty Python and the Holy Grail
1510,1,165,5,874965518,Jean de Florette (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Jean%20de%20F...,8,24,M,technician,85711,like,Jean de Florette
1617,1,156,4,874965556,Reservoir Dogs (1992),01-Jan-1992,,http://us.imdb.com/M/title-exact?Reservoir%20D...,616,24,M,technician,85711,like,Reservoir Dogs
1503,1,196,5,874965677,Dead Poets Society (1989),01-Jan-1989,,http://us.imdb.com/M/title-exact?Dead%20Poets%...,8,24,M,technician,85711,like,Dead Poets Society
1690,1,166,5,874965677,Manon of the Spring (Manon des sources) (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Manon%20des%2...,8,24,M,technician,85711,like,Manon of the Spring (Manon des sources)
1620,1,187,4,874965678,"Godfather: Part II, The (1974)",01-Jan-1974,,http://us.imdb.com/M/title-exact?Godfather:%20...,168,24,M,technician,85711,like,"Godfather: Part II, The"
1534,1,14,5,874965706,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il...",814,24,M,technician,85711,like,"Postino, Il"
1556,1,127,5,874965706,"Godfather, The (1972)",01-Jan-1972,,"http://us.imdb.com/M/title-exact?Godfather,%20...",168,24,M,technician,85711,like,"Godfather, The"
1599,1,250,4,874965706,"Fifth Element, The (1997)",09-May-1997,,http://us.imdb.com/M/title-exact?Fifth%20Eleme...,115,24,M,technician,85711,like,"Fifth Element, The"


将可能会不连续的 user_id 映射转化为连续的 user_id

In [8]:
user_ids = ratings_all['user_id'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}

movie_ids = ratings_all["movie_id"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}

title_ids = ratings_all["movie_name"].unique().tolist()
title2title_encoded = {x: i for i, x in enumerate(title_ids)}

ratings_all['user'] = ratings_all['user_id'].map(user2user_encoded)
ratings_all['movie'] = ratings_all['movie_id'].map(movie2movie_encoded)
ratings_all['title_d'] = ratings_all['movie_name'].map(title2title_encoded)

In [9]:
ratings_all.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_release_date,IMDb_URL,all_genres,age,gender,occupation,zip_code,like_type,movie_name,user,movie,title_d
1544,1,172,5,874965478,"Empire Strikes Back, The (1980)",01-Jan-1980,,http://us.imdb.com/M/title-exact?Empire%20Stri...,128141517,24,M,technician,85711,like,"Empire Strikes Back, The",0,0,0
1623,1,168,5,874965478,Monty Python and the Holy Grail (1974),01-Jan-1974,,http://us.imdb.com/M/title-exact?Monty%20Pytho...,5,24,M,technician,85711,like,Monty Python and the Holy Grail,0,1,1
1510,1,165,5,874965518,Jean de Florette (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Jean%20de%20F...,8,24,M,technician,85711,like,Jean de Florette,0,2,2
1617,1,156,4,874965556,Reservoir Dogs (1992),01-Jan-1992,,http://us.imdb.com/M/title-exact?Reservoir%20D...,616,24,M,technician,85711,like,Reservoir Dogs,0,3,3
1503,1,196,5,874965677,Dead Poets Society (1989),01-Jan-1989,,http://us.imdb.com/M/title-exact?Dead%20Poets%...,8,24,M,technician,85711,like,Dead Poets Society,0,4,4
1690,1,166,5,874965677,Manon of the Spring (Manon des sources) (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Manon%20des%2...,8,24,M,technician,85711,like,Manon of the Spring (Manon des sources),0,5,5
1620,1,187,4,874965678,"Godfather: Part II, The (1974)",01-Jan-1974,,http://us.imdb.com/M/title-exact?Godfather:%20...,168,24,M,technician,85711,like,"Godfather: Part II, The",0,6,6
1534,1,14,5,874965706,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il...",814,24,M,technician,85711,like,"Postino, Il",0,7,7
1556,1,127,5,874965706,"Godfather, The (1972)",01-Jan-1972,,"http://us.imdb.com/M/title-exact?Godfather,%20...",168,24,M,technician,85711,like,"Godfather, The",0,8,8
1599,1,250,4,874965706,"Fifth Element, The (1997)",09-May-1997,,http://us.imdb.com/M/title-exact?Fifth%20Eleme...,115,24,M,technician,85711,like,"Fifth Element, The",0,9,9


In [10]:
# 用户看过所有电影并且根据喜欢和不喜欢进行分类
movie_list = ratings_all.groupby(['user','like_type'])['movie'].apply(list).reset_index()
# 每个用户看过的所有电影
title_list = ratings_all.groupby(['user'])['title_d'].apply(list).reset_index()
# 每个用户看过电影的所有题材类型
genre_list = ratings_all.groupby(['user'])['all_genres'].unique().apply(list).reset_index()
genre_list

Unnamed: 0,user,all_genres
0,0,"[1,2,8,14,15,17, 5, 8, 6,16, 1,6,8, 8,14, 1,15..."
1,1,"[8,14,17, 8,15, 8, 6,11,13,16, 11,16, 5, 5,6,8..."
2,2,"[6,10,13,16, 8, 13,16, 1,16, 8,14, 5,8,16, 5,1..."
3,3,"[8,15, 11,16, 1,16, 1,13,14,16, 1,2,15,17, 6,8..."
4,4,"[0, 1,2,15, 1, 1,15,17, 1,2,13, 1,2,5,15, 1,15..."
...,...,...
938,938,"[1,2, 1,8,17, 1,16, 8,15, 1,6,8, 8, 1,2,5,15, ..."
939,939,"[8,14,17, 5,12,14, 8, 5, 5,8, 15,16, 1,2,15,17..."
940,940,"[8,15, 1,16, 5, 1,2,15,16, 1,2,16, 3,5,16, 1,2..."
941,941,"[8, 8,16, 5,8, 5, 1,8,14, 1,5,12, 8,15, 2,4, 1..."


去除重复的 genres 项

In [11]:
genre_list['all_genres'] = genre_list['all_genres'].apply(lambda x: [i for i in list(set(','.join(x))) if i.isdigit()] )
genre_list

Unnamed: 0,user,all_genres
0,0,"[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]"
1,1,"[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]"
2,2,"[1, 2, 5, 4, 7, 8, 6, 3, 0]"
3,3,"[1, 2, 5, 4, 7, 8, 6, 3]"
4,4,"[1, 2, 5, 4, 7, 9, 8, 6, 3, 0]"
...,...,...
938,938,"[1, 2, 5, 4, 7, 9, 8, 6, 3]"
939,939,"[1, 2, 5, 0, 4, 7, 8, 6, 3]"
940,940,"[1, 2, 5, 4, 7, 8, 6, 3]"
941,941,"[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]"


将电影分为用户喜欢和不喜欢的两种类别

In [12]:
user_video_list = movie_list.pivot(index='user', columns='like_type', values='movie').reset_index()
# 填充无效值，可能会存在没有喜欢的或者没有不喜欢的
user_video_list.fillna(ratings_all['movie'].max()+1, inplace=True)
user_video_list.head(3)

like_type,user,dislike,like
0,0,"[31, 32, 33, 35, 36, 55, 71, 81, 97, 99, 107, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,1,"[279, 298, 130, 313, 314]","[272, 250, 273, 274, 275, 276, 277, 278, 280, ..."
2,2,"[302, 317, 309, 318, 275, 250, 321, 253, 322, ...","[316, 125, 278, 319, 320, 324, 325, 326, 328, ..."


In [13]:
user_data = ratings_all[['user','occupation','gender']]
# 相当于复制一份数据？
user_data =user_data.drop_duplicates()
user_data = user_data.reset_index()
user_data = user_data.drop('index',axis=1)
user_data

Unnamed: 0,user,occupation,gender
0,0,technician,M
1,1,other,F
2,2,writer,M
3,3,technician,M
4,4,other,F
...,...,...,...
938,938,student,F
939,939,administrator,M
940,940,student,M
941,941,librarian,F


In [14]:
dataset = user_video_list.merge(title_list, on='user').merge(genre_list).merge(user_data)
dataset['like'] = dataset['like'].apply(lambda x: x if type(x) is list else [x])
dataset['dislike'] = dataset['dislike'].apply(lambda x: x if type(x) is list else [x])
dataset['predict_labels'] = dataset['like'].apply(lambda x: x[-1])
dataset['like'] = dataset['like'].apply(lambda x: x[:-1])
dataset

Unnamed: 0,user,dislike,like,title_d,all_genres,occupation,gender,predict_labels
0,0,"[31, 32, 33, 35, 36, 55, 71, 81, 97, 99, 107, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]",technician,M,269
1,1,"[279, 298, 130, 313, 314]","[272, 250, 273, 274, 275, 276, 277, 278, 280, ...","[271, 249, 272, 273, 274, 275, 276, 277, 278, ...","[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]",other,F,315
2,2,"[302, 317, 309, 318, 275, 250, 321, 253, 322, ...","[316, 125, 278, 319, 320, 324, 325, 326, 328, ...","[300, 314, 315, 307, 14, 277, 316, 317, 318, 2...","[1, 2, 5, 4, 7, 8, 6, 3, 0]",writer,M,354
3,3,[361],"[250, 275, 309, 345, 254, 344, 355, 278, 350, ...","[249, 274, 307, 343, 253, 342, 352, 277, 348, ...","[1, 2, 5, 4, 7, 8, 6, 3]",technician,M,55
4,4,"[365, 368, 369, 201, 370, 371, 373, 176, 156, ...","[126, 249, 362, 39, 363, 364, 15, 9, 37, 203, ...","[125, 248, 359, 39, 360, 361, 15, 9, 37, 202, ...","[1, 2, 5, 4, 7, 9, 8, 6, 3, 0]",other,F,34
...,...,...,...,...,...,...,...,...
938,938,"[542, 251, 872, 936]","[332, 685, 250, 8, 247, 15, 285, 980, 1181, 71...","[250, 330, 677, 250, 862, 249, 8, 246, 15, 283...","[1, 2, 5, 4, 7, 9, 8, 6, 3]",student,F,172
939,939,"[321, 31, 254, 361, 326, 54, 1402, 267, 161, 2...","[272, 307, 276, 543, 346, 279, 310, 714, 302, ...","[271, 319, 305, 275, 537, 344, 31, 253, 278, 3...","[1, 2, 5, 0, 4, 7, 8, 6, 3]",administrator,M,311
940,940,"[361, 249]","[250, 309, 298, 11, 444, 10, 918, 290, 15, 27,...","[249, 307, 296, 358, 11, 441, 10, 908, 288, 15...","[1, 2, 5, 4, 7, 8, 6, 3]",student,M,30
941,941,[171],"[712, 279, 346, 307, 305, 253, 357, 250, 308, ...","[704, 278, 344, 170, 305, 303, 252, 354, 249, ...","[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]",librarian,F,48


预处理完毕，开始分割数据集为训练集和测试集

## 3. 构建模型

首先引入依赖

In [15]:
import tensorflow as tf
from keras.layers import Layer, Embedding, Dense, Input, BatchNormalization
from keras.models import Model
from tensorflow import keras

Masked Embedding Aggregation

In [16]:
class MaskedEmbeddingsAggregatorLayer(Layer):
    def __init__(self, agg_mode='sum', *args, **kwargs):
        super(MaskedEmbeddingsAggregatorLayer, self).__init__(**kwargs)
        
        if agg_mode not in ['sum', 'mean']:
            raise NotImplementedError
        self.agg_mode = agg_mode
    
    @tf.function
    def call(self, inputs, mask=None):
        # 对不规则张量进行 mask 操作
        masked_embeddings = tf.ragged.boolean_mask(inputs, mask)
        if self.agg_mode == 'sum':
            aggregated = tf.reduce_sum(masked_embeddings, axis=1)
        elif self.agg_mode == 'mean':
            aggregated = tf.reduce_mean(masked_embeddings, axis=1)
        return aggregated
    
    def get_config(self):
        return {'agg_mode': self.agg_mode}

L2 Normalize Layer

In [17]:
class L2NormLayer(Layer):
    def __init__(self, **kwargs):
        super(L2NormLayer, self).__init__(**kwargs)
    
    @tf.function
    def call(self, inputs, mask=None):
        if mask is not None:
            inputs = tf.ragged.boolean_mask(inputs, mask).to_tensor()
        return tf.math.l2_normalize(inputs, axis=-1)
    
    def compute_mask(self, inputs, mask):
        return mask

Model

In [18]:
class YoutubeDNNRecall(Model):
    def __init__(self, feature_columns, feature_vocab, ebd_dim, **kwargs):
        super(YoutubeDNNRecall, self).__init__(**kwargs)
        self.feature_columns = feature_columns
        # 注意设置 mask_zero 为 true
        self.feature_ebd = Embedding(input_dim=feature_vocab, input_length=1, output_dim=ebd_dim, embeddings_initializer='random_normal', mask_zero=True, name='feature_embeddings')
        self.label_ebd = Embedding(input_dim=feature_vocab, input_length=1, output_dim=ebd_dim, embeddings_initializer='random_normal', mask_zero=True, name='label_embeddings')
        self.mask_ebd = MaskedEmbeddingsAggregatorLayer('mean', name='aggregate_embedding')
        self.dense1 = Dense(units=64, activation='relu', name='dense_1')
        self.dense2 = Dense(units=64, activation='relu', name='dense_2')
        self.dense3 = Dense(units=64, activation='relu', name='dense_3')
        self.bn = BatchNormalization()
        self.l2 = L2NormLayer(name='l2_norm')
        self.final = Dense(feature_vocab, activation=tf.nn.softmax, name='dense_output')
        
    def summary(self, line_length=None, positions=None, print_fn=None, expand_nested=False, show_trainable=False):
#         inputs = {f['name']: Input(shape=(), dtype=tf.string if f['dtype'] == str else f['dtype'], name=f['name']) for f in self.feature_columns}
        inputs = [Input(shape=(None,)) for i in range(4)]
        model = Model(inputs, outputs=self.call(inputs))
        keras.utils.plot_model(model, 'model.png', show_shapes=True)
        model.summary()
        
    def call(self, inputs, training=None, mask=None):
        # TODO 对于不同的数据集需要对名称进行处理
#         feature_ebd = self.mask_ebd(self.l2(self.feature_ebd(inputs['title_d'])))
#         liked_ebd = self.mask_ebd(self.l2(self.feature_ebd(inputs['like'])))
#         disliked_ebd = self.mask_ebd(self.l2(self.feature_ebd(inputs['dislike'])))
#         genre_ebd = self.mask_ebd(self.l2(self.feature_ebd(inputs['all_genres'])))
        feature_ebd = self.mask_ebd(self.l2(self.feature_ebd(inputs[0])))
        liked_ebd = self.mask_ebd(self.l2(self.feature_ebd(inputs[1])))
        disliked_ebd = self.mask_ebd(self.l2(self.feature_ebd(inputs[2])))
        genre_ebd = self.mask_ebd(self.l2(self.feature_ebd(inputs[3])))
        x = tf.concat([feature_ebd, liked_ebd, disliked_ebd, genre_ebd], axis=1)
#         x = self.bn(self.dense1(x))
        x = self.dense1(x)
#         x = self.bn(self.dense2(x))
        x = self.dense2(x)
        x = self.bn(self.dense3(x))
        return self.final(x)

In [19]:
feature_columns = [
    {'name': 'title_d', 'dtype': object},
    {'name': 'like', 'dtype': object},
    {'name': 'dislike', 'dtype': object},
    {'name': 'all_genres', 'dtype': object}
]
model = YoutubeDNNRecall(feature_columns, ratings_all['movie'].max()+2, 16)
model.summary()
optimizer = keras.optimizers.get('adam')
loss = 'sparse_categorical_crossentropy'
model.compile(loss=loss, optimizer=optimizer)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                              

## 4. 开始训练

In [20]:
train_data = dataset[dataset.user <= 600]
test_data = dataset[dataset.user>600]
train_data

Unnamed: 0,user,dislike,like,title_d,all_genres,occupation,gender,predict_labels
0,0,"[31, 32, 33, 35, 36, 55, 71, 81, 97, 99, 107, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]",technician,M,269
1,1,"[279, 298, 130, 313, 314]","[272, 250, 273, 274, 275, 276, 277, 278, 280, ...","[271, 249, 272, 273, 274, 275, 276, 277, 278, ...","[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]",other,F,315
2,2,"[302, 317, 309, 318, 275, 250, 321, 253, 322, ...","[316, 125, 278, 319, 320, 324, 325, 326, 328, ...","[300, 314, 315, 307, 14, 277, 316, 317, 318, 2...","[1, 2, 5, 4, 7, 8, 6, 3, 0]",writer,M,354
3,3,[361],"[250, 275, 309, 345, 254, 344, 355, 278, 350, ...","[249, 274, 307, 343, 253, 342, 352, 277, 348, ...","[1, 2, 5, 4, 7, 8, 6, 3]",technician,M,55
4,4,"[365, 368, 369, 201, 370, 371, 373, 176, 156, ...","[126, 249, 362, 39, 363, 364, 15, 9, 37, 203, ...","[125, 248, 359, 39, 360, 361, 15, 9, 37, 202, ...","[1, 2, 5, 4, 7, 9, 8, 6, 3, 0]",other,F,34
...,...,...,...,...,...,...,...,...
596,596,"[539, 975, 332, 974, 721, 1533]","[265, 272, 310, 309, 336, 716, 298, 345, 683, ...","[264, 271, 308, 307, 334, 708, 534, 965, 296, ...","[1, 2, 5, 4, 7, 8, 6, 3]",other,M,833
597,597,"[690, 326, 870, 118]","[346, 714, 171, 312, 309, 689, 281, 127, 716, ...","[344, 682, 706, 170, 310, 307, 681, 280, 324, ...","[1, 2, 5, 4, 9, 7, 8, 6, 3, 0]",marketing,F,66
598,598,"[762, 321, 35, 294, 796, 1040, 1364]","[275, 540, 298, 716, 758, 974, 880, 938, 116, ...","[274, 753, 319, 535, 296, 35, 708, 749, 964, 8...","[1, 2, 5, 4, 7, 8, 6, 3]",student,F,937
599,599,"[604, 631, 646, 192, 1229, 210, 1228, 844, 123...","[171, 10, 66, 459, 147, 8, 87, 70, 17, 508, 54...","[170, 10, 66, 456, 146, 8, 87, 70, 17, 504, 54...","[1, 2, 5, 0, 4, 7, 8, 6]",programmer,M,376


In [21]:
from keras.preprocessing.sequence import pad_sequences as ps

x = [ps(train_data['title_d']), ps(train_data['like']), ps(train_data['dislike']), ps(train_data['all_genres'])]
y = train_data['predict_labels'].values

model.fit(x, y, epochs=500)
test_x = [ps(test_data['title_d']), ps(test_data['like']), ps(test_data['dislike']), ps(test_data['all_genres'])]
preds = model.predict(test_x)
test_data['predicted_label'] = np.array([np.argmax(a) for a in preds])
test_data

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 

Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 239/500
Epoch 240/500
Epoch 241/500
Epoch 242/500
Epoch 243/500
Epoch 244/500
Epoch 245/500
Epoch 246/500
Epoch 247/500
Epoch 248/500
Epoch 249/500
Epoch 250/500
Epoch 251/500
Epoch 252/500
Epoch 253/500
Epoch 254/500
Epoch 255/500
Epoch 256/500
Epoch 257/500
Epoch 258/500
Epoch 259/500
Epoch 260/500
Epoch 261/500
Epoch 262/500
Epoch 263/500
Epoch 264/500
Epoch 265/500
Epoch 266/500
Epoch 267/500
Epoch 268/500
Epoch 269/500
Epoch 270/500
Epoch 

Epoch 298/500
Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 303/500
Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 

Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 411/500
Epoch 412/500
Epoch 413/500
Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epoch 429/500
Epoch 430/500
Epoch 431/500
Epoch 432/500
Epoch 433/500
Epoch 434/500
Epoch 435/500
Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 451/500
Epoch 452/500
Epoch 453/500
Epoch 454/500
Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 459/500
Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 

Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,user,dislike,like,title_d,all_genres,occupation,gender,predict_labels,predicted_label
601,601,[326],"[309, 786, 870, 361, 298, 308, 457, 716, 127, ...","[307, 777, 860, 358, 296, 306, 324, 454, 708, ...","[1, 2, 5, 4, 7, 8, 6, 3]",other,F,43,6
602,602,"[254, 179, 295, 936, 136, 405, 152]","[249, 243, 17, 184, 176, 153, 379, 412, 376, 2...","[248, 242, 253, 17, 183, 175, 152, 178, 376, 4...","[1, 2, 5, 0, 4, 7, 8, 6, 3]",programmer,M,921,706
603,603,"[63, 87, 82, 423, 268, 158, 426, 653]","[72, 8, 56, 132, 241, 27, 163, 148, 652, 372, ...","[72, 8, 56, 131, 63, 240, 27, 87, 162, 147, 64...","[1, 2, 5, 4, 7, 9, 8, 6]",educator,M,657,321
604,604,"[309, 340, 11, 539, 81, 53, 1042, 930, 39, 610...","[272, 171, 302, 349, 275, 298, 758, 276, 761, ...","[271, 170, 307, 300, 347, 274, 296, 338, 749, ...","[1, 2, 5, 0, 4, 7, 8, 6, 3]",engineer,M,481,82
605,605,"[918, 298, 362, 1043, 201, 114, 71, 219, 501, ...","[275, 539, 336, 17, 256, 21, 721, 10, 9, 131, ...","[274, 534, 334, 17, 255, 21, 713, 10, 9, 130, ...","[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]",programmer,M,1181,171
...,...,...,...,...,...,...,...,...,...
938,938,"[542, 251, 872, 936]","[332, 685, 250, 8, 247, 15, 285, 980, 1181, 71...","[250, 330, 677, 250, 862, 249, 8, 246, 15, 283...","[1, 2, 5, 4, 7, 9, 8, 6, 3]",student,F,172,250
939,939,"[321, 31, 254, 361, 326, 54, 1402, 267, 161, 2...","[272, 307, 276, 543, 346, 279, 310, 714, 302, ...","[271, 319, 305, 275, 537, 344, 31, 253, 278, 3...","[1, 2, 5, 0, 4, 7, 8, 6, 3]",administrator,M,311,539
940,940,"[361, 249]","[250, 309, 298, 11, 444, 10, 918, 290, 15, 27,...","[249, 307, 296, 358, 11, 441, 10, 908, 288, 15...","[1, 2, 5, 4, 7, 8, 6, 3]",student,M,30,37
941,941,[171],"[712, 279, 346, 307, 305, 253, 357, 250, 308, ...","[704, 278, 344, 170, 305, 303, 252, 354, 249, ...","[1, 2, 5, 0, 4, 7, 9, 8, 6, 3]",librarian,F,48,305


In [23]:
tf.argsort(preds,direction='DESCENDING',axis=-1)

<tf.Tensor: shape=(342, 1683), dtype=int32, numpy=
array([[   6,  303,  305, ...,  781,  858, 1342],
       [ 706,   82,   48, ...,  677,  446,  545],
       [ 321,  323,  376, ..., 1241,  518,  388],
       ...,
       [  37,    6,  473, ..., 1578,  866,  227],
       [ 305,  250,  937, ...,  114, 1123,  781],
       [ 305,  250,  937, ...,  114, 1123,  781]])>

In [24]:
tf.nn.top_k(preds, k=20, sorted=True, name=None)

TopKV2(values=<tf.Tensor: shape=(342, 20), dtype=float32, numpy=
array([[0.02021154, 0.01737822, 0.01495254, ..., 0.00811043, 0.00794169,
        0.00784175],
       [0.03165988, 0.02925201, 0.02405199, ..., 0.01190513, 0.01174857,
        0.01172739],
       [0.05991151, 0.04807309, 0.02854769, ..., 0.00789055, 0.00784466,
        0.00762796],
       ...,
       [0.02392564, 0.02302889, 0.01973427, ..., 0.01053532, 0.00913705,
        0.00905776],
       [0.01341405, 0.01283483, 0.00841596, ..., 0.00607528, 0.00565608,
        0.00559272],
       [0.01341405, 0.01283483, 0.00841596, ..., 0.00607528, 0.00565608,
        0.00559272]], dtype=float32)>, indices=<tf.Tensor: shape=(342, 20), dtype=int32, numpy=
array([[  6, 303, 305, ..., 281, 256, 786],
       [706,  82,  48, ..., 309, 989,  52],
       [321, 323, 376, ..., 713,  82,  66],
       ...,
       [ 37,   6, 473, ...,  21, 838, 548],
       [305, 250, 937, ..., 171, 127,   6],
       [305, 250, 937, ..., 171, 127,   6]])>)