In [1]:

# Get the data from Movielens website
from urllib.request import urlretrieve
import zipfile
import pandas as pd

urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Done. Dataset contains:")
print(zip_ref.read('ml-100k/u.info'))

#Process the dataset for movies, users,ratings and genre
# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western",
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))


Done. Dataset contains:
b'943 users\n1682 items\n100000 ratings\n'


In [2]:
#Get all the genres for a movie
import numpy as np
genre_occurences = movies[genre_cols].sum().to_dict()

genres_encoded = {x: i for i, x in enumerate(genre_cols)}

def get_genres(movies, genres):
  def get_all_genres(gs):
    active = [str(genres_encoded[genre]) for genre, g in zip(genres, gs) if g==1]
    if len(active) == 0:
      return '0'
    return ','.join((active))
  movies['all_genres'] = [
      get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]

get_genres(movies, genre_cols)

In [3]:
movies.head(3)

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,all_genres
0,0,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1995,345
1,1,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,1995,1216
2,2,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1995,16


In [4]:
rating_details_sample = ratings.merge(movies, on='movie_id').merge(users, on='user_id')


In [5]:

rating_details_sample.shape

(100000, 33)

In [6]:
rating_details_sample.head(10)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,release_date,video_release_date,imdb_url,genre_unknown,Action,...,Sci-Fi,Thriller,War,Western,year,all_genres,age,sex,occupation,zip_code
0,195,241,3.0,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,1997,5,49,M,writer,55105
1,195,256,2.0,881251577,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,1,0,0,0,1997,12515,49,M,writer,55105
2,195,110,4.0,881251793,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,...,0,0,0,0,1996,514,49,M,writer,55105
3,195,24,4.0,881251955,"Birdcage, The (1996)",08-Mar-1996,,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,...,0,0,0,0,1996,5,49,M,writer,55105
4,195,381,4.0,881251843,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,...,0,0,0,0,1994,58,49,M,writer,55105
5,195,201,3.0,881251728,Groundhog Day (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Groundhog%20D...,0,0,...,0,0,0,0,1993,514,49,M,writer,55105
6,195,152,5.0,881251820,"Fish Called Wanda, A (1988)",01-Jan-1988,,http://us.imdb.com/M/title-exact?Fish%20Called...,0,0,...,0,0,0,0,1988,5,49,M,writer,55105
7,195,285,5.0,881250949,"English Patient, The (1996)",15-Nov-1996,,http://us.imdb.com/M/title-exact?English%20Pat...,0,0,...,0,0,1,0,1996,81417,49,M,writer,55105
8,195,65,3.0,881251911,While You Were Sleeping (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?While%20You%2...,0,0,...,0,0,0,0,1995,514,49,M,writer,55105
9,195,844,4.0,881251954,That Thing You Do! (1996),28-Sep-1996,,http://us.imdb.com/M/title-exact?That%20Thing%...,0,0,...,0,0,0,0,1996,5,49,M,writer,55105


In [7]:
rating_details_sample['user_id']=rating_details_sample['user_id'].astype(int)
rating_details_sample['movie_id']=rating_details_sample['movie_id'].astype(int)

In [8]:

rating_details_sample=rating_details_sample.set_index(['user_id','unix_timestamp']).sort_index()

In [9]:
rating_details_sample =rating_details_sample.reset_index()

In [10]:
# Get the like and dislike movie list
import numpy as np
rating_details_sample['movie_type'] = np.where(rating_details_sample['rating'] >=3, 'like','dislike')
rating_details_sample['movie_name'] = rating_details_sample['title'].str[:-6]

In [11]:
rating_details_sample.head(10)

Unnamed: 0,user_id,unix_timestamp,movie_id,rating,title,release_date,video_release_date,imdb_url,genre_unknown,Action,...,War,Western,year,all_genres,age,sex,occupation,zip_code,movie_type,movie_name
0,0,874965478,171,5.0,"Empire Strikes Back, The (1980)",01-Jan-1980,,http://us.imdb.com/M/title-exact?Empire%20Stri...,0,1,...,1,0,1980,128141517,24,M,technician,85711,like,"Empire Strikes Back, The"
1,0,874965478,167,5.0,Monty Python and the Holy Grail (1974),01-Jan-1974,,http://us.imdb.com/M/title-exact?Monty%20Pytho...,0,0,...,0,0,1974,5,24,M,technician,85711,like,Monty Python and the Holy Grail
2,0,874965518,164,5.0,Jean de Florette (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Jean%20de%20F...,0,0,...,0,0,1986,8,24,M,technician,85711,like,Jean de Florette
3,0,874965556,155,4.0,Reservoir Dogs (1992),01-Jan-1992,,http://us.imdb.com/M/title-exact?Reservoir%20D...,0,0,...,0,0,1992,616,24,M,technician,85711,like,Reservoir Dogs
4,0,874965677,195,5.0,Dead Poets Society (1989),01-Jan-1989,,http://us.imdb.com/M/title-exact?Dead%20Poets%...,0,0,...,0,0,1989,8,24,M,technician,85711,like,Dead Poets Society
5,0,874965677,165,5.0,Manon of the Spring (Manon des sources) (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Manon%20des%2...,0,0,...,0,0,1986,8,24,M,technician,85711,like,Manon of the Spring (Manon des sources)
6,0,874965678,186,4.0,"Godfather: Part II, The (1974)",01-Jan-1974,,http://us.imdb.com/M/title-exact?Godfather:%20...,0,1,...,0,0,1974,168,24,M,technician,85711,like,"Godfather: Part II, The"
7,0,874965706,13,5.0,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il...",0,0,...,0,0,1994,814,24,M,technician,85711,like,"Postino, Il"
8,0,874965706,126,5.0,"Godfather, The (1972)",01-Jan-1972,,"http://us.imdb.com/M/title-exact?Godfather,%20...",0,1,...,0,0,1972,168,24,M,technician,85711,like,"Godfather, The"
9,0,874965706,249,4.0,"Fifth Element, The (1997)",09-May-1997,,http://us.imdb.com/M/title-exact?Fifth%20Eleme...,0,1,...,0,0,1997,115,24,M,technician,85711,like,"Fifth Element, The"


In [12]:
user_ids = rating_details_sample["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

movie_ids = rating_details_sample["movie_id"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

title_ids = rating_details_sample["movie_name"].unique().tolist()
title2title_encoded = {x: i for i, x in enumerate(title_ids)}
title_encoded2title = {i: x for i, x in enumerate(title_ids)}


rating_details_sample["user"] = rating_details_sample["user_id"].map(user2user_encoded)
rating_details_sample["movie"] = rating_details_sample["movie_id"].map(movie2movie_encoded)
rating_details_sample["title_d"] = rating_details_sample["movie_name"].map(title2title_encoded)

In [13]:
sample_data=rating_details_sample[['user','occupation','sex']]

In [14]:
sample_data=sample_data.reset_index()

In [15]:
rating_details_sample["movie"].max()


1681

In [16]:
movie_list = rating_details_sample.groupby(['user','movie_type'])['movie'].apply(list).reset_index()
title_list = rating_details_sample.groupby(['user'])['title_d'].apply(list).reset_index()
genre_list = rating_details_sample.groupby(['user'])['all_genres'].unique().apply(list).reset_index()

In [17]:
# Get the unique set of genre for all the users
genre_list['all_genres']=genre_list['all_genres'].apply(lambda x: list(set(','.join(x))) ) 
genre_list['all_genres']=genre_list['all_genres'].apply(lambda x:[ x for x in x if x.isdigit() ])


In [18]:
user_video_list = movie_list.pivot(index='user', columns='movie_type', values='movie').reset_index()

In [19]:
user_video_list.fillna(rating_details_sample["movie"].max()+1, inplace=True)

In [20]:
sample_data = sample_data.drop('index',axis=1)

In [21]:
sample_data =sample_data.drop_duplicates()

In [22]:
user_final_list =pd.merge(user_video_list,title_list, how= 'left')
user_title_list1 = pd.merge(user_final_list,genre_list, how='left')
user_title_list = pd.merge(user_title_list1,sample_data, how='left')

In [23]:
user_title_list1.head(3)

Unnamed: 0,user,dislike,like,title_d,all_genres
0,0,"[31, 32, 33, 35, 36, 55, 71, 81, 97, 99, 107, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]"
1,1,"[279, 298, 130, 313, 314]","[272, 250, 273, 274, 275, 276, 277, 278, 280, ...","[271, 249, 272, 273, 274, 275, 276, 277, 278, ...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]"
2,2,"[302, 317, 309, 318, 275, 250, 321, 253, 322, ...","[316, 125, 278, 319, 320, 324, 325, 326, 328, ...","[300, 314, 315, 307, 14, 277, 316, 317, 318, 2...","[1, 3, 4, 7, 2, 6, 8, 0, 5]"


In [24]:
user_title_list['like'] =user_title_list['like'].apply(lambda x: x if type(x) is list else [x])
user_title_list['dislike'] =user_title_list['dislike'].apply(lambda x: x if type(x) is list else [x])

In [25]:
user_title_list['predict_labels'] = user_title_list['like'].apply(lambda x: (x[-1]))

In [26]:
user_title_list['like']=user_title_list['like'].apply(lambda x: (x[:-1]))

In [27]:
pd.DataFrame(user_title_list[['user','dislike','like','title_d','all_genres','predict_labels']]).head(4)

Unnamed: 0,user,dislike,like,title_d,all_genres,predict_labels
0,0,"[31, 32, 33, 35, 36, 55, 71, 81, 97, 99, 107, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]",269
1,1,"[279, 298, 130, 313, 314]","[272, 250, 273, 274, 275, 276, 277, 278, 280, ...","[271, 249, 272, 273, 274, 275, 276, 277, 278, ...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]",315
2,2,"[302, 317, 309, 318, 275, 250, 321, 253, 322, ...","[316, 125, 278, 319, 320, 324, 325, 326, 328, ...","[300, 314, 315, 307, 14, 277, 316, 317, 318, 2...","[1, 3, 4, 7, 2, 6, 8, 0, 5]",354
3,3,[361],"[250, 275, 309, 345, 254, 344, 355, 278, 350, ...","[249, 274, 307, 343, 253, 342, 352, 277, 348, ...","[1, 3, 4, 7, 2, 6, 8, 5]",55


In [28]:
user_title_list_e=user_title_list[(user_title_list.user >= 1)&
                                  (user_title_list.user <= 500)]

In [29]:
user_title_list.shape

(943, 8)

In [30]:

EMBEDDING_DIMS = 16
DENSE_UNITS = 64
DROPOUT_PCT = 0.0
ALPHA = 0.0
NUM_CLASSES=rating_details_sample["movie"].max()+2

LEARNING_RATE = 0.003 

In [31]:
import tensorflow as tf
class MaskedEmbeddingsAggregatorLayer(tf.keras.layers.Layer):
    def __init__(self, agg_mode='sum', **kwargs):
        super(MaskedEmbeddingsAggregatorLayer, self).__init__(**kwargs)

        if agg_mode not in ['sum', 'mean']:
            raise NotImplementedError('mode {} not implemented!'.format(agg_mode))
        self.agg_mode = agg_mode
    
    @tf.function
    def call(self, inputs, mask=None):
        masked_embeddings = tf.ragged.boolean_mask(inputs, mask)
        if self.agg_mode == 'sum':
            aggregated =  tf.reduce_sum(masked_embeddings, axis=1)
        elif self.agg_mode == 'mean':
            aggregated = tf.reduce_mean(masked_embeddings, axis=1)
        
        return aggregated
    
    def get_config(self):
        # this is used when loading a saved model that uses a custom layer
        return {'agg_mode': self.agg_mode}

In [32]:
class L2NormLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(L2NormLayer, self).__init__(**kwargs)
    
    @tf.function
    def call(self, inputs, mask=None):
        if mask is not None:
            inputs = tf.ragged.boolean_mask(inputs, mask).to_tensor()
        return tf.math.l2_normalize(inputs, axis=-1)

    def compute_mask(self, inputs, mask):
        return mask
    

In [37]:
#---inputs
import tensorflow as tf
import datetime
import os
input_title = tf.keras.Input(shape=(None, ), name='title_d')
inp_video_liked = tf.keras.layers.Input(shape=(None,), name='like')
inp_video_disliked = tf.keras.layers.Input(shape=(None,), name='dislike')
input_genre = tf.keras.Input(shape=(None, ), name='genre')


#--- layers
features_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='features_embeddings')
labels_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='labels_embeddings')

avg_embeddings = MaskedEmbeddingsAggregatorLayer(agg_mode='mean', name='aggregate_embeddings')

dense_1 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_1')
dense_2 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_2')
dense_3 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_3')
l2_norm_1 = L2NormLayer(name='l2_norm_1')

dense_output = tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax, name='dense_output')

#--- features
features_embeddings = features_embedding_layer(input_title)
l2_norm_features = l2_norm_1(features_embeddings)
avg_features = avg_embeddings(l2_norm_features)

labels_liked_embeddings = labels_embedding_layer(inp_video_liked)
l2_norm_liked = l2_norm_1(labels_liked_embeddings)
avg_liked = avg_embeddings(l2_norm_liked)

labels_disliked_embeddings = labels_embedding_layer(inp_video_disliked)
l2_norm_disliked = l2_norm_1(labels_disliked_embeddings)
avg_disliked = avg_embeddings(l2_norm_disliked)

labels_genre_embeddings = labels_embedding_layer(input_genre)
l2_norm_genre = l2_norm_1(labels_genre_embeddings)
avg_genre = avg_embeddings(l2_norm_genre)



concat_inputs = tf.keras.layers.Concatenate(axis=1)([avg_features,
                                                     avg_liked,
                                                     avg_disliked,
                                                     avg_genre
                                                     ])
# Dense Layers

dense_1_features = dense_1(concat_inputs)
dense_1_relu = tf.keras.layers.ReLU(name='dense_1_relu')(dense_1_features)
dense_1_batch_norm = tf.keras.layers.BatchNormalization(name='dense_1_batch_norm')(dense_1_relu)

dense_2_features = dense_2(dense_1_relu)
dense_2_relu = tf.keras.layers.ReLU(name='dense_2_relu')(dense_2_features)
#dense_2_batch_norm = tf.keras.layers.BatchNormalization(name='dense_2_batch_norm')(dense_2_relu)

dense_3_features = dense_3(dense_2_relu)
dense_3_relu = tf.keras.layers.ReLU(name='dense_3_relu')(dense_3_features)
dense_3_batch_norm = tf.keras.layers.BatchNormalization(name='dense_3_batch_norm')(dense_3_relu)
outputs = dense_output(dense_3_batch_norm)

#Optimizer
optimiser = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

#--- prep model
model = tf.keras.models.Model(
    inputs=[input_title, inp_video_liked, 
            inp_video_disliked
            ,input_genre
            ],
    outputs=[outputs]
)
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy')

In [38]:
user_title_list_e

Unnamed: 0,user,dislike,like,title_d,all_genres,occupation,sex,predict_labels
1,1,"[279, 298, 130, 313, 314]","[272, 250, 273, 274, 275, 276, 277, 278, 280, ...","[271, 249, 272, 273, 274, 275, 276, 277, 278, ...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]",other,F,315
2,2,"[302, 317, 309, 318, 275, 250, 321, 253, 322, ...","[316, 125, 278, 319, 320, 324, 325, 326, 328, ...","[300, 314, 315, 307, 14, 277, 316, 317, 318, 2...","[1, 3, 4, 7, 2, 6, 8, 0, 5]",writer,M,354
3,3,[361],"[250, 275, 309, 345, 254, 344, 355, 278, 350, ...","[249, 274, 307, 343, 253, 342, 352, 277, 348, ...","[1, 3, 4, 7, 2, 6, 8, 5]",technician,M,55
4,4,"[365, 368, 369, 201, 370, 371, 373, 176, 156, ...","[126, 249, 362, 39, 363, 364, 15, 9, 37, 203, ...","[125, 248, 359, 39, 360, 361, 15, 9, 37, 202, ...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]",other,F,34
5,5,"[272, 250, 349, 307, 127, 313, 457, 27, 294, 4...","[265, 302, 171, 280, 278, 308, 347, 125, 128, ...","[264, 271, 300, 170, 279, 249, 347, 277, 306, ...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]",executive,M,253
...,...,...,...,...,...,...,...,...
496,496,"[265, 275, 340, 995, 584, 195, 62, 366, 38, 93...","[298, 250, 309, 361, 125, 716, 35, 12, 87, 169...","[264, 274, 296, 249, 307, 358, 14, 708, 338, 3...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]",student,M,234
497,497,"[125, 349, 20, 247, 10, 250, 136, 633, 214, 84...","[171, 302, 275, 785, 8, 40, 1006, 25, 918, 131...","[170, 300, 274, 776, 8, 40, 995, 25, 908, 130,...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]",writer,M,421
498,498,"[287, 537, 250, 495, 626, 53, 488, 608, 74, 1482]","[689, 254, 309, 276, 785, 277, 345, 40, 17, 27...","[681, 253, 307, 275, 776, 276, 343, 40, 17, 27...","[1, 3, 4, 7, 2, 6, 8, 0, 5]",programmer,M,905
499,499,"[272, 308, 32, 30, 1162, 304, 368, 473, 111, 1...","[298, 250, 345, 309, 321, 310, 125, 974, 340, ...","[271, 296, 249, 306, 343, 307, 319, 308, 14, 9...","[1, 3, 4, 9, 7, 2, 6, 8, 0, 5]",administrator,M,305


In [39]:
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True,dpi=96)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [40]:
model.fit([tf.keras.preprocessing.sequence.pad_sequences(user_title_list_e['title_d']),
           tf.keras.preprocessing.sequence.pad_sequences(user_title_list_e['like']),
           tf.keras.preprocessing.sequence.pad_sequences(user_title_list_e['dislike'])
           ,
            tf.keras.preprocessing.sequence.pad_sequences(user_title_list_e['all_genres'])
           ],user_title_list_e['predict_labels'].values,callbacks=[tensorboard_callback],
           steps_per_epoch=1, epochs=1000,verbose=1)



Epoch 1/1000


InvalidArgumentError:  indices[3824] = 4501 is not in [0, 4500)
	 [[{{node model_1/aggregate_embeddings/PartitionedCall_3/RaggedMask/boolean_mask/GatherV2}}]] [Op:__inference_train_function_7665]

Function call stack:
train_function


In [None]:
user_list_1=user_title_list[user_title_list.user>600]

In [None]:
user_list_1.head(10)

Unnamed: 0,user,dislike,like,title_d,all_genres,occupation,sex,predict_labels
601,601,[326],"[309, 786, 870, 361, 298, 308, 457, 716, 127, ...","[307, 777, 860, 358, 296, 306, 324, 454, 708, ...","[8, 4, 2, 5, 7, 6, 3, 1]",other,F,43
602,602,"[254, 179, 295, 936, 136, 405, 152]","[249, 243, 17, 184, 176, 153, 379, 412, 376, 2...","[248, 242, 253, 17, 183, 175, 152, 178, 376, 4...","[4, 8, 2, 5, 7, 6, 0, 3, 1]",programmer,M,921
603,603,"[63, 87, 82, 423, 268, 158, 426, 653]","[72, 8, 56, 132, 241, 27, 163, 148, 652, 372, ...","[72, 8, 56, 131, 63, 240, 27, 87, 162, 147, 64...","[8, 4, 2, 5, 7, 6, 9, 1]",educator,M,657
604,604,"[309, 340, 11, 539, 81, 53, 1042, 930, 39, 610...","[272, 171, 302, 349, 275, 298, 758, 276, 761, ...","[271, 170, 307, 300, 347, 274, 296, 338, 749, ...","[4, 8, 2, 5, 7, 6, 0, 3, 1]",engineer,M,481
605,605,"[918, 298, 362, 1043, 201, 114, 71, 219, 501, ...","[275, 539, 336, 17, 256, 21, 721, 10, 9, 131, ...","[274, 534, 334, 17, 255, 21, 713, 10, 9, 130, ...","[0, 8, 4, 2, 5, 7, 6, 9, 3, 1]",programmer,M,1181
606,606,[39],"[785, 477, 241, 512, 529, 498, 377, 109, 486, ...","[776, 474, 240, 507, 524, 495, 374, 109, 483, ...","[4, 8, 2, 5, 7, 6, 3, 1]",healthcare,F,191
607,607,"[309, 344, 307, 760, 347, 276, 57, 1202, 17, 1...","[272, 171, 20, 298, 689, 273, 331, 275, 318, 3...","[271, 170, 307, 20, 342, 296, 305, 681, 751, 3...","[9, 4, 8, 2, 5, 7, 6, 0, 3, 1]",other,M,134
608,608,"[275, 115, 298, 321, 885, 323, 127, 457, 874, ...","[305, 250, 303, 289, 30, 221, 712, 308, 782]","[303, 274, 249, 301, 287, 115, 30, 220, 296, 7...","[8, 4, 2, 5, 6, 3, 1]",student,F,444
609,609,"[298, 254, 323, 27, 58, 84, 463, 711]","[279, 348, 275, 714, 253, 712, 305, 89, 512, 6...","[278, 346, 274, 296, 706, 253, 321, 252, 704, ...","[9, 8, 4, 2, 5, 7, 6, 0, 3, 1]",student,M,428
610,610,"[785, 282, 904, 334]","[171, 302, 275, 316, 278, 317, 306, 689, 253, ...","[170, 300, 274, 314, 277, 315, 304, 681, 252, ...","[8, 4, 2, 5, 7, 6, 0, 3, 1]",librarian,M,330


In [None]:

predict =model.predict([tf.keras.preprocessing.sequence.pad_sequences(user_list_1['title_d']),
          tf.keras.preprocessing.sequence.pad_sequences(user_list_1['like']),
          tf.keras.preprocessing.sequence.pad_sequences(user_list_1['dislike']),
          tf.keras.preprocessing.sequence.pad_sequences(user_list_1['all_genres'])
          ])




In [None]:
predictions = np.array([np.argmax(a) for a in predict])

In [None]:
user_list_1['predicted_label'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
user_list_1

Unnamed: 0,user,dislike,like,title_d,all_genres,occupation,sex,predict_labels,predicted_label
601,601,[326],"[309, 786, 870, 361, 298, 308, 457, 716, 127, ...","[307, 777, 860, 358, 296, 306, 324, 454, 708, ...","[8, 4, 2, 5, 7, 6, 3, 1]",other,F,43,539
602,602,"[254, 179, 295, 936, 136, 405, 152]","[249, 243, 17, 184, 176, 153, 379, 412, 376, 2...","[248, 242, 253, 17, 183, 175, 152, 178, 376, 4...","[4, 8, 2, 5, 7, 6, 0, 3, 1]",programmer,M,921,916
603,603,"[63, 87, 82, 423, 268, 158, 426, 653]","[72, 8, 56, 132, 241, 27, 163, 148, 652, 372, ...","[72, 8, 56, 131, 63, 240, 27, 87, 162, 147, 64...","[8, 4, 2, 5, 7, 6, 9, 1]",educator,M,657,786
604,604,"[309, 340, 11, 539, 81, 53, 1042, 930, 39, 610...","[272, 171, 302, 349, 275, 298, 758, 276, 761, ...","[271, 170, 307, 300, 347, 274, 296, 338, 749, ...","[4, 8, 2, 5, 7, 6, 0, 3, 1]",engineer,M,481,200
605,605,"[918, 298, 362, 1043, 201, 114, 71, 219, 501, ...","[275, 539, 336, 17, 256, 21, 721, 10, 9, 131, ...","[274, 534, 334, 17, 255, 21, 713, 10, 9, 130, ...","[0, 8, 4, 2, 5, 7, 6, 9, 3, 1]",programmer,M,1181,336
...,...,...,...,...,...,...,...,...,...
938,938,"[542, 251, 872, 936]","[332, 685, 250, 8, 247, 15, 285, 980, 1181, 71...","[250, 330, 677, 250, 862, 249, 8, 246, 15, 283...","[8, 4, 2, 5, 7, 6, 9, 3, 1]",student,F,172,336
939,939,"[321, 31, 254, 361, 326, 54, 1402, 267, 161, 2...","[272, 307, 276, 543, 346, 279, 310, 714, 302, ...","[271, 319, 305, 275, 537, 344, 31, 253, 278, 3...","[4, 8, 2, 5, 7, 6, 0, 3, 1]",administrator,M,311,336
940,940,"[361, 249]","[250, 309, 298, 11, 444, 10, 918, 290, 15, 27,...","[249, 307, 296, 358, 11, 441, 10, 908, 288, 15...","[8, 4, 2, 5, 7, 6, 3, 1]",student,M,30,97
941,941,[171],"[712, 279, 346, 307, 305, 253, 357, 250, 308, ...","[704, 278, 344, 170, 305, 303, 252, 354, 249, ...","[0, 8, 4, 2, 5, 7, 6, 9, 3, 1]",librarian,F,48,336


In [None]:
print(tf.argsort(predict,direction='DESCENDING',axis=-1))

tf.Tensor(
[[ 539    6  250 ... 1561 1164   20]
 [ 916  726  336 ...  430  811   63]
 [ 786  685  326 ...  665 1665 1615]
 ...
 [  97  376  473 ...  430   96  162]
 [ 336  716  937 ... 1343  220  159]
 [ 336  716  937 ... 1343  220  159]], shape=(342, 1683), dtype=int32)


In [None]:
tf.nn.top_k(
    predict,
    k=20,
    sorted=True,
    name=None
)

TopKV2(values=<tf.Tensor: shape=(342, 20), dtype=float32, numpy=
array([[0.01828908, 0.01587116, 0.0143125 , ..., 0.0081482 , 0.00813982,
        0.00785915],
       [0.27056417, 0.17075664, 0.15351553, ..., 0.00449383, 0.00404884,
        0.00347649],
       [0.01993676, 0.01731513, 0.01550991, ..., 0.00978938, 0.00964544,
        0.00950752],
       ...,
       [0.14553824, 0.12785448, 0.07932664, ..., 0.00572439, 0.00541566,
        0.00513971],
       [0.02236108, 0.02009165, 0.01455817, ..., 0.00756362, 0.00729877,
        0.00706648],
       [0.02236108, 0.02009165, 0.01455817, ..., 0.00756362, 0.00729877,
        0.00706648]], dtype=float32)>, indices=<tf.Tensor: shape=(342, 20), dtype=int32, numpy=
array([[ 539,    6,  250, ...,   33,  367,  875],
       [ 916,  726,  336, ...,  311,  247, 1093],
       [ 786,  685,  326, ..., 1040, 1019,  706],
       ...,
       [  97,  376,  473, ...,   16,  305,   52],
       [ 336,  716,  937, ...,   27,  508,   82],
       [ 336,  716,  9

In [None]:
s=[i for i in range(len(movie2movie_encoded)) if movie2movie_encoded[i] == 365 ]

In [None]:
movies[movies.movie_id == ''.join(str(s[0]))]

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,all_genres
368,368,Black Sheep (1996),02-Feb-1996,,http://us.imdb.com/M/title-exact?Black%20Sheep...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1996,5


### Visualize Embeddings 

In [None]:
import io
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
#weights =model.layers[4].get_weights()[0]
weights = features_embedding_layer.get_weights()[0][1:]

for num, word in enumerate(title2title_encoded):
  vec = weights[num+1] # skip 0, it's padding.
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
   pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')