# 1. Candidate Generation
### 데이터 읽기 및 전처리

In [59]:
from urllib.request import urlretrieve
import zipfile
import pandas as pd
import numpy as np
import random

# data 읽기
# ------------- user columns ----------------------------
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']


# read data
users = pd.read_csv(
    'data/ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')


# ------------- watch columns ----------------------------
watch_cols = ['user_id', 'movie_id','rating', 'time-stamp']


watches = pd.read_csv(
    'data/ml-100k/u.data', sep='\t', names=watch_cols, encoding='latin-1')



# ------------- search columns ----------------------------
search_cols = ['user', 'search_hist']
searches = pd.read_csv(
    'data/u.search', sep='\t', names=search_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western",
]

# ------------- movies columns ----------------------------
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'data/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
# DON'T SHIFT TO START AT 0 'CAUSE WE WILL FAIL TO CONVERT TYPE LATER
# users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
# movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
# watches["movie_id"] = watches["movie_id"].apply(lambda x: str(x-1))
# watches["user_id"] = watches["user_id"].apply(lambda x: str(x-1))
# searches["user_id"] = searches["user"].apply(lambda x: str(x-1))

# example_age 추가
movies['example_age'] = (pd.to_datetime("now") - pd.to_datetime(movies['release_date']))\
            /np.timedelta64(1,'D') 

# normalize
# norm_val = (val - min)/(max-min) 
# because max-min is the largest number, any value between max & min divided by max-min is always in [0,1]
def normalize_col(df,col_name):
    df[col_name] = (df[col_name] - df[col_name].min()) / (df[col_name].max() - df[col_name].min())
    return df

movies = normalize_col(movies,'example_age')
watches = normalize_col(watches,'watch_hist_time')


# data 합치기
# WILL FAIL IF SHIFTED AT 0 HERE
data = watches.merge(movies, on='movie_id').merge(users, on='user_id')
data['user_id']=data['user_id'].astype(int)
data['movie_id']=data['movie_id'].astype(int)
data = data.set_index(['user_id']).sort_index()
data = data.reset_index()
data['movie_name']=data['title'].str[:-6] # 년도 부분 자르기


# occupation 인코딩
occupations = data["occupation"].unique().tolist()
occupations_encoded = {x: i for i, x in enumerate(occupations)}
occupationsencoded2occupations = {i: x for i, x in enumerate(occupations)}

# search history 인코딩
search_hists = searches["search_hist"].unique().tolist()
search_encoded = {x: i for i, x in enumerate(search_hists)}
searchencoded2search = {i: x for i, x in enumerate(search_hists)}

# User index encoding
user_ids = data["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

# movie index encoding
movie_ids = data["movie_id"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

# movie title encoding
title_ids = data["title"].unique().tolist()
title2title_encoded = {x: i for i, x in enumerate(title_ids)}
title_encoded2title = {i: x for i, x in enumerate(title_ids)}

# change to encoding
data["user"] = data["user_id"].map(user2user_encoded)
data["movie"] = data["movie_id"].map(movie2movie_encoded)
data["title_d"] = data["title"].map(title2title_encoded)
searches["search_hist"] = searches["search_hist"].map(search_encoded)
data["occupation"] = data["occupation"].map(occupations_encoded)
# searches["search_hist"] = searches["search_hist"]
searches = normalize_col(searches,'search_hist')

watch_hist = data.groupby(['user'])['movie_id'].apply(list).reset_index()
search_hist = searches.groupby(['user'])['search_hist'].apply(list).reset_index()
watch_hist_time = data.groupby(['user'])['watch_hist_time'].apply(list).reset_index()
example_age = data.groupby(['user'])['example_age'].apply(list).reset_index()

user_video_list = data.pivot(index='user_id', columns='movie_id', values='movie').reset_index()
user_video_list.fillna(data["movie_id"].max()+1, inplace=True)

sample_data=data[['user','occupation','sex']]
sample_data=sample_data.reset_index()
sample_data = sample_data.drop('index',axis=1)
sample_data = sample_data.drop_duplicates()

user_movie_list = pd.merge(sample_data,watch_hist, how= 'left')
user_movie_list = pd.merge(user_movie_list,watch_hist_time, how='left')
user_movie_list = pd.merge(user_movie_list,search_hist, how='left')
user_movie_list = pd.merge(user_movie_list,example_age, how='left')
user_movie_list['search_hist'] = user_movie_list['search_hist'].apply(lambda x: x if type(x) is list else []) # NaN 처리
user_movie_list['predict_labels'] = user_movie_list['movie_id'].apply(lambda x: int(random.uniform(0,data["movie"].max()))) #label을 마지막 값으로..



train_data = user_movie_list[(user_movie_list.user >= 1)&
                                  (user_movie_list.user <= 5)]
test_data = user_movie_list[(user_movie_list.user >= 6)&
                                  (user_movie_list.user <= 10)]





In [60]:
# movies # 영화 정보 데이터
user_movie_list

Unnamed: 0,user,occupation,sex,movie_id,watch_hist_time,search_hist,example_age,predict_labels
0,0,0,M,"[10, 3, 9, 5]","[0.024163568773234202, 0.0, 0.2131350681536555...",[0.0],"[0.9577264264593144, 0.0, 1.0, 0.8799693352316...",3
1,1,1,F,"[2, 7]","[0.007390688617454417, 0.1644538856434767]","[0.0, 1.0]","[0.3599824772752164, 0.07994743182564888]",2
2,2,2,M,"[3, 8]","[0.17919100725792175, 0.2091520623119136]",[0.5],"[0.0, 1.0]",3
3,3,0,M,[8],[1.0],[1.0],[1.0],6
4,4,1,F,[4],[0.017923526287838557],[],[1.0],7
5,5,3,M,[6],[0.1614887590724022],[],[1.0],0
6,6,4,M,"[2, 8]","[0.4109134360063728, 0.17919100725792175]",[],"[0.3599824772752164, 1.0]",2
7,7,4,M,"[7, 10, 1]","[0.20477075588599752, 0.1644538856434767, 0.03...",[0.0],"[0.07994743182564888, 0.9577264264593144, 1.0]",6
8,8,5,M,[1],[0.4109134360063728],[],[1.0],1
9,9,6,M,[1],[0.20069923880332802],[],[1.0],6


In [61]:
train_data # train data

Unnamed: 0,user,occupation,sex,movie_id,watch_hist_time,search_hist,example_age,predict_labels
1,1,1,F,"[2, 7]","[0.007390688617454417, 0.1644538856434767]","[0.0, 1.0]","[0.3599824772752164, 0.07994743182564888]",2
2,2,2,M,"[3, 8]","[0.17919100725792175, 0.2091520623119136]",[0.5],"[0.0, 1.0]",3
3,3,0,M,[8],[1.0],[1.0],[1.0],6
4,4,1,F,[4],[0.017923526287838557],[],[1.0],7
5,5,3,M,[6],[0.1614887590724022],[],[1.0],0


### Hyperparameter Definition

In [62]:
# CONTINUE FROM HERE - 30-09-2021
import tensorflow as tf
EMBEDDING_DIMS = 16
DENSE_UNITS = 64
DROPOUT_PCT = 0.0
ALPHA = 0.0
NUM_CLASSES=data["movie"].max() + 2
LEARNING_RATE = 0.003
tf.random.set_seed(
    1324
)

### custom layers 

In [63]:
import tensorflow as tf
class MaskedEmbeddingsAggregatorLayer(tf.keras.layers.Layer):
    def __init__(self, agg_mode='sum', **kwargs):
        super(MaskedEmbeddingsAggregatorLayer, self).__init__(**kwargs)

        if agg_mode not in ['sum', 'mean']:
            raise NotImplementedError('mode {} not implemented!'.format(agg_mode))
        self.agg_mode = agg_mode
    
    @tf.function
    def call(self, inputs, mask=None):
#         https://www.tensorflow.org/api_docs/python/tf/ragged/boolean_mask
        masked_embeddings = tf.ragged.boolean_mask(inputs, mask)
        if self.agg_mode == 'sum':
            aggregated =  tf.reduce_sum(masked_embeddings, axis=1)
        elif self.agg_mode == 'mean':
            aggregated = tf.reduce_mean(masked_embeddings, axis=1)
        return aggregated
    
    def get_config(self):
        # this is used when loading a saved model that uses a custom layer
        return {'agg_mode': self.agg_mode}
    
class L2NormLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(L2NormLayer, self).__init__(**kwargs)
    
    @tf.function
    def call(self, inputs, mask=None):
        if mask is not None:
            inputs = tf.ragged.boolean_mask(inputs, mask).to_tensor()
        return tf.math.l2_normalize(inputs, axis=-1)

    def compute_mask(self, inputs, mask):
        return mask


### model

In [64]:
#---inputs
import tensorflow as tf
import datetime
import os
input_watch_hist = tf.keras.Input(shape=(None, ), name='watch_hist')
input_watch_hist_time = tf.keras.layers.Input(shape=(None,), name='watch_hist_time')
input_search_hist = tf.keras.layers.Input(shape=(None,), name='search_hist')
input_example_age = tf.keras.Input(shape=(None, ), name='example_age')
input_occupation = tf.keras.Input(shape=(None, ), name='occupation')


#--- layers
features_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='features_embeddings')
labels_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='labels_embeddings')

avg_embeddings = MaskedEmbeddingsAggregatorLayer(agg_mode='mean', name='aggregate_embeddings')

dense_1 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_1')
dense_2 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_2')
dense_3 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_3')
l2_norm_1 = L2NormLayer(name='l2_norm_1')

dense_output = tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax, name='dense_output')

#--- features
features_embeddings = features_embedding_layer(input_watch_hist)
l2_norm_features = l2_norm_1(features_embeddings)
avg_features = avg_embeddings(l2_norm_features)

labels_watch_embeddings = labels_embedding_layer(input_watch_hist_time)
l2_norm_watched = l2_norm_1(labels_watch_embeddings)
avg_watched = avg_embeddings(l2_norm_watched)

labels_search_embeddings = labels_embedding_layer(input_search_hist)
l2_norm_searched = l2_norm_1(labels_search_embeddings)
avg_searched = avg_embeddings(l2_norm_searched)

labels_example_age_embeddings = labels_embedding_layer(input_example_age)
l2_norm_example_age = l2_norm_1(labels_example_age_embeddings)
avg_example_age = avg_embeddings(l2_norm_example_age)

labels_occupation_embeddings = labels_embedding_layer(input_occupation)
l2_norm_occupation = l2_norm_1(labels_occupation_embeddings)
avg__occupation = avg_embeddings(l2_norm_occupation)


print(avg_features)
print(avg_watched)
print(avg_searched)
print(avg_example_age)
print(input_occupation)

# 임베딩 벡터들 연결
concat_inputs = tf.keras.layers.Concatenate(axis=1)([avg_features,
                                                     avg_watched,
                                                     avg_searched,
                                                     avg_example_age,
#                                                      avg__occupation
                                                     ])
# Dense Layers
dense_1_features = dense_1(concat_inputs)
dense_1_relu = tf.keras.layers.ReLU(name='dense_1_relu')(dense_1_features)
dense_1_batch_norm = tf.keras.layers.BatchNormalization(name='dense_1_batch_norm')(dense_1_relu)

dense_2_features = dense_2(dense_1_relu)
dense_2_relu = tf.keras.layers.ReLU(name='dense_2_relu')(dense_2_features)
# dense_2_batch_norm = tf.keras.layers.BatchNormalization(name='dense_2_batch_norm')(dense_2_relu)

dense_3_features = dense_3(dense_2_relu)
dense_3_relu = tf.keras.layers.ReLU(name='dense_3_relu')(dense_3_features)
dense_3_batch_norm = tf.keras.layers.BatchNormalization(name='dense_3_batch_norm')(dense_3_relu)
outputs = dense_output(dense_3_batch_norm)

#Optimizer
optimiser = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

#--- prep model
model = tf.keras.models.Model(
    inputs=[input_watch_hist, 
            input_watch_hist_time, 
            input_search_hist,
            input_example_age,
#             input_occupation,
            ],
    outputs=[outputs]
)
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['acc'])

model.summary()

KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='aggregate_embeddings/PartitionedCall:0', description="created by layer 'aggregate_embeddings'")
KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='aggregate_embeddings/PartitionedCall:0', description="created by layer 'aggregate_embeddings'")
KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='aggregate_embeddings/PartitionedCall:0', description="created by layer 'aggregate_embeddings'")
KerasTensor(type_spec=TensorSpec(shape=(None, 16), dtype=tf.float32, name=None), name='aggregate_embeddings/PartitionedCall:0', description="created by layer 'aggregate_embeddings'")
KerasTensor(type_spec=TensorSpec(shape=(None, None), dtype=tf.float32, name='occupation'), name='occupation', description="created by layer 'occupation'")
Model: "model_3"
________________________________________________________________________________________________

In [65]:
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True,dpi=96)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


### Starts learning

In [66]:
train_data

Unnamed: 0,user,occupation,sex,movie_id,watch_hist_time,search_hist,example_age,predict_labels
1,1,1,F,"[2, 7]","[0.007390688617454417, 0.1644538856434767]","[0.0, 1.0]","[0.3599824772752164, 0.07994743182564888]",2
2,2,2,M,"[3, 8]","[0.17919100725792175, 0.2091520623119136]",[0.5],"[0.0, 1.0]",3
3,3,0,M,[8],[1.0],[1.0],[1.0],6
4,4,1,F,[4],[0.017923526287838557],[],[1.0],7
5,5,3,M,[6],[0.1614887590724022],[],[1.0],0


In [67]:
history = model.fit([tf.keras.preprocessing.sequence.pad_sequences(train_data['movie_id']),
           tf.keras.preprocessing.sequence.pad_sequences(train_data['watch_hist_time'], dtype=float),
           tf.keras.preprocessing.sequence.pad_sequences(train_data['search_hist'], dtype=float) + 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(train_data['example_age'], dtype=float),
#            tf.keras.preprocessing.sequence.pad_sequences(train_data['occupation'], dtype=float),
           ],train_data['predict_labels'].values,
           steps_per_epoch=1, epochs=50)




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [68]:
model.save("candidate_generation.h5")



In [69]:
pred = model.predict([tf.keras.preprocessing.sequence.pad_sequences(test_data['movie_id']),
           tf.keras.preprocessing.sequence.pad_sequences(test_data['watch_hist_time'], dtype=float),
           tf.keras.preprocessing.sequence.pad_sequences(test_data['search_hist'], dtype=float) + 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_data['example_age'], dtype=float)
           ])

In [70]:
pred

array([[0.10233228, 0.0751733 , 0.08530524, 0.1830142 , 0.08421379,
        0.06798429, 0.08304854, 0.1249233 , 0.06309732, 0.06720898,
        0.06369876],
       [0.06068251, 0.07617046, 0.18653351, 0.1722816 , 0.07637826,
        0.07691326, 0.07340123, 0.07524016, 0.06022383, 0.07139685,
        0.07077843],
       [0.2206241 , 0.05440961, 0.06060478, 0.06322867, 0.05689442,
        0.05362058, 0.07711288, 0.26216787, 0.04874825, 0.04907564,
        0.05351328],
       [0.2206241 , 0.05440961, 0.06060478, 0.06322867, 0.05689442,
        0.05362058, 0.07711288, 0.26216787, 0.04874825, 0.04907564,
        0.05351328]], dtype=float32)

In [71]:
# candidate generation: 
###### We extract the top-7 recommendation data for each user.
N = 6
k = np.sort((-pred).argsort()[:,:N])
print(k)
k = k.flatten()
k[k>data["movie"].max()]=0
k = np.unique(k)


[[0 2 3 4 6 7]
 [1 2 3 4 5 7]
 [0 2 3 4 6 7]
 [0 2 3 4 6 7]]


# 2. Ranking

### Load model and preprocess data

In [72]:
# load candidate_generation 
model = tf.keras.models.load_model(
    'candidate_generation.h5',
    custom_objects={
        'L2NormLayer':L2NormLayer,
        'MaskedEmbeddingsAggregatorLayer':MaskedEmbeddingsAggregatorLayer
    }
)

In [73]:
def get_genres(movies, genres):
	def get_all_genres(gs):
		active = [str(genres_encoded[genre]) for genre, g in zip(genres, gs) if g==1]
		if len(active) == 0:
			return '0'
		return ','.join((active))
	
	movies['all_genres'] = [
	get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]


In [103]:
movies

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,example_age
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1995,1.0
1,2,GoldenEye (2011),01-Jan-2011,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,2011,0.359982
2,3,Four Rooms (2020),01-Jan-2020,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,2020,0.0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1995,1.0
4,5,Copycat (1998),01-Jan-1998,,http://us.imdb.com/M/title-exact?Copycat%20(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,1998,0.879969
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1995,1.0
6,7,Twelve Monkeys (2018),01-Jan-2018,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,1,0,0,0,2018,0.079947
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,1995,1.0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1995,1.0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1996,0.957726


In [97]:
title2title_encoded

{'Richard III (1995)': 0,
 'Four Rooms (2020)': 1,
 'Dead Man Walking (1995)': 2,
 'Copycat (1998)': 3,
 'GoldenEye (2011)': 4,
 'Twelve Monkeys (2018)': 5,
 'Babe (1995)': 6,
 'Get Shorty (1995)': 7,
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 8,
 'Toy Story (1995)': 9}

In [75]:
movie_data = movies.set_index(['movie_id']).sort_index()
movie_data = movie_data.loc[k+1]
movie_data["title_d"] = movie_data["title"].map(title2title_encoded)

In [94]:
movie_data


Unnamed: 0_level_0,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,Comedy,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,example_age,title_d,all_genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,1995,1.0,9,345
2,GoldenEye (2011),01-Jan-2011,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,1,0,0,2011,0.359982,4,1216
3,Four Rooms (2020),01-Jan-2020,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,1,0,0,2020,0.0,1,16
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,1995,1.0,7,158
5,Copycat (1998),01-Jan-1998,,http://us.imdb.com/M/title-exact?Copycat%20(1998),0,0,0,0,0,0,...,0,0,0,1,0,0,1998,0.879969,3,6816
6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,0,...,0,0,0,0,0,0,1995,1.0,8,8
7,Twelve Monkeys (2018),01-Jan-2018,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,0,...,0,0,1,0,0,0,2018,0.079947,5,815
8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,1,...,0,0,0,0,0,0,1995,1.0,6,458


In [115]:

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'data/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

genres_encoded = {x: i for i, x in enumerate(genre_cols)}
get_genres(movie_data, genre_cols)

new_data = movie_data.merge(ratings, on='movie_id') # rating 추가

genre_occurences = new_data[genre_cols].sum().to_dict()


In [116]:

new_data = new_data[['movie_id', 'user_id', 'rating', 'unix_timestamp', 'all_genres', 'title_d']]
new_data['movie_type'] = np.where(new_data['rating'] >= 3, 'like', 'dislike') # 3보다 크면 like
new_data

Unnamed: 0,movie_id,user_id,rating,unix_timestamp,all_genres,title_d,movie_type
0,1,10,2,880606923,345,9,dislike
1,1,9,1,886397596,345,9,dislike
2,1,8,4,884182806,345,9,like
3,2,2,3,891717742,1216,4,like
4,2,7,5,876042340,1216,4,like
5,3,1,1,878887116,16,1,dislike
6,3,3,5,879781125,16,1,like
7,4,5,2,879372434,158,7,dislike
8,5,1,3,883603013,6816,3,like
9,6,6,3,891035994,8,8,like


In [79]:

movie_list = new_data.groupby(['user_id','movie_type'])['movie_id'].apply(list).reset_index()

movie_list

Unnamed: 0,user_id,movie_type,movie_id
0,1,dislike,[3]
1,1,like,[5]
2,2,like,"[2, 7]"
3,3,like,"[3, 8]"
4,4,dislike,[8]
5,5,dislike,[4]
6,6,like,[6]
7,7,like,"[2, 8]"
8,8,like,"[1, 7]"
9,9,dislike,[1]


In [121]:
genre_list = new_data.groupby(['user_id'])['all_genres'].unique().apply(list).reset_index()
genre_list['all_genres']=genre_list['all_genres'].apply(lambda x: list(set(','.join(x))) ) # 중복제거
genre_list['all_genres']=genre_list['all_genres'].apply(lambda x:[ x for x in x if x.isdigit() ])
genre_list

Unnamed: 0,user_id,all_genres
0,1,"[6, 1, 8]"
1,2,"[6, 5, 1, 2, 8]"
2,3,"[6, 5, 1, 4, 8]"
3,4,"[4, 5, 8]"
4,5,"[5, 1, 8]"
5,6,[8]
6,7,"[6, 2, 5, 1, 4, 8]"
7,8,"[3, 5, 1, 4, 8]"
8,9,"[4, 5, 3]"
9,10,"[4, 5, 3]"


In [80]:

genre_list = new_data.groupby(['user_id'])['all_genres'].unique().apply(list).reset_index()
genre_list['all_genres']=genre_list['all_genres'].apply(lambda x: list(set(','.join(x))) ) # 중복제거
genre_list['all_genres']=genre_list['all_genres'].apply(lambda x:[ x for x in x if x.isdigit() ])

new_data = normalize_col(new_data, 'unix_timestamp')
timestamp_list = new_data.groupby(['user_id'])['unix_timestamp'].unique().apply(list).reset_index()

title_list = new_data.groupby(['user_id'])['title_d'].apply(list).reset_index()
print(movie_list)
dataset = movie_list.pivot(index='user_id', columns='movie_type', values='movie_id').reset_index()
dataset.fillna(new_data["movie_id"].max()+1, inplace=True)

dataset['like'] =dataset['like'].apply(lambda x: x if type(x) is list else [])
dataset['dislike'] =dataset['dislike'].apply(lambda x: x if type(x) is list else [])

dataset = pd.merge(dataset, title_list, how='left')
dataset = pd.merge(dataset, genre_list, how='left')
dataset = pd.merge(dataset, timestamp_list, how='left')

dataset['predict_labels'] = dataset['like'].apply(lambda x: int(random.uniform(1,new_data["movie_id"].max()))) #label을 마지막 값으로..

dataset['like']=dataset['like'].apply(lambda x: [new_data["movie_id"].max()+1] if x == [] else x)
dataset['dislike']=dataset['dislike'].apply(lambda x: [new_data["movie_id"].max()+1] if x == [] else x)
train_data=dataset[(dataset.user_id >= 1)&
                                  (dataset.user_id <= 5)]
test_data=dataset[(dataset.user_id >= 6)&
                                  (dataset.user_id <= 9)]

    user_id movie_type movie_id
0         1    dislike      [3]
1         1       like      [5]
2         2       like   [2, 7]
3         3       like   [3, 8]
4         4    dislike      [8]
5         5    dislike      [4]
6         6       like      [6]
7         7       like   [2, 8]
8         8       like   [1, 7]
9         9    dislike      [1]
10       10    dislike      [1]


In [81]:
new_data

Unnamed: 0,movie_id,user_id,rating,unix_timestamp,all_genres,title_d,movie_type
0,1,10,2,0.291194,345,9,dislike
1,1,9,1,0.660605,345,9,dislike
2,1,8,4,0.519315,345,9,like
3,2,2,3,1.0,1216,4,like
4,2,7,5,0.0,1216,4,like
5,3,1,1,0.18148,16,1,dislike
6,3,3,5,0.238513,16,1,like
7,4,5,2,0.212441,158,7,dislike
8,5,1,3,0.482327,6816,3,like
9,6,6,3,0.956508,8,8,like


In [82]:
train_data

Unnamed: 0,user_id,dislike,like,title_d,all_genres,unix_timestamp,predict_labels
0,1,[3],[5],"[1, 3]","[6, 1, 8]","[0.18148025805016038, 0.4823272155954916]",5
1,2,[9],"[2, 7]","[4, 5]","[6, 5, 1, 2, 8]","[1.0, 0.9943047712588169]",6
2,3,[9],"[3, 8]","[1, 6]","[6, 5, 1, 4, 8]","[0.238512862381456, 0.6559625711672339]",5
3,4,[8],[9],[6],"[4, 5, 8]",[0.22311734014859716],2
4,5,[4],[9],[7],"[5, 1, 8]",[0.2124407399567807],1


In [83]:
test_data

Unnamed: 0,user_id,dislike,like,title_d,all_genres,unix_timestamp,predict_labels
5,6,[9],[6],[8],[8],[0.9565084200073466],4
6,7,[9],"[2, 8]","[4, 6]","[6, 2, 5, 1, 4, 8]","[0.0, 0.21964208637201138]",6
7,8,[9],"[1, 7]","[9, 5]","[3, 5, 1, 4, 8]","[0.5193146561727731, 0.7694933118780622]",2
8,9,[1],[9],[9],"[4, 5, 3]",[0.6606054505013651],6


### Hyperparameter Definition

In [84]:
EMBEDDING_DIMS = 16
DENSE_UNITS = 64
DROPOUT_PCT = 0.0
ALPHA = 0.0
NUM_CLASSES=new_data["movie_id"].max() + 3
LEARNING_RATE = 0.003

### model

In [85]:
#---inputs
import tensorflow as tf
import datetime
import os
input_title = tf.keras.Input(shape=(None, ), name='movie_name')
inp_video_liked = tf.keras.layers.Input(shape=(None,), name='like')
inp_video_disliked = tf.keras.layers.Input(shape=(None,), name='dislike')
input_genre = tf.keras.Input(shape=(None, ), name='genre')
input_timestamp = tf.keras.Input(shape=(None, ), name='timestamp')


#--- layers
features_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='features_embeddings')
labels_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='labels_embeddings')

avg_embeddings = MaskedEmbeddingsAggregatorLayer(agg_mode='mean', name='aggregate_embeddings')

dense_1 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_1')
dense_2 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_2')
dense_3 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_3')
l2_norm_1 = L2NormLayer(name='l2_norm_1')

dense_output = tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax, name='dense_output')

#--- features
features_embeddings = features_embedding_layer(input_title)
l2_norm_features = l2_norm_1(features_embeddings)
avg_features = avg_embeddings(l2_norm_features)

labels_liked_embeddings = labels_embedding_layer(inp_video_liked)
l2_norm_liked = l2_norm_1(labels_liked_embeddings)
avg_liked = avg_embeddings(l2_norm_liked)

labels_disliked_embeddings = labels_embedding_layer(inp_video_disliked)
l2_norm_disliked = l2_norm_1(labels_disliked_embeddings)
avg_disliked = avg_embeddings(l2_norm_disliked)

labels_genre_embeddings = labels_embedding_layer(input_genre)
l2_norm_genre = l2_norm_1(labels_genre_embeddings)
avg_genre = avg_embeddings(l2_norm_genre)

labels_timestamp_embeddings = labels_embedding_layer(input_timestamp)
l2_norm_timestamp = l2_norm_1(labels_timestamp_embeddings)
avg_timestamp = avg_embeddings(l2_norm_timestamp)


# 임베딩 벡터들 연결
concat_inputs = tf.keras.layers.Concatenate(axis=1)([avg_features,
                                                     avg_liked,
                                                     avg_disliked,
                                                     avg_genre,
                                                     avg_timestamp
                                                     ])
# Dense Layers
dense_1_features = dense_1(concat_inputs)
dense_1_relu = tf.keras.layers.ReLU(name='dense_1_relu')(dense_1_features)
dense_1_batch_norm = tf.keras.layers.BatchNormalization(name='dense_1_batch_norm')(dense_1_relu)

dense_2_features = dense_2(dense_1_relu)
dense_2_relu = tf.keras.layers.ReLU(name='dense_2_relu')(dense_2_features)
# dense_2_batch_norm = tf.keras.layers.BatchNormalization(name='dense_2_batch_norm')(dense_2_relu)

dense_3_features = dense_3(dense_2_relu)
dense_3_relu = tf.keras.layers.ReLU(name='dense_3_relu')(dense_3_features)
dense_3_batch_norm = tf.keras.layers.BatchNormalization(name='dense_3_batch_norm')(dense_3_relu)
outputs = dense_output(dense_3_batch_norm)

#Optimizer
optimiser = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

#--- prep model
model = tf.keras.models.Model(
    inputs=[input_title, 
            inp_video_liked, 
            inp_video_disliked,
            input_genre,
            input_timestamp,
            ],
    outputs=[outputs]
)
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['acc'])

# model.summary()

### 학습

In [86]:
history = model.fit([tf.keras.preprocessing.sequence.pad_sequences(train_data['title_d'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(train_data['like'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(train_data['dislike'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(train_data['all_genres'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(train_data['unix_timestamp'], dtype=float) + 1e-10,
           ],train_data['predict_labels'].values,
           steps_per_epoch=1, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [87]:
results = model.evaluate([tf.keras.preprocessing.sequence.pad_sequences(test_data['title_d'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_data['like'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_data['dislike'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(test_data['all_genres'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(test_data['unix_timestamp'], dtype=float) + 1e-10,
           ], test_data['predict_labels'].values+ 1e-10, verbose=1
        )



In [88]:
pred = model.predict([tf.keras.preprocessing.sequence.pad_sequences(test_data['title_d'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_data['like'])+ 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_data['dislike'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(test_data['all_genres'])+ 1e-10,
            tf.keras.preprocessing.sequence.pad_sequences(test_data['unix_timestamp'], dtype=float) + 1e-10
           ])



In [89]:
pred

array([[0.07901784, 0.1603995 , 0.07573444, 0.06338791, 0.05739753,
        0.13206178, 0.13167237, 0.06098158, 0.09296114, 0.07373927,
        0.07264671],
       [0.07035191, 0.08454088, 0.06060715, 0.07051102, 0.05437139,
        0.2213649 , 0.16551219, 0.06623129, 0.06649683, 0.07739711,
        0.06261533],
       [0.07932898, 0.16062401, 0.06789388, 0.07522492, 0.04725219,
        0.09017488, 0.21734384, 0.06344321, 0.06228464, 0.08504935,
        0.05138007],
       [0.07466831, 0.17336749, 0.36826903, 0.05787953, 0.0490586 ,
        0.02649532, 0.0360181 , 0.04705316, 0.06241624, 0.06110505,
        0.04366915]], dtype=float32)

In [90]:
# ranking
###### 각 user당 top-3개의 추천 데이터를 뽑아낸다.
N = 3
k = np.sort((-pred).argsort()[:,:N])
k[k>new_data["movie_id"].max()]=0
print(k)

[[1 5 6]
 [1 5 6]
 [1 5 6]
 [0 1 2]]
