In [1]:
# automatically reload edited modules
%load_ext autoreload
%autoreload 2
%matplotlib inline

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

DATASET_DIR = './dataset/'

---
# Data Load

In [2]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
movie_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
              'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
              'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

#------------------
user_data = pd.read_csv(DATASET_DIR+'u.user', sep='|', names=users_cols)
movie_data = pd.read_csv(DATASET_DIR+'u.item', sep='|', names=movie_cols, encoding='latin')

#------------------
# train
rating_data = pd.read_csv(DATASET_DIR+'u1.base', sep='\t', names=rating_cols)
# test
test_rating_data = pd.read_csv(DATASET_DIR+'u1.test', sep='\t', names=rating_cols)

### Movie Data

In [3]:
movie_data.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### User Data

In [4]:
user_data.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Rating Data

In [5]:
rating_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [6]:
test_rating_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


___
# 모델에 입력할 Input Data 만들기
   
- data_per_user   
유저당 사용할 Target 데이터 갯수   

- wathed_data_size   
입력에 사용할 유저가 본 영화 갯수 

- negative_size   
입력에 사용할 negative_sample 갯수

In [7]:
data_per_user = 20
wathed_data_size = 10
negative_size = 5 # big: 2-5 / small: 5-20

# -------------------------
'''
marge user data and rating data.
user 데이터와 rating 데이터 병합.

Since it is a system that recommends movies for users, movies with scores of less than 3 stars are excluded.
유저에게 적합한 영화를 추천하는 시스템 이므로 3점 미만 점수를 준 영화는 제외한다.
'''
train_dataset = pd.merge(user_data[['user_id','age','sex']], rating_data[['user_id','movie_id','rating']][rating_data.rating >= 3], on='user_id')

'''
Select sample as many as 'data_per_user', delete user less than 'data_per_user'.
'data_per_user' 만큼 선택, 'data_per_user' 미만 개수 user 삭제.
'''
train_dataset = train_dataset.groupby('user_id', group_keys=False).apply(
    lambda x : x.sample(n=data_per_user).reset_index(drop=True)
    if len(x) >= data_per_user
    else x.reset_index(drop=True))

'''
Select watched movies as many as 'wathed_data_size'.
본 영화를 'wathed_data_size' 만큼 선택.
'''
train_dataset['watched_movies'] = train_dataset['user_id'].apply(
    lambda x : rating_data[rating_data.user_id == x]['movie_id'].sample(n=wathed_data_size).tolist()
    if len(rating_data[rating_data.user_id == x]['movie_id'])>= wathed_data_size
    else rating_data[rating_data.user_id == x]['movie_id'].tolist())
    #else rating_data[rating_data.user_id == x]['movie_id'].sample(n=wathed_data_size, replace=True).tolist())

'''
negative sampling

!!! It's not an accurate negative sampling because it's just random, needs to be modified !!! 
!!! 단순히 랜덤으로 뽑았으므로 정확한 negative sampling 아님, 수정 필요 !!!
'''
train_dataset['negative_samples'] = np.random.choice(len(movie_data)+1, (len(train_dataset),negative_size)).tolist() # +1 : 마지막 영화 ID도 포함

train_dataset

Unnamed: 0,user_id,age,sex,movie_id,rating,watched_movies,negative_samples
0,1,24,M,109,5,"[57, 137, 63, 165, 94, 35, 205, 71, 75, 43]","[1524, 359, 426, 1022, 449]"
1,1,24,M,58,4,"[195, 165, 79, 149, 46, 75, 237, 172, 18, 127]","[460, 678, 1329, 1445, 631]"
2,1,24,M,5,3,"[147, 22, 119, 220, 116, 136, 141, 167, 168, 239]","[997, 1177, 1421, 910, 32]"
3,1,24,M,216,5,"[138, 251, 181, 144, 4, 3, 162, 244, 119, 94]","[315, 1321, 1291, 1134, 1527]"
4,1,24,M,181,5,"[26, 48, 18, 63, 139, 55, 40, 109, 199, 217]","[907, 1471, 25, 1276, 652]"
...,...,...,...,...,...,...,...
15,943,22,M,98,5,"[2, 232, 443, 831, 318, 186, 68, 427, 12, 739]","[757, 307, 885, 170, 1564]"
16,943,22,M,427,4,"[121, 182, 24, 111, 161, 151, 526, 419, 117, 403]","[18, 162, 619, 1428, 910]"
17,943,22,M,94,4,"[96, 281, 831, 415, 763, 427, 124, 1011, 785, 22]","[299, 1328, 527, 1382, 616]"
18,943,22,M,928,5,"[282, 194, 475, 2, 31, 174, 41, 431, 232, 53]","[664, 642, 491, 805, 1479]"


In [8]:
# Shuffle
# 데이터 뒤섞기
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
train_dataset.head()

Unnamed: 0,user_id,age,sex,movie_id,rating,watched_movies,negative_samples
0,440,30,M,283,5,"[751, 213, 921, 86, 988, 329, 300, 313, 283, 258]","[968, 1302, 57, 766, 1255]"
1,622,25,M,386,3,"[1039, 169, 1203, 250, 69, 542, 1216, 730, 373...","[244, 305, 635, 406, 501]"
2,590,50,M,150,5,"[1129, 275, 547, 740, 1331, 1009, 13, 754, 15,...","[15, 941, 1479, 1121, 490]"
3,354,29,F,286,4,"[89, 65, 1039, 638, 154, 896, 181, 10, 811, 1017]","[892, 981, 470, 205, 847]"
4,37,23,M,841,3,"[121, 210, 68, 833, 96, 568, 665, 230, 127, 578]","[1212, 395, 504, 1520, 1090]"


In [9]:
# Dataframe 형태에서 모델 입력에 맞게 array 형태로 변환
temp_1 = train_dataset['movie_id'].to_numpy().reshape(-1,1)
temp_2 = np.array(train_dataset['negative_samples'].tolist())
sample_data = np.append(temp_1, temp_2, axis=-1) # -> [Target, Negative_samples]

#watched_movies_data = np.array(train_dataset['watched_movies'].tolist())
watched_movies_data = tf.keras.preprocessing.sequence.pad_sequences(train_dataset['watched_movies'])

gender_data = train_dataset['sex'].apply(lambda x: 0 if x == 'M' else 1).to_numpy().reshape(-1,1)

age_data = train_dataset['age'].apply(lambda x: (x - train_dataset['age'].min())/(train_dataset['age'].max() - train_dataset['age'].min())).to_numpy().reshape(-1,1)

print(sample_data.shape, watched_movies_data.shape, gender_data.shape, age_data.shape)

(17601, 6) (17601, 10) (17601, 1) (17601, 1)


___
# Model 구성

#### Full Model
![full_model](./image/full_model.png)
   
#### Candidate Model
![candidate_model](image/candidate_model.png)

In [11]:
EMBEDDING_DIMS = 16 # 임베딩 크기
DENSE_UNITS = 64    # 은닉층 크기
MOVIE_NUM = movie_data['movie_id'].max() #1682
LEARNING_RATE = 0.003
BATCH_SIZE = 10

#-------------------------#
# embedding average layer
class Avg_Embedding(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Avg_Embedding, self).__init__(**kwargs)

    def call(self, input):
        return tf.reduce_mean(input, axis=1)
    
#-------------------------#
# Input layers
input_watched_movies = tf.keras.Input(shape=(None, ), name='watched_movies')
input_age = tf.keras.layers.Input(shape=(1), name='age')
input_gender = tf.keras.layers.Input(shape=(1), name='gender')
input_samples = tf.keras.Input(shape=(None, ), name='samples')

# Embedding layers (영화 기록을 임베딩 하기 위한 embedding layer, 검색 기록은 없으니 생략)
features_embedding_layer = tf.keras.layers.Embedding(input_dim = MOVIE_NUM+1, output_dim = EMBEDDING_DIMS, mask_zero=True, name='features_embedding')
average_embedding_layer = Avg_Embedding(name='features_embedding_average')

# Dense layers
dense_1 = tf.keras.layers.Dense(DENSE_UNITS, activation='relu', name='dense_1')
dense_2 = tf.keras.layers.Dense(EMBEDDING_DIMS, activation='relu', name='dense_2')

#-------------------------#
# Model connection
watched_movies_embedding = features_embedding_layer(input_watched_movies)
sample_movies_embedding = features_embedding_layer(input_samples)

average_embedding = average_embedding_layer(watched_movies_embedding)
concat_features = tf.keras.layers.concatenate([average_embedding, input_age, input_gender], axis=1, name='concatenate_features')
dense_1_out = dense_1(concat_features)
dense_2_out = dense_2(dense_1_out)
# 
dot_product = tf.keras.layers.dot([dense_2_out, sample_movies_embedding], axes=(1,2), name='dot_product')
output = tf.keras.layers.Activation('softmax', name = 'class_probabilities')(dot_product)

model = tf.keras.Model(inputs=[input_watched_movies, input_age, input_gender, input_samples], outputs=[output])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss='sparse_categorical_crossentropy', metrics=['acc'])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 watched_movies (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 features_embedding (Embedding)  (None, None, 16)    26928       ['watched_movies[0][0]',         
                                                                  'samples[0][0]']                
                                                                                                  
 features_embedding_average (Av  (None, 16)          0           ['features_embedding[0][0]']     
 g_Embedding)                                                                                     
                                                                                            

___
# Train

In [12]:
history = model.fit([watched_movies_data, age_data, gender_data, sample_data],
                    np.zeros(len(train_dataset),dtype=int),
                    batch_size=BATCH_SIZE,
                    epochs=50)

Epoch 1/50


2022-06-06 19:13:10.784867: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


___
# Test

In [13]:
# 유저 id를 입력하면 입력 데이터를 만들어주는 함수
def pick_user_data_input(user_id, watched_sample_num = wathed_data_size):
    if len(rating_data[rating_data.user_id == user_id]) >= watched_sample_num:
        watched_movies_data = rating_data[rating_data.user_id == user_id]['movie_id'].sample(n=watched_sample_num).to_numpy().reshape(-1,watched_sample_num)
    else:
        #watched_movies_data = rating_data[rating_data.user_id == user_id]['movie_id'].sample(n=watched_sample_num, replace=True).to_numpy().reshape(-1,watched_sample_num)
        watched_movies_data = tf.keras.preprocessing.sequence.pad_sequences(rating_data[rating_data.user_id == user_id]['movie_id'].to_numpy().reshape(1,-1), maxlen=watched_sample_num)

    age_data = user_data[user_data.user_id == user_id]['age'].apply(lambda x: (x - user_data['age'].min())/(user_data['age'].max() - user_data['age'].min())).to_numpy().reshape(-1,1)
    gender_data = user_data[user_data.user_id == user_id]['sex'].apply(lambda x : 0 if x=='M' else 1).to_numpy().reshape(-1,1)

    return [watched_movies_data, age_data, gender_data]

# 유저 임베딩 vector
get_output = tf.keras.backend.function([model.get_layer('watched_movies').input,
                                        model.get_layer('age').input,
                                        model.get_layer('gender').input], [model.layers[-3].output])

# 영화 임베딩 vector
embedding_table = model.get_layer('features_embedding').get_weights()[0]
print(embedding_table.shape)

(1683, 16)


### Movie Recommendation

In [15]:
pick = 1 # 유저 ID
top_N_size = 50 # 최대 50개 추천

a = pick_user_data_input(pick)
pred = get_output(a)[0]
result = np.dot(pred, embedding_table.transpose())[0]
rank = sorted(range(len(result)), key=lambda k: result[k], reverse=True) # 영화 추천 순위

print(f"Movie Recommendation for user {pick} (movie_ID): {rank[:10]}")

Movie Recommendation for user 1 (movie_ID): [53, 1064, 1111, 510, 1488, 239, 230, 1157, 281, 162]


### mean average precision

`model prediction VS Random pick`

In [16]:
# mean average precision 측정

mAP_N = 0
display_freq = 10

for top_N in range(1, top_N_size+1):
    True_positive = test_rating_data[(test_rating_data.user_id == pick) & (test_rating_data.movie_id.isin(rank[:top_N]))]
    positive = test_rating_data[test_rating_data.user_id == pick]

    random_pick = np.random.choice(len(movie_data), top_N)
    Random_positive = test_rating_data[(test_rating_data.user_id == pick) & (test_rating_data.movie_id.isin(random_pick))]
    
    if top_N % display_freq == 0 or top_N == 1:
        print(f'top N  = {top_N} -------------------\n')

        print(f'model  = Rank {top_N} Recall    : {len(True_positive)/len(positive)} ({len(True_positive)}/{len(positive)})')
        print(f'model  = Rank {top_N} Precision : {len(True_positive)/top_N} ({len(True_positive)}/{top_N})\n')


        print(f'random = Rank {top_N} Recall    : {len(Random_positive)/len(positive)} ({len(Random_positive)}/{len(positive)})')
        print(f'random = Rank {top_N} Precision : {len(Random_positive)/top_N} ({len(Random_positive)}/{top_N})\n')
    
    mAP_N += len(True_positive)/top_N
    
print(f'mAP@{top_N_size} = {mAP_N/top_N_size}')

top N  = 1 -------------------

model  = Rank 1 Recall    : 0.0072992700729927005 (1/137)
model  = Rank 1 Precision : 1.0 (1/1)

random = Rank 1 Recall    : 0.0 (0/137)
random = Rank 1 Precision : 0.0 (0/1)

top N  = 10 -------------------

model  = Rank 10 Recall    : 0.014598540145985401 (2/137)
model  = Rank 10 Precision : 0.2 (2/10)

random = Rank 10 Recall    : 0.0 (0/137)
random = Rank 10 Precision : 0.0 (0/10)

top N  = 20 -------------------

model  = Rank 20 Recall    : 0.014598540145985401 (2/137)
model  = Rank 20 Precision : 0.1 (2/20)

random = Rank 20 Recall    : 0.0072992700729927005 (1/137)
random = Rank 20 Precision : 0.05 (1/20)

top N  = 30 -------------------

model  = Rank 30 Recall    : 0.029197080291970802 (4/137)
model  = Rank 30 Precision : 0.13333333333333333 (4/30)

random = Rank 30 Recall    : 0.0364963503649635 (5/137)
random = Rank 30 Precision : 0.16666666666666666 (5/30)

top N  = 40 -------------------

model  = Rank 40 Recall    : 0.058394160583941604 (

### 모든 user에 대해 MAP(mean average precision)의 평균 측정

In [18]:
top_N_size = 50
display_freq = 50
AVG_mAP = 0

for pick in range(1, test_rating_data.user_id.max()+1):
    mAP_N = 0
    pred = get_output(pick_user_data_input(pick))[0]
    result = np.dot(pred, embedding_table.transpose())[0]
    rank = sorted(range(len(result)), key=lambda k: result[k], reverse=True)
    
    for top_N in range(1, top_N_size+1):
        True_positive = test_rating_data[(test_rating_data.user_id == pick) & (test_rating_data.movie_id.isin(rank[:top_N]))]
        positive = test_rating_data[test_rating_data.user_id == pick]

        mAP_N += len(True_positive)/top_N
        
    AVG_mAP += mAP_N/top_N_size
    if pick % display_freq == 0 : print(f'AVG_mAP@{top_N_size} = {AVG_mAP/pick} ({pick}/{test_rating_data.user_id.max()})')
        
print(f'최종 : AVG_mAP@{top_N_size} = {AVG_mAP/test_rating_data.user_id.max()}')

AVG_mAP@50 = 0.14774015325555956 (50/462)
AVG_mAP@50 = 0.14651703451085207 (100/462)
AVG_mAP@50 = 0.14080320414689043 (150/462)
AVG_mAP@50 = 0.1386486893241169 (200/462)
AVG_mAP@50 = 0.13491874545034896 (250/462)
AVG_mAP@50 = 0.13819096102581413 (300/462)
AVG_mAP@50 = 0.13818576585674347 (350/462)
AVG_mAP@50 = 0.13215568231662916 (400/462)
AVG_mAP@50 = 0.12283793241821864 (450/462)
최종 : AVG_mAP@50 = 0.12002009124594983
