In [1]:
# automatically reload edited modules
%load_ext autoreload
%autoreload 2
%matplotlib inline

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

DATASET_DIR = './dataset/'
# DATASET_DIR = '/home/esdl/tensorflow/DATA_SET/call_history/'

### Data load

In [3]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
movie_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
              'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
              'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

user_data = pd.read_csv(DATASET_DIR+'u.user', sep='|', names=users_cols)
movie_data = pd.read_csv(DATASET_DIR+'u.item', sep='|', names=movie_cols, encoding='latin')

# user_data = pd.read_csv(DATASET_DIR+'1_1446.CSV')
# user_data = pd.read_csv(DATASET_DIR+'11_500.CSV', encoding='latin')

# user_data

#------------------
# train
rating_data = pd.read_csv(DATASET_DIR+'u1.base', sep='\t', names=rating_cols)
# test
test_rating_data = pd.read_csv(DATASET_DIR+'u1.test', sep='\t', names=rating_cols)

In [4]:
movie_data.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
user_data.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
rating_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [6]:
test_rating_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


### 모델에 입력할 Input_data 만들기
   
- data_per_user   
유저당 사용할 Target 데이터 갯수   

- wathed_data_size   
입력에 사용할 유저가 본 영화 갯수 

- negative_size   
입력에 사용할 negative_sample 갯수

In [7]:
data_per_user = 20#20
wathed_data_size = 10#5
negative_size = 5#2 # big 2-5 small 5-20

# -------------------------

train_dataset = pd.merge(user_data[['user_id','age','sex']], rating_data[['user_id','movie_id','rating']][rating_data.rating >= 3], on='user_id')

train_dataset = train_dataset.groupby('user_id', group_keys=False).apply(
    lambda x : x.sample(n=data_per_user).reset_index(drop=True)
    if len(x) >= data_per_user
    else x.reset_index(drop=True))

train_dataset['watched_movies'] = train_dataset['user_id'].apply(
    lambda x : rating_data[rating_data.user_id == x]['movie_id'].sample(n=wathed_data_size).tolist()
    if len(rating_data[rating_data.user_id == x]['movie_id'])>= wathed_data_size
    else rating_data[rating_data.user_id == x]['movie_id'].tolist())
    #else rating_data[rating_data.user_id == x]['movie_id'].sample(n=wathed_data_size, replace=True).tolist())

train_dataset['negative_samples'] = np.random.choice(len(movie_data)+1, (len(train_dataset),negative_size)).tolist() # +1 : 마지막 영화 ID도 포함

train_dataset

Unnamed: 0,user_id,age,sex,movie_id,rating,watched_movies,negative_samples
0,1,24,M,153,3,"[162, 127, 138, 246, 240, 251, 123, 59, 34, 13]","[1079, 335, 1672, 1596, 851]"
1,1,24,M,239,4,"[269, 57, 111, 48, 168, 127, 116, 197, 249, 156]","[1371, 1184, 966, 249, 539]"
2,1,24,M,176,5,"[149, 88, 19, 131, 115, 110, 83, 211, 137, 271]","[1385, 711, 729, 863, 1637]"
3,1,24,M,111,5,"[231, 237, 99, 37, 88, 181, 178, 109, 152, 34]","[1674, 970, 462, 1286, 403]"
4,1,24,M,79,4,"[198, 182, 11, 48, 105, 99, 87, 166, 110, 71]","[1397, 1510, 151, 1311, 628]"
...,...,...,...,...,...,...,...
15,943,22,M,431,4,"[559, 42, 31, 202, 193, 796, 721, 541, 399, 228]","[1089, 149, 1478, 471, 728]"
16,943,22,M,27,4,"[186, 470, 475, 391, 69, 943, 127, 485, 1044, 22]","[1286, 607, 964, 1501, 1594]"
17,943,22,M,188,4,"[546, 485, 356, 825, 367, 94, 386, 475, 585, 38]","[748, 1372, 1116, 1365, 538]"
18,943,22,M,816,4,"[468, 31, 2, 139, 100, 816, 281, 840, 12, 625]","[879, 1292, 404, 807, 44]"


In [8]:
# 데이터 뒤섞기
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
train_dataset.head()

Unnamed: 0,user_id,age,sex,movie_id,rating,watched_movies,negative_samples
0,323,21,M,1048,3,"[268, 294, 292, 295, 651, 319, 847, 479, 1048,...","[854, 782, 287, 70, 1245]"
1,256,35,F,977,4,"[147, 1033, 100, 151, 583, 123, 591, 405, 1047...","[1452, 1218, 1434, 130, 1549]"
2,124,34,M,1,3,"[117, 174, 144, 98, 1, 168, 28, 474, 7, 172]","[1425, 256, 1280, 1284, 77]"
3,567,24,M,178,4,"[506, 209, 478, 657, 1, 23, 1019, 607, 475, 636]","[1517, 292, 1073, 1370, 1182]"
4,351,61,M,748,4,"[895, 311, 312, 307, 341, 289, 678, 300, 245, ...","[1314, 1029, 1041, 1303, 1296]"


In [9]:
# Dataframe 형태에서 모델 입력에 맞게 array 형태로 변환

temp_1 = train_dataset['movie_id'].to_numpy().reshape(-1,1)
temp_2 = np.array(train_dataset['negative_samples'].tolist())

sample_data = np.append(temp_1, temp_2, axis=-1)
#watched_movies_data = np.array(train_dataset['watched_movies'].tolist())
watched_movies_data = tf.keras.preprocessing.sequence.pad_sequences(train_dataset['watched_movies'])
gender_data = train_dataset['sex'].apply(lambda x: 0 if x == 'M' else 1).to_numpy().reshape(-1,1)
age_data = train_dataset['age'].apply(lambda x: (x - train_dataset['age'].min())/(train_dataset['age'].max() - train_dataset['age'].min())).to_numpy().reshape(-1,1)

print(sample_data.shape, watched_movies_data.shape, gender_data.shape, age_data.shape)

(17601, 6) (17601, 10) (17601, 1) (17601, 1)


### Model 구성
---
#### 전체 모델
![full_model](./full_model.PNG)
   
---
#### 현재 구현한 candidate model
![candidate_model](./candidate_model.PNG)

In [10]:
EMBEDDING_DIMS = 16 # 임베딩 크기
DENSE_UNITS = 64    # 은닉층 크기
MOVIE_NUM = movie_data['movie_id'].max() #1682
LEARNING_RATE = 0.003
BATCH_SIZE = 10

#-------------------------#

class Avg_Embedding(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Avg_Embedding, self).__init__(**kwargs)

    def call(self, input):
        return tf.reduce_mean(input, axis=1)
    
#-------------------------#
input_watched_movies = tf.keras.Input(shape=(None, ), name='watched_movies')
input_age = tf.keras.layers.Input(shape=(1), name='age')
input_gender = tf.keras.layers.Input(shape=(1), name='gender')
input_samples = tf.keras.Input(shape=(None, ), name='samples')

features_embedding_layer = tf.keras.layers.Embedding(input_dim = MOVIE_NUM+1, output_dim = EMBEDDING_DIMS, mask_zero=True, name='features_embedding')
average_embedding_layer = Avg_Embedding(name='features_embedding_average')
dense_1 = tf.keras.layers.Dense(DENSE_UNITS, activation='relu', name='dense_1')
dense_2 = tf.keras.layers.Dense(EMBEDDING_DIMS, activation='relu', name='dense_2')

#-------------------------#
watched_movies_embedding = features_embedding_layer(input_watched_movies)
sample_movies_embedding = features_embedding_layer(input_samples)

average_embedding = average_embedding_layer(watched_movies_embedding)
concat_features = tf.keras.layers.concatenate([average_embedding, input_age, input_gender], axis=1, name='concatenate_features')
dense_1_out = dense_1(concat_features)
dense_2_out = dense_2(dense_1_out)
dot_product = tf.keras.layers.dot([dense_2_out, sample_movies_embedding], axes=(1,2), name='dot_product')
output = tf.keras.layers.Activation('softmax', name = 'class_probabilities')(dot_product)

model = tf.keras.Model(inputs=[input_watched_movies, input_age, input_gender, input_samples], outputs=[output])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss='sparse_categorical_crossentropy', metrics=['acc'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
watched_movies (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
features_embedding (Embedding)  (None, None, 16)     26928       watched_movies[0][0]             
                                                                 samples[0][0]                    
__________________________________________________________________________________________________
features_embedding_average (Avg (None, 16)           0           features_embedding[0][0]         
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                        

### Train !!!

In [11]:
history = model.fit([watched_movies_data, age_data, gender_data, sample_data],
                    np.zeros(len(train_dataset),dtype=int),
                    batch_size=BATCH_SIZE,
                    epochs=50)

Train on 17601 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Test dataset으로 테스트하기

In [12]:
# 유저 id를 입력하면 입력 데이터를 만들어주는 함수
def pick_user_data_input(user_id, watched_sample_num = wathed_data_size):
    if len(rating_data[rating_data.user_id == user_id]) >= watched_sample_num:
        watched_movies_data = rating_data[rating_data.user_id == user_id]['movie_id'].sample(n=watched_sample_num).to_numpy().reshape(-1,watched_sample_num)
    else:
        #watched_movies_data = rating_data[rating_data.user_id == user_id]['movie_id'].sample(n=watched_sample_num, replace=True).to_numpy().reshape(-1,watched_sample_num)
        watched_movies_data = tf.keras.preprocessing.sequence.pad_sequences(rating_data[rating_data.user_id == user_id]['movie_id'].to_numpy().reshape(1,-1), maxlen=watched_sample_num)

    age_data = user_data[user_data.user_id == user_id]['age'].apply(lambda x: (x - user_data['age'].min())/(user_data['age'].max() - user_data['age'].min())).to_numpy().reshape(-1,1)
    gender_data = user_data[user_data.user_id == user_id]['sex'].apply(lambda x : 0 if x=='M' else 1).to_numpy().reshape(-1,1)

    return [watched_movies_data, age_data, gender_data]

# 유저 임베딩 vector
get_output = tf.keras.backend.function([model.get_layer('watched_movies').input,
                                        model.get_layer('age').input,
                                        model.get_layer('gender').input], [model.layers[-3].output])

# 영화 임베딩 vector
embedding_table = model.get_layer('features_embedding').get_weights()[0]
print(embedding_table.shape)

(1683, 16)


In [13]:
pick = 1 # 유저 ID
top_N_size = 50 # 최대 50개 추천

a = pick_user_data_input(pick)
pred = get_output(a)[0]
result = np.dot(pred, embedding_table.transpose())[0]
rank = sorted(range(len(result)), key=lambda k: result[k], reverse=True) # 영화 추천 순위

print(f"영화 추천 순위 (movie_ID): {rank[:10]}")

영화 추천 순위 (movie_ID): [9, 294, 56, 313, 13, 286, 195, 300, 258, 318]


In [14]:
# mean average precision 측정

mAP_N = 0

display_freq = 10
for top_N in range(1, top_N_size+1):
    True_positive = test_rating_data[(test_rating_data.user_id == pick) & (test_rating_data.movie_id.isin(rank[:top_N]))]
    positive = test_rating_data[test_rating_data.user_id == pick]

    random_pick = np.random.choice(len(movie_data), top_N)
    Random_positive = test_rating_data[(test_rating_data.user_id == pick) & (test_rating_data.movie_id.isin(random_pick))]
    
    if top_N % display_freq == 0 or top_N == 1:
        print(f'top N  = {top_N} -------------------\n')

        print(f'model  = Rank {top_N} Recall    : {len(True_positive)/len(positive)} ({len(True_positive)}/{len(positive)})')
        print(f'model  = Rank {top_N} Precision : {len(True_positive)/top_N} ({len(True_positive)}/{top_N})\n')


        print(f'random = Rank {top_N} Recall    : {len(Random_positive)/len(positive)} ({len(Random_positive)}/{len(positive)})')
        print(f'random = Rank {top_N} Precision : {len(Random_positive)/top_N} ({len(Random_positive)}/{top_N})\n')
    
    mAP_N += len(True_positive)/top_N
    
print(f'mAP@{top_N_size} = {mAP_N/top_N_size}')

top N  = 1 -------------------

model  = Rank 1 Recall    : 0.0 (0/137)
model  = Rank 1 Precision : 0.0 (0/1)

random = Rank 1 Recall    : 0.0 (0/137)
random = Rank 1 Precision : 0.0 (0/1)

top N  = 10 -------------------

model  = Rank 10 Recall    : 0.014598540145985401 (2/137)
model  = Rank 10 Precision : 0.2 (2/10)

random = Rank 10 Recall    : 0.0072992700729927005 (1/137)
random = Rank 10 Precision : 0.1 (1/10)

top N  = 20 -------------------

model  = Rank 20 Recall    : 0.043795620437956206 (6/137)
model  = Rank 20 Precision : 0.3 (6/20)

random = Rank 20 Recall    : 0.014598540145985401 (2/137)
random = Rank 20 Precision : 0.1 (2/20)

top N  = 30 -------------------

model  = Rank 30 Recall    : 0.051094890510948905 (7/137)
model  = Rank 30 Precision : 0.23333333333333334 (7/30)

random = Rank 30 Recall    : 0.0072992700729927005 (1/137)
random = Rank 30 Precision : 0.03333333333333333 (1/30)

top N  = 40 -------------------

model  = Rank 40 Recall    : 0.06569343065693431 (

### test_dataset 안에 있는 모든 user에 대해 MAP(mean average precision)의 평균 측정

In [15]:
top_N_size = 50

display_freq = 50

AVG_mAP = 0

for pick in range(1, test_rating_data.user_id.max()+1):
    mAP_N = 0
    pred = get_output(pick_user_data_input(pick))[0]
    result = np.dot(pred, embedding_table.transpose())[0]
    rank = sorted(range(len(result)), key=lambda k: result[k], reverse=True)
    
    for top_N in range(1, top_N_size+1):
        True_positive = test_rating_data[(test_rating_data.user_id == pick) & (test_rating_data.movie_id.isin(rank[:top_N]))]
        positive = test_rating_data[test_rating_data.user_id == pick]

        mAP_N += len(True_positive)/top_N
        
    AVG_mAP += mAP_N/top_N_size
    if pick % display_freq == 0 : print(f'AVG_mAP@{top_N_size} = {AVG_mAP/pick} ({pick}/{test_rating_data.user_id.max()})')
        
print(f'최종 : AVG_mAP@{top_N_size} = {AVG_mAP/test_rating_data.user_id.max()}')

AVG_mAP@50 = 0.16509437998054832 (50/462)
AVG_mAP@50 = 0.1629235672837778 (100/462)
AVG_mAP@50 = 0.16163059269753716 (150/462)
AVG_mAP@50 = 0.1561595192096597 (200/462)
AVG_mAP@50 = 0.14928770418441442 (250/462)
AVG_mAP@50 = 0.15340997635780057 (300/462)
AVG_mAP@50 = 0.15375456983592564 (350/462)
AVG_mAP@50 = 0.14594485262490445 (400/462)
AVG_mAP@50 = 0.1354843246928966 (450/462)
최종 : AVG_mAP@50 = 0.13219091011714082
