In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import cornac
from cornac.eval_methods import RatioSplit
from cornac.hyperopt import Continuous, Discrete, RandomSearch
from tqdm import tqdm

import warnings
warnings.filterwarnings(action='ignore')
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
# 추천할 top k개의 장소
TOP_K = 10

In [3]:
data_df = pd.read_csv('../data/score_board.csv')

In [4]:
data_df

Unnamed: 0,u_id,p_id,rating
0,*2,121,4.5
1,-채여니영화평-,287,4.0
2,-채여니영화평-,414,3.5
3,-채여니영화평-,271,3.5
4,-채여니영화평-,264,5.0
5,-채여니영화평-,606,3.5
6,-채여니영화평-,262,3.5
7,-채여니영화평-,526,5.0
8,-채여니영화평-,45,4.0
9,-채여니영화평-,255,5.0


In [5]:
data_df['rating'].value_counts()

4.0      6287
3.5      5430
3.0      4748
5.0      3055
4.5      2846
2.5      2555
2.0      2150
1.5       896
1.0       855
0.5       738
보고싶어요     121
보는중         2
Name: rating, dtype: int64

In [6]:
data_df = data_df[data_df['rating']!='보고싶어요']
data_df = data_df[data_df['rating']!='보는중']
data_df['rating'] = data_df['rating'].astype('float')
data_df = data_df.rename({'u_id':'userID','p_id':'itemID'},axis=1)
data_df = data_df.groupby('userID').filter(lambda x : len(x)>= 10).copy()
data_df = data_df.fillna(5)

In [7]:
data = [tuple(x) for x in data_df.values]
data

[('-채여니영화평-', 287, 4.0),
 ('-채여니영화평-', 414, 3.5),
 ('-채여니영화평-', 271, 3.5),
 ('-채여니영화평-', 264, 5.0),
 ('-채여니영화평-', 606, 3.5),
 ('-채여니영화평-', 262, 3.5),
 ('-채여니영화평-', 526, 5.0),
 ('-채여니영화평-', 45, 4.0),
 ('-채여니영화평-', 255, 5.0),
 ('-채여니영화평-', 250, 4.5),
 ('-채여니영화평-', 37, 4.5),
 ('-채여니영화평-', 645, 3.0),
 ('-채여니영화평-', 248, 4.5),
 ('-채여니영화평-', 239, 3.5),
 ('-채여니영화평-', 49, 3.5),
 ('-채여니영화평-', 236, 3.5),
 ('-채여니영화평-', 643, 5.0),
 ('-채여니영화평-', 234, 3.5),
 ('-채여니영화평-', 540, 2.0),
 ('-채여니영화평-', 543, 5.0),
 ('-채여니영화평-', 227, 3.0),
 ('-채여니영화평-', 53, 3.5),
 ('-채여니영화평-', 82, 3.5),
 ('-채여니영화평-', 223, 4.0),
 ('-채여니영화평-', 295, 3.5),
 ('-채여니영화평-', 35, 3.5),
 ('-채여니영화평-', 405, 3.5),
 ('-채여니영화평-', 404, 4.5),
 ('-채여니영화평-', 619, 4.0),
 ('-채여니영화평-', 455, 4.0),
 ('-채여니영화평-', 456, 3.5),
 ('-채여니영화평-', 98, 4.5),
 ('-채여니영화평-', 462, 2.5),
 ('-채여니영화평-', 616, 3.0),
 ('-채여니영화평-', 615, 5.0),
 ('-채여니영화평-', 477, 3.0),
 ('-채여니영화평-', 651, 2.0),
 ('-채여니영화평-', 22, 3.5),
 ('-채여니영화평-', 614, 3.5),
 ('-채여니영화평-', 329, 4.0),
 ('-채여니영

In [8]:
# Instantiate evaluation metrics
ndcg = cornac.metrics.NDCG(k=TOP_K)
pre = cornac.metrics.Precision(k=TOP_K)
rec = cornac.metrics.Recall(k=TOP_K)
fm = cornac.metrics.FMeasure(k=TOP_K)

#Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=data,
    test_size=0.1,
    val_size=0.1,
    rating_threshold=1.0,
    seed=123,
    verbose=True,
)

neumf = cornac.models.NeuMF(
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    lr=0.01,
    num_neg=50,
    seed=123,
    early_stopping = {'min_delta':0.0, 'patience':5}
)

# RandomSearch
rs_neumf = RandomSearch(
    model = neumf,
    space=[
        Discrete("num_epochs", [50, 100, 150, 200]),
        Discrete("num_factors", [4, 8]),
        Discrete("batch_size", [128, 256, 512])
    ],
    metric = fm,
    eval_method = ratio_split
)

# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split,
    models=[rs_neumf],
    metrics=[ndcg, pre, rec, fm],
).run()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 553
Number of items = 670
Number of ratings = 15429
Max rating = 5.0
Min rating = 0.5
Global mean = 3.4
---
Test data:
Number of users = 480
Number of items = 629
Number of ratings = 1932
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 472
Number of items = 624
Number of ratings = 1932
---
Total users = 553
Total items = 670

[RandomSearch_NeuMF] Training started!
Evaluating: {'batch_size': 512, 'num_epochs': 100, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Early stopping:
- best epoch = 22, stopped epoch = 27
- best monitored value = 0.111646 (delta = -0.002216)

Evaluating: {'batch_size': 512, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 22, stopped epoch = 27
- best monitored value = 0.111646 (delta = -0.002216)

Evaluating: {'batch_size': 512, 'num_epochs': 100, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Early stopping:
- best epoch = 8, stopped epoch = 13
- best monitored value = 0.112632 (delta = -0.007952)

Evaluating: {'batch_size': 512, 'num_epochs': 200, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 8, stopped epoch = 13
- best monitored value = 0.112632 (delta = -0.007952)

Evaluating: {'batch_size': 512, 'num_epochs': 100, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Early stopping:
- best epoch = 22, stopped epoch = 27
- best monitored value = 0.111646 (delta = -0.002216)

Evaluating: {'batch_size': 256, 'num_epochs': 150, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Early stopping:
- best epoch = 8, stopped epoch = 13
- best monitored value = 0.110611 (delta = -0.003041)

Evaluating: {'batch_size': 256, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 23, stopped epoch = 28
- best monitored value = 0.110135 (delta = -0.002242)

Evaluating: {'batch_size': 128, 'num_epochs': 200, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 4, stopped epoch = 9
- best monitored value = 0.118926 (delta = -0.007085)

Evaluating: {'batch_size': 512, 'num_epochs': 100, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Early stopping:
- best epoch = 22, stopped epoch = 27
- best monitored value = 0.111646 (delta = -0.002216)

Evaluating: {'batch_size': 128, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 16, stopped epoch = 21
- best monitored value = 0.118016 (delta = -0.005796)

Best parameter settings: {'batch_size': 512, 'num_epochs': 100, 'num_factors': 8}
F1@10 = 0.0216

[RandomSearch_NeuMF] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Ranking', max=480.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Ranking', max=472.0, style=ProgressStyle(description_widt…



VALIDATION:
...
                   |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Time (s)
------------------ + ------ + ------- + ------------ + --------- + --------
RandomSearch_NeuMF | 0.0216 |  0.0294 |       0.0153 |    0.0485 |   0.4348

TEST:
...
                   |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
------------------ + ------ + ------- + ------------ + --------- + --------- + --------
RandomSearch_NeuMF | 0.0166 |  0.0225 |       0.0117 |    0.0422 | 2807.3405 |   0.4269



In [9]:
print('Random search: ', rs_neumf.best_params)

Random search:  {'batch_size': 512, 'num_epochs': 100, 'num_factors': 8}


In [10]:
ratio_split = RatioSplit(
    data=data,
    test_size=0.2,
    rating_threshold=1.0,
    seed=123,
    verbose=True,
)

neumf = cornac.models.NeuMF(
    num_factors = rs_neumf.best_params['num_factors'],
    layers=[32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_epochs = rs_neumf.best_params['num_epochs'],
    lr=0.01,
    num_neg=50,
    batch_size = rs_neumf.best_params['batch_size'],
    seed=123,
    early_stopping = {'min_delta':0.0, 'patience':5}
)

cornac.Experiment(
    eval_method=ratio_split,
    models=[neumf],
    metrics=[ndcg, pre, rec, fm],
).run()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 553
Number of items = 670
Number of ratings = 15429
Max rating = 5.0
Min rating = 0.5
Global mean = 3.4
---
Test data:
Number of users = 533
Number of items = 660
Number of ratings = 3864
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 553
Total items = 670

[NeuMF] Training started!


HBox(children=(FloatProgress(value=0.0), HTML(value='')))



[NeuMF] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Ranking', max=533.0, style=ProgressStyle(description_widt…



TEST:
...
      |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------- + ------------ + --------- + --------- + --------
NeuMF | 0.0260 |  0.0286 |       0.0224 |    0.0411 | 1296.6767 |   0.4338



In [34]:
all_prediction = pd.DataFrame(columns=['u_id','p_id','ncf_score'])

for u_idx,u_id in tqdm(enumerate(data_df['userID'].unique())):
    tmp = pd.DataFrame(neumf.rank(u_idx)).T
    tmp['u_id'] = u_id
    tmp = pd.DataFrame(tmp,columns=['u_id',0,1])
    tmp.rename(columns={0:'p_id',1:'ncf_score'}, inplace=True)
    all_prediction = all_prediction.append(tmp)

553it [00:15, 35.38it/s]


In [35]:
all_prediction

Unnamed: 0,u_id,p_id,ncf_score
0,-채여니영화평-,356.0,0.056195
1,-채여니영화평-,611.0,0.009988
2,-채여니영화평-,627.0,0.000031
3,-채여니영화평-,174.0,0.001474
4,-채여니영화평-,363.0,0.000002
5,-채여니영화평-,106.0,0.003653
6,-채여니영화평-,620.0,0.016419
7,-채여니영화평-,593.0,0.010590
8,-채여니영화평-,42.0,0.000460
9,-채여니영화평-,653.0,0.000121


In [39]:
all_prediction.to_csv('../data/predictions/ncf_scores.scv',index=False)