In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import cornac
from cornac.eval_methods import RatioSplit
from cornac.hyperopt import Continuous, Discrete, RandomSearch
from tqdm import tqdm

import warnings
warnings.filterwarnings(action='ignore')
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
# 추천할 top k개의 장소
TOP_K = 10

In [3]:
data_df = pd.read_csv('../data/score_board.csv')

In [4]:
data_df

Unnamed: 0,u_id,p_id,rating
0,*2,121,4.5
1,-채여니영화평-,287,4.0
2,-채여니영화평-,414,3.5
3,-채여니영화평-,271,3.5
4,-채여니영화평-,264,5.0
5,-채여니영화평-,606,3.5
6,-채여니영화평-,262,3.5
7,-채여니영화평-,526,5.0
8,-채여니영화평-,45,4.0
9,-채여니영화평-,255,5.0


In [5]:
data_df['rating'].value_counts()

5.0      8945
4.0      7493
3.5      5976
3.0      5160
4.5      4153
2.5      2712
2.0      2330
0.5      1425
1.0      1049
1.5       953
보고싶어요     121
보는중         2
Name: rating, dtype: int64

In [6]:
data_df = data_df[data_df['rating']!='보고싶어요']
data_df = data_df[data_df['rating']!='보는중']
data_df['rating'] = data_df['rating'].astype('float')
data_df = data_df.rename({'u_id':'userID','p_id':'itemID'},axis=1)
# data_df = data_df.groupby('userID').filter(lambda x : len(x)>= 10).copy()
data_df = data_df.fillna(5)

In [7]:
data = [tuple(x) for x in data_df.values]
data

[('*2', 121, 4.5),
 ('-채여니영화평-', 287, 4.0),
 ('-채여니영화평-', 414, 3.5),
 ('-채여니영화평-', 271, 3.5),
 ('-채여니영화평-', 264, 5.0),
 ('-채여니영화평-', 606, 3.5),
 ('-채여니영화평-', 262, 3.5),
 ('-채여니영화평-', 526, 5.0),
 ('-채여니영화평-', 45, 4.0),
 ('-채여니영화평-', 255, 5.0),
 ('-채여니영화평-', 250, 4.5),
 ('-채여니영화평-', 37, 4.5),
 ('-채여니영화평-', 645, 3.0),
 ('-채여니영화평-', 248, 4.5),
 ('-채여니영화평-', 239, 3.5),
 ('-채여니영화평-', 49, 3.5),
 ('-채여니영화평-', 236, 3.5),
 ('-채여니영화평-', 643, 5.0),
 ('-채여니영화평-', 234, 3.5),
 ('-채여니영화평-', 540, 2.0),
 ('-채여니영화평-', 543, 5.0),
 ('-채여니영화평-', 227, 3.0),
 ('-채여니영화평-', 53, 3.5),
 ('-채여니영화평-', 82, 3.5),
 ('-채여니영화평-', 223, 4.0),
 ('-채여니영화평-', 295, 3.5),
 ('-채여니영화평-', 35, 3.5),
 ('-채여니영화평-', 405, 3.5),
 ('-채여니영화평-', 404, 4.5),
 ('-채여니영화평-', 619, 4.0),
 ('-채여니영화평-', 455, 4.0),
 ('-채여니영화평-', 456, 3.5),
 ('-채여니영화평-', 98, 4.5),
 ('-채여니영화평-', 462, 2.5),
 ('-채여니영화평-', 616, 3.0),
 ('-채여니영화평-', 615, 5.0),
 ('-채여니영화평-', 477, 3.0),
 ('-채여니영화평-', 651, 2.0),
 ('-채여니영화평-', 22, 3.5),
 ('-채여니영화평-', 614, 3.5),
 ('-채여니영화평-', 

In [8]:
# Instantiate evaluation metrics
ndcg = cornac.metrics.NDCG(k=TOP_K)
pre = cornac.metrics.Precision(k=TOP_K)
rec = cornac.metrics.Recall(k=TOP_K)
fm = cornac.metrics.FMeasure(k=TOP_K)

#Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=data,
    test_size=0.1,
    val_size=0.1,
    rating_threshold=1.0,
    seed=123,
    verbose=True,
)

neumf = cornac.models.NeuMF(
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_neg=50,
    seed=123,
    early_stopping = {'min_delta':0.0, 'patience':5}
)

# RandomSearch
rs_neumf = RandomSearch(
    model = neumf,
    space=[
        Discrete("num_epochs", [50, 100, 150, 200]),
        Discrete("num_factors", [4, 8]),
        Discrete("batch_size", [128, 256, 512]),
        Continuous("lr", 0.001, 0.01)
    ],
    metric = fm,
    eval_method = ratio_split
)

# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split,
    models=[rs_neumf],
    metrics=[ndcg, pre, rec, fm],
).run()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 12469
Number of items = 800
Number of ratings = 32196
Max rating = 5.0
Min rating = 0.5
Global mean = 3.6
---
Test data:
Number of users = 1264
Number of items = 760
Number of ratings = 2847
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 1240
Number of items = 759
Number of ratings = 2818
---
Total users = 12469
Total items = 800

[RandomSearch_NeuMF] Training started!
Evaluating: {'batch_size': 512, 'lr': 0.007416597884709045, 'num_epochs': 150, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Early stopping:
- best epoch = 12, stopped epoch = 17
- best monitored value = 0.098750 (delta = -0.003996)

Evaluating: {'batch_size': 512, 'lr': 0.005961832921746022, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 12, stopped epoch = 17
- best monitored value = 0.100038 (delta = -0.004424)

Evaluating: {'batch_size': 256, 'lr': 0.00982687778546154, 'num_epochs': 50, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 13, stopped epoch = 18
- best monitored value = 0.097793 (delta = -0.001206)

Evaluating: {'batch_size': 512, 'lr': 0.002259556863673461, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 31, stopped epoch = 36
- best monitored value = 0.099552 (delta = -0.003360)

Evaluating: {'batch_size': 128, 'lr': 0.0075614473664563754, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 5, stopped epoch = 10
- best monitored value = 0.098331 (delta = -0.003969)

Evaluating: {'batch_size': 256, 'lr': 0.007252796594667422, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 8, stopped epoch = 13
- best monitored value = 0.094604 (delta = -0.001961)

Evaluating: {'batch_size': 256, 'lr': 0.0026424255740815, 'num_epochs': 150, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Early stopping:
- best epoch = 15, stopped epoch = 20
- best monitored value = 0.096609 (delta = -0.000966)

Evaluating: {'batch_size': 512, 'lr': 0.0057864482838717955, 'num_epochs': 50, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 7, stopped epoch = 12
- best monitored value = 0.098679 (delta = -0.003821)

Evaluating: {'batch_size': 128, 'lr': 0.005428762991131081, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 7, stopped epoch = 12
- best monitored value = 0.101144 (delta = -0.001528)

Evaluating: {'batch_size': 128, 'lr': 0.007501990443131995, 'num_epochs': 100, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Early stopping:
- best epoch = 4, stopped epoch = 9
- best monitored value = 0.099420 (delta = -0.006338)

Best parameter settings: {'batch_size': 128, 'lr': 0.007501990443131995, 'num_epochs': 100, 'num_factors': 8}
F1@10 = 0.0163

[RandomSearch_NeuMF] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Ranking', max=1264.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Ranking', max=1240.0, style=ProgressStyle(description_wid…



VALIDATION:
...
                   |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Time (s)
------------------ + ------ + ------- + ------------ + --------- + --------
RandomSearch_NeuMF | 0.0163 |  0.0283 |       0.0105 |    0.0598 |   1.0382

TEST:
...
                   |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
------------------ + ------ + ------- + ------------ + --------- + --------- + --------
RandomSearch_NeuMF | 0.0150 |  0.0283 |       0.0098 |    0.0535 | 4831.9023 |   1.0951



In [9]:
print('Random search: ', rs_neumf.best_params)

Random search:  {'batch_size': 128, 'lr': 0.007501990443131995, 'num_epochs': 100, 'num_factors': 8}


In [10]:
ratio_split = RatioSplit(
    data=data,
    test_size=0,
    rating_threshold=1.0,
    verbose=True,
)

neumf = cornac.models.NeuMF(
    num_factors = rs_neumf.best_params['num_factors'],
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_epochs = rs_neumf.best_params['num_epochs'],
    lr=rs_neumf.best_params['lr'],
    num_neg=50,
    batch_size = rs_neumf.best_params['batch_size'],
    seed=123,
    early_stopping = {'min_delta':0.0, 'patience':5}
)

cornac.Experiment(
    eval_method=ratio_split,
    models=[neumf],
    metrics=[ndcg, pre, rec, fm],
).run()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 14795
Number of items = 800
Number of ratings = 40232
Max rating = 5.0
Min rating = 0.5
Global mean = 3.6
---
Test data:
Number of users = 14795
Number of items = 800
Number of ratings = 40232
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 14795
Total items = 800

[NeuMF] Training started!


HBox(children=(FloatProgress(value=0.0), HTML(value='')))



[NeuMF] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Ranking', max=14795.0, style=ProgressStyle(description_wi…



TEST:
...
      |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------- + ------------ + --------- + --------- + --------
NeuMF | 0.2460 |  0.9805 |       0.1698 |    0.9687 | 3663.9332 |  11.5212



In [12]:
pred_user = pd.read_csv('../../common/data/survey_result.csv')
pred_user = set(pred_user['u_id'])

In [13]:
all_prediction = pd.DataFrame(columns=['u_id','p_id','ncf_score'])

for u_idx, u_id in tqdm(enumerate(data_df['userID'].unique())):
    if u_id in pred_user:
        tmp = pd.DataFrame(neumf.rank(u_idx)).T
        tmp['u_id'] = u_id
        tmp = pd.DataFrame(tmp,columns=['u_id',0,1])
        tmp.rename(columns={0:'p_id',1:'ncf_score'}, inplace=True)
        all_prediction = all_prediction.append(tmp)

14795it [00:00, 17597.39it/s]


In [14]:
all_prediction

Unnamed: 0,u_id,p_id,ncf_score
0,lily,529.0,5.586952e-03
1,lily,522.0,6.249547e-05
2,lily,758.0,0.000000e+00
3,lily,438.0,2.086163e-07
4,lily,279.0,1.043427e-02
5,lily,788.0,0.000000e+00
6,lily,266.0,5.364418e-07
7,lily,593.0,0.000000e+00
8,lily,578.0,6.079674e-06
9,lily,735.0,4.678965e-06


In [15]:
all_prediction.to_csv('../data/predictions/ncf_scores.csv',index=False)