In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import cornac
import cornac.eval_methods
from cornac.eval_methods import RatioSplit, base_method
from cornac.hyperopt import Continuous, Discrete, RandomSearch
from tqdm import tqdm

import warnings
warnings.filterwarnings(action='ignore')
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
# 추천할 top k개의 장소
TOP_K = 10

In [3]:
data_df = pd.read_csv('../data/score_board.csv', index_col =0)

In [4]:
data_df['rating'] =data_df['rating'].astype('float')
data_df = data_df.rename({'u_id':'userID','p_id':'itemID'},axis=1)
#data_df = data_df.groupby('userID').filter(lambda x : len(x)>= 10).copy()
data_df = data_df.fillna(5)

In [5]:
data_df

Unnamed: 0,userID,itemID,rating
0,0,3375,3.0
1,0,2778,5.0
2,0,612,5.0
3,0,3949,5.0
4,0,1679,1.0
5,0,4805,5.0
6,0,287,3.0
7,0,3601,5.0
8,0,3526,5.0
9,0,3601,5.0


In [6]:
data = [tuple(x) for x in data_df.values]
data

[('0', 3375, 3.0),
 ('0', 2778, 5.0),
 ('0', 612, 5.0),
 ('0', 3949, 5.0),
 ('0', 1679, 1.0),
 ('0', 4805, 5.0),
 ('0', 287, 3.0),
 ('0', 3601, 5.0),
 ('0', 3526, 5.0),
 ('0', 3601, 5.0),
 ('0', 3357, 5.0),
 ('0', 104, 5.0),
 ('0', 2307, 5.0),
 ('0', 4345, 5.0),
 ('0', 3075, 5.0),
 ('0', 1269, 5.0),
 ('0', 1714, 5.0),
 ('0', 479, 5.0),
 ('0', 1186, 5.0),
 ('0', 3388, 5.0),
 ('0', 5068, 5.0),
 ('0', 5264, 3.0),
 ('0', 1123, 5.0),
 ('0', 3734, 5.0),
 ('0', 1904, 3.0),
 ('0', 4843, 5.0),
 ('0', 616, 5.0),
 ('0', 1092, 5.0),
 ('0', 3529, 5.0),
 ('0', 1254, 5.0),
 ('0', 3118, 5.0),
 ('1', 3612, 5.0),
 ('2', 3441, 5.0),
 ('2', 1737, 5.0),
 ('3', 5334, 5.0),
 ('4', 1127, 5.0),
 ('4', 1846, 5.0),
 ('4', 3415, 5.0),
 ('4', 1666, 5.0),
 ('4', 208, 5.0),
 ('4', 1846, 5.0),
 ('4', 709, 3.0),
 ('4', 177, 5.0),
 ('4', 3415, 5.0),
 ('4', 1846, 5.0),
 ('4', 1368, 5.0),
 ('4', 2382, 5.0),
 ('4', 5153, 5.0),
 ('4', 3761, 5.0),
 ('4', 3761, 5.0),
 ('4', 2910, 5.0),
 ('4', 2091, 5.0),
 ('4', 177, 5.0),
 (

In [7]:
# Instantiate evaluation metrics
ndcg = cornac.metrics.NDCG(k=TOP_K)
pre = cornac.metrics.Precision(k=TOP_K)
rec = cornac.metrics.Recall(k=TOP_K)
fm = cornac.metrics.FMeasure(k=TOP_K)

In [8]:

#Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=data,
    test_size=0.1,
    val_size=0.1,
    rating_threshold=1.0,
    seed=123,
    verbose=True,
)

neumf = cornac.models.NeuMF(
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_neg=50,
    seed=123,
    early_stopping = {'min_delta':0.0, 'patience':5}
)

# RandomSearch
rs_neumf = RandomSearch(
    model = neumf,
    space=[
        Discrete("num_epochs", [50, 100, 150, 200]),
        Discrete("num_factors", [4, 8]),
        Discrete("batch_size", [128, 256, 512]),
        Continuous("lr", 0.001, 0.01)
    ],
    metric = fm,
    eval_method = ratio_split
)

# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split,
    models=[rs_neumf],
    metrics=[ndcg, pre, rec, fm],
).run()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 19705
Number of items = 5318
Number of ratings = 81784
Max rating = 5.0
Min rating = 1.0
Global mean = 4.2
---
Test data:
Number of users = 4275
Number of items = 2920
Number of ratings = 9116
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 4285
Number of items = 2990
Number of ratings = 9114
---
Total users = 19705
Total items = 5318

[RandomSearch_NeuMF] Training started!
Evaluating: {'batch_size': 512, 'lr': 0.007416597884709045, 'num_epochs': 150, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Early stopping:
- best epoch = 16, stopped epoch = 21
- best monitored value = 0.070255 (delta = -0.003408)

Evaluating: {'batch_size': 512, 'lr': 0.005961832921746022, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 18, stopped epoch = 23
- best monitored value = 0.067369 (delta = -0.003243)

Evaluating: {'batch_size': 256, 'lr': 0.00982687778546154, 'num_epochs': 50, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 11, stopped epoch = 16
- best monitored value = 0.077132 (delta = -0.006213)

Evaluating: {'batch_size': 512, 'lr': 0.002259556863673461, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 27, stopped epoch = 32
- best monitored value = 0.067367 (delta = -0.001496)

Evaluating: {'batch_size': 128, 'lr': 0.0075614473664563754, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 7, stopped epoch = 12
- best monitored value = 0.063121 (delta = -0.002053)

Evaluating: {'batch_size': 256, 'lr': 0.007252796594667422, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 9, stopped epoch = 14
- best monitored value = 0.066140 (delta = -0.004099)

Evaluating: {'batch_size': 256, 'lr': 0.0026424255740815, 'num_epochs': 150, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Early stopping:
- best epoch = 17, stopped epoch = 22
- best monitored value = 0.068132 (delta = -0.002724)

Evaluating: {'batch_size': 512, 'lr': 0.0057864482838717955, 'num_epochs': 50, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 11, stopped epoch = 16
- best monitored value = 0.077841 (delta = -0.002226)

Evaluating: {'batch_size': 128, 'lr': 0.005428762991131081, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 9, stopped epoch = 14
- best monitored value = 0.065643 (delta = -0.003613)

Evaluating: {'batch_size': 128, 'lr': 0.007501990443131995, 'num_epochs': 100, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Early stopping:
- best epoch = 7, stopped epoch = 12
- best monitored value = 0.074966 (delta = -0.006543)

Best parameter settings: {'batch_size': 512, 'lr': 0.0057864482838717955, 'num_epochs': 50, 'num_factors': 8}
F1@10 = 0.0150

[RandomSearch_NeuMF] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Ranking', max=4275.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Ranking', max=4285.0, style=ProgressStyle(description_wid…



VALIDATION:
...
                   |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Time (s)
------------------ + ------ + ------- + ------------ + --------- + --------
RandomSearch_NeuMF | 0.0150 |  0.0314 |       0.0098 |    0.0535 |  11.1492

TEST:
...
                   |  F1@10 | NDCG@10 | Precision@10 | Recall@10 |  Train (s) | Test (s)
------------------ + ------ + ------- + ------------ + --------- + ---------- + --------
RandomSearch_NeuMF | 0.0149 |  0.0313 |       0.0097 |    0.0540 | 14024.7585 |  11.2788



In [9]:
print('Random search: ', rs_neumf.best_params)

Random search:  {'batch_size': 512, 'lr': 0.0057864482838717955, 'num_epochs': 50, 'num_factors': 8}


In [10]:
ratio_split = RatioSplit(
    data=data,
    test_size=0,
    rating_threshold=1.0,
    verbose=True,
)

neumf = cornac.models.NeuMF(
    num_factors = rs_neumf.best_params['num_factors'],
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_epochs = rs_neumf.best_params['num_epochs'],
    lr=rs_neumf.best_params['lr'],
    num_neg=50,
    batch_size = rs_neumf.best_params['batch_size'],
    seed=123,
    early_stopping = {'min_delta':0.0, 'patience':5}
)

cornac.Experiment(
    eval_method=ratio_split,
    models=[neumf],
    metrics=[ndcg, pre, rec, fm],
).run()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 22173
Number of items = 5397
Number of ratings = 101595
Max rating = 5.0
Min rating = 0.0
Global mean = 4.2
---
Test data:
Number of users = 22173
Number of items = 5397
Number of ratings = 101595
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 22173
Total items = 5397

[NeuMF] Training started!


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



[NeuMF] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Ranking', max=22173.0, style=ProgressStyle(description_wi…



TEST:
...
      |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------- + ------------ + --------- + --------- + --------
NeuMF | 0.3108 |  0.9176 |       0.2429 |    0.9110 | 3908.2797 |  51.3327



In [11]:
pred_user = pd.read_csv('../../common/data/survey_result.csv')
pred_user = set(pred_user['u_id'])

In [12]:
all_prediction = pd.DataFrame(columns=['u_id','p_id','ncf_score'])

for u_idx, u_id in tqdm(enumerate(data_df['userID'].unique())):
    if u_id in pred_user:
        tmp = pd.DataFrame(neumf.rank(u_idx)).T
        tmp['u_id'] = u_id
        tmp = pd.DataFrame(tmp,columns=['u_id',0,1])
        tmp.rename(columns={0:'p_id',1:'ncf_score'}, inplace=True)
        all_prediction = all_prediction.append(tmp)

22173it [00:04, 4898.06it/s] 


In [13]:
all_prediction

Unnamed: 0,u_id,p_id,ncf_score
0,dhyeok1996,53.0,2.205372e-06
1,dhyeok1996,1200.0,1.928508e-04
2,dhyeok1996,130.0,5.960464e-08
3,dhyeok1996,22.0,1.336336e-04
4,dhyeok1996,1341.0,5.662441e-07
5,dhyeok1996,906.0,2.782643e-04
6,dhyeok1996,2519.0,1.353025e-05
7,dhyeok1996,1105.0,4.139245e-04
8,dhyeok1996,485.0,0.000000e+00
9,dhyeok1996,525.0,3.571719e-03


In [14]:
all_prediction.to_csv('../data/predictions/ncf_scores.scv',index=False)