# NCF Cornac

In [2]:
import pandas as pd
import cornac
from cornac.eval_methods import RatioSplit
from cornac.hyperopt import Continuous, Discrete, RandomSearch
from tqdm import tqdm
import numpy as np
import tensorflow as tf

In [3]:
df = pd.read_csv('../data/ncf_data.csv')
df

Unnamed: 0,p_id,u_id,u_rate
0,f239,batoo2000,5.0
1,f239,Woongs Lee,5.0
2,f239,박우석,5.0
3,f239,EOS,5.0
4,f239,ㅎㅈㅊ,4.0
...,...,...,...
43014,572,쥬,
43015,595,쥬,
43016,213,쥬,
43017,685,쥬,


In [4]:
df.rename(columns = {'p_id' : 'itemID', 'u_id' : 'userID', 'u_rate' : 'rating'}, inplace = True)

In [5]:
ls=[]
for i in range (0,len(df)):
    new_tuple = (df['userID'][i], df['itemID'][i], df['rating'][i])
    ls.append(new_tuple)

In [6]:
ls

[('batoo2000', 'f239', 5.0),
 ('Woongs Lee', 'f239', 5.0),
 ('박우석', 'f239', 5.0),
 ('EOS', 'f239', 5.0),
 ('ㅎㅈㅊ', 'f239', 4.0),
 ('한효선', 'b209', 4.0),
 ('백지현', 'e810', 3.0),
 ('uni', 'e810', 1.0),
 ('Hannah-Gahee U♥', 'a211', 5.0),
 ('김세미', 'a211', 5.0),
 ('쑨꿍', 'a211', 5.0),
 ('제주벼', 'a211', 2.0),
 ('혜진', 'a211', 5.0),
 ('L', 'a211', 2.0),
 ('작은배', 'a211', 1.0),
 ('jw', 'a211', 1.0),
 ('정원', 'a211', 5.0),
 ('꿀벌', 'a211', 1.0),
 ('바지우', 'a211', 1.0),
 ('fjhndklhvnkl', 'a211', 5.0),
 ('eodeoddl', 'a211', 5.0),
 ('나상준', 'a211', 5.0),
 ('문수', 'a211', 5.0),
 ('ㅎㅎ', 'a211', 5.0),
 ('김진영', 'a211', 5.0),
 ('이동훈', 'a211', 5.0),
 ('서수진', 'a211', 1.0),
 ('안영준', 'a211', 1.0),
 ('ryu', 'a211', 1.0),
 ('볼링마니아', 'a211', 1.0),
 ('시닝', 'a211', 1.0),
 ('zjffltmxj', 'a211', 1.0),
 ('영원관세법인', 'a211', 1.0),
 ('이정근', 'a211', 5.0),
 ('카카카', 'a211', 5.0),
 ('비호', 'a211', 1.0),
 ('자의식', 'a211', 3.0),
 ('쥴리', 'b44', 3.0),
 ('2020년 화이팅!', 'b44', 5.0),
 ('DAAS', 'b44', 5.0),
 ('김가현', 'f20', 5.0),
 ('태찌', 'f20', 

In [7]:
ncf = cornac.models.recommender.Recommender('NeuMF')
ncf.early_stop

<bound method Recommender.early_stop of <cornac.models.recommender.Recommender object at 0x63cd894e0>>

In [8]:
TOP_K=10
# Instantiate evaluation metrics
ndcg = cornac.metrics.NDCG(k=TOP_K)
pre = cornac.metrics.Precision(k=TOP_K)
rec = cornac.metrics.Recall(k=TOP_K)
fm = cornac.metrics.FMeasure(k=TOP_K)

In [41]:

#Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=ls,
    test_size=0.1,
    val_size=0.1,
    rating_threshold=1.0,
    seed=123,
    verbose=True,
)

neumf = cornac.models.NeuMF(
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_neg=50,
    seed=123,
    early_stopping = {'min_delta':0.0, 'patience':5}
)

# RandomSearch
rs_neumf = RandomSearch(
    model = neumf,
    space=[
        Discrete("num_epochs", [50, 100, 150, 200]),
        Discrete("num_factors", [4, 8]),
        Discrete("batch_size", [128, 256, 512]),
        Continuous("lr", 0.001, 0.01)
    ],
    metric = fm,
    eval_method = ratio_split
)

# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split,
    models=[rs_neumf],
    metrics=[ndcg, pre, rec, fm],
).run()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 17464
Number of items = 1181
Number of ratings = 33140
Max rating = nan
Min rating = nan
Global mean = nan
---
Test data:
Number of users = 2024
Number of items = 287
Number of ratings = 2699
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 2045
Number of items = 297
Number of ratings = 2750
---
Total users = 17464
Total items = 1181

[RandomSearch_NeuMF] Training started!
Evaluating: {'batch_size': 512, 'lr': 0.007416597884709045, 'num_epochs': 150, 'num_factors': 4}




HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Early stopping:
- best epoch = 3, stopped epoch = 8
- best monitored value = 0.346823 (delta = -0.034753)

Evaluating: {'batch_size': 512, 'lr': 0.005961832921746022, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 3, stopped epoch = 8
- best monitored value = 0.349044 (delta = -0.027440)

Evaluating: {'batch_size': 256, 'lr': 0.00982687778546154, 'num_epochs': 50, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 1, stopped epoch = 6
- best monitored value = 0.343887 (delta = -0.071136)

Evaluating: {'batch_size': 512, 'lr': 0.002259556863673461, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 9, stopped epoch = 14
- best monitored value = 0.360591 (delta = -0.013357)

Evaluating: {'batch_size': 128, 'lr': 0.0075614473664563754, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 2, stopped epoch = 7
- best monitored value = 0.353092 (delta = -0.057804)

Evaluating: {'batch_size': 256, 'lr': 0.007252796594667422, 'num_epochs': 50, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 2, stopped epoch = 7
- best monitored value = 0.349636 (delta = -0.055599)

Evaluating: {'batch_size': 256, 'lr': 0.0026424255740815, 'num_epochs': 150, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Early stopping:
- best epoch = 6, stopped epoch = 11
- best monitored value = 0.359535 (delta = -0.048412)

Evaluating: {'batch_size': 512, 'lr': 0.0057864482838717955, 'num_epochs': 50, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Early stopping:
- best epoch = 2, stopped epoch = 7
- best monitored value = 0.347066 (delta = -0.054204)

Evaluating: {'batch_size': 128, 'lr': 0.005428762991131081, 'num_epochs': 200, 'num_factors': 4}


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Early stopping:
- best epoch = 2, stopped epoch = 7
- best monitored value = 0.354218 (delta = -0.062706)

Evaluating: {'batch_size': 128, 'lr': 0.007501990443131995, 'num_epochs': 100, 'num_factors': 8}


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Early stopping:
- best epoch = 1, stopped epoch = 6
- best monitored value = 0.351266 (delta = -0.076875)

Best parameter settings: {'batch_size': 512, 'lr': 0.002259556863673461, 'num_epochs': 50, 'num_factors': 4}
F1@10 = 0.0956

[RandomSearch_NeuMF] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Ranking', max=2024.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Ranking', max=2045.0, style=ProgressStyle(description_wid…



VALIDATION:
...
                   |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Time (s)
------------------ + ------ + ------- + ------------ + --------- + --------
RandomSearch_NeuMF | 0.0956 |  0.2456 |       0.0549 |    0.4451 |   2.2722

TEST:
...
                   |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
------------------ + ------ + ------- + ------------ + --------- + --------- + --------
RandomSearch_NeuMF | 0.0970 |  0.2482 |       0.0556 |    0.4532 | 3795.3764 |   2.3491



In [12]:
# print('Random search: ', rs_neumf.best_params)
rs_neumf=dict()
rs_neumf['best_params'] = {'batch_size': 512, 'lr': 0.002259556863673461, 'num_epochs': 50, 'num_factors': 4}

In [17]:
rs_neumf['best_params']

{'batch_size': 512,
 'lr': 0.002259556863673461,
 'num_epochs': 50,
 'num_factors': 4}

In [19]:

ratio_split = RatioSplit(
    data=ls,
    test_size=0.2,
    rating_threshold=1.0,
    verbose=True,
)

neumf = cornac.models.NeuMF(
    num_factors = rs_neumf['best_params']['num_factors'],
    layers=[64, 32, 16, 8],
    act_fn="tanh",
    learner="adam",
    num_epochs = rs_neumf['best_params']['num_epochs'],
    lr=rs_neumf['best_params']['lr'],
    num_neg=50,
    batch_size = rs_neumf['best_params']['batch_size'],
    seed=123,
    early_stopping = {'min_delta':0.0, 'patience':5}
)

cornac.Experiment(
    eval_method=ratio_split,
    models=[neumf],
    metrics=[ndcg, pre, rec, fm],
).run()


rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 17438
Number of items = 1173
Number of ratings = 33140
Max rating = nan
Min rating = nan
Global mean = nan
---
Test data:
Number of users = 3352
Number of items = 397
Number of ratings = 5399
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 17438
Total items = 1173

[NeuMF] Training started!




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



[NeuMF] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Ranking', max=3352.0, style=ProgressStyle(description_wid…



TEST:
...
      |  F1@10 | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------- + ------------ + --------- + --------- + --------
NeuMF | 0.0787 |  0.1687 |       0.0468 |    0.3327 | 2099.7345 |   3.7428



In [20]:
pred_user = pd.read_csv('../../common/data/survey_result.csv')
pred_user = set(pred_user['u_id'])

all_prediction = pd.DataFrame(columns=['u_id','p_id','ncf_score'])

for u_idx, u_id in tqdm(enumerate(df['userID'].unique())):
    if u_id in pred_user:
        tmp = pd.DataFrame(neumf.rank(u_idx)).T
        tmp['u_id'] = u_id
        tmp = pd.DataFrame(tmp,columns=['u_id',0,1])
        tmp.rename(columns={0:'p_id',1:'ncf_score'}, inplace=True)
        all_prediction = all_prediction.append(tmp)

20246it [00:01, 11058.54it/s] 


In [47]:
all_prediction.reset_index(drop=True,inplace=True)

In [48]:
all_prediction

Unnamed: 0,u_id,p_id,ncf_score
0,ryu,15.0,0.000168
1,ryu,23.0,0.510812
2,ryu,22.0,0.001500
3,ryu,1.0,0.001480
4,ryu,18.0,0.000039
...,...,...,...
37795,쥬,484.0,0.000012
37796,쥬,255.0,0.000017
37797,쥬,516.0,0.000019
37798,쥬,415.0,0.000099


In [23]:
place_dict=dict()
for p_idx, p_id in tqdm(enumerate(df['itemID'].unique())):
    place_dict[float(p_idx)] = p_id

1260it [00:00, 931738.90it/s]


In [49]:
for i in range (0,len(all_prediction)):
    all_prediction['p_id'][i] = place_dict[all_prediction['p_id'][i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [50]:
all_prediction

Unnamed: 0,u_id,p_id,ncf_score
0,ryu,g311,0.000168
1,ryu,f230,0.510812
2,ryu,c114,0.001500
3,ryu,b209,0.001480
4,ryu,c113,0.000039
...,...,...,...
37795,쥬,f233,0.000012
37796,쥬,e20,0.000017
37797,쥬,f261,0.000019
37798,쥬,c165,0.000099


In [25]:
place_dict

{0.0: 'f239',
 1.0: 'b209',
 2.0: 'e810',
 3.0: 'a211',
 4.0: 'b44',
 5.0: 'f20',
 6.0: 'g112',
 7.0: 'b88',
 8.0: 'g327',
 9.0: 'g115',
 10.0: 'd15',
 11.0: 'f27',
 12.0: 'b43',
 13.0: 'a811',
 14.0: 'e210',
 15.0: 'g311',
 16.0: 'g123',
 17.0: 'f237',
 18.0: 'c113',
 19.0: 'f29',
 20.0: 'f16',
 21.0: 'd24',
 22.0: 'c114',
 23.0: 'f230',
 24.0: 'b72',
 25.0: 'g124',
 26.0: 'b2410',
 27.0: 'b86',
 28.0: 'b200',
 29.0: 'g1313',
 30.0: 'f42',
 31.0: 'b26',
 32.0: 'f264',
 33.0: 'g184',
 34.0: 'b19',
 35.0: 'f89',
 36.0: 'b253',
 37.0: 'e412',
 38.0: 'b21',
 39.0: 'c711',
 40.0: 'f263',
 41.0: 'f45',
 42.0: 'c147',
 43.0: 'g1314',
 44.0: 'c178',
 45.0: 'g141',
 46.0: 'f1712',
 47.0: 'b291',
 48.0: 'f73',
 49.0: 'b17',
 50.0: 'b28',
 51.0: 'b10',
 52.0: 'c176',
 53.0: 'c182',
 54.0: 'e611',
 55.0: 'g146',
 56.0: 'c149',
 57.0: 'g317',
 58.0: 'f17',
 59.0: 'f231',
 60.0: 'a810',
 61.0: 'b74',
 62.0: 'f10',
 63.0: 'd22',
 64.0: 'c112',
 65.0: 'e211',
 66.0: 'b80',
 67.0: 'f19',
 68.0: 'f1912

In [10]:
all_prediction

Unnamed: 0,u_id,p_id,ncf_score
0,ryu,19.0,0.000000e+00
1,ryu,35.0,2.682209e-07
2,ryu,268.0,3.665686e-06
3,ryu,58.0,0.000000e+00
4,ryu,2.0,6.258488e-07
...,...,...,...
1255,쥬,839.0,0.000000e+00
1256,쥬,840.0,0.000000e+00
1257,쥬,841.0,0.000000e+00
1258,쥬,842.0,0.000000e+00


In [51]:
all_prediction.to_csv('../data/ncf_scores2.csv',index=False)