# Recommeneation Algorithm

**History**

In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship, Subgraph, NodeMatcher
from graphdatascience import GraphDataScience
from tqdm import tqdm
import json
import joblib
pd.set_option('display.max_rows', 100)

In [2]:
#neo4j DB info
#7473/tcp, 0.0.0.0:7471->7474/tcp, 0.0.0.0:7681->7687/tcp
db_info = {
    #'uri' : "bolt://10.109.6.14:7681",
    'uri' : "bolt://localhost:7681",
    'user' : "neo4j",
    'password' : "admin",
    }
graph_db = Graph(db_info['uri'], user=db_info['user'], password=db_info['password'])
gds = GraphDataScience(db_info['uri'], auth=(db_info['user'], db_info['password']))


## **協同過濾 (User-based)**
- 選擇user: userid:debby
- 透過已預先運算的similar relationship, 篩選top k`相似user`
- `相似user`喜歡的文章計次排序

In [141]:
def collaborative_filtering(userid = 'debby', recom_qty=20):
    #喜歡的測試資料
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:LIKE_TEST]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    ans_df = df['d.titleno'].value_counts().to_frame().reset_index()
    ans_df.columns = ['titleno','cnt']
    test_titleno = ans_df['titleno'].values
    
    #喜歡的訓練資料
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    ans_df = df['d.titleno'].value_counts().to_frame().reset_index()
    ans_df.columns = ['titleno','cnt']
    ans_titleno = ans_df['titleno'].values

    #有看過的
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:DONT_LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    valid_df = df['d.titleno'].value_counts().to_frame().reset_index()
    valid_df.columns = ['titleno','cnt']
    valid_titleno = valid_df['titleno'].values

    #推薦
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:SIMILAR_USER]->(n2)-[:LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    recomm_df = df['d.titleno'].value_counts().to_frame().reset_index()
    recomm_df.columns = ['titleno','cnt']
    #all_recomm = recomm_df['titleno'].values
    all_recomm = recomm_df['titleno'].values
    recomm_df = recomm_df[recomm_df['cnt']>1]
    recomm_titleno = recomm_df[:recom_qty]['titleno'].values
    #print(userid,len(recomm_titleno))
    #計分
    ret_dict = precision_score(valid_titleno, ans_titleno, test_titleno, recomm_titleno, all_recomm)
    return ret_dict
    
def precision_score(valid_titleno, ans_titleno, test_titleno, recomm_titleno, all_recomm):
    #-- prcision --
    ret_dict = {}
    pass_cnt = 0
    fail_cnt = 0
    invalid_cnt = 0
    training_cnt = 0
    for r in recomm_titleno:
        if r in test_titleno:
            pass_cnt+=1
            continue
        if r in ans_titleno:
            training_cnt+=1
            continue            
        if (r not in test_titleno) & (r not in valid_titleno):
            invalid_cnt+=1
            continue
        if (r not in test_titleno) & (r in valid_titleno):
            #print('FAIL',r)
            fail_cnt+=1
            continue        
        print('!!!!!!!!其他不預期狀況~~~~~~',r)
    ret_dict['pass']=pass_cnt
    ret_dict['precision_fail']=fail_cnt
    ret_dict['invalid (沒看過數量)']=invalid_cnt
    ret_dict['training (預測的是訓練資料)']=training_cnt    
    ret_dict['Precision'] = pass_cnt/(pass_cnt+fail_cnt) if (pass_cnt+fail_cnt)>0 else '--' 
    
    #-- recall --
    test_cnt = len(test_titleno)
    ret_dict['test data (測試資料數量)'] = test_cnt
    ret_dict['Recall'] = recall_passCnt/test_cnt
    
    if True:
        rank_list = []
        for t in test_titleno:
            if t in all_recomm:
                rank = np.argwhere(all_recomm==t)[0][0]
                rank_list.append(rank)
            else:
                rank_list.append(-1)
        ret_dict['rank']=rank_list
    return ret_dict


In [143]:
#prediction & 計算precision

cypher = f'''match(n:User) return n.userid'''
cursor = graph_db.run(cypher)
df = cursor.to_data_frame()
userid_list = df['n.userid'].values
ret_list = []
for useid in userid_list:
    #precision, recall, pass_cnt, fail_cnt, invalid_cnt, training_cnt, recall_passCnt, test_cnt = collaborative_filtering(useid, recom_qty=25)
    ret_dict = collaborative_filtering(useid, recom_qty=25)
    ret_dict['userid']=useid
    ret_list.append(ret_dict)
    #ret_list.append([useid, precision, pass_cnt, fail_cnt, invalid_cnt, training_cnt, recall, recall_passCnt, test_cnt])
    
precision_df = pd.DataFrame(ret_list)
#precision_df.columns=['useid','precision','pass_cnt','fail_cnt','沒看過數量','訓練資料','recall','recall_passCnt','test_cnt']
precision_all = precision_df['pass'].sum()/(precision_df['precision_fail'].sum()+precision_df['pass'].sum())    
print('precision:', precision_all)
recall_all = precision_df['pass'].sum()/precision_df['test data (測試資料數量)'].sum()    
print('recall:', recall_all)

precision_df[precision_df['Precision']!='--'].sort_values(by='Precision', ascending=False)


precision: 0.3235294117647059
recall: 0.0042985541227041815


Unnamed: 0,pass,precision_fail,invalid (沒看過數量),training (預測的是訓練資料),Precision,test data (測試資料數量),Recall,rank,userid
0,1,0,15,1,1.0,19,0.0,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 14, -1, -...",509716024
70,1,0,23,1,1.0,17,0.0,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 762, -1, ...",idaw03
10,1,0,24,0,1.0,42,0.0,"[-1, -1, 518, 607, -1, -1, -1, -1, -1, 872, -1...",JW
66,1,0,4,1,1.0,76,0.0,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",andrew0717
18,1,0,20,4,1.0,141,0.0,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",kenny
25,1,0,6,2,1.0,85,0.0,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -...",Test
44,1,0,8,0,1.0,66,0.0,"[-1, 108, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...",fxn
15,2,1,22,0,0.666667,103,0.0,"[-1, -1, -1, -1, -1, -1, 901, -1, -1, -1, -1, ...",bird
26,1,1,15,0,0.5,10,0.0,"[-1, -1, 15, -1, -1, -1, -1, -1, -1, -1]",finley
8,1,3,20,0,0.25,46,0.0,"[-1, -1, -1, 82, 84, 23, -1, -1, -1, -1, -1, -...",hhug_y


## **協同過濾 (Item-based)**
- 選擇user
- 篩選出user喜歡的document
- 透過已預先運算的similar relationship, 篩選top k`相似document`
- `相似document`計次排序

In [194]:
def collaborative_itembase(userid = 'debby', recom_qty=20):
    #喜歡的測試資料
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:LIKE_TEST]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    ans_df = df['d.titleno'].value_counts().to_frame().reset_index()
    ans_df.columns = ['titleno','cnt']
    test_titleno = ans_df['titleno'].values
    
    #喜歡的訓練資料
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    ans_df = df['d.titleno'].value_counts().to_frame().reset_index()
    ans_df.columns = ['titleno','cnt']
    ans_titleno = ans_df['titleno'].values

    #有看過的
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:DONT_LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    valid_df = df['d.titleno'].value_counts().to_frame().reset_index()
    valid_df.columns = ['titleno','cnt']
    valid_titleno = valid_df['titleno'].values

    #推薦
    cypher = f'''MATCH (u:User {{userid:'{userid}'}})-[:LIKE]->(d:Doc)-[r:SIMILAR_DOC]->(d2:Doc) RETURN d2.titleno as titleno, r.score as score ORDER BY score DESC  LIMIT 100
'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    if False:
        recomm_df = df['titleno'].value_counts().to_frame().reset_index()
        recomm_df.columns = ['titleno','cnt']
        all_recomm = recomm_df['titleno'].values
        recomm_df = recomm_df[recomm_df['cnt']>1]
        recomm_titleno = recomm_df['titleno'].values[:recom_qty]
    else:
        df['cnt']=1
        recomm_df = df.groupby('titleno').agg({'cnt':'count','score':'mean'})
        recomm_df.sort_values(['cnt','score'], inplace=True)
        recomm_df=recomm_df.reset_index()
        all_recomm = recomm_df['titleno'].values
        recomm_titleno = recomm_df['titleno'].values[:recom_qty]
    
    #計分
    print('\n\n')
    print(test_titleno, recomm_titleno)
    ret_dict = precision_score(valid_titleno, ans_titleno, test_titleno, recomm_titleno, all_recomm)
    return ret_dict
    

In [None]:
#prediction & 計算precision

cypher = f'''match(n:User) return n.userid'''
cursor = graph_db.run(cypher)
df = cursor.to_data_frame()
userid_list = df['n.userid'].values
ret_list = []
for useid in userid_list:
    ret_dict = collaborative_itembase(useid, recom_qty=20)
    ret_dict['userid']=useid
    ret_list.append(ret_dict)
    
precision_df = pd.DataFrame(ret_list)
#precision_df.columns=['useid','precision','pass_cnt','fail_cnt','沒看過數量']
precision_all = precision_df['pass'].sum()/(precision_df['precision_fail'].sum()+precision_df['pass'].sum())    
print('precision:', precision_all)
recall_all = precision_df['pass'].sum()/precision_df['test data (測試資料數量)'].sum()    
print('recall:', recall_all)

precision_df[precision_df['Precision']!='--'].sort_values(by='Precision', ascending=False)




[ 4540 24229 23604 24317  1472 23361 20642 25058 24498 24242 26174 23300
 21277 26867 20496 23027 24709 24465 23371] [  492 23135 22622 24544 24314 22668 25372 26322  4745 26360 25985 21109
 25595 22887   404 25116 25585 22204 20698 23301]



[22668 24284 27598   126   772 23216] [23532 26620   241 23654 21209 21350 24286 24781 27273 24937  1002 24414
 24202  3914 27229 20712 23670 21344 24429 25521]



[   57 26554    24 25223 23376 20970 26077  1626 25753 23452 22692 23248
 21023 24151 21943 20712 25308 22771   131 23613 25789 25459 23291 23902
 26285 25780 24933 23454 22277 20429 23848 25888 18664 26366 20863 23718
 27535 22371 22299 26358  1659 23658 24864  3965 22547  1551 26599 26491
 20721 21323 26732  1507 25890 22087 24788 24127] [27595  1493 27552 21344  3840  1595 21046  1584 24616 21206  1710   163
 27399 22347 23654  1010 22253 26266  1501  1546]



[  200 26604 24006 25016 26726 23769 21253 24856 25341 20490 21049 20535
 21324  1539 23690 26602 23314 24445 22667   489 




[  401 23100 20431 22440   260 22698 25299 23515 25143 24172 21202 25337
 21073 24581   501 23354 20494 23207 23703    69 22596 23366 24436 22094
 23412  1577 22194 25735 18678 23945    10 20629 25757 18523 25441 26728
 24067 22873 23749 24255 24630    35 20973 20593 22308 21664 21333 22131
 25538 22432 18681 20572 25618 24475 24604 22321 23261 21141 22396 25765
 21123 24052 22067 20709 25989 25483 25355 20978 21235  3956 22569 20428
   502 25568 25896 21979 23570 20923 21112 22389 25741 22499 22216 26316
 20705 22047 22720 25573 20604 24544 23783 26232 23656 23622   258 24343
 24548 24983 26016 20449 18517 21218 22187 21949 23173 23949 24468 24168
 23302 20926 24147 26878 27349 26724 22318 27269 24301 22429 22294 24101
 21994 23518 22288 24583  4740 23221   267 23143 20762  1702 23655   174
  1722 25101 24196   227 20623 27502 26399 25102 25321 25397 26885 20854
 24042 23961 26847 21315 20594 22995 25231 22690 25386 26344 22072 21169
 23359 22928  1718   790  1539  3887 23595 20979