# Recommeneation Algorithm

In [116]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship, Subgraph, NodeMatcher
from graphdatascience import GraphDataScience
from tqdm import tqdm
import json
import joblib
pd.set_option('display.max_rows', 100)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [117]:
#neo4j DB info
#7473/tcp, 0.0.0.0:7471->7474/tcp, 0.0.0.0:7681->7687/tcp
db_info = {
    'uri' : "bolt://10.109.6.14:7681",
    'user' : "neo4j",
    'password' : "admin",
    }
graph_db = Graph(db_info['uri'], user=db_info['user'], password=db_info['password'])
gds = GraphDataScience(db_info['uri'], auth=(db_info['user'], db_info['password']))


## **協同過濾 (User-based)**
- 選擇user: userid:debby
- 透過已預先運算的similar relationship, 篩選top k`相似user`
- `相似user`喜歡的文章計次排序

In [121]:
def collaborative_filtering(userid = 'debby', recom_qty=20):
    #喜歡的
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    ans_df = df['d.titleno'].value_counts().to_frame().reset_index()
    ans_df.columns = ['titleno','cnt']
    ans_titleno = ans_df['titleno'].values

    #有看過的
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:DONT_LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    valid_df = df['d.titleno'].value_counts().to_frame().reset_index()
    valid_df.columns = ['titleno','cnt']
    valid_titleno = valid_df['titleno'].values

    #推薦
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:SIMILAR_USER]->(n2)-[:LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    recomm_df = df['d.titleno'].value_counts().to_frame().reset_index()
    recomm_df.columns = ['titleno','cnt']
    recomm_df = recomm_df[recomm_df['cnt']>1]
    recomm_titleno = recomm_df['titleno'].values[:recom_qty]
    
    #計分
    precision, pass_cnt, fail_cnt, invalid_cnt = precision_score(valid_titleno, ans_titleno, recomm_titleno)
    return precision, pass_cnt, fail_cnt, invalid_cnt
    
def precision_score(valid_titleno, ans_titleno, recomm_titleno):
    pass_cnt = 0
    fail_cnt = 0
    invalid_cnt = 0
    for r in recomm_titleno:
        if r in ans_titleno:
            #print('PASS',r)
            pass_cnt+=1
            continue
        if (r not in ans_titleno) & (r not in valid_titleno):
            invalid_cnt+=1
            continue
        if (r not in ans_titleno) & (r in valid_titleno):
            #print('FAIL',r)
            fail_cnt+=1
            continue        
        print('!!!!!!!!其他不預期狀況~~~~~~',r)
        
    precision = pass_cnt/(pass_cnt+fail_cnt) if (pass_cnt+fail_cnt)>0 else '--' 
    return precision, pass_cnt, fail_cnt, invalid_cnt
    

In [119]:
#prediction & 計算precision

cypher = f'''match(n:User) return n.userid'''
cursor = graph_db.run(cypher)
df = cursor.to_data_frame()
userid_list = df['n.userid'].values
ret_list = []
for useid in userid_list:
    precision, pass_cnt, fail_cnt, invalid_cnt = collaborative_filtering(useid, recom_qty=25)
    ret_list.append([useid, precision, pass_cnt, fail_cnt, invalid_cnt])
    #print(f'precision:{precision}, (pass_cnt:{pass_cnt}, fail_cnt:{fail_cnt}, invalid_cnt:{invalid_cnt})')    
    
precision_df = pd.DataFrame(ret_list)
precision_df.columns=['useid','precision','pass_cnt','fail_cnt','沒看過數量']
precision_all = precision_df['pass_cnt'].sum()/(precision_df['fail_cnt'].sum()+precision_df['pass_cnt'].sum())    
print('precision:', precision_all)

precision_df[precision_df['precision']!='--'].sort_values(by='precision', ascending=False)

precision: 0.5625


Unnamed: 0,useid,precision,pass_cnt,fail_cnt,沒看過數量
0,509716024,1.0,1,0,14
27,p12345,1.0,3,0,22
32,athuar,1.0,1,0,23
36,1833,1.0,2,0,8
39,eileenrain,1.0,2,0,10
42,roy1023,1.0,2,0,23
45,luckie,1.0,2,0,23
46,ArthurChu,1.0,1,0,24
52,martychen,1.0,2,0,21
56,joe,1.0,1,0,24


## **協同過濾 (Item-based)**
- 選擇user
- 篩選出user喜歡的document
- 透過已預先運算的similar relationship, 篩選top k`相似document`
- `相似document`計次排序

In [123]:
def collaborative_itembase(userid = 'debby', recom_qty=20):
    #喜歡的
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    ans_df = df['d.titleno'].value_counts().to_frame().reset_index()
    ans_df.columns = ['titleno','cnt']
    ans_titleno = ans_df['titleno'].values

    #有看過的
    cypher = f'''match (n:User {{userid:'{userid}'}})-[:DONT_LIKE]->(d:Doc) return d.titleno'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    valid_df = df['d.titleno'].value_counts().to_frame().reset_index()
    valid_df.columns = ['titleno','cnt']
    valid_titleno = valid_df['titleno'].values

    #推薦
    cypher = f'''MATCH (u:User {{userid:'{userid}'}})-[:LIKE]->(d:Doc)-[r:SIMILAR_DOC]->(d2:Doc) RETURN d2.titleno as titleno, r.score as score ORDER BY score DESC  LIMIT 100
'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    recomm_df = df['titleno'].value_counts().to_frame().reset_index()
    recomm_df.columns = ['titleno','cnt']
    recomm_df = recomm_df[recomm_df['cnt']>1]
    recomm_titleno = recomm_df['titleno'].values[:recom_qty]
    
    #計分
    precision, pass_cnt, fail_cnt, invalid_cnt = precision_score(valid_titleno, ans_titleno, recomm_titleno)
    return precision, pass_cnt, fail_cnt, invalid_cnt
    

In [124]:
#prediction & 計算precision

cypher = f'''match(n:User) return n.userid'''
cursor = graph_db.run(cypher)
df = cursor.to_data_frame()
userid_list = df['n.userid'].values
ret_list = []
for useid in userid_list:
    precision, pass_cnt, fail_cnt, invalid_cnt = collaborative_itembase(useid, recom_qty=25)
    ret_list.append([useid, precision, pass_cnt, fail_cnt, invalid_cnt])
    
precision_df = pd.DataFrame(ret_list)
precision_df.columns=['useid','precision','pass_cnt','fail_cnt','沒看過數量']
precision_all = precision_df['pass_cnt'].sum()/(precision_df['fail_cnt'].sum()+precision_df['pass_cnt'].sum())    
print('precision:', precision_all)

precision_df[precision_df['precision']!='--'].sort_values(by='precision', ascending=False)

precision: 0.9477124183006536


Unnamed: 0,useid,precision,pass_cnt,fail_cnt,沒看過數量
0,509716024,1.0,3,0,6
34,K,1.0,1,0,5
1,mw00330,1.0,1,0,3
39,eileenrain,1.0,6,0,3
42,roy1023,1.0,1,0,4
44,fxn,1.0,1,0,7
45,luckie,1.0,2,0,2
46,ArthurChu,1.0,1,0,8
48,mmyu,1.0,6,0,4
49,eightno8,1.0,6,0,6
