# Net2Graph

**History**
- 0723: 
    - 從user click中sample 30％當作測試資料, 並將relationship更改為LIKE_TEST
    - LIKE relationship新增score property

In [52]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship, Subgraph, NodeMatcher
from tqdm import tqdm
import json
from keyword_extraction import Keyword_Extractor
import joblib

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Preprocess

**User Click**

In [53]:
with open('data/all_item_user_distinct_fake_.json') as f:
    user_click = json.load(f)
    
titl_dict = {}
for d in user_click:
    titl_dict[d['titleno']]=d['title']    

In [54]:
#select 10% test data
df_userClick = pd.DataFrame(user_click)
testdata_dict = {}
for userid in tqdm(df_userClick['userid'].unique()):
    df_test = df_userClick[(df_userClick['userid']==userid)&(df_userClick['clicked']==1)].sample(frac=0.3)
    testdata_dict[userid] = df_test['titleno'].values


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [00:00<00:00, 417.05it/s]


**Document**

In [55]:
doc_df = pd.read_csv('data/all_item_sum.csv')

In [56]:
doc_df['title'] = doc_df['titleno'].map(lambda x: titl_dict[x])
doc_df['summary'] = doc_df.apply(lambda r: r['title_sum'].split(r['title'])[1], axis=1)
doc_df['title_sum_new'] = doc_df.apply(lambda r: '。'.join([r.title,r.summary]), axis=1)

**keyword**

In [57]:
kwExtractor = Keyword_Extractor()
if False:
    case_list = {}
    for r in tqdm(doc_df.itertuples()):
        data_dict = {}
        titleno = r.titleno
        title_sum = r.title_sum
        data_dict['title_sum'] = title_sum

        #keyword Extraction
        keywords = kwExtractor.keyword_extract(title_sum, top_n=5)
        keyword_df = pd.DataFrame(keywords['keywords'], columns=['keyword','score','embedding'])
        kw_dic = keyword_df[['keyword','score']].to_dict('record')
        data_dict['keywords'] = kw_dic#keyword_df.to_dict('record')
        case_list[titleno] =data_dict
    joblib.dump(case_list, 'data/case_list.pkl')
else:
    case_list = joblib.load('data/case_list.pkl')

## Import to Neo4j DB

docker run \
    --name aaneo4j_ida \
    -p7471:7474 -p7681:7687 \
    -d \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4J_dbms_security_procedures_unrestricted=apoc.*,gds.* \
    -e NEO4J_dbms_security_procedures_allowlist=gds.*,apoc.* \
    -v /Users/jayhsu/work/ws/neo4j/recommendation_system/data:/data \
    -v /Users/jayhsu/work/ws/neo4j/recommendation_system/logs:/logs \
    -v /Users/jayhsu/work/ws/neo4j/recommendation_system/import:/var/lib/neo4j/import \
    -v /Users/jayhsu/work/ws/neo4j/recommendation_system/plugins:/plugins \
    --env NEO4J_dbms_memory_heap_max__size=2g \
    --env NEO4J_dbms_memory_pagecache_size=2g \
    --env NEO4J_AUTH=neo4j/admin \
    neo4j:latest

In [58]:
def get_relation_count():
    cypher = '''MATCH (n)-[r]-() RETURN COUNT(distinct n) as nodeCount, COUNT (distinct r) as relCount'''
    cursor = graph_db.run(cypher)
    df = cursor.to_data_frame()
    return df

def del_all_graph():
    #Delete all node and relations
    cypher = '''MATCH (i)
    DETACH DELETE i
    '''
    graph_db.run(cypher)


In [59]:
#neo4j DB info
#7473/tcp, 0.0.0.0:7471->7474/tcp, 0.0.0.0:7681->7687/tcp
db_info = {
    #'uri' : "bolt://10.109.6.14:7681",
    'uri' : "bolt://localhost:7681",
    'user' : "neo4j",
    'password' : "admin",
    }
graph_db = Graph(db_info['uri'], user=db_info['user'], password=db_info['password'])

In [60]:
#Doc & Keywords
for titleno in tqdm(case_list.keys()):
    case_dict = case_list[titleno]
    title_sum = case_dict['title_sum']
    node_doc = Node('Doc', titleno=titleno, title_sum=title_sum)
    graph_db.merge(node_doc, 'Doc', 'titleno',)
    kw_dic = case_dict['keywords']
    for k in kw_dic:
        keyword = k['keyword']
        score = k['score']
        node_keyword = Node('Keyword', keyword=keyword)
        graph_db.merge(node_keyword, 'Keyword', 'keyword',)
        rel_kw = Relationship(node_doc, 'HAS_KEYWORD', node_keyword, score=score)
        graph_db.merge(rel_kw)    
    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6570/6570 [08:42<00:00, 12.58it/s]


In [61]:
#User Click
matcher = NodeMatcher(graph_db)
for c in tqdm(user_click):
    userno = c['userno']
    userid = c['userid']
    titleno = c['titleno']
    clicked = c['clicked']    
    node_user = Node('User', userno=userno, userid=userid)
    node_doc = matcher.match("Doc",titleno=titleno).first()
    graph_db.merge(node_user, 'User', 'userno',)
    if clicked==1:
        if titleno in testdata_dict[userid]:
            rel_click = Relationship(node_user, 'LIKE_TEST', node_doc, clicked=clicked, score=1)
        else:
            rel_click = Relationship(node_user, 'LIKE', node_doc, clicked=clicked, score=1)
    else:
        rel_click = Relationship(node_user, 'DONT_LIKE', node_doc, clicked=clicked, score=0)
    graph_db.merge(rel_click)        
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21829/21829 [05:40<00:00, 64.18it/s]


In [63]:
get_relation_count()

Unnamed: 0,nodeCount,relCount
0,17738,54678
