In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
########################################################################
# 
# Copyright (c) 2018 Yida, Personal. All Rights Reserved
# 
########################################################################

"""
File: wash_json_one_affi.py
Author: yida_915(yida_915@163.com)
Date: 2018/08/28 22:10:47
"""

import sys
import json
from neomodel import config

# config.DATABASE_URL = 'bolt://neo4j:neo4j@localhost:7687'  # default

from neomodel import db

db.set_connection('bolt://neo4j:237200zw@localhost:7687')

from neomodel import (config, StructuredNode, StringProperty, IntegerProperty,
                      FloatProperty, UniqueIdProperty, RelationshipTo, RelationshipFrom,
                      ArrayProperty,
                      StructuredRel, DateTimeProperty)
from neomodel import cardinality
import neomodel


class WorkInRel(StructuredRel):
    # 置信度, 作者属于该机构的置信度
    # 当论文机构数量>1时, 每个作者具备置信度1/n, 表示这条链接属性的置信度
    rel_credibility = FloatProperty(default=1)


class Affiliation(StructuredNode):
    name = StringProperty(unique_index=True, required=True)
    # 一个机构里面可以有多个作者
    has_authors = RelationshipFrom('Author', 'WORK_IN', model=WorkInRel)


class Author(StructuredNode):
    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True)
    age = IntegerProperty(index=True, default=0)

    # 由于重名问题的存在, 一个作者可能属于多个机构
    affiliation = RelationshipTo('Affiliation', 'WORK_IN', model=WorkInRel)
    publish = RelationshipTo('Paper', 'Publish')
    # 一个作者会和多个作者合作.
    co_authors = RelationshipTo('Author', 'CO_AUTHOR', cardinality=cardinality.OneOrMore)
    # 作者重名问题, 判断摘要keywords的相似度, 相似度高则认为是同一作者
    # 判断机构是否和已有的作者一致.


class Literature(StructuredNode):
    name = StringProperty(unique_index=True, required=True)

    has_papers = RelationshipFrom('Paper', 'Publish_AT')
    # 中图文分类号
    clc_type = StringProperty()

paper_keyword = StringProperty("")
paper_author_name = StringProperty("")
class Paper(StructuredNode):
    title = StringProperty(unique_index=True, required=True)

    abstract = StringProperty(default="")
    keywords = ArrayProperty(paper_keyword)
    authors = ArrayProperty(paper_author_name)

    published_at = RelationshipTo('Literature', 'Publish_AT')
    published_by = RelationshipFrom('Author', 'Publish_BY')


data_arr = []
with open("paper_top200.txt") as f:
    for line in f:
        line = line.strip()
        if len(line) == 0: continue

        paper = json.loads(line)
        data = {}
        data['title'] = paper['Title']
        data['authors'] = paper['Authors']
        data['affiliations'] = paper['AuthorsAffiliations']
        data['subjects'] = paper['Subjects']
        data['abstract'] = paper['Abstract']
        data['year'] = str(paper['Year'])
        data['literature'] = paper['Literature']
        data['clc'] = paper['CLC']
        data_arr.append(data)

# print(data_arr)
import pandas as pd

paper_df = pd.DataFrame(data_arr)


print(paper_df.head())
