In [1]:
from neomodel import config, db, StructuredNode, StringProperty, IntegerProperty, UniqueIdProperty, RelationshipTo

user = 'neo4j'
password = 'test'
config.DATABASE_URL = f'bolt://{user}:{password}@localhost:7687'
db.set_connection(f'bolt://{user}:{password}@localhost:7687')

In [2]:
class Author(StructuredNode):
    name = StringProperty(unique_index=True, required=True)
    
class Publisher(StructuredNode):
    name = StringProperty(unique_index=True, required=True)
    
class Year(StructuredNode):
    year = IntegerProperty(unique_index=True, required=True)

class Location(StructuredNode):
    name = StringProperty(unique_index=True, required=True)
    
class Fragment(StructuredNode):
    cluster_id = StringProperty(unique_index=True, required=True)

class Text(StructuredNode):
    text_id = StringProperty(unique_index=True, required=True)
    title = StringProperty()
    year = RelationshipTo(Year, 'PUBLICATION_YEAR')
    author = RelationshipTo(Author, 'HAS_AUTHOR')
    publisher = RelationshipTo(Publisher, 'HAS_PUBLISHER')
    location = RelationshipTo(Location, 'PUBLISHED_IN')
    fragment = RelationshipTo(Fragment, 'USES_FRAGMENT')
    

In [3]:
import json
texts = []

with open('../data/test_parse.json') as f:    
    texts = json.load(f)


In [4]:
import sys
import time

start = time.time()

for i, text in enumerate(texts):
    db_text = Text(text_id=text['id'], title=text['title']).save()
    for loc in text['pubplace']:
        loc_node = ''
        
        try:
            loc_node = Location.nodes.get(name=loc)
        except:
            loc_node = Location(name=loc).save()
        finally:
            db_text.location.connect(loc_node)
            
    for author in text.get('author', []):
        author_node = ''
        try:
            author_node = Author.nodes.get(name=author)
        except:
            author_node = Author(name=author).save()
        finally:
            db_text.author.connect(author_node)

    for publisher in text.get('publisher', []):
        pub_node = ''
        try:
            pub_node = Publisher.nodes.get(name=publisher)
        except:
            pub_node = Publisher(name=publisher).save()
        finally:
            db_text.publisher.connect(pub_node)

    try:
        if text.get('date', ''):
            year_node = ''
            year = int("".join(filter(lambda x: x.isnumeric(),  text['date'])))

            try:
                year_node = Year.nodes.get(year=year)
            except:
                year_node = Year(year=year).save()
            finally:
                db_text.year.connect(year_node)
    except:
        print('year failded', text)
            
    if i % 1000 == 999:
        print(f'Time since last checkpoint {time.time() - start} Progress at {i + 1} / {len(texts)}')
        start = time.time()

Time since last checkpoint 58.80594062805176 Progress at 1000 / 60327
Time since last checkpoint 59.67611646652222 Progress at 2000 / 60327
Time since last checkpoint 62.22266340255737 Progress at 3000 / 60327
year failded {'id': 'A66970', 'title': "The Roman-church's devotions vindicated from Doctour Stillingfleet's mis-representation by O.N. a Catholick.", 'author': ['N. O.', 'R. H.', 'Cressy', 'Serenus'], 'pubplace': [], 'publisher': [], 'date': 'M.DC.LXXII.'}
year failded {'id': 'A65465', 'title': "The pious communicant rightly prepar'd, or, A discourse concerning the Blessed Sacrament wherein the nature of it is described, our obligation to frequent communion enforced, and directions given for due preparation for it, behaviour at, and after it, and profiting by it : with prayers and hymns, suited to the several parts of that holy office : to which is added, a short discourse of baptism / by Samuel Wesley ...", 'author': ['Wesley, Samuel'], 'pubplace': ['London'], 'publisher': ['Ch

In [18]:
byid = {}

for text in texts:
    if text.get('date', ''):
        try:
            byid[text['id']] = int("".join(filter(lambda x: x.isnumeric(),  text['date'])))
        except:
            byid[text['id']] = ''
            pass

In [3]:
import json
clusters = []

with open('../data/test_cluster_parse.json') as f:    
    clusters = json.load(f)


In [None]:
# Way too slow need to transform these into csv imports

import time
start = time.time()
for i, cluster in enumerate(list(clusters.keys())[10000:]):
    fragment_node = ''
    try:
        fragment_node = Fragment.nodes.get(cluster_id=cluster)
    except:
        fragment_node = Fragment(cluster_id=cluster).save()
        
    for text in clusters[cluster]:
        try:
            db_text = Text.nodes.get(text_id=text['text_id'])
            db_text.fragment.connect(fragment_node)
            db_text.save()
        except:
            pass
    if i % 100000 == 999:
        print(f'Time passed since last checkpoint {time.time() - start}, progress at {i + 1} / {len(clusters)}')
        

Time passed since last checkpoint 109.93622469902039, progress at 1000 / 3707553


In [4]:
from pandas.io.json import json_normalize
df = json_normalize(clusters)
df.to_csv('test.csv')

In [None]:
class Tester(StructuredNode):
    texts = RelationshipTo()