In [2760]:
from rdflib import Graph, URIRef, Literal, BNode, plugin, Namespace
#from rdflib.serializer import Serializer
from rdflib.plugin import register, Serializer
import numpy as np
from collections import defaultdict, OrderedDict
import pandas as pd
import json
import requests
from io import StringIO
import re
from urllib.request import urlopen

register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer')

In [2761]:
class Schema():
    def __init__(self, debug=False):
        self.forbidden = ["subject", "language", "authorIdentifierScheme", "contributorType", "publicationIDType", "DatasetField"]
        self.datadict = {}
        self.g = Graph()
        self.thisRef = 'https://dataverse.org/schema'
        self.RootRef = ''
        self.mappings = {}
        self.locator = {}
        self.language = 'en'
        self.CompoundNodes = []
        self.CompoundValues = {}
        self.Vertices = {}
        self.termURIs = {}
        self.defaultlanguage = ''
        self.serializeJSON = {}
        
    def loadfile(self, filename):
        data = False
        if re.search('.csv', filename, re.IGNORECASE):
            data = pd.read_csv(filename)
        if re.search('.tsv', filename, re.IGNORECASE):            
            data = pd.read_csv(filename,sep="\t")
        elif re.search('.json', filename, re.IGNORECASE):
            response = urlopen(filename)
            json_data = response.read().decode('utf-8', 'replace')            
            data = pd.read_json(json_data) #son_normalize(json.loads(json_data))
            #OR data = pd.read_csv(filename)
        return data
        
    def emptyGraph(self):
        self.g = Graph()
        return self.g
    
    def load_metadata_schema(self, schemaURL, schemablock=False):
        keynameID = 1        
        if not schemablock:
            schemablock = 'default'
        schema = requests.get(schemaURL).text.split('\n')
        if schemablock == 'citation':
            schema = schema[2:]
        else:
            schema = schema[2:]
        dataschema = []
    
        for i in range(0, len(schema)):
            item = schema[i]
            elements = item.split('\t')
            try:
                if elements[keynameID] not in forbidden:
                    if i == len(schema):
                        dataschema.append(item)
                    else:
                        dataschema.append(item + "\n")
            except:
                skip = elements
        schemaIO = StringIO(''.join(dataschema))        
        data = pd.read_csv(schemaIO, sep="\t", error_bad_lines=False)
        self.datadict[schemablock] = data

        # Mappings for termURIs
        if 'termURI' in data.columns:
            for i in data[['name','termURI']].index:
                if data.loc[i]['termURI'] is not np.nan:                    
                    self.termURIs[data.loc[i]['name']] = data.loc[i]['termURI']        
                    
        return self.datadict

    def RemoveRef(self, valueURL):
        valueURL = valueURL.replace(self.RootRef, '')
        valueURL = valueURL.replace('<', '')
        valueURL = valueURL.replace('>', '')
        return valueURL
    
    def SetTermURI(self, value):
        if value in self.termURIs:
            return self.termURIs[value]
        else:
            return self.SetRef(value)
        
    def SetRef(self, value):
        # Set references with loaded semantic mappings
        value = value.replace('#','')
        value = value.replace(' ','')
        if value in self.mappings:
            RefURL = self.mappings[value]
        else:
            RefURL = "%s%s" % (self.RootRef, value)
            
        return RefURL 
    
    def to_graph(self, schemaname=False, filename = False, DEBUG=False):
        self.RootRef = "%s/%s/" % (self.thisRef, schemaname)
        
        if schemaname not in self.datadict:
            return

        self.g = self.emptyGraph()
        ns1 = Namespace(self.RootRef)
        self.g.bind(schemaname, ns1)
        skos = Namespace('http://www.w3.org/2004/02/skos/core#')
        self.g.bind('skos', skos)
        
        self.datadict[schemaname].fillna('', inplace=True)
        tmpnames = self.datadict[schemaname].columns
        names = []
        for name in tmpnames:
            newname = "schema_%s" % name
            names.append(newname)
        staRoot = URIRef(self.RootRef)
        
        for row in range(0, self.datadict[schemaname]['name'].size):              
            staID = BNode()
            nodename = self.SetRef(self.datadict[schemaname].loc[row]['name'])
            parentname = self.SetRef(self.datadict[schemaname].loc[row]['parent'])

            if DEBUG:
                print(nodename)
            if parentname != self.RootRef: #like 'https://dataverse.org/schema/citation/':
                staParent = self.locator[parentname]
                self.g.add((staParent, URIRef(nodename), staID))
                self.g.add((staParent, skos['broader'], URIRef(nodename)))
                self.locator[nodename] = staID      
            else:
                self.g.add((staRoot, URIRef(nodename), staID))
                #self.g.add((URIRef(nodename), skos['narrower'], URIRef(parentname)))
                self.locator[nodename] = staID
            
            statement = staID
            for i in range(0, self.datadict[schemaname].loc[row].size-1):                                
                item = self.datadict[schemaname].loc[row].values[i]
                if item:
                    if self.defaultlanguage:
                        self.g.add((statement, URIRef(self.SetRef(names[i])), Literal(item, lang=self.defaultlanguage)))
                    else:
                        self.g.add((statement, URIRef(self.SetRef(names[i])), Literal(item)))
                #self.g.add((statement, URIRef(self.SetRef(names[i])), Literal("%s NL" % item, lang='nl')))
        
        # Save to files
        if filename:
            self.g.serialize(format='n3', destination="/tmp/%s.nt" % schemaname)
            self.g.serialize(format='json-ld', auto_compact=True, use_rdf_type=True, destination="/tmp/%s.json-ld" % schemaname)
        return self.g            

    def isNode(self, pNode): 
        if pNode:
            checkNode = str(pNode)[:3]                
            if checkNode == '_:N':  
                return pNode
            else:
                return False
        return False
                        
    def CompoundElements(self, jsongraph, DEBUG=None):
        for compoundkey in jsongraph:
            #isEdge = False
            rootNodeID = None
            for key in compoundkey:  
                if key == '@id':
                    if DEBUG:
                        print("KEY %s / %s" % (self.isNode(compoundkey[key]), compoundkey[key]))
                    rootNodeID = compoundkey[key]
                for i in range(0, len(compoundkey[key])):
                    if '@id' in compoundkey[key][i]:
                        nodeID = compoundkey[key][i]['@id']
                        if self.isNode(nodeID):                            
                            self.CompoundNodes.append(nodeID) 
                            self.CompoundValues[nodeID] = compoundkey[key][i]
                            cv = nodeID
                            if DEBUG:
                                print("\t%s\n" % compoundkey[key][i]['@id']) 
            if self.isNode(rootNodeID):
                self.CompoundValues[rootNodeID] = compoundkey
            else:
                self.Vertices[rootNodeID] = compoundkey
                #print("%s => %s\n" % (self.isNode(rootNodeID), compoundkey))
        randomNode = None
        for rootNodeID in self.Vertices:
            #print("%s => %s\n" % (self.isNode(rootNodeID), compoundkey))
            compoundkey = self.Vertices[rootNodeID]            
            for key in compoundkey:         
                newfields = {}
                if '@id' in compoundkey[key][0]:
                    nodeID = compoundkey[key][0]['@id']
                    if DEBUG:
                        print("%s %s" % (key, nodeID))
                    extra = []
                    if nodeID in self.CompoundValues:
                        #self.serializeJSON[key] = self.CompoundValues[nodeID]
                        extra.append(self.CompoundValues[nodeID])
                        randomNode = nodeID
                        #extra['nodeID'] = nodeID
                        x = False
                    self.serializeJSON[key] = extra
                else:                    
                    self.serializeJSON[key] = compoundkey[key]
        #print(self.CompoundValues[cv])
        return randomNode
        return self.serializeJSON

    def Lookup(self, fieldname=None, NESTED=None, DEBUG=None):
        lookup = {}
        for s,p,o in schema.g.triples((None, URIRef(self.SetRef(fieldname)),None)):    
            for s1,p1,o1 in schema.g.triples((o, None, None)):
            #if re.search('http', o1):
                if NESTED:
                    if not re.search('schema_|skos', p1.n3()):
                        info = {}
                        info['loc'] = o1
                        info['nested'] = 'True'
                        info['labels'] = self.Lookup(self.RemoveRef(p1.n3()))
                        info['short'] = self.RemoveRef(p1.n3())
                        lookup[p1.n3()] = info
                else:
                    info = {}
                    if DEBUG:
                        print("%s %s %s" % (s1,p1,o1))        
                    info['uri'] = o1
                    info['loc'] = p1
                    lookup[str(p1)] = info
        return lookup
    
    def Overview(self, subfield=None, condition=None, DEBUG=None):
        overview = {}
        lookup_term = None
        if subfield:
            lookup_term = URIRef(self.SetRef(subfield))
        print(lookup_term)
        for s,p,o in schema.g.triples((None, lookup_term, Literal(condition))):
            for s1,p1,o1 in schema.g.triples((s, None, None)):        
                if re.search('name', p1):
                    info = {}
                    if DEBUG:
                        print("S %s %s" % (s1, o1))    
                    info['uri'] = self.SetRef(o1)
                    info['loc'] = s1
                    overview[str(o1)] = info
        return overview

In [2762]:
schema = Schema()
csvfile = 'https://raw.githubusercontent.com/Dans-labs/common-migrations/master/core/resources/examples/example.csv'
dataset = schema.loadfile(csvfile)
dataset

Unnamed: 0,title,identifier,sid,state,permission,organization,lat,lng,firstYear,lastYear,taxon,objectType,elementType,investigator,language,category,UserLastname,userInitials,userEmail,userId
0,"CH, Co. Amsterdam",Klc House,dccd:4742,PUBLISHED,values,Quebec Univ,12.456,-9.098,8768,9887,Curques,House,,Lian McDonald,en,built in plug,Blue,K. M.,k.blue@example.uk,King


In [2763]:
jsonfile = 'https://raw.githubusercontent.com/vega/vega/master/docs/data/population.json'
dataset2 = schema.loadfile(jsonfile)

In [2764]:
tsvfile = 'https://raw.githubusercontent.com/datasciencelabs/2019/master/shiny/population.tsv'
dataset3 = schema.loadfile(jsonfile)

In [2765]:
schemaURL = 'https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/citation.tsv'
schemapd = schema.load_metadata_schema(schemaURL, 'citation')
schemaURL = 'https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/biomedical.tsv'
schemapd = schema.load_metadata_schema(schemaURL, 'biomedical')
#schemapd['citation'].head()
termURIs = {}
if 'termURI' in schemapd['citation'].columns:
    for i in schemapd['citation'][['name','termURI']].index:
        if schemapd['citation'].loc[i]['termURI'] is not np.nan:
            #print(schemapd['citation'].loc[i]['termURI'])            
            termURIs[schemapd['citation'].loc[i]['name']] = schemapd['citation'].loc[i]['termURI']
termURIs

b'Skipping line 71: expected 17 fields, saw 18\n'


{'title': 'http://purl.org/dc/terms/title',
 'alternativeTitle': 'http://purl.org/dc/terms/alternative',
 'alternativeURL': 'https://schema.org/distribution',
 'author': 'http://purl.org/dc/terms/creator',
 'authorIdentifier': 'http://purl.org/spar/datacite/AgentIdentifier',
 'publication': 'http://purl.org/dc/terms/isReferencedBy',
 'publicationCitation': 'http://purl.org/dc/terms/bibliographicCitation',
 'publicationIDNumber': 'http://purl.org/spar/datacite/ResourceIdentifier',
 'publicationURL': 'https://schema.org/distribution',
 'contributor': 'http://purl.org/dc/terms/contributor',
 'grantNumber': 'https://schema.org/sponsor',
 'dateOfDeposit': 'http://purl.org/dc/terms/dateSubmitted',
 'timePeriodCovered': 'https://schema.org/temporalCoverage',
 'kindOfData': 'http://rdf-vocabulary.ddialliance.org/discovery#kindOfData',
 'software': 'https://www.w3.org/TR/prov-o/#wasGeneratedBy',
 'relatedDatasets': 'http://purl.org/dc/terms/relation',
 'otherReferences': 'http://purl.org/dc/ter

In [2766]:
schema.to_graph('citation', filename='citation')

<Graph identifier=Nc327a15395d3484b96a29630959760a6 (<class 'rdflib.graph.Graph'>)>

In [2767]:
OTHER = False
if OTHER:
    schema.emptyGraph()
    schema.to_graph('biomedical', filename='biomedical')

In [2768]:
if OTHER:
    socialURL = "https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/social_science.tsv"
    schemapd = schema.load_metadata_schema(socialURL, 'socialsciences')
    schema.emptyGraph() 
    schema.to_graph('socialsciences', filename='socialsciences')

In [2769]:
jsonld = schema.CompoundElements(json.loads(schema.g.serialize(format='json-ld')))
#print(json.dumps(jsonld, indent=2))
#with open('/tmp/data.json', 'w', encoding='utf-8') as f:
    #json.dump(jsonld, f, ensure_ascii=False, indent=4)

for subj, pred, obj in schema.g:
    localstatements = [ subj, pred, obj ] 
    for item in localstatements:
        if re.search('Title', str(item)):
            print(localstatements)
#jsonld = jsonld[2:]
#print(jsonld)
#for subj in schema.g.subjects(URIRef('https://dataverse.org/schema/citation/schema_required'), Literal('True')):    
#    print(schema.g.value(subject=subj, object=Literal('True')))
for s,p,o in schema.g.triples((None, URIRef('https://dataverse.org/schema/citation/schema_required'), Literal('True'))):
    for s1,p1,o1 in schema.g.triples((s, None, None)):
        #if re.search('http', o1):
        if re.search('name', p1):
            print("S %s %s" % (p1, o1))
    #for s1,p1,o1 in schema.g.triples((None, None, s)):
    #    if re.search('http', o1):
    #        print(o1)

[rdflib.term.URIRef('https://dataverse.org/schema/citation/'), rdflib.term.URIRef('https://dataverse.org/schema/citation/alternativeTitle'), rdflib.term.BNode('Nc3822b0ebd6a4e29a576d7e0ddbbd4ea')]
[rdflib.term.BNode('Nc3822b0ebd6a4e29a576d7e0ddbbd4ea'), rdflib.term.URIRef('https://dataverse.org/schema/citation/schema_name'), rdflib.term.Literal('alternativeTitle')]
[rdflib.term.BNode('Nf7d8196518c04b1d840a9b905269f4a6'), rdflib.term.URIRef('https://dataverse.org/schema/citation/schema_title'), rdflib.term.Literal('Title')]
[rdflib.term.BNode('Nc3822b0ebd6a4e29a576d7e0ddbbd4ea'), rdflib.term.URIRef('https://dataverse.org/schema/citation/schema_title'), rdflib.term.Literal('Alternative Title')]
S https://dataverse.org/schema/citation/schema_name authorName
S https://dataverse.org/schema/citation/schema_name datasetContact
S https://dataverse.org/schema/citation/schema_name datasetContactEmail
S https://dataverse.org/schema/citation/schema_name dsDescriptionValue
S https://dataverse.org/s

In [2770]:
fields = schema.Overview('schema_required', 'True')
fields

https://dataverse.org/schema/citation/schema_required


{'authorName': {'uri': 'https://dataverse.org/schema/citation/authorName',
  'loc': rdflib.term.BNode('N27eeb9bbccc54f22bcf46676f2cd12b7')},
 'datasetContact': {'uri': 'https://dataverse.org/schema/citation/datasetContact',
  'loc': rdflib.term.BNode('Ne765fa6aa12a46509ee49a7c545ff2bf')},
 'datasetContactEmail': {'uri': 'https://dataverse.org/schema/citation/datasetContactEmail',
  'loc': rdflib.term.BNode('Nc05950df769b4a63936e14b586fa0b0e')},
 'dsDescriptionValue': {'uri': 'https://dataverse.org/schema/citation/dsDescriptionValue',
  'loc': rdflib.term.BNode('N733847a32648425099acb2a06f480653')},
 'producerName': {'uri': 'https://dataverse.org/schema/citation/producerName',
  'loc': rdflib.term.BNode('N4a4875843b614961b22708501868f2a8')},
 'title': {'uri': 'https://dataverse.org/schema/citation/title',
  'loc': rdflib.term.BNode('Nf7d8196518c04b1d840a9b905269f4a6')},
 'author': {'uri': 'https://dataverse.org/schema/citation/author',
  'loc': rdflib.term.BNode('N37de6f47037a4a64911356

In [2771]:
for field in schema.Overview('', 'True'):
    nested = schema.Lookup(field, NESTED=True)
    if nested:
        nested_fields = schema.Lookup(field, NESTED=True)    
        #print("%s %s\n" % (field, 'nested'))
    print(field)
#schema.Lookup(fieldname='keyword', NESTED=True)        
#schema.Lookup(fieldname='title')

None
distributor
author
productionDate
publicationIDNumber
relatedMaterial
timePeriodCoveredStart
timePeriodCoveredEnd
keywordValue
seriesName
authorAffiliation
grantNumber
contributorName
contributor
otherReferences
dateOfDeposit
dsDescription
producerName
authorName
dsDescriptionValue
datasetContact
dsDescriptionDate
topicClassValue
keywordVocabularyURI
authorIdentifier
kindOfData
producer
distributorName
datasetContactEmail
relatedDatasets
grantNumberAgency
title
grantNumberValue
datasetContactAffiliation
keyword
publication
keywordVocabulary
otherId
topicClassification
softwareName
timePeriodCovered
publicationCitation
software
distributionDate
datasetContactName
dateOfCollection
notesText


In [2772]:
schema.Lookup('keyword', NESTED=True)
schema.Lookup('dsDescriptionValue', NESTED=True)

{}

In [2777]:
dataset_fields = dataset.columns
mappings = { 'investigator': 'authorName', 'organization': 'affiliation', 'taxon': 'dsDescription', "category": "keyword" }
newcolumns = []
for fieldname in dataset_fields:
    if fieldname in mappings:
        newcolumns.append(mappings[fieldname])
    else:
        newcolumns.append(fieldname)

datasetg = Schema()        
root = schema.SetRef('citation') 
staID = BNode()
#datasetg.g.add((URIRef(schema.RootRef), URIRef(root), staID)) 
#root = schema.thisRef


print(root)
dataset.columns = newcolumns        
for i in dataset.index:
    data = dataset.loc[i]
    for col in range(0, len(data)):
        field = dataset.columns[col]
        print("[%s] %s %s"% (col, dataset.columns[col], data[col]))
        if schema.Lookup(fieldname=field):
            print(field)  
            fieldURI = schema.SetTermURI(field)
            staID = BNode()
            datasetg.g.add((URIRef(root), URIRef(fieldURI), staID))
            
            #datasetg.g.add((URIRef(root), URIRef(fieldURI)))             
            #datasetg.g.add((URIRef(root), URIRef(fieldURI), Literal(data[col])))            
            nested = schema.Lookup(field, NESTED=True)
            nested_fields = None
            if nested:
                datasetg.g.add((URIRef(root), URIRef(schema.SetRef(field)), Literal(data[col])))
                staID = URIRef(schema.thisRef)
                #datasetg.g.add((URIRef(root), URIRef(fieldURI)))              
                nested_fields = schema.Lookup(field, NESTED=True)    
                for nf in nested_fields:
                    print("%s %s\n" % (field, nf))
                    subfield = nf
                    subfield = subfield.replace('<','')
                    subfield = subfield.replace('>','')
                    datasetg.g.add((URIRef(schema.SetRef(field)), URIRef(subfield), Literal(data[col])))
            else:
                skip = 2
                datasetg.g.add((URIRef(root), URIRef(fieldURI), Literal(data[col])))
 
schemaname = 'dataset'

datasetg.g.serialize(format='n3', destination="/tmp/%s.nt" % schemaname)                    
datasetg.g.serialize(format='json-ld', auto_compact=True, use_rdf_type=True, destination="/tmp/%s.jsonld" % schemaname)                    

https://dataverse.org/schema/citation/citation
[0] title CH, Co. Amsterdam
title
[1] identifier Klc House
[2] sid dccd:4742
[3] state PUBLISHED
[4] permission values
[5] affiliation Quebec Univ
[6] lat 12.456
[7] lng -9.097999999999999
[8] firstYear 8768
[9] lastYear 9887
[10] dsDescription Curques
dsDescription
dsDescription <https://dataverse.org/schema/citation/dsDescriptionDate>

dsDescription <https://dataverse.org/schema/citation/dsDescriptionValue>

[11] objectType House
[12] elementType nan
[13] authorName Lian McDonald
authorName
[14] language en
[15] keyword built in plug
keyword
keyword <https://dataverse.org/schema/citation/keywordVocabulary>

keyword <https://dataverse.org/schema/citation/keywordVocabularyURI>

keyword <https://dataverse.org/schema/citation/keywordValue>

[16] UserLastname Blue
[17] userInitials K. M.
[18] userEmail k.blue@example.uk
[19] userId King
