In [197]:
from rdflib import Graph, URIRef, Literal, BNode, plugin, Namespace
from rdflib.serializer import Serializer
from rdflib.plugin import register, Serializer
import numpy as np
from collections import defaultdict, OrderedDict
import pandas as pd
import json
import requests
from io import StringIO
import re
from urllib.request import urlopen

register('json-ld', Serializer, 'rdflib_jsonld.serializer', 'JsonLDSerializer')

In [198]:
class Schema():
    def __init__(self, debug=False):
        self.forbidden = ["subject", "language", "authorIdentifierScheme", "contributorType", "publicationIDType", "DatasetField"]
        self.forbidden = ["language", "authorIdentifierScheme", "contributorType", "publicationIDType", "DatasetField"]
        self.datadict = {}
        self.g = Graph()
        self.thisRef = 'https://dataverse.org/schema'
        self.RootRef = ''
        self.mappings = {}
        self.locator = {}
        self.language = 'en'
        self.CompoundNodes = []
        self.CompoundValues = {}
        self.Vertices = {}
        self.termURIs = {}
        self.defaultlanguage = ''
        self.serializeJSON = {}
        self.metadataframe = None
        #self.forbidden = {}
        
    def loadfile(self, filename):
        data = False
        if re.search('.csv', filename, re.IGNORECASE):
            data = pd.read_csv(filename)
        if re.search('.tsv', filename, re.IGNORECASE):            
            data = pd.read_csv(filename,sep="\t")
        elif re.search('.json', filename, re.IGNORECASE):
            response = urlopen(filename)
            json_data = response.read().decode('utf-8', 'replace')            
            data = pd.read_json(json_data) #son_normalize(json.loads(json_data))
            #OR data = pd.read_csv(filename)
        return data
        
    def emptyGraph(self):
        self.g = Graph()
        return self.g
    
    def load_metadata_schema(self, schemaURL, schemablock=False):
        keynameID = 1        
        if not schemablock:
            schemablock = 'default'
        schema = requests.get(schemaURL).text.split('\n')
        #schema = pd.read_csv(StringIO(rawschema))
        if schemablock == 'citation':
            schema = schema[2:]
        else:
            schema = schema[2:]
        dataschema = []
    
        for i in range(0, len(schema)):
            item = schema[i]
            elements = item.split('\t')
            #print(elements)
            try:            
                if elements[keynameID] not in self.forbidden:
                    if i == len(schema):
                        dataschema.append(item)
                    else:
                        dataschema.append(item + "\n")
            except:
                skip = elements
                
        #print(dataschema)
        schemaIO = StringIO(''.join(dataschema))        
        data = pd.read_csv(schemaIO, sep="\t", error_bad_lines=False)
        print(data.columns)
        #data = data.drop(data[data['fieldType'] == np.nan])
        if 'fieldType' in data.columns:
            data = data[data['fieldType'].notna()]
        if ' fieldType' in data.columns:
            data = data[data[' fieldType'].notna()]
            
        self.datadict[schemablock] = data

        # Mappings for termURIs
        if 'termURI' in data.columns:
            for i in data[['name','termURI']].index:
                if data.loc[i]['termURI'] is not np.nan:                    
                    self.termURIs[data.loc[i]['name']] = data.loc[i]['termURI']   
        self.metadataframe = data
                    
        return self.datadict

    def RemoveRef(self, valueURL):
        valueURL = valueURL.replace(self.RootRef, '')
        valueURL = valueURL.replace('<', '')
        valueURL = valueURL.replace('>', '')
        return valueURL
    
    def SetTermURI(self, value):
        if value in self.termURIs:
            return self.termURIs[value]
        else:
            return self.SetRef(value)
        
    def SetRef(self, value):
        # Set references with loaded semantic mappings
        value = value.replace('#','')
        value = value.replace(' ','')
        if value in self.mappings:
            RefURL = self.mappings[value]
        else:
            RefURL = "%s%s" % (self.RootRef, value)
            
        return RefURL 
    
    def to_graph(self, schemaname=False, filename = False, DEBUG=False):
        self.RootRef = "%s/%s/" % (self.thisRef, schemaname)
        
        if schemaname not in self.datadict:
            return

        self.g = self.emptyGraph()
        ns1 = Namespace(self.RootRef)
        self.g.bind(schemaname, ns1)
        skos = Namespace('http://www.w3.org/2004/02/skos/core#')
        self.g.bind('skos', skos)
        
        self.datadict[schemaname].fillna('', inplace=True)
        tmpnames = self.datadict[schemaname].columns
        names = []
        for name in tmpnames:
            newname = "schema_%s" % name
            names.append(newname)
        staRoot = URIRef(self.RootRef)
        
        for row in range(0, self.datadict[schemaname]['name'].size):              
        #for row in range(0, 50):
            staID = BNode()
            nodename = self.SetRef(self.datadict[schemaname].loc[row]['name'])
            parentname = self.SetRef(self.datadict[schemaname].loc[row]['parent'])

            if DEBUG:
                print(nodename)
            if parentname != self.RootRef: #like 'https://dataverse.org/schema/citation/':
                staParent = self.locator[parentname]
                self.g.add((staParent, URIRef(nodename), staID))
                self.g.add((staParent, skos['broader'], URIRef(nodename)))
                self.g.add((URIRef(parentname), skos['narrower'], URIRef(nodename)))
                self.g.add((URIRef(nodename), skos['broader'], URIRef(parentname)))
                self.g.add((URIRef(nodename), skos['altLabel'], Literal(self.datadict[schemaname].loc[row]['name'])))
                self.g.add((URIRef(nodename), skos['prefLabel'], Literal(self.datadict[schemaname].loc[row]['title'])))
                if self.datadict[schemaname].loc[row]['termURI']:
                    self.g.add((URIRef(nodename), skos['exactMatch'], Literal(self.datadict[schemaname].loc[row]['termURI'])))

                self.locator[nodename] = staID      
            else:
                self.g.add((staRoot, URIRef(nodename), staID))
                if self.datadict[schemaname].loc[row]['termURI']:
                    self.g.add((URIRef(nodename), skos['exactMatch'], Literal(self.datadict[schemaname].loc[row]['termURI'])))
                
                #self.g.add((staRoot, str(self.datadict[schemaname].loc[row]['name']), staID)) #vty
                #self.g.add((URIRef(nodename), skos['narrower'], URIRef(parentname)))
                self.locator[nodename] = staID
            
            statement = staID
            for i in range(0, self.datadict[schemaname].loc[row].size-1):                                
                item = self.datadict[schemaname].loc[row].values[i]
                if item:
                    if self.defaultlanguage:
                        self.g.add((statement, URIRef(self.SetRef(names[i])), Literal(item, lang=self.defaultlanguage)))
                    else:
                        self.g.add((statement, URIRef(self.SetRef(names[i])), Literal(item)))
                #self.g.add((statement, URIRef(self.SetRef(names[i])), Literal("%s NL" % item, lang='nl')))
        
        # Save to files
        if filename:
            self.g.serialize(format='n3', destination="/tmp/%s.nt" % schemaname)
            #self.g.serialize(format='json-ld', auto_compact=True, use_rdf_type=True, destination="/tmp/%s.json-ld" % schemaname)
        return self.g            

    def isNode(self, pNode): 
        if pNode:
            checkNode = str(pNode)[:3]                
            if checkNode == '_:N':  
                return pNode
            else:
                return False
        return False
                        
    def CompoundElements(self, jsongraph, DEBUG=None):
        for compoundkey in jsongraph:
            #isEdge = False
            rootNodeID = None
            for key in compoundkey:  
                if key == '@id':
                    if DEBUG:
                        print("KEY %s / %s" % (self.isNode(compoundkey[key]), compoundkey[key]))
                    rootNodeID = compoundkey[key]
                for i in range(0, len(compoundkey[key])):
                    if '@id' in compoundkey[key][i]:
                        nodeID = compoundkey[key][i]['@id']
                        if self.isNode(nodeID):                            
                            self.CompoundNodes.append(nodeID) 
                            self.CompoundValues[nodeID] = compoundkey[key][i]
                            cv = nodeID
                            if DEBUG:
                                print("\t%s\n" % compoundkey[key][i]['@id']) 
            if self.isNode(rootNodeID):
                self.CompoundValues[rootNodeID] = compoundkey
            else:
                self.Vertices[rootNodeID] = compoundkey
                #print("%s => %s\n" % (self.isNode(rootNodeID), compoundkey))
        randomNode = None
        for rootNodeID in self.Vertices:
            #print("%s => %s\n" % (self.isNode(rootNodeID), compoundkey))
            compoundkey = self.Vertices[rootNodeID]            
            for key in compoundkey:         
                newfields = {}
                if '@id' in compoundkey[key][0]:
                    nodeID = compoundkey[key][0]['@id']
                    if DEBUG:
                        print("%s %s" % (key, nodeID))
                    extra = []
                    if nodeID in self.CompoundValues:
                        #self.serializeJSON[key] = self.CompoundValues[nodeID]
                        extra.append(self.CompoundValues[nodeID])
                        randomNode = nodeID
                        #extra['nodeID'] = nodeID
                        x = False
                    self.serializeJSON[key] = extra
                else:                    
                    self.serializeJSON[key] = compoundkey[key]
        #print(self.CompoundValues[cv])
        return randomNode
        return self.serializeJSON

    def Info(self, fieldname=None, NESTED=None, DEBUG=None):
        triples = []
        rootname = None
        for s,p,o in schema.g.triples((URIRef(self.SetRef(fieldname)),None, None)):    
            rootname = s
            triple = [s, p, o]
            triples.append(triple)
        return triples
    
    def Relations(self, fieldname=None, NESTED=None, relation=None, DEBUG=None):
        roots = {}
        triples = []
        if 'http' in fieldname:
            searchfield = URIRef(fieldname)
        else:
            searchfield = URIRef(self.SetRef(fieldname))

        for s,p,o in schema.g.triples((searchfield,None, None)):    
            if DEBUG:
                print("[DEBUG] %s %s %s\n" % (s,p,o))
            for t in [s,p,o]:
                print(Literal(t))
                if relation in str(Literal(t)):
                    triples.append([str(s), str(p), str(o)])
        return triples
    
    def Lookup(self, fieldname=None, NESTED=None, DEBUG=None):
        lookup = {}
        for s,p,o in schema.g.triples((None, URIRef(self.SetRef(fieldname)),None)):    
            for s1,p1,o1 in schema.g.triples((o, None, None)):
            #if re.search('http', o1):
                if NESTED:
                    if not re.search('schema_|skos', p1.n3()):
                        info = {}
                        info['loc'] = o1
                        info['nested'] = 'True'
                        info['labels'] = self.Lookup(self.RemoveRef(p1.n3()))
                        info['short'] = self.RemoveRef(p1.n3())
                        lookup[p1.n3()] = info
                else:
                    info = {}
                    if DEBUG:
                        print("%s %s %s" % (s1,p1,o1))        
                    info['uri'] = o1
                    info['loc'] = p1
                    lookup[str(p1)] = info
        return lookup
    
    def Overview(self, subfield=None, condition=None, DEBUG=None):
        overview = {}
        lookup_term = None
        if subfield:
            lookup_term = URIRef(self.SetRef(subfield))
        print(lookup_term)
        for s,p,o in schema.g.triples((None, lookup_term, Literal(condition))):
            for s1,p1,o1 in schema.g.triples((s, None, None)):        
                if re.search('name', p1):
                    info = {}
                    if DEBUG:
                        print("S %s %s" % (s1, o1))    
                    info['uri'] = self.SetRef(o1)
                    info['loc'] = s1
                    overview[str(o1)] = info
        return overview
    
    def Hierarchy(self, fieldname):
        #rootfield = schema.Info(fieldname, NESTED=True)  
        hierarchy = {}
        internalfields = []
        root = self.Relations(fieldname, NESTED=True, relation='#broader')        
        if root:    
            # field has top relations
            nested = schema.Relations(root[0][2], NESTED=True, relation='#narrow') 
            hierarchy['root'] = root[0][2]
            for n in nested:
                nestedkey = "%sValue" % (field)
                internalfields.append(n[2])   
            hierarchy['fields'] = internalfields
        if not internalfields:
            # fields with internal relations            
            nested = schema.Relations(fieldname, NESTED=True, relation='#narrow') 
            for n in nested:
                hierarchy['root'] = n[0]
                nestedkey = "%sValue" % (field)
                internalfields.append(n[2])     
            hierarchy['fields'] = internalfields
        if not internalfields:
            root = self.Relations(fieldname, NESTED=True, relation='#altLabel')
        return hierarchy

In [199]:
schema = Schema()
csvfile = 'https://raw.githubusercontent.com/Dans-labs/common-migrations/master/core/resources/examples/example.csv'
dataset = schema.loadfile(csvfile)
dataset

Unnamed: 0,title,identifier,sid,state,permission,organization,lat,lng,firstYear,lastYear,taxon,objectType,elementType,investigator,language,category,UserLastname,userInitials,userEmail,userId
0,"CH, Co. Amsterdam",Klc House,dccd:4742,PUBLISHED,values,Quebec Univ,12.456,-9.098,8768,9887,Curques,House,,Lian McDonald,en,built in plug,Blue,K. M.,k.blue@example.uk,King


In [200]:
jsonfile = 'https://raw.githubusercontent.com/vega/vega/master/docs/data/population.json'
dataset2 = schema.loadfile(jsonfile)

In [201]:
tsvfile = 'https://raw.githubusercontent.com/datasciencelabs/2019/master/shiny/population.tsv'
dataset3 = schema.loadfile(jsonfile)

In [202]:
schemaURL = 'https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/citation.tsv'
schemapd = schema.load_metadata_schema(schemaURL, 'citation')
schema.metadataframe

Index(['#datasetField', 'name', 'title', 'description', 'watermark',
       ' fieldType', 'displayOrder', 'displayFormat', 'advancedSearchField',
       'allowControlledVocabulary', 'allowmultiples', 'facetable',
       'displayoncreate', 'required', 'parent', 'metadatablock_id', 'termURI'],
      dtype='object')


b'Skipping line 72: expected 17 fields, saw 18\nSkipping line 85: expected 17 fields, saw 22\n'


Unnamed: 0,#datasetField,name,title,description,watermark,fieldType,displayOrder,displayFormat,advancedSearchField,allowControlledVocabulary,allowmultiples,facetable,displayoncreate,required,parent,metadatablock_id,termURI
0,,title,Title,Full title by which the Dataset is known.,Enter title...,text,0.0,,True,False,False,False,True,True,,citation,http://purl.org/dc/terms/title
1,,subtitle,Subtitle,A secondary title used to amplify or state cer...,,text,1.0,,False,False,False,False,False,False,,citation,
2,,alternativeTitle,Alternative Title,A title by which the work is commonly referred...,,text,2.0,,False,False,False,False,False,False,,citation,http://purl.org/dc/terms/alternative
3,,alternativeURL,Alternative URL,"A URL where the dataset can be viewed, such as...","Enter full URL, starting with http://",url,3.0,"<a href=""#VALUE"" target=""_blank"">#VALUE</a>",False,False,False,False,False,False,,citation,https://schema.org/distribution
4,,otherId,Other ID,Another unique identifier that identifies this...,,none,4.0,:,False,False,True,False,False,False,,citation,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,,relatedDatasets,Related Datasets,"Any Datasets that are related to this Dataset,...",,textbox,72.0,,False,False,True,False,False,False,,citation,http://purl.org/dc/terms/relation
69,,otherReferences,Other References,Any references that would serve as background ...,,text,73.0,,False,False,True,False,False,False,,citation,http://purl.org/dc/terms/references
70,,originOfSources,Origin of Sources,"For historical materials, information about th...",,textbox,75.0,,False,False,False,False,False,False,,citation,
71,,characteristicOfSources,Characteristic of Sources Noted,Assessment of characteristics and source mater...,,textbox,76.0,,False,False,False,False,False,False,,citation,


In [203]:
schemaURL = 'https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/biomedical.tsv'
schemapd = schema.load_metadata_schema(schemaURL, 'biomedical')
#schemapd['citation'].head()
termURIs = {}
if 'termURI' in schemapd['citation'].columns:
    for i in schemapd['citation'][['name','termURI']].index:
        if schemapd['citation'].loc[i]['termURI'] is not np.nan:
            #print(schemapd['citation'].loc[i]['termURI'])      
            if schemapd['citation'].loc[i]['termURI']:
                termURIs[schemapd['citation'].loc[i]['name']] = schemapd['citation'].loc[i]['termURI']
                termURIs[schema.SetRef(schemapd['citation'].loc[i]['name'])] = schemapd['citation'].loc[i]['termURI']
termURIs

Index(['#datasetField', 'name', 'title', 'description', 'watermark',
       'fieldType', 'displayOrder', 'displayFormat', 'advancedSearchField',
       'allowControlledVocabulary', 'allowmultiples', 'facetable',
       'displayoncreate', 'required', 'parent', 'metadatablock_id'],
      dtype='object')


{'title': 'http://purl.org/dc/terms/title',
 'alternativeTitle': 'http://purl.org/dc/terms/alternative',
 'alternativeURL': 'https://schema.org/distribution',
 'author': 'http://purl.org/dc/terms/creator',
 'authorIdentifier': 'http://purl.org/spar/datacite/AgentIdentifier',
 'subject': 'http://purl.org/dc/terms/subject',
 'publication': 'http://purl.org/dc/terms/isReferencedBy',
 'publicationCitation': 'http://purl.org/dc/terms/bibliographicCitation',
 'publicationIDNumber': 'http://purl.org/spar/datacite/ResourceIdentifier',
 'publicationURL': 'https://schema.org/distribution',
 'contributor': 'http://purl.org/dc/terms/contributor',
 'grantNumber': 'https://schema.org/sponsor',
 'dateOfDeposit': 'http://purl.org/dc/terms/dateSubmitted',
 'timePeriodCovered': 'https://schema.org/temporalCoverage',
 'kindOfData': 'http://rdf-vocabulary.ddialliance.org/discovery#kindOfData',
 'software': 'https://www.w3.org/TR/prov-o/#wasGeneratedBy',
 'relatedDatasets': 'http://purl.org/dc/terms/relati

In [204]:
schema.to_graph('citation', filename='citation')

<Graph identifier=N969e41c5cd7741eea0479acbe042a535 (<class 'rdflib.graph.Graph'>)>

In [205]:
OTHER = False
if OTHER:
    schema.emptyGraph()
    schema.to_graph('biomedical', filename='biomedical')

In [206]:
if OTHER:
    socialURL = "https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/social_science.tsv"
    schemapd = schema.load_metadata_schema(socialURL, 'socialsciences')
    schema.emptyGraph() 
    schema.to_graph('socialsciences', filename='socialsciences')

In [207]:
#jsonld = schema.CompoundElements(json.loads(schema.g.serialize(format='json-ld')))
#print(json.dumps(jsonld, indent=2))
#with open('/tmp/data.json', 'w', encoding='utf-8') as f:
    #json.dump(jsonld, f, ensure_ascii=False, indent=4)

for subj, pred, obj in schema.g:
    localstatements = [ subj, pred, obj ] 
    for item in localstatements:
        if re.search('Title', str(item)):
            print(localstatements)
#jsonld = jsonld[2:]
#print(jsonld)
#for subj in schema.g.subjects(URIRef('https://dataverse.org/schema/citation/schema_required'), Literal('True')):    
#    print(schema.g.value(subject=subj, object=Literal('True')))
for s,p,o in schema.g.triples((None, URIRef('https://dataverse.org/schema/citation/schema_required'), Literal('True'))):
    for s1,p1,o1 in schema.g.triples((s, None, None)):
        #if re.search('http', o1):
        if re.search('name', p1):
            print("S %s %s" % (p1, o1))
    #for s1,p1,o1 in schema.g.triples((None, None, s)):
    #    if re.search('http', o1):
    #        print(o1)

[rdflib.term.BNode('N52d41060e1d84712b3c12b0f1457648d'), rdflib.term.URIRef('https://dataverse.org/schema/citation/schema_name'), rdflib.term.Literal('alternativeTitle')]
[rdflib.term.URIRef('https://dataverse.org/schema/citation/'), rdflib.term.URIRef('https://dataverse.org/schema/citation/alternativeTitle'), rdflib.term.BNode('N52d41060e1d84712b3c12b0f1457648d')]
[rdflib.term.BNode('N52d41060e1d84712b3c12b0f1457648d'), rdflib.term.URIRef('https://dataverse.org/schema/citation/schema_title'), rdflib.term.Literal('Alternative Title')]
[rdflib.term.URIRef('https://dataverse.org/schema/citation/alternativeTitle'), rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#exactMatch'), rdflib.term.Literal('http://purl.org/dc/terms/alternative')]
[rdflib.term.BNode('Nc49394c469324142b55e33048fe118ab'), rdflib.term.URIRef('https://dataverse.org/schema/citation/schema_title'), rdflib.term.Literal('Title')]
S https://dataverse.org/schema/citation/schema_name title
S https://dataverse.org/schema

In [208]:
fields = schema.Overview('schema_required', 'True')
fields

https://dataverse.org/schema/citation/schema_required


{'title': {'uri': 'https://dataverse.org/schema/citation/title',
  'loc': rdflib.term.BNode('Nc49394c469324142b55e33048fe118ab')},
 'author': {'uri': 'https://dataverse.org/schema/citation/author',
  'loc': rdflib.term.BNode('Nbb9eca11bfe54758a82014968080c58f')},
 'authorName': {'uri': 'https://dataverse.org/schema/citation/authorName',
  'loc': rdflib.term.BNode('Ne1fd820a9ea4453fa337b200d65c0360')},
 'datasetContact': {'uri': 'https://dataverse.org/schema/citation/datasetContact',
  'loc': rdflib.term.BNode('N25bbc07fc49b425aadf0023709d4a3b9')},
 'datasetContactEmail': {'uri': 'https://dataverse.org/schema/citation/datasetContactEmail',
  'loc': rdflib.term.BNode('N9d5e8d4813f848f08563617c16ab6006')},
 'dsDescription': {'uri': 'https://dataverse.org/schema/citation/dsDescription',
  'loc': rdflib.term.BNode('N881cb09ca94745aba1a7918b0fe04a59')},
 'dsDescriptionValue': {'uri': 'https://dataverse.org/schema/citation/dsDescriptionValue',
  'loc': rdflib.term.BNode('N469aca309c324ab09fa9

In [209]:
for field in schema.Overview('', 'True'):
    nested = schema.Lookup(field, NESTED=True)
    if nested:
        nested_fields = schema.Lookup(field, NESTED=True)    
        #print("\t%s %s\n" % (nested, 'nested'))
    print(field)
#schema.Lookup(fieldname='keyword', NESTED=True)        
#schema.Lookup(fieldname='title')

None
title
otherId
author
authorName
authorAffiliation
authorIdentifier
datasetContact
datasetContactName
datasetContactAffiliation
datasetContactEmail
dsDescription
dsDescriptionValue
dsDescriptionDate
subject
keyword
keywordValue
keywordVocabulary
keywordVocabularyURI
topicClassification
topicClassValue
publication
publicationCitation
publicationIDNumber
notesText
producer
producerName
productionDate
contributor
contributorName
grantNumber
grantNumberAgency
grantNumberValue
distributor
distributorName
distributionDate
dateOfDeposit
timePeriodCovered
timePeriodCoveredStart
timePeriodCoveredEnd
dateOfCollection
kindOfData
seriesName
software
softwareName
relatedMaterial
relatedDatasets
otherReferences


In [210]:
schema.Lookup('keyword', NESTED=True)
schema.Lookup('dsDescriptionValue', NESTED=True)

{}

In [211]:
dataset_fields = dataset.columns
mappings = { 'investigator': 'authorName', 'organization': 'affiliation', 'taxon': 'dsDescription', "category": "keyword" }
newcolumns = []
for fieldname in dataset_fields:
    if fieldname in mappings:
        newcolumns.append(mappings[fieldname])
    else:
        newcolumns.append(fieldname)

datasetg = Schema()        
root = schema.SetRef('citation') 
staID = BNode()
#datasetg.g.add((URIRef(schema.RootRef), URIRef(root), staID)) 
#root = schema.thisRef

print(root)
dataset.columns = newcolumns        
metadata = {}
for i in dataset.index:
    data = dataset.loc[i]
    for col in range(0, len(data)):
        field = dataset.columns[col]
        print("Metadata [%s] %s %s => %s"% (col, dataset.columns[col], data[col], field))        
        
        if schema.Lookup(fieldname=field):
            print(field)               
            fieldURI = schema.SetTermURI(field)
            staID = BNode()
            datasetg.g.add((URIRef(root), URIRef(fieldURI), staID))
            if field in termURIs:
                metadata[termURIs[field]] = data[col]
            else:
                metadata[str(schema.SetRef(field))] = data[col]
            
            #datasetg.g.add((URIRef(root), URIRef(fieldURI)))             
            #datasetg.g.add((URIRef(root), URIRef(fieldURI), Literal(data[col])))            
            nested = schema.Lookup(field, NESTED=True)
            nested_fields = None

            if nested:
                nestedkey = "%sValue" % (field)
                datasetg.g.add((URIRef(root), URIRef(schema.SetRef(field)), Literal(data[col])))
                staID = URIRef(schema.thisRef)
                #datasetg.g.add((URIRef(root), URIRef(fieldURI)))              
                nested_fields = schema.Lookup(field, NESTED=True)  
                nested_metadata = {}
                for nf in nested_fields:
                    print("[D] %s %s (%s)\n" % (schema.SetRef(field), nf, schema.SetRef(nestedkey)))                    
                    #nested_metadata[nf] = 'A'
                    subfield = nf
                    subfield = subfield.replace('<','')
                    subfield = subfield.replace('>','')
                    if subfield == schema.SetRef(nestedkey):
                        nested_metadata[subfield] = data[col]
                    else:
                        nested_metadata[subfield] = "" #data[col]
                    datasetg.g.add((URIRef(schema.SetRef(field)), URIRef(subfield), Literal(data[col])))
                if nested_metadata:
                    if field in termURIs:
                        metadata[termURIs[field]] = nested_metadata
                    else:
                        metadata[str(schema.SetRef(field))] = nested_metadata
            else:
                skip = 2
                datasetg.g.add((URIRef(root), URIRef(fieldURI), Literal(data[col])))
 
schemaname = 'dataset'

datasetg.g.serialize(format='n3', destination="/tmp/%s.nt" % schemaname)                    
#datasetg.g.serialize(format='json-ld', auto_compact=True, use_rdf_type=True, destination="/tmp/%s.jsonld" % schemaname)                    

https://dataverse.org/schema/citation/citation
Metadata [0] title CH, Co. Amsterdam => title
title
Metadata [1] identifier Klc House => identifier
Metadata [2] sid dccd:4742 => sid
Metadata [3] state PUBLISHED => state
Metadata [4] permission values => permission
Metadata [5] affiliation Quebec Univ => affiliation
Metadata [6] lat 12.456 => lat
Metadata [7] lng -9.097999999999999 => lng
Metadata [8] firstYear 8768 => firstYear
Metadata [9] lastYear 9887 => lastYear
Metadata [10] dsDescription Curques => dsDescription
dsDescription
[D] https://dataverse.org/schema/citation/dsDescription <https://dataverse.org/schema/citation/dsDescriptionValue> (https://dataverse.org/schema/citation/dsDescriptionValue)

[D] https://dataverse.org/schema/citation/dsDescription <https://dataverse.org/schema/citation/dsDescriptionDate> (https://dataverse.org/schema/citation/dsDescriptionValue)

Metadata [11] objectType House => objectType
Metadata [12] elementType nan => elementType
Metadata [13] authorName

<Graph identifier=N45ed8c3c034940e88b4c6e4939aed674 (<class 'rdflib.graph.Graph'>)>

In [212]:
fieldname = 'datasetContact'
default = {}
default['https://dataverse.org/schema/citation/datasetContactName'] = 'Slava'
default['https://dataverse.org/schema/citation/datasetContactAffiliation'] = 'DANS'
default['https://dataverse.org/schema/citation/datasetContactEmail'] = 'slava@dans.knaw.nl'
cfields = schema.Hierarchy(fieldname)
if cfields:
    intmetadata = {}
    for field in cfields['fields']:    
        intmetadata[field] = default[field]
    metadata[cfields['root']] = intmetadata
#metadata[schema.SetRef('datasetContactEmail')] = 'me@email.me'
print(json.dumps(metadata, indent=4))
#fields

https://dataverse.org/schema/citation/datasetContact
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/datasetContactName
https://dataverse.org/schema/citation/datasetContact
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/datasetContactAffiliation
https://dataverse.org/schema/citation/datasetContact
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/datasetContactEmail
https://dataverse.org/schema/citation/datasetContact
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/datasetContactName
https://dataverse.org/schema/citation/datasetContact
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/datasetContactAffiliation
https://dataverse.org/schema/citation/datasetContact
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/datasetContactEmail
{
    "http://purl.org/dc/terms/title": "CH, Co. Amste

In [213]:
fieldname = 'author'
default = {}
default['https://dataverse.org/schema/citation/authorName'] = 'Slava'
default['https://dataverse.org/schema/citation/authorAffiliation'] = 'DANS'
default['https://dataverse.org/schema/citation/datasetContactEmail'] = 'slava@dans.knaw.nl'
cfields = schema.Hierarchy(fieldname)
print("Root %s" % cfields['root'])
if cfields:
    intmetadata = {}
    for field in cfields['fields']: 
        if field in default:
            print("Field %s" % field)
            intmetadata[field] = default[field]
    metadata[cfields['root']] = intmetadata
print(json.dumps(metadata, indent=4))

https://dataverse.org/schema/citation/author
http://www.w3.org/2004/02/skos/core#exactMatch
http://purl.org/dc/terms/creator
https://dataverse.org/schema/citation/author
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/authorName
https://dataverse.org/schema/citation/author
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/authorAffiliation
https://dataverse.org/schema/citation/author
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/authorIdentifier
https://dataverse.org/schema/citation/author
http://www.w3.org/2004/02/skos/core#exactMatch
http://purl.org/dc/terms/creator
https://dataverse.org/schema/citation/author
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/authorName
https://dataverse.org/schema/citation/author
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/authorAffiliation
https://dataverse.org/schema/citati

In [214]:
fieldname = 'subject'
cfields = schema.Hierarchy(fieldname)
print(cfields)
#if not 'fields' in cfields:
metadata["https://dataverse.org/schema/citation/%s" % fieldname] = "Arts and Humanities"
print(json.dumps(metadata, indent=4))

https://dataverse.org/schema/citation/subject
http://www.w3.org/2004/02/skos/core#exactMatch
http://purl.org/dc/terms/subject
https://dataverse.org/schema/citation/subject
http://www.w3.org/2004/02/skos/core#exactMatch
http://purl.org/dc/terms/subject
https://dataverse.org/schema/citation/subject
http://www.w3.org/2004/02/skos/core#exactMatch
http://purl.org/dc/terms/subject
{'fields': []}
{
    "http://purl.org/dc/terms/title": "CH, Co. Amsterdam",
    "https://dataverse.org/schema/citation/dsDescription": {
        "https://dataverse.org/schema/citation/dsDescriptionValue": "Curques",
        "https://dataverse.org/schema/citation/dsDescriptionDate": ""
    },
    "https://dataverse.org/schema/citation/authorName": "Lian McDonald",
    "https://dataverse.org/schema/citation/keyword": {
        "https://dataverse.org/schema/citation/keywordValue": "built in plug",
        "https://dataverse.org/schema/citation/keywordVocabulary": "",
        "https://dataverse.org/schema/citation/keyw

In [216]:
fieldname='datasetContactName'
fieldname='https://dataverse.org/schema/citation/datasetContactEmail'
fieldname= 'grantNumber'
if fieldname:
    rootfield = schema.Info(fieldname, NESTED=True)    
    root = schema.Relations(fieldname, NESTED=True, relation='#broader')        
    print("Root %s" % root)
    #rootfield = 'datasetContact'
    if root:    
        nested = schema.Relations(root[0][2], NESTED=True, relation='#narrow') 
        for n in nested:
            nestedkey = "%sValue" % (field)
            for i in n:
                print(i)

https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#exactMatch
https://schema.org/sponsor
https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/grantNumberAgency
https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/grantNumberValue
Root []


In [217]:
fieldname='datasetContactName'
fieldname= 'grantNumber'
fields = schema.Hierarchy(fieldname)
fields

https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#exactMatch
https://schema.org/sponsor
https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/grantNumberAgency
https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/grantNumberValue
https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#exactMatch
https://schema.org/sponsor
https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/grantNumberAgency
https://dataverse.org/schema/citation/grantNumber
http://www.w3.org/2004/02/skos/core#narrower
https://dataverse.org/schema/citation/grantNumberValue


{'root': 'https://dataverse.org/schema/citation/grantNumber',
 'fields': ['https://dataverse.org/schema/citation/grantNumberAgency',
  'https://dataverse.org/schema/citation/grantNumberValue']}

In [218]:
fieldname = 'authorIdentifier'
#fieldname = 'title'
alts = schema.Relations(fieldname, NESTED=True, relation='#exactMatch') 
alts



https://dataverse.org/schema/citation/authorIdentifier
http://www.w3.org/2004/02/skos/core#broader
https://dataverse.org/schema/citation/author
https://dataverse.org/schema/citation/authorIdentifier
http://www.w3.org/2004/02/skos/core#altLabel
authorIdentifier
https://dataverse.org/schema/citation/authorIdentifier
http://www.w3.org/2004/02/skos/core#prefLabel
Identifier
https://dataverse.org/schema/citation/authorIdentifier
http://www.w3.org/2004/02/skos/core#exactMatch
http://purl.org/spar/datacite/AgentIdentifier


[['https://dataverse.org/schema/citation/authorIdentifier',
  'http://www.w3.org/2004/02/skos/core#exactMatch',
  'http://purl.org/spar/datacite/AgentIdentifier']]