In [565]:
from rdflib import Graph, URIRef, Literal, BNode, plugin, Namespace
from rdflib.serializer import Serializer
from collections import defaultdict, OrderedDict
import pandas as pd
import json
import requests
from io import StringIO

In [566]:
class Schema():
    def __init__(self, debug=False):
        self.forbidden = ["subject", "language", "authorIdentifierScheme", "contributorType", "publicationIDType", "DatasetField"]
        self.datadict = {}
        self.g = Graph()
        self.thisRef = 'https://dataverse.org/schema'
        self.RootRef = ''
        self.mappings = {}
        self.locator = {}
        
    def emptyGraph(self):
        self.g = Graph()
        return self.g
    
    def load_metadata_schema(self, schemaURL, schemablock=False):
        keynameID = 1        
        if not schemablock:
            schemablock = 'default'
        schema = requests.get(schemaURL).text.split('\n')
        if schemablock == 'citation':
            schema = schema[2:]
        else:
            schema = schema[2:]
        dataschema = []
    
        for i in range(0, len(schema)):
            item = schema[i]
            elements = item.split('\t')
            try:
                if elements[keynameID] not in forbidden:
                    if i == len(schema):
                        dataschema.append(item)
                    else:
                        dataschema.append(item + "\n")
            except:
                skip = elements
        schemaIO = StringIO(''.join(dataschema))        
        data = pd.read_csv(schemaIO, sep="\t", error_bad_lines=False)
        self.datadict[schemablock] = data
        return self.datadict

    def SetRef(self, value):
        # Set references with loaded semantic mappings
        value = value.replace('#','')
        value = value.replace(' ','')
        if value in self.mappings:
            RefURL = self.mappings[value]
        else:
            RefURL = "%s%s" % (self.RootRef, value)
            
        return RefURL 
    
    def to_graph(self, schemaname=False, filename = False, DEBUG=False):
        self.RootRef = "%s/%s/" % (self.thisRef, schemaname)
        
        if schemaname not in self.datadict:
            return

        self.g = self.emptyGraph()
        ns1 = Namespace(self.RootRef)
        self.g.bind(schemaname, ns1)
        
        self.datadict[schemaname].fillna('', inplace=True)
        names = self.datadict[schemaname].columns
        staRoot = URIRef(self.RootRef)
        
        for row in range(0, self.datadict[schemaname]['name'].size):              
            staID = BNode()
            nodename = self.SetRef(self.datadict[schemaname].loc[row]['name'])
            parentname = self.SetRef(self.datadict[schemaname].loc[row]['parent'])

            if DEBUG:
                print(nodename)
            if parentname != self.RootRef: #like 'https://dataverse.org/schema/citation/':
                staParent = self.locator[parentname]
                self.g.add((staParent, URIRef(nodename), staID))
                self.locator[nodename] = staID                
            else:
                self.g.add((staRoot, URIRef(nodename), staID))
                self.locator[nodename] = staID
            
            statement = staID
            for i in range(0, self.datadict[schemaname].loc[row].size-1):                                
                item = self.datadict[schemaname].loc[row].values[i]
                self.g.add((statement, URIRef(self.SetRef(names[i])), Literal(item)))
        
        # Save to files
        if filename:
            self.g.serialize(format='n3', destination="/tmp/%s.nt" % schemaname)
            self.g.serialize(format='json-ld', destination="/tmp/%s.json-ld" % schemaname)
        return self.g            

In [567]:
schemaURL = 'https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/citation.tsv'
schema = Schema()
schemapd = schema.load_metadata_schema(schemaURL, 'citation')
schemaURL = 'https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/biomedical.tsv'
schemapd = schema.load_metadata_schema(schemaURL, 'biomedical')
schemapd['citation'].head()

b'Skipping line 71: expected 17 fields, saw 18\n'


Unnamed: 0,#datasetField,name,title,description,watermark,fieldType,displayOrder,displayFormat,advancedSearchField,allowControlledVocabulary,allowmultiples,facetable,displayoncreate,required,parent,metadatablock_id,termURI
0,,title,Title,Full title by which the Dataset is known.,Enter title...,text,0,,True,False,False,False,True,True,,citation,http://purl.org/dc/terms/title
1,,subtitle,Subtitle,A secondary title used to amplify or state cer...,,text,1,,False,False,False,False,False,False,,citation,
2,,alternativeTitle,Alternative Title,A title by which the work is commonly referred...,,text,2,,False,False,False,False,False,False,,citation,http://purl.org/dc/terms/alternative
3,,alternativeURL,Alternative URL,"A URL where the dataset can be viewed, such as...","Enter full URL, starting with http://",url,3,"<a href=""#VALUE"" target=""_blank"">#VALUE</a>",False,False,False,False,False,False,,citation,https://schema.org/distribution
4,,otherId,Other ID,Another unique identifier that identifies this...,,none,4,:,False,False,True,False,False,False,,citation,


In [568]:
schema.to_graph('citation')

<Graph identifier=N49b76bf7ab5844c9a264ba69a0b6f812 (<class 'rdflib.graph.Graph'>)>

In [569]:
schema.emptyGraph()
schema.to_graph('biomedical')

<Graph identifier=N36136cff20f04dfe9c0713e600081f10 (<class 'rdflib.graph.Graph'>)>

In [570]:
socialURL = "https://raw.githubusercontent.com/IQSS/dataverse/develop/scripts/api/data/metadatablocks/social_science.tsv"
schemapd = schema.load_metadata_schema(socialURL, 'socialsciences')
schema.emptyGraph() 
schema.to_graph('socialsciences', filename='socialsciences')

<Graph identifier=Nc28c9c06c680454abd8952364f1e6832 (<class 'rdflib.graph.Graph'>)>