In [29]:
import json, codecs, csv

In [30]:
# Open the CSV containing the content.
infile = codecs.open('../CSV/web-pages-export.csv','r',encoding='utf-8')
webpage_data = csv.reader(infile,delimiter=',')

# Create an iterable for the script.
webpages = [row for row in webpage_data]

# Remove the header row.
del webpages[0]

In [31]:
# Establish the data model.
data = {}
data['@context']={}
data['@context']['@base'] = 'https://catalog.digitallatin.org'
data['@context']['dcterms'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
data['@context']['frbr'] = 'http://vocab.org/frbr/core#'
data['@context']['madsrdf'] = 'http://www.loc.gov/mads/rdf/v1#'
data['@context']['rdf'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
data['@context']['schema'] = 'http://schema.org/'
data['@context']['Title'] = 'dcterms:title'
data['@context']['Author'] = 'dcterms:creator'
data['@context']['SourceEdition'] = 'dcterms:source'
data['@context']['SourceURI'] = 'dcterms:URI'
data['@context']['Publisher'] = 'dcterms:publisher'
data['@context']['Repository'] = 'schema:WebSite'
data['@context']['AccessDate'] = 'dcterms:date'
data['@context']['Rights'] = 'dcterms:rights'
data['@context']['DLLid'] = 'dcterms:identifier'
data['@context']['References'] = {}
data['@context']['References']['@id'] = 'dcterms:references'
data['@context']['References']['@type'] = '@id'
data['@context']['DLL Author'] = 'frbr:Creator'
data['@context']['DLL Work'] = 'frbr:exemplarOf'
data['@graph'] = []

In [32]:
# Iterate over each item and process it.
for i in webpages:
    dll_link = i[0]
    dll_id = i[1]
    title = i[2]
    
    # Handle the author(s) of the item. If there are > 1 values, return those values.
    def author_process():
        author = i[3]
        # If there aren't any values, return an empty value.
        if not author:
            author = ''
            return author

        # If there is > 1 value, the values will be separated by ';'. Process the
        # values as a list, then return the joined list.
        elif ';' in author:
            authors = []
            author_list = author.split(';')
            for author in author_list:
                authors.append(author)
            author = authors
            return author

        # If there is only one author, return it.
        else:
            return author
        
    author = author_process()
    
    source_edition = i[4]
    source_uri = i[5]
    publisher = i[6]
    repository = i[7]
    access_date = i[8]
    rights = i[9]
    
    # Handle the reference(s) to DLL Author(s) and return the appropriate number of values.
    def dll_author_process():
        dll_author = i[10]
        # If there aren't any values, return an empty value.
        if not dll_author:
            dll_author = ''
            return dll_author

        # If there is > 1 value, the values will be separated by ';'. Process the
        # values as a list, then return the joined list.
        elif ';' in dll_author:
            dll_authors = []
            dll_author_list = dll_author.split(';')
            for dll_author in dll_author_list:
                dll_authors.append(dll_author)
            dll_author = dll_authors
            return dll_author

        # If there is only one dll_author, return it.
        else:
            return dll_author
        
    dll_author = dll_author_process()
    
    # Handle the reference(s) to DLL Work(s) and return the appropriate number of values.
    def dll_work_process():
        dll_work = i[11]
        # If there aren't any values, return an empty value.
        if not dll_work:
            dll_work = ''
            return dll_work

        # If there is > 1 value, the values will be separated by ';'. Process the
        # values as a list, then return the joined list.
        elif ';' in dll_work:
            dll_works = []
            dll_work_list = dll_work.split(';')
            for dll_work in dll_work_list:
                dll_works.append(dll_work)
            dll_work = dll_works
            return dll_work

        # If there is only one dll_work, return it.
        else:
            return dll_work
        
    dll_work = dll_work_process()
    
    # Make the JSON object.
    page = {
        '@id': dll_link,
        'Title': title,
        'Author': author,
        'SourceEdition': source_edition,
        'SourceURI': source_uri,
        'Publisher': publisher,
        'Repository': repository,
        'AccessDate': access_date,
        'Rights': rights,
        'References': {
                'DLL Author': dll_author,
                'DLL Work': dll_work}}

    # Add the item to the graph.
    data['@graph'].append(page)

In [33]:
# Write the results to a file for uploading to the git repo.
with codecs.open('../JSON/webpages.json','a',encoding='utf-8') as outfile:
    json.dump(data,outfile,sort_keys = True, indent = 4, ensure_ascii = False)