# Importing JSON Metadata from an ORACC Project
by Niek Veldhuis

February 2017

In [1]:
import pandas as pd
import json
import urllib.request

# ORACC Metadata
[ORACC](http://oracc.org) data and metadata may be retrieved in JSON format. The various JSON files and their contents are discussed by Steve Tinney in the [Open Data](http://oracc.org/doc/opendata/) page.

The code below will pull the file `manifest.json` from the project of your choice and display the list of JSON files available for that project. The file `catalogue.json` contains the metadat. This file is pulled from the site and formatted into a Pandas DataFrame.

The code below loads the data directly from the site. Alternatively, one may download the file first and read a local version of the file. Note that the one method needs the function `json.load()` (for files), the other `json.loads()` (for strings). The argument `strict=False` takes care of a control character that may be found in [ORACC](http://oracc.org) catalogs.)

The library `urllib.request` requires Python 3. The equivalent in Python 2 is `urllib2`.

In [2]:
project = input("Project abbreviation: ").lower()

Project abbreviation: dcclt


In [3]:
url = 'http://oracc.museum.upenn.edu/' + project + '/manifest.json'
d = urllib.request.urlopen(url).read().decode('utf-8')
data = json.loads(d)
#filename = "../data/metadata/metadata.json"
#with open(filename, 'r') as f:
#    data=json.load(f)   #, strict=False)

# List of JSON Documents
Display a list of JSON documents available for this project

In [4]:
data['files']
#df=pd.DataFrame(data['files'])
#list(df.columns.values)

['catalogue.json',
 'corpus.json',
 'gloss-akk-x-earakk.json',
 'gloss-akk-x-mbperi.json',
 'gloss-akk-x-midbab.json',
 'gloss-akk-x-oldbab.json',
 'gloss-akk-x-stdbab.json',
 'gloss-akk.json',
 'gloss-qpc.json',
 'gloss-qpn-x-celest.json',
 'gloss-qpn-x-divine.json',
 'gloss-qpn-x-ethnic.json',
 'gloss-qpn-x-lineage.json',
 'gloss-qpn-x-months.json',
 'gloss-qpn-x-object.json',
 'gloss-qpn-x-people.json',
 'gloss-qpn-x-places.json',
 'gloss-qpn-x-temple.json',
 'gloss-qpn-x-waters.json',
 'gloss-qpn.json',
 'gloss-sux-x-emesal.json',
 'gloss-sux.json',
 'gloss-uga.json',
 'gloss-xhu.json',
 'index-akk-x-earakk.json',
 'index-akk-x-mbperi.json',
 'index-akk-x-midbab.json',
 'index-akk-x-oldbab.json',
 'index-akk-x-stdbab.json',
 'index-akk.json',
 'index-cat.json',
 'index-lem.json',
 'index-qpc.json',
 'index-qpn-x-celest.json',
 'index-qpn-x-divine.json',
 'index-qpn-x-ethnic.json',
 'index-qpn-x-lineage.json',
 'index-qpn-x-months.json',
 'index-qpn-x-object.json',
 'index-qpn-x-peo

# Read Catalogue.json File

In [5]:
url = 'http://oracc.museum.upenn.edu/' + project + '/catalogue.json'
d = urllib.request.urlopen(url).read().decode('utf-8')
data = json.loads(d, strict=False)

# Transform to DataFrame
The JSON file is imported into a Pandas DataFrame. The line

> `list(df.columns.values)`

exposes a list of the available data elements.

In [6]:
df=pd.DataFrame(data['members']).T
list(df.columns.values)

['accession_no',
 'acquisition_history',
 'archive',
 'ark',
 'atf_source',
 'atf_up',
 'author',
 'author_remarks',
 'cdli_collation',
 'cdli_comments',
 'citation',
 'collection',
 'collection_copyright',
 'condition_description',
 'created_by',
 'created_on',
 'credits',
 'date_of_origin',
 'dates_referenced',
 'designation',
 'electronic_publication',
 'excavation_no',
 'external_id',
 'findspot_remarks',
 'findspot_square',
 'genre',
 'google_earth_collection',
 'google_earth_provenience',
 'height',
 'join_information',
 'keywords',
 'language',
 'last_modified_by',
 'last_modified_on',
 'lineart_up',
 'material',
 'museum_no',
 'notes',
 'object_preservation',
 'object_remarks',
 'object_type',
 'other_names',
 'period',
 'period_remarks',
 'photo_up',
 'place',
 'primary_edition',
 'primary_publication',
 'provenience',
 'provenience_remarks',
 'publication_date',
 'publication_history',
 'published_collation',
 'seal_id',
 'seal_information',
 'series',
 'series_section',
 'st

# Select Relevant Fields
Select the fieldsyou wish to preserve. Separate the fields with commas, as in

`designation, period, provenience` (this is the default).

Not all fields are available for each record. `Pandas` will note a missing value with `NaN` (Not a Number). The function `fillna()` changes this into the string `not entered`.

In [7]:
relevant_fields = ['designation', 'period', 'provenience']
fields = input("Enter the relevant fields: ").lower().replace(' ', '')
if not fields == '':
    relevant_fields = fields.split(',')

Enter the relevant fields: 


In [8]:
#relevant_fields = ['designation', 'period', 'provenience'] 
df = df[relevant_fields]
df = df.fillna(value='not entered')
df

Unnamed: 0,designation,period,provenience
P000001,"W 06435,a",Uruk III,Uruk
P000002,"W 06435,b",Uruk III,Uruk
P000003,"W 09123,d",Uruk IV,Uruk
P000004,"W 09169,d",Uruk IV,Uruk
P000005,"W 09206,k",Uruk IV,Uruk
P000006,"W 09656,h1",Uruk IV,Uruk
P000007,"W 09656,x",Uruk IV,Uruk
P000008,"W 11985,e",Uruk III,Uruk
P000009,"W 11985,f",Uruk III,Uruk
P000010,"W 11985,g",Uruk III,Uruk


# Save as Comma Separated Values file
Or use any other appropriate file format. Note that in theory `encoding = 'utf-8'` should not be necessary in Python 3. However, in Windows encoding may still cause problems and thus the argument is included for safety.

In [9]:
with open('../data/metadata/' + project + '_meta.csv', 'w') as f:
    df.to_csv(f, sep=',', encoding='utf-8')