Skip to content

Commit

Permalink
Merge pull request #60 from ChalkLab/50-add-rruff-reader
Browse files Browse the repository at this point in the history
Adds RRUFF reader to scidatalib.io.rruff
  • Loading branch information
JohnsonDylan committed May 3, 2021
2 parents 7bf9f50 + 454388a commit 46867d1
Show file tree
Hide file tree
Showing 5 changed files with 2,762 additions and 9 deletions.
27 changes: 19 additions & 8 deletions scidatalib/io/jcamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,23 @@ def _reader(filehandle: str) -> dict:
return jcamp_dict


def _read_get_description(jcamp_dict: dict, keywords: List[str]) -> str:
"""
Utility function to create a description string from the JCAMP
dictionary
:param jcamp_dict: JCAMP-DX dictionary extracted from read
:return: String for the description for SciData object
"""
description_lines = []
for key in keywords:
if key in jcamp_dict:
value = jcamp_dict.get(key)
description_lines.append(f'{key.upper()}: {value}')

return _DESCRIPTION_KEY_SPLIT_CHAR.join(description_lines)


def _read_get_graph_source_citation_section(jcamp_dict: dict) -> List[str]:
"""
Extract and translate from the JCAMP-DX dictionary the SciData JSON-LD
Expand Down Expand Up @@ -777,7 +794,7 @@ def _read_get_datagroup_subsection(jcamp_dict: dict) -> List[dict]:
"""
# Convert from JCAMP units -> SciData JSON-LD unitref
xunits = jcamp_dict.get("xunits", "")
xunitref = _XUNIT_MAP.get(xunits)
xunitref = _XUNIT_MAP.get(xunits, "")
yunitref = jcamp_dict.get("yunits", "")

# Values for attributes
Expand Down Expand Up @@ -969,20 +986,14 @@ def _read_translate_jcamp_to_scidata(jcamp_dict: dict) -> SciData:
scidata.starttime(f'{jcamp_date} {jcamp_time}')

# Description
description_lines = []
description_keywords = [
"jcamp-dx",
"class",
"cas registry no",
"sample description",
"xydata"
]
for key in description_keywords:
if key in jcamp_dict:
value = jcamp_dict.get(key)
description_lines.append(f'{key.upper()}: {value}')

description = _DESCRIPTION_KEY_SPLIT_CHAR.join(description_lines)
description = _read_get_description(jcamp_dict, description_keywords)
scidata.description(description)

# Authors
Expand Down
206 changes: 206 additions & 0 deletions scidatalib/io/rruff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
import numpy as np
from typing import TextIO

from scidatalib.scidata import SciData
from scidatalib.io import jcamp


_SCIDATA_UID = "scidata:rruff:jsonld"


def read_rruff(filename: str) -> dict:
"""
Reader for RRUFF database files to SciData object
RRUFF file format is a modified version of JCAMP, so re-use jcamp module
:param filename: Filename to read from for RRUFF files
:return: SciData object read in from RRUFF file
"""
with open(filename, "r") as fileobj:
rruff_dict = _reader(fileobj)
scidata = _read_translate_rruff_to_scidata(rruff_dict)
return scidata


def _reader(filehandle: TextIO) -> dict:
"""
File reader for RRUFF file format
:param filehandle: RRUFF file to read from
:return: Dictionary parsed from RRUFF file
"""
rruff_dict = {}
y = []
x = []
for line in filehandle:
# Skip blank or comment lines
if not line.strip():
continue
if line.startswith("$$"):
continue

rruff_dict, _, _ = jcamp._read_parse_header_line(line, rruff_dict)

if not line.startswith("##"):
datavals = jcamp._read_parse_dataset_line(
line,
jcamp._DATA_FORMAT_XYXY)
x.extend(datavals[0::2])
y.extend(datavals[1::2])

x = np.array([float(xval) for xval in x])
y = np.array([float(yval) for yval in y])

if ("xfactor" in rruff_dict):
x = x * rruff_dict["xfactor"]
if ("yfactor" in rruff_dict):
y = y * rruff_dict["yfactor"]

rruff_dict['x'] = x
rruff_dict['y'] = y
return rruff_dict


def _read_get_aspects_section(rruff_dict: dict) -> dict:
"""
Extract and translate from the RRUFF dictionary the SciData JSON-LD
'aspects' sub-ection of the 'methodology' section
:param rruff_dict: RRUFF dictionary to extract aspects section from
:return: The 'aspects' section of SciData JSON-LD methodology
"""
aspects = []

# Measurement
measurement = {
"@id": "measurement/1/",
"@type": "cao:CAO_000152",
"techniqueType": "obo:CHMO_0000228",
"technique": "obo:CHMO_0000656",
"instrumentType": "raman spectrometer",
"instrument": "Unknown",
}

# Settings for measurement
settings = []
if "laser_wavelength" in rruff_dict:
wavelength = {
"@id": "setting/1",
"@type": "sdo:setting",
"quantity": "wavelength",
"property": "Laser Wavelength",
"value": {
"@id": "setting/1/value/",
"number": rruff_dict.get("laser_wavelength"),
"unitstr": "qudt:NanoM",
}
}
settings.append(wavelength)

measurement.update({"settings": settings})

aspects.append(measurement)
return aspects


def _read_get_facets_section(rruff_dict: dict) -> dict:
"""
Extract and translate from the RRUFF dictionary the SciData JSON-LD
'facets' sub-section of the 'system' section
:param rruff_dict: RRUFF dictionary to extract facets section from
:return: The 'facets' section of SciData JSON-LD from translation
"""

facets = []
material = {
"@id": "material",
"@type": ["sdo:facet", "sdo:material"],
"name": rruff_dict.get("names", ""),
"materialType": rruff_dict.get("ideal chemistry", ""),
}
facets.append(material)
return facets


def _read_translate_rruff_to_scidata(rruff_dict: dict) -> dict:
"""
Main translation of RRUFF to SciData object
:param rruff_dict: RRUFF dictionary extracted from read
:return: SciData object from translation
"""
scidata = SciData(_SCIDATA_UID)

# Add champ namespace for aspect techniqueType
cao = {"cao": "http://champ-project.org/images/ontology/cao.owl#"}
scidata.namespaces(cao)

# Title
scidata.title(rruff_dict.get("names", ""))
scidata.publisher(rruff_dict.get("owner", ""))

# Description
description_keywords = [
"description",
"locality",
"status",
]
description = jcamp._read_get_description(rruff_dict, description_keywords)
scidata.description(description)

# UID
rruff_dict.update({"rruffid": f'rruff:{rruff_dict.get("rruffid")}'})
scidata.graph_uid(rruff_dict.get('rruffid'))

# Authors
authors = []
author_keywords = ["source"]
for author_keyword in author_keywords:
if author_keyword in rruff_dict:
authors.append({
"@id": "author/{}".format(len(authors) + 1),
"@type": "dc:creator",
"name": rruff_dict[author_keyword]
})
scidata.author(authors)

# Sources / references
sources = []
sources.append({
"@id": "source/1/",
"@type": "dc:source",
"citation": "Highlights in Mineralogical Crystallography 2015 1-30",
"reftype": "journal article",
"doi": "10.1515/9783110417104-003",
"url": "https://doi.org/10.1515/9783110417104-003"
})

if "url" in rruff_dict:
sources.append({
"@id": f"source/{len(sources) + 1}",
"@type": "dc:source",
"citation": "RRUFF project database entry",
"url": f'https://{rruff_dict.get("url")}',
})
scidata.sources(sources)

# Discipline and sub-discipline
scidata.discipline("w3i:Chemistry")
scidata.subdiscipline("w3i:AnalyticalChemistry")

# Methodology - aspects
scidata.aspects(_read_get_aspects_section(rruff_dict))

# System - facets
scidata.facets(_read_get_facets_section(rruff_dict))

# Dataset
scidata.scope("material")
datagroup = jcamp._read_get_datagroup_subsection(rruff_dict)
scidata.datagroup([datagroup])

# TODO: add the dataseries
# Issue: https://github.com/ChalkLab/SciDataLib/issues/43

return scidata
5 changes: 4 additions & 1 deletion scidatalib/scidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,10 @@ def tocdict(a):
""" get the @type entry from a dictionary """
for k, v in a.items():
if k == '@type':
self.meta['@graph']['toc'].append(v)
if isinstance(v, list):
self.meta['@graph']['toc'].extend(v)
else:
self.meta['@graph']['toc'].append(v)
if isinstance(v, list):
toclist(v)
if isinstance(v, dict):
Expand Down

0 comments on commit 46867d1

Please sign in to comment.