# Create CSDMS Code-Meta Files

The purpose of this notebook is to create code-meta json files for content described in the CSDMS model registry. These code-meta representations will serve as input to the FAIR evaluation framework.

This notebooks uses data collected via the following url structure:

`https://csdms.colorado.edu/csdms_wiki/index.php?title=Special:Browse&offset=0&dir=out&article=Model%3ATOPMODEL&group=hide&format=json`

In [1]:
import json
import pprint
from datetime import date

from pydantic.v1 import HttpUrl
from pydantic2_schemaorg.Person import Person
from pydantic2_schemaorg.DataFeed import DataFeed
from pydantic2_schemaorg.Organization import Organization
from pydantic2_schemaorg.CreativeWork import CreativeWork
from codemeticulous.codemeta.models import CodeMetaV3, VersionedLanguage

In [2]:
# load some data
fname = 'FineSed3D.json'

#with open('raw_model_metadata/TOPMODEL.json', 'r') as f:
#with open('raw_model_metadata/Dorado.json', 'r') as f:
#with open('raw_model_metadata/SVELA.json', 'r') as f:

with open(f'raw_model_metadata/{fname}', 'r') as f:
    dat = json.loads(f.read())    

Squash all properties so it's easier to access the data

All properties within the 'data' list are structure as:

```
{'property': <Property Name>, 'dataitem': [{'type': <Type>, 'item': <Value>}]}

for example:

{'property': 'City', 'dataitem': [{'type': 2, 'item': 'Lancaster'}]}
```

In [3]:
properties = {}
for prop in dat['data']:
    items = [p['item'].strip() for p in prop['dataitem']]
    if len(items) == 1:
        items = items[0]
    properties[prop['property']] = items

Create Pydantic Representation of "Core" metadata. These are metadata that fit into the SchemaOrg CreativeWork class.

In [4]:
def as_list(obj):
    if isinstance(obj, list):
        return obj
    return [obj]

In [5]:
# create list of authors. Lots of handling is needed to account for one or more authors.

# get the total number of authors by their email addresses 
total_people = len(as_list(properties['Email_address'])) 
if 'Additional_email_address' in properties:
    total_people += len(as_list(properties['Additional_email_address']))

# collect first names, last names, and emails
first_names = [properties['First_name']]
if 'Additional_first_name' in properties:
    first_names += as_list(properties['Additional_first_name'])

last_names = [properties['Last_name']]
if 'Additional_last_name' in properties:
    last_names += as_list(properties['Additional_last_name'])

emails = [properties['Email_address']] 
if 'Additional_email_address' in properties:
    emails += as_list(properties['Additional_email_address'])

# If the length of any of these fields is 1, it will be applied to all additional_fields
# collect addresses. 
cities = as_list(properties["City"]) #+ additional_cities
if 'Additional_city' in properties:
    additional_cities = as_list(properties['Additional_city'])
    if len(additional_cities) < (total_people -1 ):
        additional_cities *= (total_people - 1)
    cities += additional_cities

# collect countries
countries = as_list(properties["Country"]) 
if 'Additional_country' in properties:
    additional_countries = as_list(properties['Additional_country'])
    if len(additional_countries) < (total_people -1 ):
        additional_countries *= (total_people - 1)
    countries += additional_countries

# collect institutes
institutes = as_list(properties["Institute"])
if 'Additional_institute' in properties:
    additional_institutes = as_list(properties['Additional_institute'])
    if len(additional_institutes) < (total_people -1 ):
        additional_institutes *= (total_people - 1)
    institutes += additional_institutes

# build list of authors
authors = []
for i in range(0, total_people):
    author = Person(
        givenName= first_names[i] ,
        familyName = last_names[i],
        email = emails[i],
        affiliation = Organization(
            address = f'{cities[i]}, {countries[i]}',
            name = institutes[i]
        )
    )
    authors.append(author)

# author = Person(
#     givenName= properties['First_name'] ,
#     familyName = properties['Last_name'],
#     email = properties['Email_address'],
#     affiliation = Organization(
#         address = f'{properties["City"]}, {properties["Country"]}',
#         name = properties['Institute']
#     )
# )

In [6]:


date_created = date(int(properties['Start_year_development']),1,1)

# Keywords combine: ModelDomain, Modelautophrases, and Model_keywords
keywords = as_list(properties.get('ModelDomain', [])) + \
           as_list(properties.get('Modelautophrases', [])) + \
           as_list(properties.get('Model_keywords', []))
if len(keywords) == 0:
    keywords = None

Creating Pydantic representation of Code-Meta fields. These are defined by Leslie Hsu in https://github.com/codemeta/codemeta/blob/d464a2891206a55c1146b4dd6b996b8fa733ceb1/crosswalks/csdms.csv. In the future, this will be replaced with the crosswalk that Irene is developing.

In [7]:
# core fields
web_address = properties.get('Source_web_address', None)
downloadUrl = None
if web_address is not None:
    codeRepository = HttpUrl(scheme=properties['Source_web_address'].split(':')[0], url=properties['Source_web_address'])

    # downloadUrl
    downloadUrl = HttpUrl(scheme=properties['Source_web_address'].split(':')[0], url=properties['Source_web_address'])

plang = as_list(properties['Programming_language']) # if isinstance(properties['Programming_language'], list) else [properties['Programming_language']]

if 'Program_language_other' in properties:
    plang += as_list(properties['Program_language_other'])
#plang_other = properties['Program_language_other'] if isinstance(properties['Program_language_other'], list) else [properties['Program_language_other']]
programmingLanguage = [VersionedLanguage(name=lang) for lang in plang]

# applicationCategory
applicationCategory = properties.get('Model_type', None)

# memory Requirements
#memoryRequirements = properties['Memory_requirements'].replace('-', '').strip()
memoryRequirements = properties.get('Memory_requirements', None)
if memoryRequirements is not None:
    memoryRequirements = memoryRequirements.replace('-', '').strip()
if not memoryRequirements:
    memoryRequirements = None
    
dateModified = None
if 'End_year_model_development' in properties:
    dateModified = date(properties['End_year_model_development'], 1, 1)

supported_platforms = as_list(properties['Supported_platforms']) # if isinstance(properties['Supported_platforms'], list) else [properties['Supported_platforms']]
#supported_platforms_other = []
if 'Supported_platforms_other' in properties: # not sure if this is use, but it's in Leslie's crosswalk
    supported_platforms += as_list(properties['Supported_platforms_other']) # if isinstance(properties['Supported_platforms_other'], list) else [properties['Supported_platforms_other']]
operatingSystem = [platform for platform in supported_platforms]


#url = HttpUrl(scheme=properties['Source_web_address'].split(':')[0], url=properties['Source_web_address'])
model_name = dat['subject'][0:-7]
record_url = f'https://csdms.colorado.edu/wiki/Model:{model_name}'
url = HttpUrl(scheme=record_url.split(':')[0],
              url=record_url)

# developmentStatus ### NOTE THIS IS DIFFERENT THAN IRENE'S MAPPING ###
developmentStatus = properties.get('DevelopmentCode', None)

# license
# TODO: The way we extract license needs to be improved.
license_value = properties.get('Program_license_type', None)
if license_value is not None:
    if license_value.startswith('https:') or license_value.startswith('http:'):
        license = HttpURL(scheme = license_value.split(':')[0], url=license_value)
    else:
        license = CreativeWork(name=properties['Program_license_type'])

# identifier
doi_identifier = properties.get('DOI_model', None)

# softwareVersion
software_version = properties.get('version', None)

# softwareHelp
manual_available_string = properties.get('version', 'No')
helpAvailable = False if manual_available_string == 'No' else True
manual_url = f"https://csdms.colorado.edu/csdms_wiki/images/{':'.join(properties.get('Model_manual').split(':')[1:])}"

softwareHelp = CreativeWork(name="Software Manual",
                            url=HttpUrl(scheme='https',
                                        url=manual_url.replace(' ', '_')
                                       )
                           )

# supportingData
# using test data because this is a URL that's provided. Calibration data is a narrative.
supportingData = None
testData = properties.get('Model_test_data', None)
if testData is not None:
    test_data_url = f"https://csdms.colorado.edu/csdms_wiki/images/{':'.join(testData.split(':')[1:])}"
    supportingData = DataFeed(name = 'Test Data',
                              url=test_data_url.replace(' ', '_')
                             )

# processorRequirements
# Cannot satisfy with CSDMS metadata.
# Nr-processors-implemented, Nr-of-distributed-processors, and Nr-shared-processors
# is empty for models in registry
#multiple_procs = properties.get('Multiple Processors', None)


# isAccessibleForFree
# could this be a combination of accessible url and open license?

# referencePublication
# Could be obtained by scraping the "references" section of the website, but
# these data are not available in the JSON. Irene suggests using "Describe_key_physical_parameters"
# However this field is not used consistently across records in the CSDMS registry.

# contributor  <--- need an example to test.
# Current_future_collaborators is used inconsistently, sometimes it's a statement inviting
# collaboration, and other times it's a list of active collaborators.


# encoding
# hasPart
# identifier
# isAccessibleForFree
# isPartOf
# relatedLink
# softwareRequirements
# provider
# ispartof
#sameas


- [x] name
- [x] dateCreated
- [x] author
- [x] description
- [x] keywords
- [x] codeRepository
- [x] programmingLanguage
- [x] applicationCategory
- [x] memoryRequirements
- [x] dateModified
- [x] operatingSystem
- [x] url
- [x] developmentStatus
- [x] downloadUrl
- [x] identifier
- [x] license
- [x] softwareHelp
- [x] softwareVersion
- [x] supportingData
- [ ] encoding
- [ ] contributor <- need to find an example in the metadata
- [ ] hasPart
- [ ] isAccessibleForFree
- [ ] isPartOf
- [ ] processorRequirements
- [ ] referencePublication
- [ ] relatedLink
- [ ] softwareRequirements



---
- [ ] ~address~
- [ ] ~affiliation~
- [ ] applicationSubCategory `?`
- [ ] buildInstructions `?`
- [ ] citation `?`
- [ ] ~codeIntegration~
- [ ] copyrightHolder `?`
- [ ] copyrightYear `?`
- [ ] datePublished `?`
- [ ] editor `?`
- [ ] ~email~
- [ ] ~embargoDate~
- [ ] endDate `?`
- [ ] ~familyName~
- [ ] ~fileFormat~
- [ ] fileSize `?`
- [ ] funder `?`
- [ ] funding `?`
- [ ] ~givenName~
- [ ] installUrl `?` 
- [ ] issueTracker `?`
- [ ] maintainer `?`
- [ ] permissions `?`
- [ ] ~position~
- [ ] producer `?`
- [ ] provider `?`
- [ ] publisher `?`
- [ ] readme `?`
- [ ] releaseNotes `?`
- [ ] ~roleName~
- [ ] ~runtimePlatform~  <-- Supported_platforms?
- [ ] sameAs `?`
- [ ] ~softwareSuggestions~
- [ ] sponsor `?`
- [ ] startDate `?`
- [ ] storageRequirements `?`
- [ ] targetProduct `?`
- [ ] ~version~

In [267]:
meta = CodeMetaV3(
    name = dat['subject'][0:-7],
    dateCreated = date_created,
    author=authors,
    description=properties['Extended_model_description'],
    keywords=keywords,
    codeRepository=codeRepository,
    programmingLanguage = programmingLanguage,
    applicationCategory = applicationCategory,
    memoryRequirements = memoryRequirements,
    dateModified = dateModified,
    operatingSystem = operatingSystem,
    url = url,
    developmentStatus=developmentStatus,
    downloadUrl=downloadUrl,
    identifier=doi_identifier,
    software_version=software_version,
    supportingData=supportingData,
    
)

# There appears to be an issue with the validate_creative_work function that
# I think is related to the "each_item=True" option. To overcome this, we'll
# add some fields after our class is instantiated to skip validation. 
meta.license = license
meta.softwareHelp=softwareHelp

print(json.dumps(json.loads(meta.json()), indent=4))

{
    "@context": "https://w3id.org/codemeta/3.0",
    "@type": "SoftwareSourceCode",
    "name": "FineSed3D",
    "codeRepository": "https://github.com/Delta-function/cliffs-src",
    "programmingLanguage": [
        {
            "@type": "ComputerLanguage",
            "name": "Fortran77"
        }
    ],
    "applicationCategory": "Single",
    "operatingSystem": [
        "Unix",
        "Linux",
        "Mac OS",
        "Windows"
    ],
    "softwareHelp": {
        "@type": "CreativeWork",
        "url": "https://csdms.colorado.edu/csdms_wiki/images/FineSed3D_User_Manual.pdf",
        "name": "Software Manual"
    },
    "supportingData": {
        "@type": "DataFeed",
        "url": "https://csdms.colorado.edu/csdms_wiki/images/Case2Data.mat",
        "name": "Test Data"
    },
    "author": [
        {
            "@type": "Person",
            "givenName": "Zhen",
            "familyName": "Cheng",
            "affiliation": {
                "@type": "Organization",
       