## Specification DDE Metadata generator

This script ingests information from Sahar's automatically pushed list of recently updated files to create changes in the appropriate tables that are used by the DDE.

Specifically, this script will:
1. Generate a table from all the files that were updated and pushed to the Bioschemas-DDE repo as a result of Sahar's automation
2. split the table to the individual files that they should update
3. Check to see if the url is already in the table, if so, ignore it
4. Of the remaining entries, check if the name is already in the table
 * if it is, select the version with the higher number and update the table

In [1]:
import os
import json
import pandas as pd
import time
import requests

In [8]:
def convert_to_raw(githuburl): ## Converts a github url to a raw github url
    githubrawurl = githuburl.replace('github.com','raw.githubusercontent.com').replace('blob/','').replace('tree/','')
    return githubrawurl

def load_parent_source():
    parent_source_url = 'https://github.com/BioSchemas/bioschemas.github.io/blob/profile-auto-generation/_data/metadata_mapping.csv'
    parent_source_df = pd.read_csv(convert_to_raw(parent_source_url), header=0,usecols = ['profile','TypeParent','ProfileParent'])
    return parent_source_df

def lookup_parent(parent_source_options,classname,spectype):
    parent_choices = parent_source_options.loc[parent_source_options['profile']==classname]
    if spectype == 'Profile':
        parent = parent_choices.iloc[0]['ProfileParent']
    else:
        parent = parent_choices.iloc[0]['TypeParent']
    return parent

def parse_info_from_json(url,parent_source_options):
    r = requests.get(convert_to_raw(url))
    tmpjson = json.loads(r.text)
    speclass =  [x for x in tmpjson['@graph'] if x["@type"]=="rdfs:Class"]
    speclassid = speclass[0]['@id'].split(':')
    namespace = speclassid[0]
    classname = speclassid[1]
    if '$validation' in list(speclass[0].keys()):
        spectype = 'Profile'
    else:
        spectype = 'Type'
    version = parse_version_from_url(url,classname)
    parent = lookup_parent(parent_source_options,classname,spectype)
    tmpdict = {'namespace':namespace,'name':classname,'subClassOf':parent,'type':spectype,'version':version,'url':url}
    return tmpdict

def generate_base_update_table(script_path):
    updated_specs_folder = os.path.join(script_path,'latest-updated-profiles')
    latest_updates = os.listdir(updated_specs_folder)
    parent_source_options = load_parent_source()
    updated_list = []
    for eachfile in latest_updates:
        filepath = os.path.join(updated_specs_folder,eachfile)
        tmpdf = pd.read_csv(filepath, header=0)
        version = tmpdf.iloc[0]['version']
        url = tmpdf.iloc[0]['url']
        tmpdict = parse_info_from_json(url,parent_source_options)
        updated_list.append(tmpdict)
    updated_df = pd.DataFrame(updated_list)
    return updated_df


def parse_version_from_url(url,classname):
    urlstrlist = url.split('/')
    bioschemasfile = urlstrlist[-1]
    no_ext = bioschemasfile.replace('.JSONLD','').replace('.jsonld','').replace('.JSON','').replace('.json','')
    version = no_ext.replace(classname+'_v','').replace(classname+'_','').replace('-type','').replace('-profile','')
    return version


def check_for_updates(spec_updated_df,original_df):
    update_needed = False
    if len(spec_updated_df)>0:
        update_needed = True
    if update_needed == True:
        original_url_list = original_df['url'].tolist()
        original_alias_list = [x.replace('blob','tree') for x in original_url_list]
        potentially_needs_updates = spec_updated_df.loc[~spec_updated_df['url'].isin(original_url_list)]
        if len(potentially_needs_updates) > 0:
            alias_check = potentially_needs_updates.loc[~potentially_needs_updates['url'].isin(original_alias_list)]
            if len(alias_check) > 0:
                update_needed = True
            else:
                update_needed = False
        else:
            update_needed = False
    return update_needed


def parse_version_number(version_number):
    clean_number = version_number.split('-')
    clean_version = clean_number[0]
    return clean_version


def compare_versions(a,b):
    version_a = parse_version_number(a).split('.')
    version_b = parse_version_number(b).split('.')
    if int(version_a[0]) > int(version_b[0]):
        ## The whole number of version a is greater than version b
        latest_version = a
    elif int(version_a[0]) < int(version_b[0]): 
        ## The whole number of version b is greater than version a
        latest_version = b
    else:
        ## The whole numbers of version a and b are the same, check the decimal value
        if int(version_a[1]) > int(version_b[1]):
            ## The decimal value of version a is greater than version b
            latest_version = a
        elif int(version_a[1]) < int(version_b[1]):
            ## The decimal value of version b is greater than version a
            latest_version = b
        else:
            ## The versions are the same pick the new version
            latest_version = a
      
    return latest_version


def update_spec_table(script_path,eachfile,spec_updated_df,test=False):
    original_df = pd.read_csv(os.path.join(script_path,eachfile),delimiter='\t',header=0,
                              usecols=['name','namespace','subClassOf','type','version','url'])
    update_needed = check_for_updates(spec_updated_df,original_df)
    if update_needed == True:
        classes_to_update = spec_updated_df['name'].to_list()
        newdf = original_df.loc[~original_df['name'].isin(classes_to_update)]
        originalist = original_df['name'].tolist()
        if test == True:
            print('classes to update: ',classes_to_update)
        for eachclass in classes_to_update:
            updateversiondf = spec_updated_df.loc[spec_updated_df['name']==eachclass]
            updateversion = updateversiondf.iloc[0]['version']
            oldversiondf = original_df.loc[original_df['name']==eachclass]
            if len(oldversiondf)<=0:
                latestversion = updateversion
                newdf = pd.concat((newdf,updateversiondf),ignore_index=True)
            elif eachclass not in originalist:
                latestversion = updateversion
                newdf = pd.concat((newdf,updateversiondf),ignore_index=True)
            else:
                oldversion = oldversiondf.iloc[0]['version']
                latestversion = compare_versions(updateversion,oldversion)
                if latestversion == updateversion:
                    newdf = pd.concat((newdf,updateversiondf),ignore_index=True)
                else :
                    newdf = pd.concat((newdf,oldversiondf),ignore_index=True)
            print(eachclass, latestversion)
        ordereddf = newdf[['namespace','name','subClassOf','type','version','url']].copy()
        ordereddf.to_csv(os.path.join(script_path,eachfile),sep='\t',header=True,index=False)
        if test == True:
            print('file to update: ',eachfile)
            print('content of update: ',ordereddf)


def update_tables(script_path,test=False):
    updated_df = generate_base_update_table(script_path)
    if test == True:
        print('updated df: ',updated_df)
    deprecated = updated_df.loc[updated_df['version'].astype(str).str.contains('DEPRECATED')]
    if test == True:
        print('number of deprecated: ',len(deprecated))
    draftdf = updated_df.loc[(updated_df['version'].astype(str).str.contains('DRAFT'))&(~updated_df['version'].astype(str).str.contains('DEPRECATED'))]
    if test == True:
        print('number of draft: ',len(draftdf))
    releasedf = updated_df.loc[(updated_df['version'].astype(str).str.contains('RELEASE'))&(~updated_df['version'].astype(str).str.contains('DEPRECATED'))]
    if test == True:
        print('number of released: ',len(releasedf))
    draft_profile = draftdf.loc[draftdf['type']=='Profile']
    if test == True:
        print('number of draft profiles: ',len(draft_profile))
    draft_type = draftdf.loc[draftdf['type']=='Type']
    if test == True:
        print('number of draft type: ',len(draft_type))
    released_profile = releasedf.loc[releasedf['type']=='Profile']
    if test == True:
        print('number of released profile: ',len(released_profile))
    released_type = releasedf.loc[releasedf['type']=='Type']
    if test == True:
        print('number of released type: ',len(released_type))
    filelist = ['deprecated.txt','profile_list.txt','type_list.txt','draft_profile_list.txt','draft_type_list.txt']
    for eachfile in filelist:
        if 'deprecated' in eachfile:
            spec_updated_df = deprecated
            if test == True:
                update_spec_table(script_path,eachfile,spec_updated_df,True)
            else:
                update_spec_table(script_path,eachfile,spec_updated_df,False)
        elif 'profile' in eachfile:
            if 'draft' in eachfile:
                spec_updated_df = draft_profile
                if test == True:
                    update_spec_table(script_path,eachfile,spec_updated_df,True)
                else:
                    update_spec_table(script_path,eachfile,spec_updated_df,False)
            else:
                spec_updated_df = released_profile
                if test == True:
                    update_spec_table(script_path,eachfile,spec_updated_df,True)
                else:
                    update_spec_table(script_path,eachfile,spec_updated_df,False)
        else:
            if 'draft' in eachfile:
                spec_updated_df = draft_type
                if test == True:
                    update_spec_table(script_path,eachfile,spec_updated_df,True)
                else:
                    update_spec_table(script_path,eachfile,spec_updated_df,False)                    
            else:
                spec_updated_df = released_type
                if test == True:
                    update_spec_table(script_path,eachfile,spec_updated_df,True)
                else:
                    update_spec_table(script_path,eachfile,spec_updated_df,False)
                


In [9]:
script_path = ''
update_tables(script_path,True)

updated df:                  namespace                    name  \
0    bioschemasdeprecated                  Beacon   
1              bioschemas           BioChemEntity   
2              bioschemas        BioChemStructure   
3              bioschemas               BioSample   
4              bioschemas       ChemicalSubstance   
5              bioschemas       ComputationalTool   
6                bh2022GH   ComputationalWorkflow   
7              bioschemas             DataCatalog   
8    bioschemasdeprecated              DataRecord   
9              bioschemas                 Dataset   
10       bioschemasdrafts                 Disease   
11             bioschemas                     DNA   
12             bioschemas                  Enzyme   
13             bioschemas                   Event   
14             bioschemas         FormalParameter   
15             bioschemas                    Gene   
16             bioschemas                 Journal   
17             bioschemas        

ProteinStructure 0.6-DRAFT
PublicationIssue 0.3-DRAFT
PublicationVolume 0.3-DRAFT
RNA 0.2-DRAFT
ScholarlyArticle 0.3-DRAFT
SemanticTextAnnotation 0.3-DRAFT
SequenceAnnotation 0.7-DRAFT
SequenceRange 0.2-DRAFT
Study 0.3-DRAFT
Taxon 0.8-DRAFT
TaxonName 0.2-DRAFT
file to update:  draft_profile_list.txt
content of update:             namespace                    name  \
0         bioschemas                  Course   
1         bioschemas          CourseInstance   
2   bioschemasdrafts                    Gene   
3         bioschemas        TrainingMaterial   
4         bioschemas       ChemicalSubstance   
5         bioschemas       ComputationalTool   
6           bh2022GH   ComputationalWorkflow   
7         bioschemas             DataCatalog   
8         bioschemas                 Dataset   
9   bioschemasdrafts                 Disease   
10        bioschemas                   Event   
11        bioschemas         FormalParameter   
12        bioschemas                 Journal   
13     

In [None]:
import os
import json
import pandas as pd
import time
import requests

from src.metatables import *

script_path = os.getcwd()
update_tables(script_path)

## Deprecated functions-- Do not use!

In [None]:
def compare_versions(a,b):
    greaterlist = ['0.10','0.11','0.12','0.13','0.14','0.15']
    lesserlist = ['0.2','0.3','0.4','0.5','0.6','0.7','0.8','0.9']
    latest_version = None
    if a > b:
        latest_version = a
    else:
        for x in greaterlist:
            if x in a:
                weirdversion = True
                break
            else:
                weirdversion = False
        if weirdversion == False:
            latest_version = b
        else:
            for y in lesserlist:
                if y in b:
                    latest_version = a
                    break
                else:
                    latest_version = b    
    return latest_version