In [1]:
date = "2020-03-09"
date_sn = date.replace("-","")
snomed_zip = "/Users/matentzn/ws/snomed_test_data/SnomedCT_InternationalRF2_PRODUCTION_{}T120000Z.zip".format(date_sn)
snomed_mapping_extended = "mapping_extended.tsv"
snomed_mapping_simple = "mapping_simple.tsv"
snomed_config = "snomed_config.yaml"

In [2]:
import os
import shutil
import zipfile
import pandas as pd
import yaml
import json

In [3]:
snomed_mapping_in_zip = "SnomedCT_InternationalRF2_PRODUCTION_{}T120000Z/Snapshot/Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_{}.txt".format(date_sn,date_sn)
simple_snomed_mapping_in_zip = "SnomedCT_InternationalRF2_PRODUCTION_{}T120000Z/Snapshot/Refset/Map/der2_sRefset_SimpleMapSnapshot_INT_{}.txt".format(date_sn,date_sn)

with zipfile.ZipFile(snomed_zip) as z:
    with z.open(snomed_mapping_in_zip) as zf, open(snomed_mapping_extended, 'wb') as f:
        shutil.copyfileobj(zf, f)
    with z.open(simple_snomed_mapping_in_zip) as zf, open(snomed_mapping_simple, 'wb') as f:
        shutil.copyfileobj(zf, f)

In [35]:
dfe = pd.read_csv(snomed_mapping_extended,sep="\t") 
dfs = pd.read_csv(snomed_mapping_simple,sep="\t") 

In [36]:
dfe.head()

Unnamed: 0,id,effectiveTime,active,moduleId,refsetId,referencedComponentId,mapGroup,mapPriority,mapRule,mapAdvice,mapTarget,correlationId,mapCategoryId
0,80005aeb-477c-53dc-9a5c-ce723ca264cb,20150731,1,449080006,447562003,254153009,1,1,True,ALWAYS Q79.8,Q79.8,447561005,447637006
1,80007e6a-7408-5b87-a1ec-70b212811410,20190731,1,449080006,447562003,16623961000119100,1,1,True,ALWAYS D61.1,D61.1,447561005,447637006
2,80009454-5531-5f78-b7c9-d288f2346d83,20190131,0,449080006,447562003,301327002,1,1,True,MAP SOURCE CONCEPT CANNOT BE CLASSIFIED WITH A...,,447561005,447638001
3,8000a5af-6962-5385-9227-4038d1f7b237,20150731,1,449080006,447562003,246951001,1,1,True,ALWAYS H10.8,H10.8,447561005,447637006
4,80010c58-f11f-572f-98f9-852cd301d5c0,20200131,0,449080006,447562003,291710005,2,1,True,ALWAYS X44 | POSSIBLE REQUIREMENT FOR PLACE OF...,X44,447561005,447637006


In [37]:
dfs.head()

Unnamed: 0,id,effectiveTime,active,moduleId,refsetId,referencedComponentId,mapTarget
0,80001267-7451-550a-82f0-92cc3bdfe890,20020131,1,900000000000207008,900000000000497000,154938001,.E4D4
1,80001782-d79c-5b33-8679-c0c62beef6da,20020131,1,900000000000207008,900000000000497000,138614002,.13gX
2,8000241a-ed32-4339-876b-05fee677bda3,20180131,1,900000000000207008,900000000000497000,735755000,XUyL5
3,80002a2a-412f-59a4-b07c-6f194709c556,20020131,1,900000000000207008,900000000000497000,238194001,X40Ze
4,80004caa-f9ed-5ef8-a9fb-6c9e89e0b89d,20020131,1,900000000000207008,900000000000497000,181522009,7N72Y


In [38]:
## Processing extended mappings

In [39]:
config = yaml.load(open(snomed_config, 'r'), Loader=yaml.FullLoader)
for key in config:
    if key!="snomed":
        print("------------------")
        print(key+":")
        for c in config.get(key):
            if "id" in c:
                print("{} ({})".format(c['id'],c['label']))
            elif "comment" in c:
                print(c['comment'])

------------------
meta:
icd metadata codes: https://confluence.ihtsdotools.org/display/RMT/SNOMED+CT+Managed+Service+-+US+Edition+Mapping+SNOMED+CT+to+ICD-10-CM+Technical+Specifications+-+March+2020
Module codes: https://confluence.ihtsdotools.org/display/DOCEXTPG/4.2+Modules
------------------
mapCategoryId:
447638001 (The map source concept cannot be classified)
447639009 (The map is context dependent)
447640006 (The source concept is ambiguous)
447635003 (Guidance from NCHS is ambiguous)
447641005 (The source SNOMED CT concept is incompletely modeled)
------------------
moduleId:
449080006 (SNOMED CT to ICD-10 rule-based mapping module)
731000124108 (US National Library of Medicine maintained module)
22091000087100 (Canada Health Infoway Reference Set Module)
999000011000000103 (SNOMED CT United Kingdom clinical extension module)
------------------
refsetId:
447562003 (ICD-10 complex map reference set)
446608001 (ICD-O simple map reference set)
900000000000497000 (CTV3 simple map r

In [40]:
# check whether orphanet
dfe['refsetId'].value_counts()

447562003    177845
Name: refsetId, dtype: int64

In [41]:
# Narrow down extended mapping set

# Excluded columns:
# id, effectiveTime, correlationId, mapAdvice (human readable advice on how to interpret map), mapGroup

# Check metadata here: https://confluence.ihtsdotools.org/display/DOCRELFMT/5.2.10+Complex+and+Extended+Map+Reference+Sets
print(len(dfe))
dfe = dfe[dfe['active']==1] # only take into account currently active mappings
print(len(dfe))
dfe = dfe[dfe['mapRule']=="TRUE"] # only use unconditional 
print(len(dfe))
dfe = dfe[dfe['mapCategoryId']==447637006] # only use "properly classified" mappings
print(len(dfe))

relevant_colums = ['active','moduleId','refsetId', 'referencedComponentId','mapTarget','mapCategoryId']
dfe = dfe[relevant_colums]
dfe.head()

177845
138328
138145
111729


Unnamed: 0,active,moduleId,refsetId,referencedComponentId,mapTarget,mapCategoryId
0,1,449080006,447562003,254153009,Q79.8,447637006
1,1,449080006,447562003,16623961000119100,D61.1,447637006
3,1,449080006,447562003,246951001,H10.8,447637006
5,1,449080006,447562003,472759003,I27.2,447637006
6,1,449080006,447562003,53597009,L56.2,447637006


In [42]:
dfe['refsetId'].value_counts()

447562003    111729
Name: refsetId, dtype: int64

In [43]:
#dfs = dfs[relevant_colums]
dfs.head()

Unnamed: 0,id,effectiveTime,active,moduleId,refsetId,referencedComponentId,mapTarget
0,80001267-7451-550a-82f0-92cc3bdfe890,20020131,1,900000000000207008,900000000000497000,154938001,.E4D4
1,80001782-d79c-5b33-8679-c0c62beef6da,20020131,1,900000000000207008,900000000000497000,138614002,.13gX
2,8000241a-ed32-4339-876b-05fee677bda3,20180131,1,900000000000207008,900000000000497000,735755000,XUyL5
3,80002a2a-412f-59a4-b07c-6f194709c556,20020131,1,900000000000207008,900000000000497000,238194001,X40Ze
4,80004caa-f9ed-5ef8-a9fb-6c9e89e0b89d,20020131,1,900000000000207008,900000000000497000,181522009,7N72Y


In [44]:
dfs['refsetId'].value_counts()

900000000000497000    474073
446608001              26868
Name: refsetId, dtype: int64

In [64]:
# "fromCurie","toCurie","datasourcePrefix","datasource","sourceType","scope","date"

# dfe: dataframe with 
def prepare_ols_mapping_table(dfe,fromOID,toOID,config, ids='unspecified'):
    df_mapping = dfe[['referencedComponentId','mapTarget']]
    df_mapping=df_mapping[~df_mapping['mapTarget'].isnull()]
    df_mapping.columns = ['fromCurie','toCurie']
    df_mapping['fromCurie'] = "{}:".format(fromOID)+df_mapping['fromCurie'].astype(str)
    df_mapping['toCurie'] = "{}:".format(toOID)+df_mapping['toCurie'].astype(str)
    df_mapping['datasourcePrefix'] = fromOID
    df_mapping['datasource'] = fromOID
    df_mapping['sourceType'] = "ONTOLOGY"
    df_mapping['scope'] = "RELATED"
    df_mapping['date'] = date
    

def prepare_ols_mapping(dfe,fromOID,toOID,config, export_to="table", ids='unspecified'):
    df_mapping = dfe[['referencedComponentId','mapTarget']]
    df_mapping=df_mapping[~df_mapping['mapTarget'].isnull()]
    df_mapping.columns = ['fromCurie','toCurie']
    df_mapping['fromCurie'] = "{}:".format(fromOID)+df_mapping['fromCurie'].astype(str)
    df_mapping['toCurie'] = "{}:".format(toOID)+df_mapping['toCurie'].astype(str)
    df_mapping['datasourcePrefix'] = fromOID
    df_mapping['datasource'] = fromOID
    df_mapping['sourceType'] = "ONTOLOGY"
    df_mapping['scope'] = "RELATED"
    df_mapping['date'] = date
    ttl = []
    
    if export_to=="ttl":
        snomed_uri_prefix = "http://snomed.info/id/"
        dbxref = "http://www.geneontology.org/formats/oboInOwl#hasDbXref"

        for index, row in df_mapping.iterrows():
            curie = row['fromCurie']
            uri = curie.replace(fromOID+":", snomed_uri_prefix)
            to_curie = row['toCurie']
            triple = f"<{uri}> <{dbxref}> \"{to_curie}\" . "
            ttl.append(triple)

        with open(f'snomed_{ids}_mapping_oxo_{fromOID}_{toOID}.ttl', 'w') as f:
            for item in ttl:
                f.write("%s\n" % item)
    else:
        df_mapping.to_csv(f'snomed_{ids}_mapping_oxo_{fromOID}_{toOID}.csv', index=None)
        df_mapping['datasource'] = json.dumps(config['snomed'])
        df_mapping.to_csv(f'ols_old_snomed_{ids}_mapping_oxo_{fromOID}_{toOID}.csv', index=None)
    
def extract_all_mappings(df, config, export_to, ids='unspecified'):
    for refsetid in df['refsetId'].unique():
        curie_prefix = "ID"
        for c in config['refsetId']:
            if c['id']==refsetid:
                print(c)
                curie_prefix = c['curie_prefix']
        print(refsetid)
        print(curie_prefix)
        df_ref = df[df['refsetId']==refsetid]
        prepare_ols_mapping(df_ref,"SNOMEDCT", curie_prefix, config,export_to,ids)

#### Config is json blob like:
"""  
  alternateIris: null
  name: "SNOMED CT International"
  source: "ONTOLOGY"
  idorgNamespace: ""
  alternatePrefix: 
    - "SNOMEDCT"
    - "SNOMED"
  prefix: "SNOMEDCT"
  licence: "https://www.nlm.nih.gov/healthit/snomedct/snomed_licensing.html"
  orcid: null
  versionInfo: ""
  preferredPrefix: "SNOMEDCT"
"""
    
def prepare_ols_mapping(dfe,fromOID,config):
    df_mapping = dfe[['fromCurie','toCurie']]
    df_mapping['datasourcePrefix'] = fromOID
    df_mapping['datasource'] = json.dumps(config)
    df_mapping['sourceType'] = "ONTOLOGY"
    df_mapping['scope'] = "RELATED"
    df_mapping['date'] = date
    df_mapping.to_csv('snomed_mapping_oxo_old.csv',index=None)

In [65]:
extract_all_mappings(dfs, config, 'ttl', "simple")
#extract_all_mappings(dfs, config, 'table', "simple")

{'id': 900000000000497000, 'label': 'CTV3 simple map reference set', 'curie_prefix': 'CTV3'}
900000000000497000
CTV3
{'id': 446608001, 'label': 'ICD-O simple map reference set', 'curie_prefix': 'ICDO'}
446608001
ICDO


In [66]:
extract_all_mappings(dfe, config, 'ttl', "extended")
#extract_all_mappings(dfe, config, 'table', "simple")

{'id': 447562003, 'label': 'ICD-10 complex map reference set', 'curie_prefix': 'ICD10CM'}
447562003
ICD10CM
