In [128]:
date = "2020-03-09"
date_sn = date.replace("-","")
snomed_zip = "/Users/matentzn/ws/snomed_test_data/SnomedCT_InternationalRF2_PRODUCTION_{}T120000Z.zip".format(date_sn)
snomed_mapping_extended = "mapping_extended.tsv"
snomed_mapping_simple = "mapping_simple.tsv"
snomed_config = "snomed_config.yaml"

In [129]:
import os
import shutil
import zipfile
import pandas as pd
import yaml
import json

In [130]:
snomed_mapping_in_zip = "SnomedCT_InternationalRF2_PRODUCTION_{}T120000Z/Snapshot/Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_{}.txt".format(date_sn,date_sn)
simple_snomed_mapping_in_zip = "SnomedCT_InternationalRF2_PRODUCTION_{}T120000Z/Snapshot/Refset/Map/der2_sRefset_SimpleMapSnapshot_INT_{}.txt".format(date_sn,date_sn)

with zipfile.ZipFile(snomed_zip) as z:
    with z.open(snomed_mapping_in_zip) as zf, open(snomed_mapping_extended, 'wb') as f:
        shutil.copyfileobj(zf, f)
    with z.open(simple_snomed_mapping_in_zip) as zf, open(snomed_mapping_simple, 'wb') as f:
        shutil.copyfileobj(zf, f)

In [138]:
dfe = pd.read_csv(snomed_mapping_extended,sep="\t") 
dfs = pd.read_csv(snomed_mapping_simple,sep="\t") 

In [139]:
## Processing extended mappings

In [150]:
config = yaml.load(open(snomed_config, 'r'), Loader=yaml.FullLoader)
for key in config:
    if key!="snomed":
        print("------------------")
        print(key+":")
        for c in config.get(key):
            if "id" in c:
                print("{} ({})".format(c['id'],c['label']))
            elif "comment" in c:
                print(c['comment'])

------------------
meta:
icd metadata codes: https://confluence.ihtsdotools.org/display/RMT/SNOMED+CT+Managed+Service+-+US+Edition+Mapping+SNOMED+CT+to+ICD-10-CM+Technical+Specifications+-+March+2020
Module codes: https://confluence.ihtsdotools.org/display/DOCEXTPG/4.2+Modules
------------------
mapCategoryId:
447638001 (The map source concept cannot be classified)
447639009 (The map is context dependent)
447640006 (The source concept is ambiguous)
447635003 (Guidance from NCHS is ambiguous)
447641005 (The source SNOMED CT concept is incompletely modeled)
------------------
moduleId:
449080006 (SNOMED CT to ICD-10 rule-based mapping module)
731000124108 (US National Library of Medicine maintained module)
22091000087100 (Canada Health Infoway Reference Set Module)
999000011000000103 (SNOMED CT United Kingdom clinical extension module)
------------------
refsetId:
447562003 (ICD-10 complex map reference set)
446608001 (ICD-O simple map reference set)
900000000000497000 (CTV3 simple map r

In [141]:
# check whether orphanet
dfe['refsetId'].value_counts()

447562003    177845
Name: refsetId, dtype: int64

In [None]:


# Excluded columns:
# id, effectiveTime, correlationId, mapAdvice (human readable advice on how to interpret map), mapGroup

# Check metadata here: https://confluence.ihtsdotools.org/display/DOCRELFMT/5.2.10+Complex+and+Extended+Map+Reference+Sets
print(len(dfe))
dfe = dfe[dfe['active']==1] # only take into account currently active mappings
print(len(dfe))
dfe = dfe[dfe['mapRule']=="TRUE"] # only use unconditional 
print(len(dfe))
dfe = dfe[dfe['mapCategoryId']==447637006] # only use "properly classified" mappings
print(len(dfe))

relevant_colums = ['active','moduleId','refsetId', 'referencedComponentId','mapTarget','mapCategoryId']
dfe = dfe[relevant_colums]

In [None]:
dfs.head()

In [142]:
dfs['refsetId'].value_counts()

900000000000497000    474073
446608001              26868
Name: refsetId, dtype: int64

In [146]:
# "fromCurie","toCurie","datasourcePrefix","datasource","sourceType","scope","date"

# dfe: dataframe with 
def prepare_ols_mapping(dfe,fromOID,toOID,config):
    df_mapping = dfe[['referencedComponentId','mapTarget']]
    df_mapping=df_mapping[~df_mapping['mapTarget'].isnull()]
    df_mapping.columns = ['fromCurie','toCurie']
    df_mapping['fromCurie'] = "{}:".format(fromOID)+df_mapping['fromCurie'].astype(str)
    df_mapping['toCurie'] = "{}:".format(toOID)+df_mapping['toCurie'].astype(str)
    df_mapping['datasourcePrefix'] = fromOID
    df_mapping['datasource'] = fromOID
    df_mapping['sourceType'] = "ONTOLOGY"
    df_mapping['scope'] = "RELATED"
    df_mapping['date'] = date
    df_mapping.to_csv('snomed_mapping_oxo.csv')
    df_mapping['datasource'] = json.dumps(config['snomed'])
    df_mapping.to_csv('snomed_mapping_oxo_old.csv')


In [None]:
# Config is json blob like:
"""  
  alternateIris: null
  name: "SNOMED CT International"
  source: "ONTOLOGY"
  idorgNamespace: ""
  alternatePrefix: 
    - "SNOMEDCT"
    - "SNOMED"
  prefix: "SNOMEDCT"
  licence: "https://www.nlm.nih.gov/healthit/snomedct/snomed_licensing.html"
  orcid: null
  versionInfo: ""
  preferredPrefix: "SNOMEDCT"
"""
    
def prepare_ols_mapping(dfe,fromOID,config):
    df_mapping = dfe[['fromCurie','toCurie']]
    df_mapping['datasourcePrefix'] = fromOID
    df_mapping['datasource'] = json.dumps(config)
    df_mapping['sourceType'] = "ONTOLOGY"
    df_mapping['scope'] = "RELATED"
    df_mapping['date'] = date
    df_mapping.to_csv('snomed_mapping_oxo_old.csv')

In [147]:
prepare_ols_mapping(dfe,"SNOMEDCT","ICD10CM",config)
df_mapping[df_mapping['fromCurie']=="SNOMEDCT:26929004"].head()


Unnamed: 0,fromCurie,toCurie,datasourcePrefix,datasource,sourceType,scope,date
42741,SNOMEDCT:26929004,ICD10CM:G30.9,SNOMED,"{""alternateIris"": [], ""name"": ""SNOMED CT Inter...",ONTOLOGY,RELATED,20-02-29


In [151]:
for refsetid in dfs['refsetId'].unique():
    curie_prefix = "ID"
    for c in config['refsetId']:
        if c['id']==refsetid:
            print(c)
            curie_prefix = c['curie_prefix']
    print(refsetid)
    print(curie_prefix)
    dfs_ref = dfs[dfs['refsetId']==refsetid]
    prepare_ols_mapping(dfe,"SNOMEDCT",curie_prefix,config)

{'id': 900000000000497000, 'label': 'CTV3 simple map reference set', 'curie_prefix': 'CTV3'}
900000000000497000
CTV3
{'id': 446608001, 'label': 'ICD-O simple map reference set', 'curie_prefix': 'ICDO'}
446608001
ICDO
