# Map Obesity dataset into Phenopackets

In [2]:
import re

from datetime import datetime, date
from datetime import timedelta

from phenopackets import Individual, Sex, PhenotypicFeature, OntologyClass, Phenopacket
from phenopackets import Measurement, Quantity
from phenopackets import AgeRange, Age
from phenopackets import Diagnosis

## Map temporal data

In [None]:
record_id = 1
SCT_412903 = 2004
SCT_583190 = 1

### in brackets and # behind def functions: names of variables and columns in test_final.csv

# col 4: sct_422549004 // Patient_ID


def parse_erker_age(age): # col 6: sct_184099003_y / Birthyear
    age_df = pd.read_csv('/Users/adam/Documents/Adipositas/MC4R/mapping_phenos/test_final.csv')['sct_184099003_y'].apply(age)
    return 'age:'
        'iso8601duration' = "P" + age_df + "Y"

def parse_erker_sex(sex): #col 7: sct_281053000 / Sex at birth
    if sex == 'sct_248152002': 
        return 'FEMALE'
    elif sex == 'sct_248153007': 
        return 'MALE' 
    else: 
        return 'UNKNOWN_SEX'    

def parse_erker_agerange(age_range): 
    """#col 10: sct_410598002 / Alterskategorie"""
    
    if age_range == 'sct_133931009':
        start = Age(iso8601duration='P0Y')
        end = Age(iso8601duration='P1Y')
    elif age_range == 'sct_410602000':
        start = Age(iso8601duration='P1Y')
        end = Age(iso8601duration='P6Y')
    elif age_range == 'sct_410600008':
        start = Age(iso8601duration='P6Y')
        end = Age(iso8601duration='P12Y')
    elif age_range == 'sct_133937008':
        start = Age(iso8601duration='P12Y')
        end = Age(iso8601duration='P18Y')
    elif age_range == 'sct_13393600':
        start = Age(iso8601duration='P18Y')
        end = Age(iso8601duration='P99Y')
    else:
        print(f'lnc_67162-8_X {age_range}')
        return None
    
    return AgeRange(start=start, end=end)

# col 11: 'sct_769681006': ## erstkontakt 
## not represented within Phenopackets?

def parse_erker_onset(onset):
    onset_feature: {
        type: "HP:0003674"
        label: "Onset"
    if onset == 'sct_118189007': 
        type: {
            id: "HP:0030674",
            label: "Antenatal onset"
        }
    elif onset == 'sct_364586004': 
        type: {
            id: "HP:0003577",
            label: "Congenital onset"
        }
    elif onset == "sct_424850005": #date of clinical onset
        age = "sct_184099003_y" # col_7, year of birth
        type: {
            id: "HP:0003674",
            label: "Onset"
        onset: 
            timestamp: age.astype(int) + "sct_424850005_y" + "sct_424850005_m" + "sct_424850005_d" 
                                            #col_14,15,16 year, month and day of clinical onset
        }     
    else:
        return = "sct_261665006" # unknown
            
    }

##col 17 & 18: 'sct_423493009_y ' : age of diagnosis
def parse_erker_datediagnosis(age_dg):
    return Age(iso8601duration=f'P{age_dg.y}Y{age_dg.m}M')

## Map interpretation

In [None]:
def parse_disease(diagnosis, onset_diagnosis):
    """Takes some columns and turns them into a Disease. E.g. age of diagnosis column"""
    # TODO - create a disease or return None if there is not enough data
    if diagnosis == "Obesity due to pro-opiomelanocortin deficiency"
        term:
            id: "ORPHA:71529" #col_21 = sct_439401001_orpha
            label: "Obesity due to pro-opiomelanocortin deficiency"
        term: 
            id: "ICD10:E66.8" # col_23 = sct_439401001_icd10gm
            label: "Other obesity"
        term: 
            id: "OMIM:___" #col_33 = sct_439401001_omim
            label: "___"
        term:
            id: "AlphaID__" # col_24 = sct_439401001_alphaid
            label: "___"
    else
        return None 
    
    if onset_diagnosis == "sct_118189007": 
        type: {
            id: "HP:0030674",
            label: "Antenatal onset"
        }
    elif onset_diagnosis == 'sct_364586004': 
        type: {
            id: "HP:0003577",
            label: "Congenital onset"
    elif onset_diagnosis == "sct_424850005" #date of diagnosis onset
        onset:
            age: 
                return parse_erker_datediagnosis(age_dg)
    elif onset_diagnosis == "sct_261665006" #unknown
        return None
                            
    return Disease(diagnosis=diagnosis, onset=onset)

## col20: dg_sicherung
## not represented within Phenopackets?

def parse_erker_genetics(zygosity):
    id = record_id # col_1 record ID 
    progressStatus = "SOLVED"
    diagnosis: 
        variantDescriptor: 
            expressions:
                syntax = "hgvs"
                value = "" # col 27-29: ln_48004_6_1, _2, _3 = HGVS Diagnosis

    if zygosity == "sct_22061001": # col 37-42: ln_55198_6 #zygotisity
        variantInterpretation = allelicState(id="GENO:0000136", label="homozygous")
    elif zygosity == "sct_14556007": 
        variantInterpretation = allelicState(id="GENO:0000135", label="heterozygous")
    elif zygosity == "homoplasmy": 
        variantInterpretation = allelicState(id="GENO:0000602", label="homoplasmic")
    elif zygosity == "heteroplasmy": 
        variantInterpretation = allelicState(id="GENO:0000603", label="heteroplasmic")
    elif zygosity == "sct_1220561009" 
        return None 
        
    return Interpretation(id = id, progressStatus = progressStatus, variantDescriptor = variantDescriptor, variantInterpretation = variantInterpretation)
            
def parse_erker_hgnc(gen_hgnc):# col 30-32: ln_47999_8_hgnc_1, _2, _3 = HGNC Diagnosis
    if gen_hgnc == "HGNC:6932"
        valueId = "HGNC:6932"
        symbol = "MC4R"
        alternateIds = "OMIM:___" # col_33 = sct_439401001_omim
        
    return GeneDescriptor(valueId = valueId, symbol = symbol, alternateIds = alternateIds)





# scol 46: ct_8116006_1 /  phenotype_1
def parse_erker_ph(ph_f)
    return 'phenotypicFeature:'
        'type:'
            'id': 'HP:0001513'
            'label': 'Obesity'
 
        

            
            
# col: 123 : ethnicity:
## ## not represented within phenopackets? 


# col 127 - 133: BMI parents & siblings:
## within pedigree? 
def parse_erker_fa(fa, mother, father, siblings): 
    """col 127-133: sct_72705000_rd, mother""" 
    if mother == "sct_373066001": #sct_72705000_rd 
            familyId = ("___") 
            individualId = ("___")
            paternalId = ("___")
            maternalId = ("MOTHER")
            sex = ("FEMALE")
            affectedStatus = ("MISSING")
            ### affected by RD?
            ### Mother deceased? 
            ### Mother Age? 
    if father == "sct_373066001": # sct_66839005_rd
            familyId = ("___") 
            individualId = ("___")
            paternalId = ("FATHER")
            maternalId = ("___")
            sex = ("MALE")
            affectedStatus: ("MISSING")
            ### affected by RD?
            ### Father deceased? 
            ### Fater Age? 
    if siblings == "sct_373066001":  # sct_82101005_rd
            ### ??? 
            
            
# col 145ff: concomitant_medication_1 ff
# col 151ff: comorbidities_1 ff
# unclear whether or not to integrate 
    
    
def create_individual(individual_id, sex):
    s = create_sex(sex)
    return Individual(id=individual_id, sex=s)
    
def create_phenotypic_features(col1, col2, col3):
    # TODO - implement a real function that returns phenotypic features
    return []

In [None]:

# The big picture, we assume each row corresponds to a phenopacket.

fpath = '/Users/adam/Documents/Adipositas/MC4R/mapping_phenos/test_final.csv'

phenopackets = []
with open(fpath) as fh:
    for row in fh:
        columns = row.split(',')
        record_id = columns[0]
        sex = columns[7]
        individual = create_individual(record_id, sex)
        
        pfs = create_phenotypic_features(columns[13], )
        pp = Phenopacket(id=record_id, subject=individual, phenotypic_features=pfs)
        phenopackets.append(pp)


In [None]:
#kg=OntologyClass(unit='UCUM:kg', label='kilogram')

if age_col == '0':
    # TODO - handle
    pass
else:
    
age_pt = re.compile(r'(?P<year>\d+)(/(?P<month>\d+))?')

full = age_pt.match('00/10')
y = full.group('year')
'P1Y10M'
semi = age_pt.match('0')

In [None]:
# create method with all the columns


def create_measurements(weight_col: str, height_col: str, age_col:str):
    # Process weight
    assay = OntologyClass(id='LOINC:3141-9', label='Body weight Measured') # TODO - set proper values
    value = weight_col if isinstance(weight_col, float) else float(weight_col) # TODO - check, this can explode
    quantity = Quantity(unit=kg, value=value)
    
    assay = OntologyClass(id='LOINC:3137-7', label='Body height Measured')
    value = weight_col if isinstance(height_col, float) else float(height)
    quantity = Quantity(unit=cm, value=value)

    'Age of U6\n0Y 0M'

                  
    # Process height
measurement:
    assay:
        id: "LOINC:3141-9"
        label: "Body weight Measured"
    value:
        quantity:
            unit:
                id: "UCUM:kg"
                label: "kilogram"
    assay: 
        id: "LOINC:3137-7"
        label: "Body height Measured"
    value: 
        quantity:
            unit:
                id: "UCUM:m"
                label: "metres"   
    timeObserved:
        age: '___'
     
                
                
    if weight_col == "Weight-Birth\nkg" and height_col == "Height-Birth\ncm"
            age = 'P0Y0M'
    elif weight_col == "Weight - U6\nkg" and height_col == "Height-U6\ncm"
            age = 'P1Y0M'
    elif weight_col == "Weight-U7\nkg" and height_col == "Height-U7\ncm"
            age = 'P2Y0M'
    elif weight_col == "Weight-U7a\nkg" and height_col == "Height-U7a\ncm"
            age = 'P3Y0M'
    elif weight_col == "Weight-U8\nkg" and height_col == "Height-U8\ncm"
            age = 'P4Y0M'
    elif weight_col == "Weight-U9\nkg" and height_col == "Height-U9\ncm"
            age = 'P5Y0M'
    elif weight_col == "Weight- 6y\nkg" and height_col == "Height- 6y\ncm"
            age = 'P6Y0M'
    elif weight_col == "Weight-9y\nkg" and height_col == "Height 9y\ncm"
            age = 'P9Y0M'
    elif weight_col == "Weight-12y\nkg" and height_col == "Height-12y\ncm"
            age = 'P12Y0M'
    elif weight_col == "Weight-15y\nkg" and height_col == "Height-15y\ncm"
            age = 'P15Y0M'
    elif weight_col == "Weight-18y\nkg" and height_col == "Height-18y\ncm"
            age = 'P18Y0M'    
    else weight_col == "Weight -21y\nkg" and height_col == "Height-21y\ncm"
            age = 'P21Y0M'