In [1]:
import polars as pl # the same as pandas just faster
from loguru import logger

import configparser
from pathlib import Path
from datetime import datetime

from ERKER2Phenopackets.src.utils import write_files
from ERKER2Phenopackets.src.MC4R import map_mc4r2phenopackets
from ERKER2Phenopackets.src.utils import PolarsUtils

### Get path to notebook from config file

In [2]:
config = configparser.ConfigParser()
config.read('../../data/config/config.cfg')
real_data = False

if real_data:
    path = Path(config.get('Paths', 'mc4r_path')) 
else:
    path = Path(config.get('Paths', 'synth_data_path'))

phenopackets_out = Path(config.get('Paths', 'phenopackets_out'))

### Set up logger using loguru

In [3]:
cur_time = datetime.now().strftime("%Y-%m-%d-%H%M") # get cur time for unique dir name


logger.debug('test test  t')

[32m2023-09-11 12:57:42.288[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [34m[1mtest test  t[0m


## Set Creator Tag

In [4]:
created_by = config.get('Constants', 'creator_tag')

print(f'Creator tag: {created_by}')

Creator tag: P. Robinson, MD, D. Danis, PhD, A. Graefe, F. Rehburg


### Read data in

In [5]:
df = pl.read_csv(path)


df.head(5)

record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,sct_38866009,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
8,"""YD053""","""2023-09-05""",1995,"""sct_248153007""",,,"""sct_414408004""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2007-17-20""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2008.0,11.0,19.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
37,"""ER019""","""2023-09-05""",1992,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2015-14-18""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2014.0,5.0,5.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
66,"""LD045""","""2023-09-05""",1997,"""sct_248152002""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2001-7-16""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2018.0,7.0,7.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
35,"""AM034""","""2023-09-05""",1994,"""sct_248152002""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2008-6-14""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2009.0,7.0,10.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
88,"""NS054""","""2023-09-05""",1995,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2008-17-4""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2019.0,2.0,18.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Get some info about the data

#### Get number of rows and columns

In [6]:
print(f'Number of rows: {PolarsUtils.get_num_rows(df)}')
print(f'Number of cols: {PolarsUtils.get_num_cols(df)}')

Number of rows: 317
Number of cols: 50


In [7]:
df.describe()

describe,record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
str,f64,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""count""",50.0,"""50""","""50""",50.0,"""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""",50.0,50.0,50.0,"""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""",…,"""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50"""
"""null_count""",0.0,"""0""","""0""",0.0,"""0""","""50""","""50""","""5""","""0""","""50""","""50""","""0""","""0""","""50""","""50""","""0""","""0""","""50""","""0""","""50""","""50""","""50""","""0""",1.0,1.0,2.0,"""0""","""50""","""0""","""0""","""0""","""0""","""50""","""0""","""0""","""50""",…,"""50""","""50""","""0""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""50""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""50""","""0""","""50""","""50"""
"""mean""",44.86,,,1998.94,,,,,,,,,,,,,,,,,,,,2011.122449,5.857143,15.375,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""std""",29.624872,,,7.028833,,,,,,,,,,,,,,,,,,,,4.871827,3.272359,8.001662,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""min""",1.0,"""AA009""","""2023-09-05""",1988.0,"""sct_248152002""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_255398004""","""sct_1220561009…",,,"""2001-7-16""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2000.0,1.0,3.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
"""25%""",18.0,,,1995.0,,,,,,,,,,,,,,,,,,,,2008.0,3.0,9.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""50%""",47.0,,,1997.0,,,,,,,,,,,,,,,,,,,,2010.0,6.0,17.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""75%""",70.0,,,2002.0,,,,,,,,,,,,,,,,,,,,2015.0,9.0,21.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""max""",92.0,"""ZK036""","""2023-09-05""",2015.0,"""sct_248153007""",,,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2020-12-2""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2020.0,11.0,29.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Display unique values for each column

In [8]:
df.unique()

record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,sct_38866009,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
5,"""AK050""","""2023-09-05""",1988,"""sct_248152002""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_255398004""","""sct_1220561009…",,,"""2018-7-15""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2020.0,11.0,6.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
48,"""AA009""","""2023-09-05""",2013,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2019-6-15""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",,2.0,10.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
92,"""AM030""","""2023-09-05""",2014,"""sct_248152002""",,,,"""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2006-1-2""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2014.0,7.0,29.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
80,"""AS002""","""2023-09-05""",2001,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2003-2-30""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2009.0,7.0,26.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
88,"""DM090""","""2023-09-05""",2015,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2008-15-22""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2007.0,1.0,18.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
75,"""TY097""","""2023-09-05""",1993,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2013-17-12""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2009.0,6.0,8.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
19,"""ZK036""","""2023-09-05""",2015,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2016-14-11""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2013.0,6.0,17.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
79,"""EG049""","""2023-09-05""",1995,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2015-9-31""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2008.0,6.0,15.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
55,"""AV020""","""2023-09-05""",1996,"""sct_248152002""",,,,"""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2006-14-18""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2001.0,3.0,21.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
32,"""AS014""","""2023-09-05""",1999,"""sct_248152002""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2017-11-10""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2010.0,2.0,9.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Get number of null values for each column

In [9]:
PolarsUtils.null_value_analysis(df, verbose=True)

There are 232/317 columns with only null values in the data
There are 260/317 columns with at least one null value in the data


variable,null_count,all_null
str,u32,bool
"""sct_263495000""",50,false
"""sct_315354004""",50,false
"""sct_364699009""",5,false
"""sct_399753006""",50,false
"""sct_420259009""",50,false
"""sct_16100001""",50,false
"""sct_16100001_s…",50,false
"""sct_276239002_…",50,false
"""sct_405795006_…",50,false
"""sct_405795006_…",50,false


## Preprocessing
- remove null colus
- clean up data
- add id col to data (col name `'mc4r_id'`)

### Remove null cols

In [10]:
df = PolarsUtils.drop_null_cols(df, remove_all_null=True, remove_any_null=False)
df.head(5)

Dropped 232 columns. 85 columns remaining.


record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_364699009,sct_278844005,sct_767023003,sct_184305005_rd,sct_769681006_center,sct_440377005,sct_424850005,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_icd10gm,sct_439401001_alphaid,sct_432213005,sct_263493007,sct_116694002,sct_246454002,sct_439401001_ver,sct_103330002,sct_439401001_val,sct_39154008_hpo,sct_406522009,ln_48007_9,ln_48007_9_mitoch,sct_439401001_gen_val,ln_ll4048_6,sct_439401001_omim_g_1,…,sct_55446002_str_2,ln_48004_6_2,ln_48005_3_2,ln_48018_6_2,ln_48002_0_2,ln_48019_4_2,ln_53037_8_2,ln_62374_4_3,ln_48004_6_3,ln_48018_6_3,ln_48019_4_3,ln_53037_8_3,sct_8116006_1,sct_8116006_1_date,sct_8116006_2,sct_8116006_2_date,sct_8116006_3,sct_8116006_3_date,sct_8116006_4,sct_8116006_4_date,sct_72705000_rd,sct_160430005,sct_66839005_rd,sct_160436004,sct_82101005_rd,sct_75226009_rd,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_21134002_class
i64,str,str,i64,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
8,"""YD053""","""2023-09-05""",1995,"""sct_248153007""","""sct_414408004""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2007-17-20""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2008.0,11.0,19.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2014-05-07""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""",…,,,,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""","""SO_1000002""",,,"""2008-01-13""","""HP:0025499""",,,"""2001-03-17""",,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
37,"""ER019""","""2023-09-05""",1992,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2015-14-18""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2014.0,5.0,5.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2015-02-24""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,,,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""",,,,"""2004-01-26""",,,"""HP:0025499""",,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
66,"""LD045""","""2023-09-05""",1997,"""sct_248152002""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2001-7-16""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2018.0,7.0,7.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2007-12-12""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,,,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""",,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
35,"""AM034""","""2023-09-05""",1994,"""sct_248152002""","""sct_14045001""","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2008-6-14""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2009.0,7.0,10.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2008-06-27""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,,,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""",,,,,"""HP:0025499""",,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
88,"""NS054""","""2023-09-05""",1995,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2008-17-4""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2019.0,2.0,18.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2013-04-10""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""",…,,"""NM_005912.3:c.…",,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""",,,"""HP:0025499""",,,,,"""2004-09-04""",,"""2002-01-12""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"


### Update id column
1. Drop old ID (has duplicate values)
2. Initialize new ID column

In [11]:
df.drop_in_place('record_id');  # entferne alte ID

In [12]:
df = PolarsUtils.add_id_col(df, id_col_name='mc4r_id', id_datatype = str)  # fuege neue ID hinzu
df.head(5)

mc4r_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_364699009,sct_278844005,sct_767023003,sct_184305005_rd,sct_769681006_center,sct_440377005,sct_424850005,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_icd10gm,sct_439401001_alphaid,sct_432213005,sct_263493007,sct_116694002,sct_246454002,sct_439401001_ver,sct_103330002,sct_439401001_val,sct_39154008_hpo,sct_406522009,ln_48007_9,ln_48007_9_mitoch,sct_439401001_gen_val,ln_ll4048_6,sct_439401001_omim_g_1,…,sct_55446002_str_2,ln_48004_6_2,ln_48005_3_2,ln_48018_6_2,ln_48002_0_2,ln_48019_4_2,ln_53037_8_2,ln_62374_4_3,ln_48004_6_3,ln_48018_6_3,ln_48019_4_3,ln_53037_8_3,sct_8116006_1,sct_8116006_1_date,sct_8116006_2,sct_8116006_2_date,sct_8116006_3,sct_8116006_3_date,sct_8116006_4,sct_8116006_4_date,sct_72705000_rd,sct_160430005,sct_66839005_rd,sct_160436004,sct_82101005_rd,sct_75226009_rd,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_21134002_class
str,str,str,i64,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""0""","""YD053""","""2023-09-05""",1995,"""sct_248153007""","""sct_414408004""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2007-17-20""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2008.0,11.0,19.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2014-05-07""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""",…,,,,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""","""SO_1000002""",,,"""2008-01-13""","""HP:0025499""",,,"""2001-03-17""",,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
"""1""","""ER019""","""2023-09-05""",1992,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2015-14-18""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2014.0,5.0,5.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2015-02-24""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,,,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""",,,,"""2004-01-26""",,,"""HP:0025499""",,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
"""2""","""LD045""","""2023-09-05""",1997,"""sct_248152002""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2001-7-16""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2018.0,7.0,7.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2007-12-12""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,,,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""",,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
"""3""","""AM034""","""2023-09-05""",1994,"""sct_248152002""","""sct_14045001""","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2008-6-14""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2009.0,7.0,10.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2008-06-27""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,,,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""",,,,,"""HP:0025499""",,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
"""4""","""NS054""","""2023-09-05""",1995,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2008-17-4""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2019.0,2.0,18.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2013-04-10""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""",…,,"""NM_005912.3:c.…",,"""HGNC:6932""",,,,"""ln_LA26806-2""",,"""HGNC:6932""",,,"""HP:0025499""",,,,,"""2004-09-04""",,"""2002-01-12""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"


### Parsing step

In [13]:
from ERKER2Phenopackets.src.MC4R.MappingDicts import phenotype_label_map_erker2phenopackets
from ERKER2Phenopackets.src.MC4R.MappingDicts import allele_label_map_erker2phenopackets
from ERKER2Phenopackets.src.MC4R import zygosity_map_erker2phenopackets, sex_map_erker2phenopackets
from ERKER2Phenopackets.src.MC4R.ParseMC4R import parse_date_of_diagnosis, parse_year_of_birth, \
parse_phenotyping_date, parse_omim

config = configparser.ConfigParser()
config.read('../../data/config/config.cfg')
no_mutation = config.get('NoValue', 'mutation')
no_phenotype = config.get('NoValue', 'phenotype')
no_date = config.get('NoValue', 'date')
no_omim = config.get('NoValue', 'omim')

# sct_184099003_y (year of birth)
df = PolarsUtils.map_col(df, map_from='sct_184099003_y', map_to='parsed_year_of_birth',\
    mapping=parse_year_of_birth)

# sct_281053000 (sex)
df = PolarsUtils.map_col(df, map_from='sct_281053000', map_to='parsed_sex',\
    mapping=sex_map_erker2phenopackets)

# sct_432213005 (date of diagnosis)
df = PolarsUtils.map_col(df, map_from='sct_432213005',\
    map_to='parsed_date_of_diagnosis' ,mapping=parse_date_of_diagnosis)
df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_diagnosis', no_date)

# # ln_48007_9 (zygosity)
df = PolarsUtils.map_col(df, map_from='ln_48007_9', map_to='parsed_zygosity',\
    mapping=zygosity_map_erker2phenopackets)
df = PolarsUtils.map_col(df, map_from='ln_48007_9', map_to='allele_label', \
                         mapping=allele_label_map_erker2phenopackets)


# sct_439401001_orpha (diagnosis (ORPHA))
# does not require mapping

# sct_439401001_omim_g_1, sct_439401001_omim_g_2, sct_439401001_omim_g_3 \
# (Primärdiagnose OMIM)
df = PolarsUtils.map_col(df, map_from='sct_439401001_omim_g_1',\
    map_to='parsed_omim_1' ,mapping=parse_omim)
df = PolarsUtils.fill_null_vals(df, 'parsed_omim_1', no_omim)
    
df = PolarsUtils.map_col(df, map_from='sct_439401001_omim_g_2',\
    map_to='parsed_omim_2' ,mapping=parse_omim)
df = PolarsUtils.fill_null_vals(df, 'parsed_omim_2', no_omim)


# ln_48005_3_1, ln_48005_3_2, ln_48005_3_3 (mutation p.HGVS)
df = PolarsUtils.fill_null_vals(df, 'ln_48005_3_1', no_mutation)
df = PolarsUtils.fill_null_vals(df, 'ln_48005_3_2', no_mutation)
if 'ln_48005_3_3' in df.columns:
    df = PolarsUtils.fill_null_vals(df, 'ln_48005_3_3', no_mutation)

# ln_48004_6_1, ln_48004_6_2, ln_48004_6_3 (mutation c.HGVS)
df = PolarsUtils.fill_null_vals(df, 'ln_48004_6_1', no_mutation)
df = PolarsUtils.fill_null_vals(df, 'ln_48004_6_2', no_mutation)
if 'ln_48004_6_3' in df.columns:
    df = PolarsUtils.fill_null_vals(df, 'ln_48004_6_3', no_mutation)

# ln_48018_6_1 (gene HGNC)
# does not require mapping

# sct_8116006_1, sct_8116006_2, sct_8116006_3, sct_8116006_4, sct_8116006_5 (phenotype\
  #classification
df = PolarsUtils.fill_null_vals(df, 'sct_8116006_1', no_phenotype)
df = PolarsUtils.fill_null_vals(df, 'sct_8116006_2', no_phenotype)
df = PolarsUtils.fill_null_vals(df, 'sct_8116006_3', no_phenotype)
df = PolarsUtils.fill_null_vals(df, 'sct_8116006_4', no_phenotype)
if 'sct_8116006_5' in df.columns:
    df = PolarsUtils.fill_null_vals(df, 'sct_8116006_5', no_phenotype)

# sct_8116006_1_date, sct_8116006_2_date, sct_8116006_3_date, sct_8116006_4_date, \
    # sct_8116006_5_date (dates of phenotype determination)
df = PolarsUtils.map_col(df, map_from='sct_8116006_1_date', map_to='parsed_date_of_phenotyping1', mapping=parse_phenotyping_date)
df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping1',no_date)

df = PolarsUtils.map_col(df, map_from='sct_8116006_2_date', map_to='parsed_date_of_phenotyping2', mapping=parse_phenotyping_date)
df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping2',no_date)

if 'sct_8116006_3_date' in df.columns:
    df = PolarsUtils.map_col(df, map_from='sct_8116006_3_date', map_to='parsed_date_of_phenotyping3', mapping=parse_phenotyping_date)
    df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping3',no_date)
    
if 'sct_8116006_4_date' in df.columns:
    df = PolarsUtils.map_col(df, map_from='sct_8116006_4_date', map_to='parsed_date_of_phenotyping4', mapping=parse_phenotyping_date)
    df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping4',no_date)
    
if 'sct_8116006_5_date' in df.columns:
    df = PolarsUtils.map_col(df, map_from='sct_8116006_5_date', map_to='parsed_date_of_phenotyping5', mapping=parse_phenotyping_date)  
    df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping5',no_date)  


# phenotype label
df = PolarsUtils.map_col(df, map_from='sct_8116006_1', map_to='parsed_phenotype_label1', mapping=phenotype_label_map_erker2phenopackets)
df = PolarsUtils.map_col(df, map_from='sct_8116006_2', map_to='parsed_phenotype_label2', mapping=phenotype_label_map_erker2phenopackets)
df = PolarsUtils.map_col(df, map_from='sct_8116006_3', map_to='parsed_phenotype_label3', mapping=phenotype_label_map_erker2phenopackets)
df = PolarsUtils.map_col(df, map_from='sct_8116006_4', map_to='parsed_phenotype_label4', mapping=phenotype_label_map_erker2phenopackets)
if 'sct_8116006_5' in df.columns:
    df = PolarsUtils.map_col(df, map_from='sct_8116006_5', map_to='parsed_phenotype_label5', mapping=phenotype_label_map_erker2phenopackets)

In [14]:
df.head()

mc4r_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_364699009,sct_278844005,sct_767023003,sct_184305005_rd,sct_769681006_center,sct_440377005,sct_424850005,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_icd10gm,sct_439401001_alphaid,sct_432213005,sct_263493007,sct_116694002,sct_246454002,sct_439401001_ver,sct_103330002,sct_439401001_val,sct_39154008_hpo,sct_406522009,ln_48007_9,ln_48007_9_mitoch,sct_439401001_gen_val,ln_ll4048_6,sct_439401001_omim_g_1,…,sct_8116006_2_date,sct_8116006_3,sct_8116006_3_date,sct_8116006_4,sct_8116006_4_date,sct_72705000_rd,sct_160430005,sct_66839005_rd,sct_160436004,sct_82101005_rd,sct_75226009_rd,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_21134002_class,parsed_year_of_birth,parsed_sex,parsed_date_of_diagnosis,parsed_zygosity,allele_label,parsed_omim_1,parsed_omim_2,parsed_date_of_phenotyping1,parsed_date_of_phenotyping2,parsed_date_of_phenotyping3,parsed_date_of_phenotyping4,parsed_phenotype_label1,parsed_phenotype_label2,parsed_phenotype_label3,parsed_phenotype_label4
str,str,str,i64,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""0""","""YD053""","""2023-09-05""",1995,"""sct_248153007""","""sct_414408004""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2007-17-20""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2008.0,11.0,19.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2014-05-07""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""",…,,"""NO_PHENOTYPE""","""2001-03-17""","""NO_PHENOTYPE""",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""1995-01-01T00:…","""MALE""","""2014-05-07T00:…","""GENO:0000135""","""heterozygous""","""NO_OMIM""","""NO_OMIM""","""2008-01-13T00:…","""NO_DATE""","""2001-03-17T00:…","""NO_DATE""",,"""Class I obesit…",,
"""1""","""ER019""","""2023-09-05""",1992,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2015-14-18""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2014.0,5.0,5.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2015-02-24""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,"""HP:0025499""",,"""NO_PHENOTYPE""",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""1992-01-01T00:…","""MALE""","""2015-02-24T00:…","""GENO:0000135""","""heterozygous""","""OMIM:155541.00…","""NO_OMIM""","""2004-01-26T00:…","""NO_DATE""","""NO_DATE""","""NO_DATE""",,,"""Class I obesit…",
"""2""","""LD045""","""2023-09-05""",1997,"""sct_248152002""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2001-7-16""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2018.0,7.0,7.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2007-12-12""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,"""NO_PHENOTYPE""",,"""NO_PHENOTYPE""",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""1997-01-01T00:…","""FEMALE""","""2007-12-12T00:…","""GENO:0000137""","""unspecified zy…","""OMIM:155541.00…","""NO_OMIM""","""NO_DATE""","""NO_DATE""","""NO_DATE""","""NO_DATE""",,,,
"""3""","""AM034""","""2023-09-05""",1994,"""sct_248152002""","""sct_14045001""","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2008-6-14""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2009.0,7.0,10.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2008-06-27""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""",…,,"""NO_PHENOTYPE""",,"""NO_PHENOTYPE""",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""1994-01-01T00:…","""FEMALE""","""2008-06-27T00:…","""GENO:0000135""","""heterozygous""","""OMIM:155541.00…","""NO_OMIM""","""NO_DATE""","""NO_DATE""","""NO_DATE""","""NO_DATE""",,"""Class I obesit…",,
"""4""","""NS054""","""2023-09-05""",1995,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2008-17-4""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2019.0,2.0,18.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2013-04-10""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""",…,,"""NO_PHENOTYPE""","""2004-09-04""","""NO_PHENOTYPE""","""2002-01-12""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""1995-01-01T00:…","""MALE""","""2013-04-10T00:…","""GENO:0000135""","""heterozygous""","""NO_OMIM""","""NO_OMIM""","""NO_DATE""","""NO_DATE""","""2004-09-04T00:…","""2002-01-12T00:…","""Class I obesit…",,,


## Map to phenopackets

In [15]:
print(cur_time)

2023-09-11-1257


In [16]:
from ERKER2Phenopackets.src.MC4R.MapMC4R import _map_chunk


phenopackets = _map_chunk(df, cur_time[:10]) #map_mc4r2phenopackets(df, cur_time)

[32m2023-09-11 12:57:43.136[0m | [1mINFO    [0m | [36msrc.MC4R.MapMC4R[0m:[36m_map_chunk[0m:[36m69[0m - [1mCurrently working on thread 10016[0m
[32m2023-09-11 12:57:43.140[0m | [34m[1mDEBUG   [0m | [36msrc.MC4R.MapMC4R[0m:[36m_map_chunk[0m:[36m75[0m - [34m[1m<class 'google.protobuf.timestamp_pb2.Timestamp'>[0m
[32m2023-09-11 12:57:43.145[0m | [34m[1mDEBUG   [0m | [36msrc.MC4R.MapMC4R[0m:[36m_map_chunk[0m:[36m89[0m - [34m[1m10016: ID: 0[0m
[32m2023-09-11 12:57:43.148[0m | [34m[1mDEBUG   [0m | [36msrc.MC4R.MapMC4R[0m:[36m_map_chunk[0m:[36m98[0m - [34m[1m10016: row["parsed_year_of_birth"]='1995-01-01T00:00:00.00Z'[0m
[32m2023-09-11 12:57:43.151[0m | [34m[1mDEBUG   [0m | [36msrc.MC4R.MapMC4R[0m:[36m_map_chunk[0m:[36m99[0m - [34m[1m10016: row["parsed_sex"]='MALE'[0m
[32m2023-09-11 12:57:43.154[0m | [34m[1mDEBUG   [0m | [36msrc.MC4R.MapMC4R[0m:[36m_map_disease[0m:[36m472[0m - [34m[1m2014-05-07T00:00:00.00Z[0m


## Write to json

In [17]:
phenopackets_out_dir = phenopackets_out / cur_time # create dir for output

write_files(phenopackets, phenopackets_out_dir)

Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Successfully wrote phenopacket to JSON ..\..\data\out\phenopackets\2023-09-11-1257
Succ