In [26]:
import polars as pl # the same as pandas just faster

import configparser
from pathlib import Path
from datetime import datetime
import sys

sys.path.append('../..') # for imports to work
from src.utils import write_files
from src.MC4R import map_mc4r2phenopackets
from src.utils import PolarsUtils

### Get path to notebook from config file

In [27]:
config = configparser.ConfigParser()
config.read('../../data/config/config.cfg')
real_data = True

if real_data:
    path = Path(config.get('Paths', 'mc4r_path')) 
else:
    path = Path(config.get('Paths', 'synth_data_path'))

phenopackets_out = Path(config.get('Paths', 'phenopackets_out'))

### Set Creator Tag

In [28]:
created_by = config.get('Creator', 'creator_tag')

print(f'Creator tag: {created_by}')

Creator tag: P. Robinson, MD, D. Danis, PhD, A. Graefe, F. Rehburg


### Read data in

In [29]:
df = pl.read_csv(path)
df.head(5)

record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,sct_439401001_bodysite,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_441898007_data,sct_441898007_eu,sct_441898007_int,sct_441898007_aff,sct_441898007_gen,sct_441898007_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
1,"""123AB""",,2004,,"""sct_248152002""",,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2009-15-11""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",2009.0,3.0,19.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
2,"""234BC""",,2001,,"""sct_248153007""",,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2011-10-21""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2009.0,7.0,24.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
3,"""345CD""",,2002,,"""sct_248152002""",,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2004-5-21""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",,,,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
4,"""456DE""",,2005,,"""sct_248152002""",,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2017-2-18""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",2016.0,14.0,19.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
5,"""567EF""",,1999,,"""sct_248153007""",,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""20011-6-14""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",2009.0,9.0,14.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Get some info about the data

#### Get number of rows and columns

In [30]:
print(f'Number of rows: {PolarsUtils.get_num_rows(df)}')
print(f'Number of cols: {PolarsUtils.get_num_cols(df)}')

Number of rows: 317
Number of cols: 5


In [31]:
df.describe()

describe,record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_441898007_data,sct_441898007_eu,sct_441898007_int,sct_441898007_aff,sct_441898007_gen,sct_441898007_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
str,f64,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""count""",5.0,"""5""","""5""",5.0,"""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""",5.0,5.0,5.0,"""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""",…,"""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5"""
"""null_count""",0.0,"""0""","""5""",0.0,"""5""","""0""","""5""","""0""","""0""","""5""","""5""","""0""","""0""","""5""","""5""","""0""","""0""","""5""","""0""","""5""","""5""","""5""","""0""",1.0,1.0,1.0,"""0""","""5""","""0""","""0""","""0""","""0""","""5""","""0""","""0""","""5""",…,"""5""","""5""","""0""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""5""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""5""","""0""","""5""","""5"""
"""mean""",3.0,,,2002.2,,,,,,,,,,,,,,,,,,,,2010.75,8.25,19.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""std""",1.581139,,,2.387467,,,,,,,,,,,,,,,,,,,,3.5,4.573474,4.082483,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""min""",1.0,"""123AB""",,1999.0,,"""sct_248152002""",,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""20011-6-14""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",2009.0,3.0,14.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
"""25%""",2.0,,,2001.0,,,,,,,,,,,,,,,,,,,,2009.0,7.0,19.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""50%""",3.0,,,2002.0,,,,,,,,,,,,,,,,,,,,2009.0,9.0,19.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""75%""",4.0,,,2004.0,,,,,,,,,,,,,,,,,,,,2016.0,14.0,24.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""max""",5.0,"""567EF""",,2005.0,,"""sct_248153007""",,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2017-2-18""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2016.0,14.0,24.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Display unique values for each column

In [32]:
df.unique()

record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,sct_439401001_bodysite,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_441898007_data,sct_441898007_eu,sct_441898007_int,sct_441898007_aff,sct_441898007_gen,sct_441898007_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
3,"""345CD""",,2002,,"""sct_248152002""",,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2004-5-21""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",,,,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
5,"""567EF""",,1999,,"""sct_248153007""",,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""20011-6-14""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",2009.0,9.0,14.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
1,"""123AB""",,2004,,"""sct_248152002""",,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2009-15-11""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",2009.0,3.0,19.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
2,"""234BC""",,2001,,"""sct_248153007""",,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2011-10-21""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2009.0,7.0,24.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
4,"""456DE""",,2005,,"""sct_248152002""",,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2017-2-18""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",2016.0,14.0,19.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Get number of null values for each column

In [33]:
PolarsUtils.null_value_analysis(df, verbose=True)

There are 244/317 columns with only null values in the data
There are 261/317 columns with at least one null value in the data


variable,null_count,all_null
str,u32,bool
"""sct_399423000""",5,false
"""sct_281053000""",5,false
"""sct_315354004""",5,false
"""sct_399753006""",5,false
"""sct_420259009""",5,false
"""sct_16100001""",5,false
"""sct_16100001_s…",5,false
"""sct_276239002_…",5,false
"""sct_405795006_…",5,false
"""sct_405795006_…",5,false


## Preprocessing
- remove null colus
- clean up data
- add id col to data (col name `'mc4r_id'`)

### Remove null cols

In [34]:
df = PolarsUtils.drop_null_cols(df, remove_all_null=True, remove_any_null=False)
df.head(5)

Dropped 244 columns. 73 columns remaining.


record_id,sct_422549004,sct_184099003_y,sct_263495000,sct_364699009,sct_278844005,sct_767023003,sct_184305005_rd,sct_769681006_center,sct_440377005,sct_424850005,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_icd10gm,sct_439401001_alphaid,sct_432213005,sct_263493007,sct_116694002,sct_246454002,sct_439401001_ver,sct_103330002,sct_439401001_val,sct_39154008_hpo,sct_406522009,ln_48007_9,ln_48007_9_mitoch,sct_439401001_gen_val,ln_ll4048_6,sct_439401001_omim_g_1,ln_62374_4_1,ln_48004_6_1,ln_48005_3_1,ln_48018_6_1,ln_48002_0_1,ln_48019_4_1,ln_53037_8_1,ln_93044_6_1,ln_62374_4_2,ln_48018_6_2,ln_62374_4_3,ln_48018_6_3,sct_8116006_1,sct_8116006_1_date,sct_8116006_2,sct_8116006_2_date,sct_8116006_3,sct_8116006_3_date,sct_8116006_4,sct_8116006_4_date,sct_72705000_rd,sct_160430005,sct_66839005_rd,sct_160436004,sct_82101005_rd,sct_75226009_rd,sct_309370004_research,sct_309370004_narse,sct_441898007_data,sct_441898007_eu,sct_441898007_int,sct_441898007_aff,sct_441898007_gen,sct_441898007_case,sct_123038009,sct_840566006,sct_21134002_class
i64,str,i64,str,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
1,"""123AB""",2004,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2009-15-11""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",2009.0,3.0,19.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2009-04-19""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""1.555.410.021""","""ln_LA26806-2""","""NM_005912.3:c.…","""NP_005903.2:p.…","""HGNC:6932""","""sct_1220561009…","""SO_1000002""","""ln_LA6675-8""","""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025499""","""2001-10-15""","""HP:0025499""","""2004-12-14""",,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
2,"""234BC""",2001,"""sct_248153007""","""sct_90027003""","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2011-10-21""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2009.0,7.0,24.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2012-05-16""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""1.555.410.024""","""ln_LA26806-2""",,,"""HGNC:6932""",,,,"""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025502""","""2005-10-21""","""HP:0025499""","""2008-09-21""",,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
3,"""345CD""",2002,"""sct_248152002""","""sct_90027003""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2004-5-21""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…",,"""ln_LA26806-2""","""NM_005912.3:c.…","""NP_005903.2:p.…","""HGNC:6932""","""sct_1220561009…","""SO_1000002""","""ln_LA26333-7""","""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025499""","""2003-05-22""",,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
4,"""456DE""",2005,"""sct_248152002""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2017-2-18""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",2016.0,14.0,19.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2019-10-05""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…",,"""ln_LA26806-2""",,,,,,,"""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025500""","""2001-05-19""","""HP:0025501""","""2002-04-19""","""HP:0025499""","""2005-09-10""",,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
5,"""567EF""",1999,"""sct_248153007""","""sct_372148003_…","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""20011-6-14""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",2009.0,9.0,14.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2011-05-07""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""1.555.410.024""","""ln_LA26806-2""","""NM_005912.3:c.…","""NP_005903.2:p.…","""HGNC:6932""","""sct_1220561009…","""SO_1000002""","""ln_LA6675-8""","""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025499""","""2005-02-15""","""HP:0025500""","""2006-02-24""","""HP:0025500""","""2007-11-15""","""HP:0025500""","""2008-11-12""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"


### Update id column
1. Drop old ID (has duplicate values)
2. Initialize new ID column

In [35]:
df.drop_in_place('record_id');

In [36]:
df = PolarsUtils.add_id_col(df, id_col_name='mc4r_id')
df.head(5)

mc4r_id,sct_422549004,sct_184099003_y,sct_263495000,sct_364699009,sct_278844005,sct_767023003,sct_184305005_rd,sct_769681006_center,sct_440377005,sct_424850005,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_icd10gm,sct_439401001_alphaid,sct_432213005,sct_263493007,sct_116694002,sct_246454002,sct_439401001_ver,sct_103330002,sct_439401001_val,sct_39154008_hpo,sct_406522009,ln_48007_9,ln_48007_9_mitoch,sct_439401001_gen_val,ln_ll4048_6,sct_439401001_omim_g_1,ln_62374_4_1,ln_48004_6_1,ln_48005_3_1,ln_48018_6_1,ln_48002_0_1,ln_48019_4_1,ln_53037_8_1,ln_93044_6_1,ln_62374_4_2,ln_48018_6_2,ln_62374_4_3,ln_48018_6_3,sct_8116006_1,sct_8116006_1_date,sct_8116006_2,sct_8116006_2_date,sct_8116006_3,sct_8116006_3_date,sct_8116006_4,sct_8116006_4_date,sct_72705000_rd,sct_160430005,sct_66839005_rd,sct_160436004,sct_82101005_rd,sct_75226009_rd,sct_309370004_research,sct_309370004_narse,sct_441898007_data,sct_441898007_eu,sct_441898007_int,sct_441898007_aff,sct_441898007_gen,sct_441898007_case,sct_123038009,sct_840566006,sct_21134002_class
i64,str,i64,str,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
0,"""123AB""",2004,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2009-15-11""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",2009.0,3.0,19.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2009-04-19""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""1.555.410.021""","""ln_LA26806-2""","""NM_005912.3:c.…","""NP_005903.2:p.…","""HGNC:6932""","""sct_1220561009…","""SO_1000002""","""ln_LA6675-8""","""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025499""","""2001-10-15""","""HP:0025499""","""2004-12-14""",,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
1,"""234BC""",2001,"""sct_248153007""","""sct_90027003""","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2011-10-21""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2009.0,7.0,24.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2012-05-16""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""1.555.410.024""","""ln_LA26806-2""",,,"""HGNC:6932""",,,,"""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025502""","""2005-10-21""","""HP:0025499""","""2008-09-21""",,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
2,"""345CD""",2002,"""sct_248152002""","""sct_90027003""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2004-5-21""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…",,"""ln_LA26806-2""","""NM_005912.3:c.…","""NP_005903.2:p.…","""HGNC:6932""","""sct_1220561009…","""SO_1000002""","""ln_LA26333-7""","""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025499""","""2003-05-22""",,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
3,"""456DE""",2005,"""sct_248152002""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2017-2-18""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",2016.0,14.0,19.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2019-10-05""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…",,"""ln_LA26806-2""",,,,,,,"""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025500""","""2001-05-19""","""HP:0025501""","""2002-04-19""","""HP:0025499""","""2005-09-10""",,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
4,"""567EF""",1999,"""sct_248153007""","""sct_372148003_…","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""20011-6-14""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",2009.0,9.0,14.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2011-05-07""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""1.555.410.024""","""ln_LA26806-2""","""NM_005912.3:c.…","""NP_005903.2:p.…","""HGNC:6932""","""sct_1220561009…","""SO_1000002""","""ln_LA6675-8""","""sct_1220561009…","""ln_LA26806-2""","""HGNC:6932""","""ln_LA26806-2""","""HGNC:6932""","""HP:0025499""","""2005-02-15""","""HP:0025500""","""2006-02-24""","""HP:0025500""","""2007-11-15""","""HP:0025500""","""2008-11-12""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"


### Parsing step

In [37]:
from src.MC4R import zygosity_map_erker2phenopackets, sex_map_erker2phenopackets
from src.MC4R.ParseMC4R import parse_date_of_diagnosis, parse_year_of_birth, parse_phenotyping_date

# sct_184099003_y (year of birth)
df = PolarsUtils.map_col(df, map_from='sct_184099003_y', map_to='parsed_year_of_birth',\
    mapping=parse_year_of_birth)

# sct_281053000 (sex)
df = PolarsUtils.map_col(df, map_from='sct_281053000', map_to='parsed_sex',\
    mapping=sex_map_erker2phenopackets)

# sct_432213005 (date of diagnosis)
df = PolarsUtils.map_col(df, map_from='sct_432213005',\
    map_to='parsed_date_of_diagnosis' ,mapping=parse_date_of_diagnosis)

# ln_48007_9 (zygosity)
df = PolarsUtils.map_col(df, map_from='ln_48007_9', map_to='parsed_zygosity',\
    mapping=zygosity_map_erker2phenopackets)

# sct_439401001_orpha (diagnosis (ORPHA))
# does not require mapping

# ln_81290_9_1, ln_81290_9_2, ln_81290_9_3 (mutation p.HGVS)
# does not require mapping

# ln_48004_6_1, ln_48004_6_2, ln_48004_6_3 (mutation c.HGVS)
# does not require mapping

# ln_48018_6_1, ln_48018_6_2, ln_48018_6_3 (gene HGNC)
# does not require mapping

# sct_8116006_1, sct_8116006_2, sct_8116006_3, sct_8116006_4, sct_8116006_5 (phenotype\
  #classification
# does not require mapping

# sct_8116006_1_date, sct_8116006_2_date, sct_8116006_3_date, sct_8116006_4_date, \
    # sct_8116006_5_date (dates of phenotype determination)
df = PolarsUtils.map_col(df, map_from='sct_8116006_1_date', map_to='parsed_dates_of_phenotyping1', mapping=parse_phenotyping_date)
df = PolarsUtils.map_col(df, map_from='sct_8116006_2_date', map_to='parsed_dates_of_phenotyping2', mapping=parse_phenotyping_date)
df = PolarsUtils.map_col(df, map_from='sct_8116006_3_date', map_to='parsed_dates_of_phenotyping3', mapping=parse_phenotyping_date)
df = PolarsUtils.map_col(df, map_from='sct_8116006_4_date', map_to='parsed_dates_of_phenotyping4', mapping=parse_phenotyping_date)
df = PolarsUtils.map_col(df, map_from='sct_8116006_5_date', map_to='parsed_dates_of_phenotyping5', mapping=parse_phenotyping_date)


ColumnNotFoundError: sct_281053000

Error originated just after this operation:
DF ["mc4r_id", "sct_422549004", "sct_184099003_y", "sct_263495000"]; PROJECT */74 COLUMNS; SELECTION: "None"

## Map to phenopackets

In [None]:
phenopackets = map_mc4r2phenopackets(df, created_by=created_by)

TypeError: Timestamp() takes no arguments

## Write to json

In [None]:
cur_time = datetime.now().strftime("%Y%m%d-%H%M%S")
phenopackets_out_dir = phenopackets_out / cur_time

write_files(phenopackets, phenopackets_out_dir)