In [52]:
import polars as pl # the same as pandas just faster
from loguru import logger

import configparser
from pathlib import Path
from datetime import datetime
import sys

sys.path.append('../..') # for imports to work
from src.utils import write_files
from src.MC4R import map_mc4r2phenopackets
from src.utils import PolarsUtils

### Set up logger using loguru

In [53]:
logger.remove()  
# Remove default logger (stdout)    
# TODO: activate: logger.add(log_file, level=log_level, rotation="1 week", retention="2 weeks")  # Log to a file     
# You can customize the log format as needed    
logger_format = (         
    "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "       
    "<level>{level: <8}</level> | "        
    "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "        
    "<level>{message}</level>"    
)
     
logger.add(lambda msg: print(msg, end=""), colorize=True, format=logger_format, level="DEBUG") #level="SUCCESS" # Log to stdout
# LOGGER LEVELS
# 1. trace
# 2. debug
# 3. info
# 4. success
# 5. warning
# 6. error
# 7. critical

5

### Get path to notebook from config file

In [54]:
config = configparser.ConfigParser()
config.read('../../data/config/config.cfg')
real_data = True

if real_data:
    path = Path(config.get('Paths', 'mc4r_path')) 
else:
    path = Path(config.get('Paths', 'synth_data_path'))

phenopackets_out = Path(config.get('Paths', 'phenopackets_out'))

### Set Creator Tag

In [55]:
created_by = config.get('Constants', 'creator_tag')

print(f'Creator tag: {created_by}')

Creator tag: P. Robinson, MD, D. Danis, PhD, A. Graefe, F. Rehburg


### Read data in

In [56]:
df = pl.read_csv(path)
df.head(5)

record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,sct_38866009,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
1,"""FA062""",,2001,"""sct_248153007""",,,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2007-13-24""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2008.0,1.0,23.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
2,"""YA074""",,2002,"""sct_248152002""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2011-10-21""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2011.0,4.0,18.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
3,"""SA099""",,2002,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2002-7-23""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",,,,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
4,"""EA102""",,2007,"""sct_248152002""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2018-9-26""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2018.0,10.0,17.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
5,"""AA070""",,2000,"""sct_248152002""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2009-4-17""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2009.0,7.0,5.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Get some info about the data

#### Get number of rows and columns

In [57]:
print(f'Number of rows: {PolarsUtils.get_num_rows(df)}')
print(f'Number of cols: {PolarsUtils.get_num_cols(df)}')

Number of rows: 317
Number of cols: 98


In [58]:
df.describe()

describe,record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
str,f64,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""count""",98.0,"""98""","""98""",98.0,"""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""",98.0,98.0,98.0,"""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""",…,"""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98"""
"""null_count""",0.0,"""0""","""98""",0.0,"""0""","""98""","""98""","""3""","""0""","""98""","""98""","""0""","""0""","""98""","""98""","""0""","""0""","""98""","""0""","""98""","""98""","""98""","""0""",5.0,5.0,5.0,"""0""","""98""","""0""","""0""","""0""","""0""","""98""","""0""","""0""","""98""",…,"""98""","""98""","""0""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""98""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""98""","""0""","""98""","""98"""
"""mean""",49.5,,,2000.102041,,,,,,,,,,,,,,,,,,,,2010.096774,5.645161,15.075269,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""std""",28.434134,,,7.157276,,,,,,,,,,,,,,,,,,,,5.039118,3.181622,8.106944,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""min""",1.0,"""AA009""",,1988.0,"""sct_248152002""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_255398004""","""sct_1220561009…",,,"""1988-10-21""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",2000.0,1.0,1.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
"""25%""",25.0,,,1995.0,,,,,,,,,,,,,,,,,,,,2007.0,3.0,8.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""50%""",50.0,,,1999.0,,,,,,,,,,,,,,,,,,,,2009.0,6.0,16.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""75%""",74.0,,,2005.0,,,,,,,,,,,,,,,,,,,,2014.0,8.0,21.0,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""max""",98.0,"""ZK036""",,2017.0,"""sct_248153007""",,,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2020-5-16""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2020.0,12.0,29.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Display unique values for each column

In [59]:
df.unique()

record_id,sct_422549004,sct_399423000,sct_184099003_y,sct_281053000,sct_263495000,sct_315354004,sct_364699009,sct_278844005,sct_399753006,sct_420259009,sct_767023003,sct_184305005_rd,sct_16100001,sct_16100001_source,sct_769681006_center,sct_440377005,sct_276239002_aim,sct_424850005,sct_405795006_y,sct_405795006_m,sct_405795006_d,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_412726003,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_orpha_sub,sct_439401001_icd10gm,sct_439401001_alphaid,sct_439401001_alphaidstr,sct_38866009,…,sct_82101005_10_age,sct_82101005_10_gender,sct_75226009_rd,sct_75226009_1_age,sct_75226009_1_gender,sct_75226009_2_age,sct_75226009_2_gender,sct_75226009_3_age,sct_75226009_3_gender,sct_75226009_4_age,sct_75226009_4_gender,sct_75226009_5_age,sct_75226009_5_gender,sct_75226009_6_age,sct_75226009_6_gender,sct_75226009_7_age,sct_75226009_7_gender,sct_75226009_8_age,sct_75226009_8_gender,sct_75226009_9_age,sct_75226009_9_gender,sct_75226009_10_age,sct_75226009_10_gender,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_840566006_specific,sct_21134002_class,sct_21134002_score,erker_v15_complete
i64,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
2,"""YA074""",,2002,"""sct_248152002""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2011-10-21""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2011.0,4.0,18.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
14,"""AB043""",,1999,"""sct_248152002""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2006-1-2""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2006.0,1.0,6.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
15,"""LB004""",,2000,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2008-17-4""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2009.0,5.0,19.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
19,"""HC046""",,1994,"""sct_248153007""",,,"""sct_90027003""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2007-10-10""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2007.0,11.0,5.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
20,"""NC061""",,1996,"""sct_248153007""",,,,"""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2006-14-29""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2007.0,3.0,12.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
23,"""LD045""",,1997,"""sct_248152002""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2007-12-5""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2007.0,12.0,12.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
36,"""HJ081""",,2009,"""sct_248152002""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2015-1-27""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2015.0,2.0,24.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
44,"""RK101""",,2008,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_263659003""","""sct_1220561009…",,,"""2020-12-2""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2020.0,12.0,3.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
52,"""RM084""",,2004,"""sct_248153007""",,,"""sct_14045001""","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2004-6-1""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_261665006""",,,,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,
57,"""NO075""",,1999,"""sct_248153007""",,,"""sct_372148003_…","""CS_MII_Person_…",,,"""sct_41847000""","""sct_1220561009…",,,"""2009-7-21""","""sct_1220561009…",,"""sct_261665006""",,,,"""sct_410672004""",2009.0,7.0,23.0,"""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""",,"""E66.8""","""sct_1220561009…",,,…,,,"""sct_1220561009…",,,,,,,,,,,,,,,,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…",,"""sct_1220561009…",,


#### Get number of null values for each column

In [60]:
PolarsUtils.null_value_analysis(df, verbose=True)

There are 230/317 columns with only null values in the data
There are 261/317 columns with at least one null value in the data


variable,null_count,all_null
str,u32,bool
"""sct_399423000""",98,false
"""sct_263495000""",98,false
"""sct_315354004""",98,false
"""sct_364699009""",3,false
"""sct_399753006""",98,false
"""sct_420259009""",98,false
"""sct_16100001""",98,false
"""sct_16100001_s…",98,false
"""sct_276239002_…",98,false
"""sct_405795006_…",98,false


## Preprocessing
- remove null colus
- clean up data
- add id col to data (col name `'mc4r_id'`)

### Remove null cols

In [61]:
df = PolarsUtils.drop_null_cols(df, remove_all_null=True, remove_any_null=False)
df.head(5)

Dropped 230 columns. 87 columns remaining.


record_id,sct_422549004,sct_184099003_y,sct_281053000,sct_364699009,sct_278844005,sct_767023003,sct_184305005_rd,sct_769681006_center,sct_440377005,sct_424850005,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_icd10gm,sct_439401001_alphaid,sct_432213005,sct_263493007,sct_116694002,sct_246454002,sct_439401001_ver,sct_103330002,sct_439401001_val,sct_39154008_hpo,sct_406522009,ln_48007_9,ln_48007_9_mitoch,sct_439401001_gen_val,ln_ll4048_6,sct_439401001_omim_g_1,sct_439401001_omim_g_2,…,ln_48018_6_2,ln_48002_0_2,ln_48019_4_2,ln_53037_8_2,ln_62374_4_3,sct_55446002_str_3,ln_48004_6_3,ln_48005_3_3,ln_48018_6_3,ln_48002_0_3,ln_48019_4_3,ln_53037_8_3,sct_8116006_1,sct_8116006_1_date,sct_8116006_2,sct_8116006_2_date,sct_8116006_3,sct_8116006_3_date,sct_8116006_4,sct_8116006_4_date,sct_72705000_rd,sct_160430005,sct_66839005_rd,sct_160436004,sct_82101005_rd,sct_75226009_rd,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_21134002_class
i64,str,i64,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
1,"""FA062""",2001,"""sct_248153007""","""sct_90027003""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2007-13-24""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2008.0,1.0,23.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2008-01-23""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025500""","""2003-06-24""","""HP:0025501""","""2001-08-24""","""HP:0025499""","""2006-10-24""",,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
2,"""YA074""",2002,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2011-10-21""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2011.0,4.0,18.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2011-04-18""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025499""","""2004-07-21""",,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
3,"""SA099""",2002,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2002-7-23""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025502""","""2006-09-23""","""HP:0025499""","""2007-10-23""",,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
4,"""EA102""",2007,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2018-9-26""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2018.0,10.0,17.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2018-10-17""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0021""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025499""","""2008-01-26""","""HP:0025500""","""2009-01-26""","""HP:0025500""","""2010-10-26""","""HP:0025500""","""2012-01-26""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
5,"""AA070""",2000,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2009-4-17""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2009.0,7.0,5.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2009-07-05""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025499""","""2002-11-17""","""HP:0025499""","""2005-02-17""",,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"


### Update id column
1. Drop old ID (has duplicate values)
2. Initialize new ID column

In [62]:
df.drop_in_place('record_id');  # entferne alte ID

In [63]:
df = PolarsUtils.add_id_col(df, id_col_name='mc4r_id')  # fuege neue ID hinzu
df.head(5)

mc4r_id,sct_422549004,sct_184099003_y,sct_281053000,sct_364699009,sct_278844005,sct_767023003,sct_184305005_rd,sct_769681006_center,sct_440377005,sct_424850005,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_icd10gm,sct_439401001_alphaid,sct_432213005,sct_263493007,sct_116694002,sct_246454002,sct_439401001_ver,sct_103330002,sct_439401001_val,sct_39154008_hpo,sct_406522009,ln_48007_9,ln_48007_9_mitoch,sct_439401001_gen_val,ln_ll4048_6,sct_439401001_omim_g_1,sct_439401001_omim_g_2,…,ln_48018_6_2,ln_48002_0_2,ln_48019_4_2,ln_53037_8_2,ln_62374_4_3,sct_55446002_str_3,ln_48004_6_3,ln_48005_3_3,ln_48018_6_3,ln_48002_0_3,ln_48019_4_3,ln_53037_8_3,sct_8116006_1,sct_8116006_1_date,sct_8116006_2,sct_8116006_2_date,sct_8116006_3,sct_8116006_3_date,sct_8116006_4,sct_8116006_4_date,sct_72705000_rd,sct_160430005,sct_66839005_rd,sct_160436004,sct_82101005_rd,sct_75226009_rd,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_21134002_class
i64,str,i64,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
0,"""FA062""",2001,"""sct_248153007""","""sct_90027003""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2007-13-24""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2008.0,1.0,23.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2008-01-23""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025500""","""2003-06-24""","""HP:0025501""","""2001-08-24""","""HP:0025499""","""2006-10-24""",,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
1,"""YA074""",2002,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2011-10-21""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2011.0,4.0,18.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2011-04-18""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025499""","""2004-07-21""",,,,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
2,"""SA099""",2002,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2002-7-23""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025502""","""2006-09-23""","""HP:0025499""","""2007-10-23""",,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
3,"""EA102""",2007,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2018-9-26""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2018.0,10.0,17.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2018-10-17""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0021""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025499""","""2008-01-26""","""HP:0025500""","""2009-01-26""","""HP:0025500""","""2010-10-26""","""HP:0025500""","""2012-01-26""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"
4,"""AA070""",2000,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2009-4-17""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2009.0,7.0,5.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2009-07-05""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""","""""nan""""",…,"""HGNC:6932""",,,,"""ln_LA26806-2""",,,,"""HGNC:6932""",,,,"""HP:0025499""","""2002-11-17""","""HP:0025499""","""2005-02-17""",,,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…"


### Parsing step

In [64]:
from src.MC4R.MappingDicts import phenotype_label_map_erker2phenopackets
from src.MC4R.MappingDicts import allele_label_map_erker2phenopackets
from src.MC4R import zygosity_map_erker2phenopackets, sex_map_erker2phenopackets
from src.MC4R.ParseMC4R import parse_date_of_diagnosis, parse_year_of_birth, \
parse_phenotyping_date, parse_omim

config = configparser.ConfigParser()
config.read('../../data/config/config.cfg')
no_mutation = config.get('NoValue', 'mutation')
no_phenotype = config.get('NoValue', 'phenotype')
no_date = config.get('NoValue', 'date')
no_omim = config.get('NoValue', 'omim')

# sct_184099003_y (year of birth)
df = PolarsUtils.map_col(df, map_from='sct_184099003_y', map_to='parsed_year_of_birth',\
    mapping=parse_year_of_birth)

# sct_281053000 (sex)
df = PolarsUtils.map_col(df, map_from='sct_281053000', map_to='parsed_sex',\
    mapping=sex_map_erker2phenopackets)

# sct_432213005 (date of diagnosis)
df = PolarsUtils.map_col(df, map_from='sct_432213005',\
    map_to='parsed_date_of_diagnosis' ,mapping=parse_date_of_diagnosis)
df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_diagnosis', no_date)

# # ln_48007_9 (zygosity)
df = PolarsUtils.map_col(df, map_from='ln_48007_9', map_to='parsed_zygosity',\
    mapping=zygosity_map_erker2phenopackets)
df = PolarsUtils.map_col(df, map_from='ln_48007_9', map_to='allele_label', \
                         mapping=allele_label_map_erker2phenopackets)


# sct_439401001_orpha (diagnosis (ORPHA))
# does not require mapping

# sct_439401001_omim_g_1, sct_439401001_omim_g_2, sct_439401001_omim_g_3 \
# (Primärdiagnose OMIM)
df = PolarsUtils.map_col(df, map_from='sct_439401001_omim_g_1',\
    map_to='parsed_omim_1' ,mapping=parse_omim)
df = PolarsUtils.fill_null_vals(df, 'parsed_omim_1', no_omim)
    
df = PolarsUtils.map_col(df, map_from='sct_439401001_omim_g_2',\
    map_to='parsed_omim_2' ,mapping=parse_omim)
df = PolarsUtils.fill_null_vals(df, 'parsed_omim_2', no_omim)


# ln_48005_3_1, ln_48005_3_2, ln_48005_3_3 (mutation p.HGVS)
df = PolarsUtils.fill_null_vals(df, 'ln_48005_3_1', no_mutation)
df = PolarsUtils.fill_null_vals(df, 'ln_48005_3_2', no_mutation)
df = PolarsUtils.fill_null_vals(df, 'ln_48005_3_3', no_mutation)

# ln_48004_6_1, ln_48004_6_2, ln_48004_6_3 (mutation c.HGVS)
df = PolarsUtils.fill_null_vals(df, 'ln_48004_6_1', no_mutation)
df = PolarsUtils.fill_null_vals(df, 'ln_48004_6_2', no_mutation)
df = PolarsUtils.fill_null_vals(df, 'ln_48004_6_3', no_mutation)

# ln_48018_6_1 (gene HGNC)
# does not require mapping

# sct_8116006_1, sct_8116006_2, sct_8116006_3, sct_8116006_4, sct_8116006_5 (phenotype\
  #classification
df = PolarsUtils.fill_null_vals(df, 'sct_8116006_1', no_phenotype)
df = PolarsUtils.fill_null_vals(df, 'sct_8116006_2', no_phenotype)
df = PolarsUtils.fill_null_vals(df, 'sct_8116006_3', no_phenotype)
df = PolarsUtils.fill_null_vals(df, 'sct_8116006_4', no_phenotype)
if 'sct_8116006_5' in df.columns:
    df = PolarsUtils.fill_null_vals(df, 'sct_8116006_5', no_phenotype)

# sct_8116006_1_date, sct_8116006_2_date, sct_8116006_3_date, sct_8116006_4_date, \
    # sct_8116006_5_date (dates of phenotype determination)
df = PolarsUtils.map_col(df, map_from='sct_8116006_1_date', map_to='parsed_date_of_phenotyping1', mapping=parse_phenotyping_date)
df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping1',no_date)

df = PolarsUtils.map_col(df, map_from='sct_8116006_2_date', map_to='parsed_date_of_phenotyping2', mapping=parse_phenotyping_date)
df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping2',no_date)

if 'sct_8116006_3_date' in df.columns:
    df = PolarsUtils.map_col(df, map_from='sct_8116006_3_date', map_to='parsed_date_of_phenotyping3', mapping=parse_phenotyping_date)
    df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping3',no_date)
    
if 'sct_8116006_4_date' in df.columns:
    df = PolarsUtils.map_col(df, map_from='sct_8116006_4_date', map_to='parsed_date_of_phenotyping4', mapping=parse_phenotyping_date)
    df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping4',no_date)
    
if 'sct_8116006_5_date' in df.columns:
    df = PolarsUtils.map_col(df, map_from='sct_8116006_5_date', map_to='parsed_date_of_phenotyping5', mapping=parse_phenotyping_date)  
    df = PolarsUtils.fill_null_vals(df, 'parsed_date_of_phenotyping5',no_date)  


# phenotype label
df = PolarsUtils.map_col(df, map_from='sct_8116006_1', map_to='parsed_phenotype_label1', mapping=phenotype_label_map_erker2phenopackets)
df = PolarsUtils.map_col(df, map_from='sct_8116006_2', map_to='parsed_phenotype_label2', mapping=phenotype_label_map_erker2phenopackets)
df = PolarsUtils.map_col(df, map_from='sct_8116006_3', map_to='parsed_phenotype_label3', mapping=phenotype_label_map_erker2phenopackets)
df = PolarsUtils.map_col(df, map_from='sct_8116006_4', map_to='parsed_phenotype_label4', mapping=phenotype_label_map_erker2phenopackets)
if 'sct_8116006_5' in df.columns:
    df = PolarsUtils.map_col(df, map_from='sct_8116006_5', map_to='parsed_phenotype_label5', mapping=phenotype_label_map_erker2phenopackets)

In [65]:
df.head()

mc4r_id,sct_422549004,sct_184099003_y,sct_281053000,sct_364699009,sct_278844005,sct_767023003,sct_184305005_rd,sct_769681006_center,sct_440377005,sct_424850005,sct_423493009,sct_432213005_y,sct_432213005_m,sct_432213005_d,sct_717800004,sct_64245008,sct_125679009_proposit,sct_842009,sct_439401001_orpha,sct_439401001_icd10gm,sct_439401001_alphaid,sct_432213005,sct_263493007,sct_116694002,sct_246454002,sct_439401001_ver,sct_103330002,sct_439401001_val,sct_39154008_hpo,sct_406522009,ln_48007_9,ln_48007_9_mitoch,sct_439401001_gen_val,ln_ll4048_6,sct_439401001_omim_g_1,sct_439401001_omim_g_2,…,sct_8116006_2_date,sct_8116006_3,sct_8116006_3_date,sct_8116006_4,sct_8116006_4_date,sct_72705000_rd,sct_160430005,sct_66839005_rd,sct_160436004,sct_82101005_rd,sct_75226009_rd,sct_309370004_research,sct_309370004_narse,sct_309370004_data,sct_309370004_eu,sct_309370004_int,sct_309370004_aff,sct_309370004_gen,sct_309370004_case,sct_123038009,sct_840566006,sct_21134002_class,parsed_year_of_birth,parsed_sex,parsed_date_of_diagnosis,parsed_zygosity,allele_label,parsed_omim_1,parsed_omim_2,parsed_date_of_phenotyping1,parsed_date_of_phenotyping2,parsed_date_of_phenotyping3,parsed_date_of_phenotyping4,parsed_phenotype_label1,parsed_phenotype_label2,parsed_phenotype_label3,parsed_phenotype_label4
i64,str,i64,str,str,str,str,str,str,str,str,str,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
0,"""FA062""",2001,"""sct_248153007""","""sct_90027003""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2007-13-24""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2008.0,1.0,23.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2008-01-23""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""","""""nan""""",…,"""2001-08-24""","""HP:0025499""","""2006-10-24""","""NO_PHENOTYPE""",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""2001-01-01T00:…","""MALE""","""2008-01-23T00:…","""GENO:0000135""","""heterozygous""","""OMIM:155541.00…","""NO_OMIM""","""2003-06-24T00:…","""2001-08-24T00:…","""2006-10-24T00:…","""NO_DATE""","""Class II obesi…","""Class III obes…","""Class I obesit…",
1,"""YA074""",2002,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2011-10-21""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2011.0,4.0,18.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2011-04-18""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0024""""","""""nan""""",…,,"""NO_PHENOTYPE""",,"""NO_PHENOTYPE""",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""2002-01-01T00:…","""FEMALE""","""2011-04-18T00:…","""GENO:0000135""","""heterozygous""","""OMIM:155541.00…","""NO_OMIM""","""2004-07-21T00:…","""NO_DATE""","""NO_DATE""","""NO_DATE""","""Class I obesit…",,,
2,"""SA099""",2002,"""sct_248153007""","""sct_14045001""","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2002-7-23""","""sct_1220561009…","""sct_261665006""","""sct_261665006""",,,,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""","""""nan""""",…,"""2007-10-23""","""NO_PHENOTYPE""",,"""NO_PHENOTYPE""",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""2002-01-01T00:…","""MALE""","""NO_DATE""","""GENO:0000137""","""unspecified zy…","""NO_OMIM""","""NO_OMIM""","""2006-09-23T00:…","""2007-10-23T00:…","""NO_DATE""","""NO_DATE""","""Overweight""","""Class I obesit…",,
3,"""EA102""",2007,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_263659003""","""sct_1220561009…","""2018-9-26""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2018.0,10.0,17.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_373067005""","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2018-10-17""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""155541.0021""""","""""nan""""",…,"""2009-01-26""","""HP:0025500""","""2010-10-26""","""HP:0025500""","""2012-01-26""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""2007-01-01T00:…","""FEMALE""","""2018-10-17T00:…","""GENO:0000135""","""heterozygous""","""OMIM:155541.00…","""NO_OMIM""","""2008-01-26T00:…","""2009-01-26T00:…","""2010-10-26T00:…","""2012-01-26T00:…","""Class I obesit…","""Class II obesi…","""Class II obesi…","""Class II obesi…"
4,"""AA070""",2000,"""sct_248152002""","""sct_372148003_…","""CS_MII_Person_…","""sct_41847000""","""sct_1220561009…","""2009-4-17""","""sct_1220561009…","""sct_261665006""","""sct_410672004""",2009.0,7.0,5.0,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""ORPHA:71529""","""E66.8""","""sct_1220561009…","""2009-07-05""","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""hl7_conditionv…","""sct_373067005""","""sct_439401001_…","""sct_373066001""","""sct_1220561009…","""ln_LA6706-1""","""sct_1220561009…","""sct_373066001""","""sct_1220561009…","""""nan""""","""""nan""""",…,"""2005-02-17""","""NO_PHENOTYPE""",,"""NO_PHENOTYPE""",,"""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""sct_1220561009…","""2000-01-01T00:…","""FEMALE""","""2009-07-05T00:…","""GENO:0000135""","""heterozygous""","""NO_OMIM""","""NO_OMIM""","""2002-11-17T00:…","""2005-02-17T00:…","""NO_DATE""","""NO_DATE""","""Class I obesit…","""Class I obesit…",,


## Map to phenopackets

In [66]:
from src.MC4R.MapMC4R import _map_chunk


phenopackets = _map_chunk(df) #map_mc4r2phenopackets(df)

TypeError: 0 has type int, but expected one of: bytes, unicode

## Write to json

In [None]:
cur_time = datetime.now().strftime("%Y%m%d-%H%M%S") # get cur time for unique dir name
phenopackets_out_dir = phenopackets_out / cur_time # create dir for output

write_files(phenopackets, phenopackets_out_dir)

NameError: name 'phenopackets' is not defined