In [14]:
import configparser
from pathlib import Path

config = configparser.ConfigParser()
config.read('../../../data/config/config.cfg')

phenopackets_out_dir = ('..' / Path(config.get('Paths', 'phenopackets_out'))).resolve()

phenopackets_out_dir

WindowsPath('C:/Users/Surface/OneDrive/Documents/DataSpell/ERKER2Phenopackets/ERKER2Phenopackets/data/out/phenopackets')

# read in data from phenopackets

In [15]:
from ERKER2Phenopackets.src.utils.io import read_files

example_phenopackets_dir = phenopackets_out_dir / 'example-phenopackets-from-synthetic-data'
pps = read_files(example_phenopackets_dir)

# convert phenopackets to dataframe
Necessary fields:
- id
- zygosity
- cHGVS mutation
- Obsesity Class and timestamp

In [16]:
print(pps[0])

id: "0"
subject {
  id: "0"
  date_of_birth {
    seconds: 946684800
  }
  sex: FEMALE
  taxonomy {
    id: "NCBITaxon:9606"
    label: "Homo sapiens"
  }
}
phenotypic_features {
  type {
    id: "HP:0001513"
    label: "Obesity"
  }
  excluded: true
  onset {
    timestamp {
      seconds: 708652800
    }
  }
}
interpretations {
  id: "e0dd47b3-6e13-4507-84a7-5606bee17024"
  progress_status: SOLVED
  diagnosis {
    disease {
      id: "ORPHA:71529"
      label: "Obesity due to melanocortin 4 receptor deficiency"
    }
    genomic_interpretations {
      subject_or_biosample_id: "0"
      interpretation_status: CONTRIBUTORY
      variant_interpretation {
        variation_descriptor {
          id: "id:A"
          expressions {
            syntax: "hgvs"
            value: "NP_005903.2:p.(Val103Ile)"
          }
          expressions {
            syntax: "hgvs"
            value: "NM_005912.3:c.181G>T"
          }
          allelic_state {
            id: "GENO:0000135"
            

In [62]:
from typing import Tuple
from phenopackets import Phenopacket

def extract_fields(phenopacket: Phenopacket) -> Tuple[str, int, str]:
    pp_id = int(phenopacket.id)
    dob = phenopacket.subject.date_of_birth.ToSeconds()
    sex = phenopacket.subject.sex
    
    try:
        zygosity = phenopacket.interpretations[0].diagnosis.genomic_interpretations[0].variant_interpretation.variation_descriptor.allelic_state.label
    except IndexError:
        return None
    return pp_id, dob, sex, zygosity

In [63]:
phenopacket = pps[0]
pp_id = int(phenopacket.id)
dob = phenopacket.subject.date_of_birth.ToSeconds()
sex = phenopacket.subject.sex

zygosity = phenopacket.interpretations[0].diagnosis.genomic_interpretations[0].variant_interpretation.variation_descriptor.allelic_state.label
zygosity

'heterozygous'

In [64]:
for pp in pps:
    print(extract_fields(pp))

(0, 946684800, 1, 'heterozygous')
(1, 915148800, 1, 'heterozygous')
None
None
(12, 757382400, 2, 'heterozygous')
(13, 757382400, 2, 'heterozygous')
None
None
None
(17, 1072915200, 1, 'homozygous')
(18, 1136073600, 1, 'heterozygous')
None
(2, 1009843200, 1, 'heterozygous')
(20, 1072915200, 2, 'heterozygous')
(21, 1136073600, 2, 'heterozygous')
(22, 694224000, 2, 'heterozygous')
(23, 1293840000, 1, 'heterozygous')
(24, 1072915200, 2, 'unspecified zygosity')
(25, 820454400, 2, 'heterozygous')
(26, 757382400, 1, 'heterozygous')
None
(28, 631152000, 1, 'heterozygous')
(29, 1483228800, 2, 'heterozygous')
(3, 820454400, 2, 'heterozygous')
(30, 725846400, 1, 'heterozygous')
(31, 915148800, 1, 'heterozygous')
(32, 1420070400, 2, 'heterozygous')
(33, 978307200, 1, 'heterozygous')
None
None
(36, 788918400, 1, 'homozygous')
(37, 1072915200, 1, 'heterozygous')
(38, 978307200, 1, 'heterozygous')
(39, 1356998400, 1, 'homozygous')
(4, 946684800, 2, 'heterozygous')
None
(41, 915148800, 1, 'heterozygous

In [None]:
import polars as pl

data = list(map(extract_fields, pps))
columns = ['id', 'date_of_birth', 'sex', 'zygosity', 'cHGVS mutation', 'Obsesity Class and timestamp']

df = pl.Dataframe(data, columns=columns)