In [14]:
import configparser
from pathlib import Path

config = configparser.ConfigParser()
config.read('../../../data/config/config.cfg')

phenopackets_out_dir = ('..' / Path(config.get('Paths', 'phenopackets_out'))).resolve()

phenopackets_out_dir

WindowsPath('C:/Users/Surface/OneDrive/Documents/DataSpell/ERKER2Phenopackets/ERKER2Phenopackets/data/out/phenopackets')

# read in data from phenopackets

In [15]:
from ERKER2Phenopackets.src.utils.io import read_files

example_phenopackets_dir = phenopackets_out_dir / 'example-phenopackets-from-synthetic-data'
pps = read_files(example_phenopackets_dir)

# convert phenopackets to dataframe
Necessary fields:
- id
- zygosity
- cHGVS mutation
- Obsesity Class and timestamp

In [16]:
print(pps[0])

id: "0"
subject {
  id: "0"
  date_of_birth {
    seconds: 946684800
  }
  sex: FEMALE
  taxonomy {
    id: "NCBITaxon:9606"
    label: "Homo sapiens"
  }
}
phenotypic_features {
  type {
    id: "HP:0001513"
    label: "Obesity"
  }
  excluded: true
  onset {
    timestamp {
      seconds: 708652800
    }
  }
}
interpretations {
  id: "e0dd47b3-6e13-4507-84a7-5606bee17024"
  progress_status: SOLVED
  diagnosis {
    disease {
      id: "ORPHA:71529"
      label: "Obesity due to melanocortin 4 receptor deficiency"
    }
    genomic_interpretations {
      subject_or_biosample_id: "0"
      interpretation_status: CONTRIBUTORY
      variant_interpretation {
        variation_descriptor {
          id: "id:A"
          expressions {
            syntax: "hgvs"
            value: "NP_005903.2:p.(Val103Ile)"
          }
          expressions {
            syntax: "hgvs"
            value: "NM_005912.3:c.181G>T"
          }
          allelic_state {
            id: "GENO:0000135"
            

In [83]:
from typing import Tuple
from phenopackets import Phenopacket

sex_map = {0: 'unknown', 1: 'female', 2: 'male', 3: 'other'}

def extract_fields(phenopacket: Phenopacket) -> Tuple[str, int, str]:
    pp_id = int(phenopacket.id)
    dob = phenopacket.subject.date_of_birth.ToSeconds()
    sex = sex_map[phenopacket.subject.sex]
    
    try:
        variation_descriptor = phenopacket.interpretations[0].diagnosis.genomic_interpretations[0].variant_interpretation.variation_descriptor
        
        c_hgvs = variation_descriptor.expressions[1].value
        zygosity = variation_descriptor.allelic_state.label
    except IndexError:
        return None
    return pp_id, dob, sex, zygosity, c_hgvs

In [84]:
phenopacket = pps[0]
variation_descriptor = phenopacket.interpretations[0].diagnosis.genomic_interpretations[0].variant_interpretation

In [85]:
for pp in pps:
    print(extract_fields(pp))

(0, 946684800, 'female', 'heterozygous', 'NM_005912.3:c.181G>T')
(1, 915148800, 'female', 'heterozygous', 'NM_005912.3:c.307G>A')
None
None
(12, 757382400, 'male', 'heterozygous', 'NM_005912.3:c.307G>A')
(13, 757382400, 'male', 'heterozygous', 'NM_005912.3:c.307G>A')
None
None
None
(17, 1072915200, 'female', 'homozygous', 'NM_005912.3:c.253A>G')
(18, 1136073600, 'female', 'heterozygous', 'NM_005912.3:c.508A>G')
None
(2, 1009843200, 'female', 'heterozygous', 'NM_005912.3:c.821A>G')
(20, 1072915200, 'male', 'heterozygous', 'NM_005912.3:c.307G>A')
(21, 1136073600, 'male', 'heterozygous', 'NM_005912.3:c.751A>C')
(22, 694224000, 'male', 'heterozygous', 'NM_005912.3:c.268G>A')
(23, 1293840000, 'female', 'heterozygous', 'NM_005912.3:c.307G>A')
(24, 1072915200, 'male', 'unspecified zygosity', 'NM_005912.3:c.533C>T')
(25, 820454400, 'male', 'heterozygous', 'NM_005912.3:c.230C>T')
(26, 757382400, 'female', 'heterozygous', 'NM_005912.3:c.268G>A')
None
(28, 631152000, 'female', 'heterozygous', 'NM

In [86]:
import polars as pl

data = list(filter(lambda x: x is not None, map(extract_fields, pps)))

data, len(data)

([(0, 946684800, 'female', 'heterozygous', 'NM_005912.3:c.181G>T'),
  (1, 915148800, 'female', 'heterozygous', 'NM_005912.3:c.307G>A'),
  (12, 757382400, 'male', 'heterozygous', 'NM_005912.3:c.307G>A'),
  (13, 757382400, 'male', 'heterozygous', 'NM_005912.3:c.307G>A'),
  (17, 1072915200, 'female', 'homozygous', 'NM_005912.3:c.253A>G'),
  (18, 1136073600, 'female', 'heterozygous', 'NM_005912.3:c.508A>G'),
  (2, 1009843200, 'female', 'heterozygous', 'NM_005912.3:c.821A>G'),
  (20, 1072915200, 'male', 'heterozygous', 'NM_005912.3:c.307G>A'),
  (21, 1136073600, 'male', 'heterozygous', 'NM_005912.3:c.751A>C'),
  (22, 694224000, 'male', 'heterozygous', 'NM_005912.3:c.268G>A'),
  (23, 1293840000, 'female', 'heterozygous', 'NM_005912.3:c.307G>A'),
  (24, 1072915200, 'male', 'unspecified zygosity', 'NM_005912.3:c.533C>T'),
  (25, 820454400, 'male', 'heterozygous', 'NM_005912.3:c.230C>T'),
  (26, 757382400, 'female', 'heterozygous', 'NM_005912.3:c.268G>A'),
  (28, 631152000, 'female', 'heterozyg

In [92]:
columns = ['id', 'date_of_birth', 'sex', 'zygosity', 'c_hgvs']
transposed_data = list(zip(*data))

df = pl.DataFrame({col: transposed_data[i] for i, col in enumerate(columns)})

In [93]:
df.head()

id,date_of_birth,sex,zygosity,c_hgvs
i64,i64,str,str,str
0,946684800,"""female""","""heterozygous""","""NM_005912.3:c.…"
1,915148800,"""female""","""heterozygous""","""NM_005912.3:c.…"
12,757382400,"""male""","""heterozygous""","""NM_005912.3:c.…"
13,757382400,"""male""","""heterozygous""","""NM_005912.3:c.…"
17,1072915200,"""female""","""homozygous""","""NM_005912.3:c.…"
