In [14]:
import tabula
from tabula.io import read_pdf
import pandas as pd

df = read_pdf("12920_2018_372_MOESM1_ESM.pdf", pages='all', lattice=True, pandas_options={'header': None})
df_all = pd.concat(df[0:24])
df_all = df_all.rename(columns={0: "indernal_id", 1: "hpo_id", 2: "disorder", 3: "orphanet", 4: "gene", 5: "hgnc_id", 6: "locus", 7: "inheritant"})
df_all = df_all[df_all['hpo_id'].notna()]
df_all = df_all[df_all['gene']!='Gene']
df_all = df_all[['indernal_id','hpo_id', 'disorder', 'gene', 'hgnc_id']]
df_all['indernal_id'] = df_all['indernal_id'].str.replace('\r',' ')
df_all['hpo_id'] = df_all['hpo_id'].str.replace('\r',' ')
df_all['disorder'] = df_all['disorder'].str.replace('\r',' ')
df_all['gene'] = df_all['gene'].str.replace('\r',' ')
df_all['hgnc_id'] = df_all['hgnc_id'].str.replace('\r',' ')

In [16]:
import requests
import json
import pandas as pd
from pandas import json_normalize
from google.protobuf.json_format import Parse, MessageToJson
from google.protobuf.timestamp_pb2 import Timestamp
from phenopackets import Phenopacket, Individual, PhenotypicFeature, OntologyClass, GeneDescriptor, GenomicInterpretation, Diagnosis, Interpretation

In [15]:
df_all

Unnamed: 0,indernal_id,hpo_id,disorder,gene,hgnc_id
2,TCS001,"HP:0010301, HP:0000252, HP:0001263",Rubinstein-Taybi Syndrome 2,EP300,3373
3,TCS003,"HP:0001263, HP:0002650, HP:0001382, HP:0002079...",Coffin-Siris syndrome,SMARCB1,11103
4,TCS004,"HP:0001263, HP:0004322, HP:0100255",Alazami Syndrome,LARP7,24912
5,TCS005,HP:0007105,KAT6B-Related Disorder,KAT6B,17582
6,TCS006,"HP:0009372, HP:0001193, HP:0007598, HP:0001762",Type C Brachydactyly,GDF5,4220
...,...,...,...,...,...
1,TCS240,"HP:0005616, HP:0000646, HP:0001263, HP:0001999...",autosomal dominant pseudohypoparathyroidism ty...,GNAS,
2,TCS241,"HP:0001263, HP:0001999, HP:0001510, HP:0001382...",autosomal dominant SHORT syndrome [MIM: 269880],PIK3R1,
3,TCS242,"HP:0000062, HP:0001328","autosomal dominant sex- limited 46,XY sex reve...",NR5A1,
4,TCS243,"HP:0000365, HP:0000365, HP:0007703, HP:0000662...",autosomal dominant retinitis pigmentosa (type ...,NR2E3,


In [18]:
dj = df_all.to_json(orient = 'records')
parsed = json.loads(dj)
parsed[0]

{'indernal_id': 'TCS001',
 'hpo_id': 'HP:0010301, HP:0000252, HP:0001263',
 'disorder': 'Rubinstein-Taybi Syndrome 2',
 'gene': 'EP300',
 'hgnc_id': '3373'}

In [23]:
with open('example.json', 'w') as jsfile:
    subject = Individual(id=parsed[0]['indernal_id'])
    phenotypic_features = [PhenotypicFeature(type=OntologyClass(id=parsed[0]['hpo_id'].split(',')[0])),
                           PhenotypicFeature(type=OntologyClass(id=parsed[0]['hpo_id'].split(',')[1])),
                           PhenotypicFeature(type=OntologyClass(id=parsed[0]['hpo_id'].split(',')[2]))]
    gene_descriptor = GeneDescriptor(symbol = parsed[0]['gene'])
    genomic_interpretations = GenomicInterpretation(gene=gene_descriptor)
    diagnosis = Diagnosis(genomic_interpretations = [genomic_interpretations])
    interpretations = [Interpretation(diagnosis=diagnosis)]
    
    phenopacket = Phenopacket(id=parsed[0]['indernal_id'], subject=subject, phenotypic_features=phenotypic_features, interpretations=interpretations)

    json = MessageToJson(phenopacket)
    jsfile.write(json)

In [25]:
for subject in parsed:
    subject_id = subject['indernal_id']
    patient_id = subject['indernal_id']
    phenotypic_features = subject['hpo_id'].split(',')
    gene_id = subject['gene']
    features = [PhenotypicFeature(type=OntologyClass(id=hpo_id)) for hpo_id in phenotypic_features]
    
    with open(subject_id + '.json', 'w') as jsfile:
        subject = Individual(id = subject_id)
        phenotypic_features = features
        
        gene_descriptor = GeneDescriptor(symbol = gene_id)
        genomic_interpretations = GenomicInterpretation(gene=gene_descriptor)
        diagnosis = Diagnosis(genomic_interpretations = [genomic_interpretations])
        interpretations = [Interpretation(diagnosis=diagnosis)]


        phenopacket = Phenopacket(id = patient_id, subject=subject, phenotypic_features=phenotypic_features, interpretations=interpretations)

        json = MessageToJson(phenopacket)
        jsfile.write(json)