# Fill based on phenopackets ontology terms

In [1]:
import os
# synthetic test data set
datadir = "../data"
# ontology term mappings
phenotype_variables_file = os.path.join(datadir, "ontology_terms", "phenotype_variables.csv")
phenotype_values_files = {
    "param23795": os.path.join(datadir, "ontology_terms", "ABO_blood_group_values.csv"),
    "param23796": os.path.join(datadir, "ontology_terms", "Rh_blood_group_values.csv"),
    "param18133": os.path.join(datadir, "ontology_terms", "smoking_values.csv"),
    "param23797": os.path.join(datadir, "ontology_terms", "PhenotypicFeature_values.csv")}

In [2]:
import pandas as pd
phenotype_variables = pd.read_csv(phenotype_variables_file)
phenotype_values = {p: pd.read_csv(f) for p, f in 
                   phenotype_values_files.items()}

In [3]:
phenotype_variables

Unnamed: 0,column_name,original_column_label,column_label,phenopackets_building_block,part_of_phenopackets_building_block,Term URI,Term CURIE,Term label,Unit term URI,Unit term CURIE,Unit term label
0,ID Patient,PRIMARY KEY,id,id,Individual,,,,,,
1,param17377,Leucocytes,Leukocytes,Measurement,Biosample,https://loinc.org/26464-8/,LOINC:26464-8,Leukocytes [#/volume] in Blood,http://purl.obolibrary.org/obo/NCIT_C67249,NCIT:C67249,Billion Cells per Liter
2,param17374,Erythrocytes,Erythrocytes,Measurement,Biosample,https://loinc.org/26453-1/,LOINC:26453-1,Erythrocytes [#/volume] in Blood,http://purl.obolibrary.org/obo/NCIT_C67243,NCIT:C67243,Trillion Cells per Liter
3,param17371,Hemoglobin,Hemoglobin,Measurement,Biosample,https://loinc.org/718-7/,LOINC:718-7,Hemoglobin [Mass/volume] in Blood,http://purl.obolibrary.org/obo/UO_0000175,UO:0000175,gram per liter
4,param17401,Hematokrit,Hematocrit,Measurement,Biosample,https://loinc.org/20570-8/,LOINC:20570-8,Hematocrit [Volume Fraction] of Blood,http://purl.obolibrary.org/obo/UO_0000205,UO:0000205,volume per unit volume
5,param17395,Platelets,Platelets,Measurement,Biosample,https://loinc.org/26515-7/,LOINC:26515-7,Platelets [#/volume] in Blood,http://purl.obolibrary.org/obo/NCIT_C67249,NCIT:C67249,Billion Cells per Liter
6,param17383,MCV,MCV,Measurement,Biosample,https://loinc.org/787-2,LOINC:787-2,MCV [Entitic volume] by Automated count,http://purl.obolibrary.org/obo/UO_0000104,UO:0000104,femtoliter
7,param17386,MCH,MCH,Measurement,Biosample,https://loinc.org/785-6,LOINC:785-6,MCH [Entitic mass] by Automated count,http://purl.obolibrary.org/obo/UO_0000025,UO:0000025,picogram
8,param17389,MCHC,MCHC,Measurement,Biosample,https://loinc.org/786-4,LOINC:786-4,MCHC [Mass/volume] by Automated count,http://purl.obolibrary.org/obo/UO_0000175,UO:0000175,gram per liter
9,param17392,Lymphocytes,Lymphocytes,Measurement,Biosample,https://loinc.org/731-0,LOINC:731-0,Lymphocytes [#/volume] in Blood by Automated c...,http://purl.obolibrary.org/obo/UO_0000191,UO:0000191,fraction


## ISA Template form FAIR Data Station

In [4]:
isa = pd.read_excel('C:\\Users\\Z896216\\Documents\\EATRIS+\\FairDataStation\\eatris_isa.xlsx', sheet_name = 2) #TODO
isa

Unnamed: 0,observation unit identifier,observation unit name,observation unit description,study identifier,date of birth,place of birth,sex,age,smoker,smoker(ontology),...,neutrophils,lymphocytes,haematocrit,erythroctyes,hemoglobine,mcv,mch,mxd,platelets,mchc
0,CZC2546,Czech Cohort Invidiual 2546,Healthy individual,EATRIS-Plus-demo,,,Male,29,smoker,,...,,,,,,,,,,
1,CZC2547,Czech Cohort Invidiual 2547,Healthy individual,EATRIS-Plus-demo,,,Male,42,ex-smoker,,...,,,,,,,,,,
2,CZC2549,Czech Cohort Invidiua 2549,Healthy individual,EATRIS-Plus-demo,,,Male,33,smoker,,...,,,,,,,,,,


In [5]:
isa.columns

Index(['observation unit identifier', 'observation unit name',
       'observation unit description', 'study identifier', 'date of birth',
       'place of birth', 'sex', 'age', 'smoker', 'smoker(ontology)',
       'blood type abo', 'blood type abo(ontology)', 'blood type rh',
       'blood type rh(ontology)', 'host height', 'weight',
       'host body-mass index', 'leukocytes', 'neutrophils', 'lymphocytes',
       'haematocrit', 'erythroctyes', 'hemoglobine', 'mcv', 'mch', 'mxd',
       'platelets', 'mchc'],
      dtype='object')

In [7]:
isa.at[1, 'blood type abo(ontology)']

nan

### Blood group

In [8]:
## Number of rows in the data frame
nrow = len(isa.index)


## Name of column to be replaced by ontology terms
col_name_text = 'blood type abo' # TODO
col_name_ont = 'blood type abo(ontology)' # TODO

## Loop over rows to replace human readable values by ontology terms from phenopackets
for i in range(0, nrow):
    # human term
    human_term = isa.at[i, col_name_text]
    
    # Map to ontology term
    df_mapping = phenotype_values['param23795'] #TODO
    term = df_mapping.loc[df_mapping['value_reported'].str.lower() ==  human_term.lower(), 'Term URI'].values[0]
    
    # Replace by ontology term
    isa.at[i, col_name_ont] = term

### Rh factor

In [9]:
df_mapping = phenotype_values['param23796']
df_mapping['Alt label'] = ['negative', 'positive'] # TODO
df_mapping

Unnamed: 0,value_reported,Term URI,Term CURIE,Term label,Alt label
0,-,http://purl.obolibrary.org/obo/NCIT_C76252,NCIT:C76252,Rh Negative Blood Group,negative
1,+,http://purl.obolibrary.org/obo/NCIT_C76251,NCIT:C76251,Rh Positive Blood Group,positive


In [11]:
## Name of column to be replaced by ontology terms
col_name_text = 'blood type rh'
col_name_ont = 'blood type rh(ontology)'

## Loop over rows to replace human readable values by ontology terms from phenopackets
for i in range(0, nrow):
    # human term
    human_term = isa.at[i, col_name_text]
    
    # Map to ontology term
    df_mapping = phenotype_values['param23796'] #TODO
    try:
        term = df_mapping.loc[df_mapping['value_reported'].str.lower() ==  human_term.lower(), 'Term URI'].values[0]
    except:
        term =  df_mapping.loc[df_mapping['Alt label'].str.lower() ==  human_term.lower(), 'Term URI'].values[0]
                                                
        #term = df_mapping.loc[df_mapping['value_reported'].str.lower() in  human_term.lower(), 'Term URI'].values[0]
    
    # Replace by ontology term
    isa.at[i, col_name_ont] = term

### Smoking Status

In [12]:
## Name of column to be replaced by ontology terms
col_name_text = 'smoker'
col_name_ont = 'smoker(ontology)'

## Loop over rows to replace human readable values by ontology terms from phenopackets
for i in range(0, nrow):
    # human term
    human_term = isa.at[i, col_name_text]
    
    # Map to ontology term
    df_mapping = phenotype_values['param18133'] #TODO
    term = df_mapping.loc[df_mapping['value_reported'].str.lower() ==  human_term.lower(), 'Term URI'].values[0]
    
    # Replace by ontology term
    isa.at[i, col_name_ont] = term

In [14]:
isa.iloc[:,1:14]

Unnamed: 0,observation unit name,observation unit description,study identifier,date of birth,place of birth,sex,age,smoker,smoker(ontology),blood type abo,blood type abo(ontology),blood type rh,blood type rh(ontology)
0,Czech Cohort Invidiual 2546,Healthy individual,EATRIS-Plus-demo,,,Male,29,smoker,http://purl.obolibrary.org/obo/NCIT_C67147,A,http://purl.obolibrary.org/obo/NCIT_C76246,positive,http://purl.obolibrary.org/obo/NCIT_C76251
1,Czech Cohort Invidiual 2547,Healthy individual,EATRIS-Plus-demo,,,Male,42,ex-smoker,http://purl.obolibrary.org/obo/NCIT_C67148,B,http://purl.obolibrary.org/obo/NCIT_C76247,positive,http://purl.obolibrary.org/obo/NCIT_C76251
2,Czech Cohort Invidiua 2549,Healthy individual,EATRIS-Plus-demo,,,Male,33,smoker,http://purl.obolibrary.org/obo/NCIT_C67147,AB,http://purl.obolibrary.org/obo/NCIT_C76248,positive,http://purl.obolibrary.org/obo/NCIT_C76251
