In [None]:
import os
import json
import obonet
import numpy as np
import pandas as pd
from pprint import pprint

DATA_DIR = '/Users/singhn4/Projects/kids_first/data'

In [None]:
# Helper functions
def read_json(filepath):
    with open(filepath, 'r') as json_file:
        return json.load(json_file)

def write_json(data, filepath):
    with open(filepath, 'w') as json_file:
        json.dump(data, json_file, sort_keys=True, indent=4, separators=(',', ':'))

def read_or_create_hpo_map(filepath=None):
    if not filepath:
        filepath = os.path.join(DATA_DIR, 'hpo_map.json')
    
    # Cache the hpo map
    if not os.path.isfile(filepath):
        url = 'http://purl.obolibrary.org/obo/hp.obo'
        graph = obonet.read_obo(url)
        name_to_id = {data['name'].lower(): id_ for id_, data in graph.nodes(data=True)}
        write_json(name_to_id, filepath)
    else:
        name_to_id = read_json(filepath)
    
    return name_to_id

def get_term(name):
    """ Try to resolve an hpo term from a string """
    
    # Read or create hpo map
    name_to_id = read_or_create_hpo_map()
    
    name = name.lower().replace('_', ' ')
    if name in name_to_id:
        return name_to_id[name]
    return None

def get_mapping(df, phenotype_col):
    """ Given raw data, try to extract an initial mapping """
    mapping = {}
    
    if phenotype_col not in df.columns.tolist():
        return mapping
    
    for idx, row in df.iterrows():
        phenotype = row[phenotype_col]
        hpo_term = get_term(phenotype)
        mapping[phenotype] = hpo_term
    return mapping

def apply_mapping(df, mapping):
    mapped_df = pd.DataFrame.from_dict(mapping, orient='index')
    mapped_df.rename(columns={0: 'hpo_id'}, inplace=True)

    # Merge original df with mapped df
    df = pd.merge(df, mapped_df, left_on='phenotype', right_index=True)
    return df

def add_hpo_id_col(df, phenotype_col='phenotype'):
    mapping = get_mapping(df, phenotype_col)
    if mapping:
        df = apply_mapping(df, mapping)
    return df

In [None]:
# Read phenotypes
df = pd.read_csv('/Users/singhn4/Desktop/phenotype_counts.csv')
df.head()

In [None]:
# Get hpo terms
df = add_hpo_id_col(df)
df