# Old PhenoDigm evidence


There are a few things I need to check

1. A given mouse model can be mapped to different disease/target associations.
2. Check if a mouse and human phenotypes the same for every evidence based on a given model

In [7]:
import gzip
import json
import pandas as pd


phenodigm_file = '/Users/dsuveges/project_data/ot/evidence_input/20.11/phenodigm/phenodigm-29-10-2020.json.gz'

parsed_data = []
with gzip.open(phenodigm_file, 'r') as f:
    for row in f:
        data = json.loads(row)
        parsed_data.append({
            'target' : data['target']['id'].split('/')[-1],
            'disease' : data['disease']['id'].split('/')[-1],
            'disease_label' : data['disease']['name'],
            'model_id' : data['evidence']['biological_model']['model_id'],
            'mouse_phenotypes' : [x['id'] for x in data['evidence']['biological_model']['phenotypes']],
            'human_phenotypes': [x['id'] for x in data['evidence']['disease_model_association']['human_phenotypes']]
        })

print(len(parsed_data))
phenodigm_df = pd.DataFrame(parsed_data)

print(phenodigm_df.head())

687439
            target         disease  \
0  ENSG00000115977  Orphanet_79235   
1  ENSG00000115977    Orphanet_205   
2  ENSG00000115977     EFO_1000946   
3  ENSG00000115977     EFO_0003888   
4  ENSG00000115977  Orphanet_88616   

                                       disease_label               model_id  \
0                     Crigler-Najjar syndrome type 2  MGI:5766745#hom#early   
1                            Crigler-Najjar syndrome  MGI:5766745#hom#early   
2                        gastric mucosal hypertrophy  MGI:5766745#hom#early   
3           attention deficit hyperactivity disorder  MGI:5766745#hom#early   
4  Autosomal recessive non-syndromic intellectual...  MGI:5766745#hom#early   

                                    mouse_phenotypes          human_phenotypes  
0  [MP:0005567, MP:0003795, MP:0001399, MP:001188...              [HP:0008282]  
1  [MP:0005567, MP:0003795, MP:0001399, MP:001188...              [HP:0008282]  
2  [MP:0005567, MP:0003795, MP:0001399, MP:001

In [9]:
print(f'Number of evidence: {len(phenodigm_df)}')
print(f"Number of disease/target association: {len(phenodigm_df[['target','disease']].drop_duplicates())}")
print(f"Number of mouse models: {len(phenodigm_df.model_id.unique())}")
print(f"Number of unique targets: {len(phenodigm_df.target.unique())}")
print(f"Number of unique diseases: {len(phenodigm_df.disease.unique())}")

Number of evidence: 687439
Number of disease/target association: 376633
Number of mouse models: 21862
Number of unique targets: 8644
Number of unique diseases: 4693


In [34]:
xf = (
    phenodigm_df
    .groupby('model_id')
    .agg(
        dict(mouse_phenotypes = lambda x: x.apply(json.dumps).unique().shape[0],
        human_phenotypes = lambda x: x.apply(json.dumps).unique().shape[0])
    )
)

## Testing the new data

Let's see if the mouse model data downloaded from the solr index also looks the same.

In [38]:
%%bash

gzcat /Users/dsuveges/project_data/phenodigm_solr_dump/type.mouse_model/*gz \
    | grep "MGI:5766745#hom#early" \
    | jq '.model_phenotypes'

[
  "MP:0005567 decreased circulating total protein level",
  "MP:0002574 increased vertical activity",
  "MP:0003921 abnormal heart left ventricle morphology",
  "MP:0005635 decreased circulating bilirubin level",
  "MP:0002968 increased circulating alkaline phosphatase level",
  "MP:0009141 increased prepulse inhibition",
  "MP:0001469 abnormal contextual conditioning behavior",
  "MP:0002590 increased mean corpuscular volume",
  "MP:0002764 short tibia",
  "MP:0001399 hyperactivity",
  "MP:0005293 impaired glucose tolerance",
  "MP:0004738 abnormal auditory brainstem response",
  "MP:0005419 decreased circulating serum albumin level",
  "MP:0011110 preweaning lethality, incomplete penetrance",
  "MP:0003795 abnormal bone structure",
  "MP:0001489 decreased startle reflex"
]


In [43]:
len(set(phenodigm_df.loc[phenodigm_df.model_id == "MGI:5766745#hom#early"].mouse_phenotypes[1]))

20

In [57]:
# Are these values overlap?
evidence_ids = pd.Series(phenodigm_df.loc[phenodigm_df.model_id == "MGI:5766745#hom#early"].mouse_phenotypes[1], name='evidence').to_frame()
source_ids = pd.Series([
    "MP:0005567",
    "MP:0002574",
    "MP:0003921",
    "MP:0005635",
    "MP:0002968",
    "MP:0009141",
    "MP:0001469",
    "MP:0002590",
    "MP:0002764",
    "MP:0001399",
    "MP:0005293",
    "MP:0004738",
    "MP:0005419",
    "MP:0011110",
    "MP:0003795",
    "MP:0001489",
], name='source').to_frame()
evidence_ids.merge(source_ids, how='outer', left_on='evidence', right_on='source', indicator=True).sort_values('_merge')

Unnamed: 0,evidence,source,_merge
11,MP:0003313,,left_only
16,MP:0000558,,left_only
3,MP:0011888,,left_only
15,MP:0011960,,left_only
5,MP:0001569,,left_only
6,MP:0000202,,left_only
13,MP:0001486,,left_only
20,,MP:0005635,right_only
21,,MP:0002968,right_only
22,,MP:0001469,right_only


In [59]:
for c in range()
with gzip.open(f'/Users/dsuveges/project_data/phenodigm_solr_dump/type.ontology_ontology/IMPC_solr_dump.00{c}.json.gz') as f:
    for row in f:
        data = json.loads(row)
        print(json.dumps(data, indent=2))
        break

{
  "type": "ontology_ontology",
  "mp_id": "MP:0013681",
  "mp_term": "abnormal early germinal center B cell number",
  "hp_id": "HP:0032126",
  "hp_term": "Decreased proportion of unswitched memory B cells"
}


In [60]:
for c in range(10):
    print(c)

0
1
2
3
4
5
6
7
8
9


In [1]:
string = 'KKKKKKKEEEEEERRRRRRTTTTJJJJJJJTTTZZZZZZZZZZZRRRRR'

print(string)
label = 2
offset = 1
for i in range(len(string)):
    try:
        if string[i] != string[i+1]:
            print(f'{label:<{offset}}', end='')
            offset = 1
            label += 2        
        else:
            offset += 1
    except IndexError:
        print(f'{label:<{offset}}')

KKKKKKKEEEEEERRRRRRTTTTJJJJJJJTTTZZZZZZZZZZZRRRRR
2      4     6     8   10     12 14         16   
