In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.spatial import distance
from scipy.stats import ttest_1samp

In [2]:
RAW = Path('data/raw/')
PROCESSED = Path('data/processed/')

BENCHMARKS = ['comorbidities', 'causative', 'ndf_rt']
SEED = 2020

## Preprocessing

In [58]:
def flatten_df(df, flat_col, col):
    df.dropna(inplace=True)
    df = pd.concat([pd.Series(row[col], row[flat_col].split(',')) 
                     for _, row in df.iterrows()]).reset_index()
    df['index'].replace('', np.nan, inplace=True)
    df.dropna(inplace=True)
    return df

In [73]:
comorb_dict = {'addison':'C0001403', 'heartdisease':'C0018799', 'obesity':'C0028754', 'prematurity':'C0151526', 
               'schizophrenia':'C0036341','type_1_diabetes':'C0011854','type_2_diabetes':'C0011860','autism':'C0004352'}
comorb = pd.read_csv(RAW/f'autism.txt', sep='\t')
comorb['CUI_head'] = [comorb_dict['autism']]*len(comorb)
comorb['String_head'] = ['autism']*len(comorb)

for dis in ['addison', 'heartdisease', 'obesity', 'prematurity', 'schizophrenia', 'type_1_diabetes', 'type_2_diabetes']:
    df = pd.read_csv(RAW/f'{dis}.txt', sep='\t')
    df['CUI_head'] = [comorb_dict[dis]]*len(df)
    df['String_head'] = [f'{dis}']*len(df)
    comorb = pd.concat([comorb, df])

comorb.rename(columns={'CUI':'CUI_tail', 'String':'String_tail'}, inplace=True)
comorb.drop(['Type'], axis=1, inplace=True)
comorb['Relationship'] = ['Comorbid Condition']*len(comorb)
comorb['Benchmark'] = ['comorbidities']*len(comorb)
comorb.to_csv(PROCESSED/'comorbidities.csv')
comorb

Unnamed: 0,CUI_tail,String_tail,CUI_head,String_head,Relationship,Benchmark
0,C0004352,Autistic Disorder,C0004352,autism,Comorbid Condition,comorbidities
1,C0004936,Mental disorders,C0004352,autism,Comorbid Condition,comorbidities
2,C0151898,Schizophrenic reaction,C0004352,autism,Comorbid Condition,comorbidities
3,C2603372,Infantile psychosis,C0004352,autism,Comorbid Condition,comorbidities
4,C0338984,Residual infantile autism,C0004352,autism,Comorbid Condition,comorbidities
...,...,...,...,...,...,...
46,C0860029,Vaginal yeast infection,C0011860,type_2_diabetes,Comorbid Condition,comorbidities
47,C1279309,Type II diabetes mellitus with mononeuropathy,C0011860,type_2_diabetes,Comorbid Condition,comorbidities
48,C1279310,Type II diabetes mellitus with polyneuropathy,C0011860,type_2_diabetes,Comorbid Condition,comorbidities
49,C1720557,Polyneuropathy associated with type II diabete...,C0011860,type_2_diabetes,Comorbid Condition,comorbidities


In [74]:
cause = pd.read_csv(RAW/f'causative_agent.txt', sep='\t')
cause['Relationship'] = ['causative_agent']*len(cause)

for i in ['cause_of', 'induces']:
    df = pd.read_csv(RAW/f'{i}.txt', sep='\t')
    df['Relationship'] = [f'{i}']*len(df)
    cause = pd.concat([cause, df])

cause.rename(columns={'CUI_Result':'CUI_tail','CUI_Cause':'CUI_head', 'Result':'String_tail', 'Cause':'String_head'}, 
             inplace=True)
cause['Benchmark'] = ['causative']*len(cause)
cause.to_csv(PROCESSED/'causative.csv')
cause

Unnamed: 0,CUI_head,CUI_tail,String_head,String_tail,Relationship,Benchmark
0,C0032143,C0572011,Tissue plasminogen activator preparation,Alteplase allergy,causative_agent,causative
1,C0024026,C0024025,Louping ill virus,Louping ill,causative_agent,causative
2,C0032144,C0573964,Plasminogen activator product,Urokinase overdose of undetermined intent,causative_agent,causative
3,C0032148,C3839797,Plasmodium,Malaria in mother complicating childbirth,causative_agent,causative
4,C0162421,C0553004,Thermoactinomyces,Thermoactinomyces species antibody,causative_agent,causative
...,...,...,...,...,...,...
640,C0981771,C0041657,"THIOPENTAL NA 2GM KIT SUSP,RTL",Unconscious,induces,causative
641,C0981770,C0041657,Thiopental sodium 500mg powder for injection s...,Unconscious,induces,causative
642,C0705766,C0011991,NIACIN 50MG/5ML ELIXIR,Diarrhea symptom,induces,causative
643,C0705766,C0016382,NIACIN 50MG/5ML ELIXIR,Face goes red,induces,causative


In [75]:
prevent = pd.read_csv(RAW/f'may_prevent.txt', sep='\t')
prevent = flatten_df(prevent, 'Condition', 'Treatment')
prevent['Relationship'] = ['May prevent']*len(prevent)
treat = pd.read_csv(RAW/f'may_treat.txt', sep='\t')
treat = flatten_df(treat, 'Condition', 'Treatment')
treat['Relationship'] = ['May treat']*len(treat)
drug = pd.concat([prevent, treat])
drug.rename({'index':'CUI_tail', 0:'CUI_head'}, inplace=True, axis=1)
drug['String_head'] = ['']*len(drug)
drug['String_tail'] = ['']*len(drug)
drug['Benchmark'] = ['ndf_rt']*len(drug)
drug.to_csv(PROCESSED/'ndf_rt.csv')
drug

Unnamed: 0,CUI_tail,CUI_head,Relationship,String_head,String_tail,Benchmark
0,C0151744,C0288672,May prevent,,,ndf_rt
2,C0022672,C0001047,May prevent,,,ndf_rt
3,C0032787,C0001047,May prevent,,,ndf_rt
5,C0080233,C0001134,May prevent,,,ndf_rt
6,C0011334,C0001134,May prevent,,,ndf_rt
...,...,...,...,...,...,...
7070,C0019521,C0014994,May treat,,,ndf_rt
7071,C0038218,C0014994,May treat,,,ndf_rt
7072,C0023048,C0014994,May treat,,,ndf_rt
7073,C0030193,C0014994,May treat,,,ndf_rt


In [189]:
sem_types = pd.read_csv(RAW/'sem_types.csv', encoding='latin-1', index_col=1)
sem_types.drop(['Unnamed: 0'], inplace=True, axis=1)
vec_exists = list(set(sem_types.index.values).intersection(set(cui_vecs.index.values)))
df = sem_types.loc[vec_exists]
df.to_csv(PROCESSED/'sem_types_filtered.csv')

In [210]:
all_benchmarks = pd.concat([comorb, cause, drug])
all_benchmarks = all_benchmarks[all_benchmarks['CUI_head'].isin(vec_exists)]
all_benchmarks = all_benchmarks[all_benchmarks['CUI_tail'].isin(vec_exists)]
all_benchmarks.to_csv(PROCESSED/'all_benchmarks.csv')

In [204]:
len(all_benchmarks)

15936

In [206]:
len(all_benchmarks)

5719

In [211]:
len(all_benchmarks)

5004

## Evaluation

In [None]:
# for a benchmark, loop over list of known pairs
# for each pair, check sem types of both and replace with random cuis 
# 

In [6]:
sem_types_filtered = pd.read_csv(PROCESSED/'sem_types_filtered.csv', index_col=0)
sem_types_filtered

Unnamed: 0_level_0,SemanticType,String
CUI,Unnamed: 1_level_1,Unnamed: 2_level_1
C2895257,Disease or Syndrome,anterior spinal artery compression syndrome ce...
C2905192,Injury or Poisoning,"Exposure to tanning bed, sequela"
C2872931,Injury or Poisoning,"Corrosion of unspecified degree of left knee, ..."
C1261365,Finding,Abnormal drug levels in respiratory organs and...
C2842404,Injury or Poisoning,Posterior subluxation of left sternoclavicular...
...,...,...
C0029387,Clinical Attribute,Osmolarity
C2833062,Injury or Poisoning,Laceration with foreign body of pharynx and ce...
C2895854,Disease or Syndrome,"Abscess of tendon sheath, left lower leg"
C2911302,Finding,Personal history of malignant neoplasm of soft...


In [7]:
cui_vecs = pd.read_csv(RAW/'cui2vec_pretrained.csv', index_col=0)
cui_vecs

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V491,V492,V493,V494,V495,V496,V497,V498,V499,V500
C0000052,-0.004071,0.002169,-4.466913e-17,-0.006110,-0.001187,0.000047,0.007681,-0.002564,0.012169,8.510987e-17,...,-0.000316,-0.049984,-0.022582,-0.041039,0.003112,0.025915,-0.065661,0.004741,-0.004502,-0.029053
C0000163,-0.008389,0.002486,-3.816392e-17,-0.004083,-0.002875,0.001207,0.005683,0.002054,0.007849,4.466913e-17,...,0.053730,0.048069,-0.058616,-0.013041,-0.005875,-0.000951,-0.007084,0.057521,-0.008791,0.001159
C0000167,-0.008328,0.002697,-5.811324e-17,-0.014372,0.004227,0.008176,0.007521,-0.000114,0.012576,5.160802e-17,...,0.031376,0.026515,-0.064476,0.040333,-0.005710,-0.027542,0.003274,0.062406,0.020109,-0.014616
C0000172,-0.008589,0.002666,-1.734723e-17,-0.001215,-0.001841,-0.001273,0.002268,-0.000551,0.003284,-1.734723e-17,...,0.007605,0.009991,-0.035895,-0.014936,0.003336,0.001816,-0.007257,0.014691,0.001276,0.001881
C0000215,-0.001324,-0.000040,-1.561251e-17,-0.002030,-0.000095,0.000728,0.003543,-0.001463,0.003100,4.856887e-17,...,-0.012399,-0.002617,-0.020168,0.027851,0.002107,-0.013557,0.019516,-0.000823,0.025063,-0.002061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0439491,-0.007018,0.004070,-4.683753e-17,-0.006886,0.006628,0.014713,0.011321,-0.012718,0.021551,3.263449e-16,...,-0.014483,0.013615,-0.008099,-0.002717,0.017188,0.001512,-0.004248,-0.015843,0.003936,-0.010559
C1233027,-0.002838,-0.000408,4.336809e-18,0.000034,0.003287,0.000240,0.001862,-0.000448,0.004030,9.194034e-17,...,-0.021149,0.009911,-0.026446,0.012468,0.011699,-0.006620,-0.000096,-0.016387,-0.001500,-0.001500
C0393676,-0.008120,0.005053,2.211772e-17,-0.001729,0.000976,-0.006364,0.010354,-0.006720,0.017869,1.435484e-16,...,0.003785,-0.029808,0.016210,-0.023907,0.016596,0.001208,-0.009797,0.024946,0.002593,0.000229
C0022275,-0.002968,0.001307,-1.344411e-17,0.000586,-0.000053,-0.000460,0.001939,-0.001210,0.007292,6.104058e-17,...,-0.009566,-0.003285,-0.004036,0.005388,0.008718,0.002527,-0.016883,0.005438,-0.005547,-0.005875


In [8]:
all_benchmarks = pd.read_csv(PROCESSED/'all_benchmarks.csv')

In [10]:
def sample_same_semantic(cui, n_samples):
    try:
        sem_list = sem_types_filtered.loc[cui]['SemanticType'].values
    except AttributeError:
        sem_list = [sem_types_filtered.loc[cui]['SemanticType']]
    df = sem_types_filtered[sem_types_filtered['SemanticType'].isin(sem_list)].drop([cui])
    samples = df.sample(n=n_samples, replace=True).index.values
    return samples

In [11]:
def run_benchmark(df, benchmark, n_samples=10000):
    bench_df = df[df['Benchmark']==benchmark].reset_index()
    relationships = np.zeros(len(bench_df))
    for i, row in bench_df.iterrows():
        real_sim = 1 - (distance.cosine(cui_vecs.loc[row['CUI_head']].values, cui_vecs.loc[row['CUI_tail']].values))
        head_samples = sample_same_semantic(row['CUI_head'], n_samples=n_samples)
        tail_samples = sample_same_semantic(row['CUI_tail'], n_samples=n_samples)
        null_sims= np.array([1-(distance.cosine(cui_vecs.loc[head_samples[sample]],cui_vecs.loc[tail_samples[sample]])) 
              for sample in range(n_samples)])  
        relationships[i] = (np.sum(null_sims>real_sim)/n_samples)<0.05
    return relationships                                                                                                                 

In [12]:
for bench in BENCHMARKS:
    r = run_benchmark(all_benchmarks, benchmark=bench)
    power = np.sum(r)/len(r)
    print(f'Power of benchmark {bench} is {power}')

Power of benchmark comorbidities is 0.45604395604395603


In [13]:
for bench in BENCHMARKS[1:]:
    r = run_benchmark(all_benchmarks, benchmark=bench)
    power = np.sum(r)/len(r)
    print(f'Power of benchmark {bench} is {power}')

Power of benchmark causative is 0.5180327868852459
Power of benchmark ndf_rt is 0.7562541509851671
