# load knowledge graphs and show infos

In [1]:
import pandas as pd

file = pd.read_csv('kg.csv')
file.head(10)

  file = pd.read_csv('kg.csv')


Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
0,protein_protein,ppi,0,9796,gene/protein,PHYHIP,NCBI,8889,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,1,7918,gene/protein,GPANK1,NCBI,2798,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,2,8233,gene/protein,ZRSR2,NCBI,5646,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,3,4899,gene/protein,NRF1,NCBI,11592,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,4,5297,gene/protein,PI4KA,NCBI,2122,8601,gene/protein,RGS20,NCBI
5,protein_protein,ppi,5,6564,gene/protein,SLC15A1,NCBI,2352,8933,gene/protein,RTL8C,NCBI
6,protein_protein,ppi,6,8668,gene/protein,EIF3I,NCBI,5164,22976,gene/protein,PAXIP1,NCBI
7,protein_protein,ppi,7,10826,gene/protein,FAXDC2,NCBI,3934,345274,gene/protein,SLC10A6,NCBI
8,protein_protein,ppi,8,4489,gene/protein,MT1A,NCBI,1785,7157,gene/protein,TP53,NCBI
9,protein_protein,ppi,9,6272,gene/protein,SORT1,NCBI,13895,54873,gene/protein,PALMD,NCBI


# infos of kg.csv

In [2]:
file['x_type'].value_counts()

x_type
drug                  2805696
gene/protein          2631229
anatomy               1566154
disease                341244
effect/phenotype       257096
biological_process     252202
molecular_function      96723
cellular_component      93102
pathway                 47716
exposure                 9336
Name: count, dtype: int64

In [30]:
file.columns

Index(['relation', 'display_relation', 'x_index', 'x_id', 'x_type', 'x_name',
       'x_source', 'y_index', 'y_id', 'y_type', 'y_name', 'y_source'],
      dtype='object')

# select entries associated with chexpert label

In [18]:
chexpert_labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation',
       'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion',
       'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other',
       'Pneumonia', 'Pneumothorax']

kg_with_chex_label=file[(file['x_name'].isin(chexpert_labels))|
                        (file['y_name'].isin(chexpert_labels))].drop_duplicates()

# additionally, we want anatomy and phenotype knowledges as well
kg_related = file[(file['x_type']== 'anatomy' )|
                  (file['x_type']== 'effect/phenotype')].drop_duplicates().sample(10000)

In [19]:
kg_with_chex_label= pd.concat([kg_with_chex_label,kg_related])

kg_with_chex_label

Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
3062597,phenotype_protein,associated with,1114,26,gene/protein,AOC1,NCBI,84339,1640,effect/phenotype,Cardiomegaly,HPO
3062598,phenotype_protein,associated with,6309,147,gene/protein,ADRA1B,NCBI,84339,1640,effect/phenotype,Cardiomegaly,HPO
3062599,phenotype_protein,associated with,2472,150,gene/protein,ADRA2A,NCBI,84339,1640,effect/phenotype,Cardiomegaly,HPO
3062600,phenotype_protein,associated with,5752,152,gene/protein,ADRA2C,NCBI,84339,1640,effect/phenotype,Cardiomegaly,HPO
3062601,phenotype_protein,associated with,2959,156,gene/protein,GRK2,NCBI,84339,1640,effect/phenotype,Cardiomegaly,HPO
...,...,...,...,...,...,...,...,...,...,...,...,...
6802237,anatomy_protein_present,expression present,65076,2661,anatomy,superior frontal gyrus,UBERON,4571,79953,gene/protein,SYNDIG1,NCBI
7513074,anatomy_protein_present,expression present,64520,2081,anatomy,cardiac atrium,UBERON,7817,8314,gene/protein,BAP1,NCBI
7982974,anatomy_protein_present,expression present,64787,2358,anatomy,peritoneum,UBERON,79583,646912,gene/protein,RPL7P52,NCBI
7070716,anatomy_protein_present,expression present,64334,1893,anatomy,telencephalon,UBERON,4256,745,gene/protein,MYRF,NCBI


# define spo triples

spo stands for Subject Predicate Object triples.

In [26]:
class spo_df(pd.DataFrame):
    def __init__(self) -> None:
        super().__init__()
        self.insert(0,'object',pd.Series(dtype='str'))
        self.insert(0,'predicate',pd.Series(dtype='str'))
        self.insert(0,'subject',pd.Series(dtype='str'))


def add_entry_by_row(df,spo_entries:list):
    entries_df = pd.DataFrame(spo_entries,columns=df.columns)
    return pd.concat([df,entries_df])

def add_entry_by_column(df,subjects, predicates, objects):
    entries_df = pd.DataFrame({'subject':subjects,'predicate':predicates,'object':objects})
    return  pd.concat([df,entries_df])
        

## instantiate spo and add entries

In [33]:
chepert_spo = spo_df()

chepert_spo = add_entry_by_column(chepert_spo,
                    subjects = kg_with_chex_label['x_name'], 
                    predicates = 'is',
                    objects= kg_with_chex_label['x_type'])

chepert_spo = add_entry_by_column(chepert_spo,
                    subjects = kg_with_chex_label['y_name'], 
                    predicates = 'is',
                    objects= kg_with_chex_label['y_type'])

chepert_spo = add_entry_by_column(chepert_spo,
                    subjects = kg_with_chex_label['x_name'], 
                    predicates = kg_with_chex_label['display_relation'],
                    objects= kg_with_chex_label['y_name'])

# save

In [34]:
chepert_spo.to_csv('./kgs/CheXpert_KG.spo',index= False, sep='\t')

# another spo dataset

In [None]:
kg_Atelectasis = file[(file['x_name'] == 'Atelectasis') | (file['y_name'] == 'Atelectasis')]
kg_Atelectasis

In [None]:
file4= file[(file['relation']=="drug_effect" ) |
            (file['relation']=="disease_phenotype_positive") |
            (file['relation']=="phenotype_phenotype")]

In [None]:
values = kg_Atelectasis.x_name.value_counts()

In [None]:
kg_Atelectasis= pd.concat([kg_Atelectasis,
    file[(file['x_name']=='Linear atelectasis') |
         (file['y_name']=='Linear atelectasis')]])

In [None]:
kg_Atelectasis[kg_Atelectasis['y_name']=='Linear atelectasis']

In [None]:
anatomy_3000 = file[file['x_type']=='anatomy'].sample(3000)

In [None]:
ate_ana_sample = pd.concat([kg_Atelectasis, anatomy_3000])

In [None]:
ate_ana_sample['Predicate'] = ate_ana_sample['relation'] + '_' +ate_ana_sample['display_relation']

In [None]:
ate_ana_sample2= ate_ana_sample[['x_name','Predicate','y_name']]

In [None]:
ate_ana_sample = pd

In [None]:
# remove all '_' 
for serie_name in ate_ana_sample2.keys():
    ate_ana_sample2.loc[:,serie_name] = ate_ana_sample2.loc[:,serie_name].str.replace(' ','_')
    
ate_ana_sample2

In [None]:
ate_ana_sample2.to_csv('kg_anatomy3kAndAtelectasis.spo',index= False, sep='\t')