### Uniprot

In [None]:
import requests
from io import StringIO
import pandas as pd

def fetch_uniprot_data(params):
    url = "https://rest.uniprot.org/uniprotkb/stream"

    response = requests.get(url, params=params)

    if response.ok:
        tsv_data = StringIO(response.text)
        df = pd.read_csv(tsv_data, sep='\t')
        return df
    else:
        print("Failed to fetch data:", response.status_code)
        print(response.text)
        return None
# change the parameters to fetch the data
# load protein information
params = {
        'fields': 'accession,cc_disease',
        'format': 'tsv',
        'query': '(model_organism:9606) AND (reviewed:true)',
        'sort': 'organism_name asc'
    }

df_uniprot = fetch_uniprot_data(params)
if df_uniprot is not None:
    print(df_uniprot)
else:
    print("No data retrieved.")
#save the data to a CSV file
df_uniprot.to_csv('uniprot_protein_disease.csv', index=False)

In [2]:
# Download Link: API
# Download Date: 2025-03-21
# Download Version: 2025-03-21
import re
import pandas as pd

df_uniprot = pd.read_csv('uniprot_protein_disease.csv', sep=',')
df_uniprot = df_uniprot.dropna(subset=['Involvement in disease'])

def extract_brackets(text):
    matches = re.findall(r'\[([^\[\]]*)\]', text)
    return matches[0] if matches else None

df_uniprot['OMIM'] = df_uniprot['Involvement in disease'].apply(extract_brackets)
df_uniprot = df_uniprot.drop(columns=['Involvement in disease'])
df_uniprot = df_uniprot.dropna(subset=['OMIM'])
df_uniprot['OMIM'] = df_uniprot['OMIM'].str.replace('MIM:', '')
df_uniprot['OMIM'] = df_uniprot['OMIM'].str.split(';')
df_uniprot = df_uniprot.explode('OMIM')
df_uniprot

Unnamed: 0,Entry,OMIM
244,A0A1B0GTQ4,619941
251,A0A1B0GTW7,619702
344,A0A1W2PR82,618662
420,A0AVF1,619534
444,A0PJY2,616030
...,...,...
20394,Q9Y6X0,269150
20402,Q9Y6X9,616688
20403,Q9Y6Y0,618969
20404,Q9Y6Y1,614756


### DISEASES

In [3]:
# Download Link: https://download.jensenlab.org/human_disease_knowledge_full.tsv
# Download Date: 2025-03-21
# Download Version: 2025-03-15
import pandas as pd

column_name = ['protein', 'name', 'disease', 'disease name', 'source database','evidence type','confidence score']
df_diseases = pd.read_csv('human_disease_knowledge_full.tsv', sep='\t', names=column_name)

# filter df_diseases['protein'] to only contain ENSEMBL IDs
df_diseases = df_diseases[df_diseases['protein'].str.contains('ENSP')]
df_diseases_filter = df_diseases[['protein', 'disease']].drop_duplicates()
df_diseases_filter

Unnamed: 0,protein,disease
9,ENSP00000001146,DOID:0080001
10,ENSP00000001146,DOID:0080006
11,ENSP00000001146,DOID:11971
12,ENSP00000001146,DOID:17
13,ENSP00000001146,DOID:1934
...,...,...
96305,ENSP00000501180,ICD10:C80
96306,ENSP00000501180,ICD10:D
96307,ENSP00000501180,ICD10:D4
96308,ENSP00000501180,ICD10:D48


### HPO

In [4]:
# Download Link: https://hpo.jax.org/data/annotations
# Download Date: 2025-03-21
# Download Version: unknown

df_hpo = pd.read_csv('genes_to_disease.txt', sep='\t')
df_hpo = df_hpo[['ncbi_gene_id', 'disease_id']]
df_hpo = df_hpo.drop_duplicates()
df_hpo = df_hpo.dropna()
df_hpo

Unnamed: 0,ncbi_gene_id,disease_id
0,NCBIGene:64170,OMIM:212050
1,NCBIGene:51256,OMIM:248000
2,NCBIGene:28981,OMIM:617895
3,NCBIGene:8216,OMIM:616564
4,NCBIGene:6505,OMIM:615232
...,...,...
15588,NCBIGene:55901,ORPHA:231160
15589,NCBIGene:7049,ORPHA:231160
15590,NCBIGene:1281,ORPHA:231160
15591,NCBIGene:83854,ORPHA:231160


In [5]:
df_hpo['ncbi_gene_id'] = df_hpo['ncbi_gene_id'].replace('NCBIGene:', '', regex=True)
df_hpo[['disease_database', 'disease_id_number']] = df_hpo['disease_id'].str.split(':', expand=True)
df_hpo.drop(columns=['disease_id'], inplace=True)
df_hpo_omim = df_hpo[df_hpo['disease_database'] == 'OMIM']
df_hpo_omim.drop(columns=['disease_database'], inplace=True)
df_hpo_omim

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hpo_omim.drop(columns=['disease_database'], inplace=True)


Unnamed: 0,ncbi_gene_id,disease_id_number
0,64170,212050
1,51256,248000
2,28981,617895
3,8216,616564
4,6505,615232
...,...,...
7478,5297,619708
7479,673,163950
7480,5604,163950
7481,5781,163950


### DisGeNet

In [6]:
# Download Link: API
# Download Date: 2025-03-21
# Download Version: 2025-03-21

df_disgenet = pd.read_csv('DisGeNet_Gene_Disease_UMLS.csv')
df_disgenet['umls_code'] = df_disgenet['umls_code'].replace('UMLS_', '', regex=True)
df_disgenet

Unnamed: 0,gene_id,gene_symbol,umls_code
0,1,A1BG,C0036341
1,1,A1BG,C0019209
2,2,A2M,C0002395
3,2,A2M,C0024121
4,2,A2M,C0011581
...,...,...,...
91479,132090497,LOC132090497,C3279775
91480,132090498,LOC132090498,C3279775
91481,132090521,LOC132090521,C4285231
91482,132090595,LOC132090595,C4225343


In [7]:
df_disgenet_protein_disease = df_disgenet[['gene_id', 'umls_code']]
df_disgenet_protein_disease.drop_duplicates(inplace=True)
df_disgenet_protein_disease

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_disgenet_protein_disease.drop_duplicates(inplace=True)


Unnamed: 0,gene_id,umls_code
0,1,C0036341
1,1,C0019209
2,2,C0002395
3,2,C0024121
4,2,C0011581
...,...,...
91479,132090497,C3279775
91480,132090498,C3279775
91481,132090521,C4285231
91482,132090595,C4225343


### BioMedgraphica ID

In [8]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_protein = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein' / 'BioMedGraphica_Protein.csv'
target_dir_disease = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Disease' / 'BioMedGraphica_Disease.csv'
biomedgraphica_protein = pd.read_csv(target_dir_protein, dtype=str)
biomedgraphica_disease = pd.read_csv(target_dir_disease, dtype=str)

### UniProt Mapping

UniProt ID

In [9]:
uniprot_individualid = biomedgraphica_protein[['Uniprot_ID', 'BioMedGraphica_ID']]
uniprot_individualid.dropna(subset=['Uniprot_ID'], inplace=True)
uniprot_individualid = uniprot_individualid.assign(Uniprot_ID=uniprot_individualid['Uniprot_ID'].str.split(';')).explode('Uniprot_ID')
uniprot_to_individualid = uniprot_individualid.groupby('Uniprot_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_uniprot['From_ID'] = df_uniprot['Entry'].map(uniprot_to_individualid)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_individualid.dropna(subset=['Uniprot_ID'], inplace=True)


OMIM ID

In [10]:
omim_individualid = biomedgraphica_disease[['OMIM_ID', 'BioMedGraphica_ID']]
omim_individualid.dropna(subset=['OMIM_ID'], inplace=True)
omim_individualid = omim_individualid.assign(OMIM_ID=omim_individualid['OMIM_ID'].str.split(';')).explode('OMIM_ID')
omim_to_individualid = omim_individualid.groupby('OMIM_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_uniprot['To_ID'] = df_uniprot['OMIM'].map(omim_to_individualid)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  omim_individualid.dropna(subset=['OMIM_ID'], inplace=True)


Uniprot Protein-Disease

In [11]:
uniprot_protein_disease = df_uniprot[['From_ID', 'To_ID']]
uniprot_protein_disease.dropna(subset=['From_ID'], inplace=True)
uniprot_protein_disease.dropna(subset=['To_ID'], inplace=True)
uniprot_protein_disease.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4784 entries, 244 to 20408
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   From_ID  4784 non-null   object
 1   To_ID    4784 non-null   object
dtypes: object(2)
memory usage: 112.1+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_protein_disease.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_protein_disease.dropna(subset=['To_ID'], inplace=True)


In [12]:
uniprot_protein_disease['From_ID'] = uniprot_protein_disease['From_ID'].str.split(';')
uniprot_protein_disease['To_ID'] = uniprot_protein_disease['To_ID'].str.split(';')

uniprot_protein_disease = uniprot_protein_disease.explode('From_ID')
uniprot_protein_disease = uniprot_protein_disease.explode('To_ID')
uniprot_protein_disease.drop_duplicates(inplace=True)
uniprot_protein_disease.reset_index(drop=True, inplace=True)
uniprot_protein_disease

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_protein_disease['From_ID'] = uniprot_protein_disease['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uniprot_protein_disease['To_ID'] = uniprot_protein_disease['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PT003616,BMG_DS072594
1,BMG_PT003629,BMG_DS072482
2,BMG_PT004067,BMG_DS068410
3,BMG_PT007034,BMG_DS071340
4,BMG_PT007277,BMG_DS058366
...,...,...
5568,BMG_PT100448,BMG_DS080706
5569,BMG_PT100449,BMG_DS069728
5570,BMG_PT100450,BMG_DS055956
5571,BMG_PT100462,BMG_DS028482


### DISEASES Mapping

Ensembl ID

In [13]:
ensembl_individualid = biomedgraphica_protein[['Ensembl_Protein_ID', 'BioMedGraphica_ID']]
ensembl_individualid.dropna(subset=['Ensembl_Protein_ID'], inplace=True)

ensembl_to_individualid = ensembl_individualid.groupby('Ensembl_Protein_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_diseases_filter['From_ID'] = df_diseases_filter['protein'].map(ensembl_to_individualid)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ensembl_individualid.dropna(subset=['Ensembl_Protein_ID'], inplace=True)


DO ID

In [14]:
do_individualid = biomedgraphica_disease[['DO_ID', 'BioMedGraphica_ID']]
do_individualid.dropna(subset=['DO_ID'], inplace=True)
do_individualid = do_individualid.assign(DO_ID=do_individualid['DO_ID'].str.split(';')).explode('DO_ID')
do_to_individualid = do_individualid.groupby('DO_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_diseases_filter['To_ID_DO'] = df_diseases_filter['disease'].map(do_to_individualid)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  do_individualid.dropna(subset=['DO_ID'], inplace=True)


ICD10 ID

In [15]:
icd10_individualid = biomedgraphica_disease[['ICD10_ID', 'BioMedGraphica_ID']]
icd10_individualid.dropna(subset=['ICD10_ID'], inplace=True)
icd10_individualid = icd10_individualid.assign(ICD10_ID=icd10_individualid['ICD10_ID'].str.split(';')).explode('ICD10_ID')
icd10_to_individualid = icd10_individualid.groupby('ICD10_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_diseases_filter['disease'] = df_diseases_filter['disease'].replace('ICD10:', '', regex=True)
df_diseases_filter['To_ID_ICD10'] = df_diseases_filter['disease'].map(icd10_to_individualid)
df_diseases_filter

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  icd10_individualid.dropna(subset=['ICD10_ID'], inplace=True)


Unnamed: 0,protein,disease,From_ID,To_ID_DO,To_ID_ICD10
9,ENSP00000001146,DOID:0080001,,BMG_DS000383,
10,ENSP00000001146,DOID:0080006,,,
11,ENSP00000001146,DOID:11971,,BMG_DS003103,
12,ENSP00000001146,DOID:17,,BMG_DS002064,
13,ENSP00000001146,DOID:1934,,BMG_DS000909,
...,...,...,...,...,...
96305,ENSP00000501180,C80,,,
96306,ENSP00000501180,D,,,
96307,ENSP00000501180,D4,,,
96308,ENSP00000501180,D48,,,


In [16]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

df_diseases_filter = merge_string_columns(df_diseases_filter, ['To_ID_DO', 'To_ID_ICD10'], 'To_ID')
df_diseases_filter.replace('', pd.NA, inplace=True)
df_diseases_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69896 entries, 9 to 96309
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   protein  69896 non-null  object
 1   disease  69896 non-null  object
 2   From_ID  23538 non-null  object
 3   To_ID    32714 non-null  object
dtypes: object(4)
memory usage: 2.7+ MB


In [17]:
diseases_protein_disease = df_diseases_filter[['From_ID', 'To_ID']]
diseases_protein_disease.dropna(subset=['From_ID'], inplace=True)
diseases_protein_disease.dropna(subset=['To_ID'], inplace=True)
diseases_protein_disease.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11108 entries, 89 to 96083
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   From_ID  11108 non-null  object
 1   To_ID    11108 non-null  object
dtypes: object(2)
memory usage: 260.3+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases_protein_disease.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases_protein_disease.dropna(subset=['To_ID'], inplace=True)


In [18]:
diseases_protein_disease['From_ID'] = diseases_protein_disease['From_ID'].str.split(';')
diseases_protein_disease['To_ID'] = diseases_protein_disease['To_ID'].str.split(';')

diseases_protein_disease = diseases_protein_disease.explode('From_ID')
diseases_protein_disease = diseases_protein_disease.explode('To_ID')
diseases_protein_disease.drop_duplicates(inplace=True)
diseases_protein_disease.reset_index(drop=True, inplace=True)

diseases_protein_disease

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases_protein_disease['From_ID'] = diseases_protein_disease['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diseases_protein_disease['To_ID'] = diseases_protein_disease['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PT041395,BMG_DS002005
1,BMG_PT041395,BMG_DS000127
2,BMG_PT041395,BMG_DS031527
3,BMG_PT041395,BMG_DS024549
4,BMG_PT041395,BMG_DS000523
...,...,...
12382,BMG_PT159347,BMG_DS056411
12383,BMG_PT159347,BMG_DS003101
12384,BMG_PT159347,BMG_DS000603
12385,BMG_PT159347,BMG_DS000858


### HPO Mapping

NCBI Gene ID

In [19]:
ncbi_individual = biomedgraphica_protein[['NCBI_Gene_ID', 'BioMedGraphica_ID']]
ncbi_individual.dropna(subset=['NCBI_Gene_ID'], inplace=True)
ncbi_individual = ncbi_individual.assign(NCBI_Gene_ID=ncbi_individual['NCBI_Gene_ID'].str.split(';')).explode('NCBI_Gene_ID')
ncbi_to_individual = ncbi_individual.groupby('NCBI_Gene_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_hpo_omim['From_ID'] = df_hpo_omim['ncbi_gene_id'].map(ncbi_to_individual)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ncbi_individual.dropna(subset=['NCBI_Gene_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hpo_omim['From_ID'] = df_hpo_omim['ncbi_gene_id'].map(ncbi_to_individual)


OMIM ID

In [20]:
omim_individual = biomedgraphica_disease[['OMIM_ID', 'BioMedGraphica_ID']]
omim_individual.dropna(subset=['OMIM_ID'], inplace=True)
omim_individual = omim_individual.assign(OMIM_ID=omim_individual['OMIM_ID'].str.split(';')).explode('OMIM_ID')
omim_to_individual = omim_individual.groupby('OMIM_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_hpo_omim['To_ID'] = df_hpo_omim['disease_id_number'].astype(str).map(omim_to_individual)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  omim_individual.dropna(subset=['OMIM_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hpo_omim['To_ID'] = df_hpo_omim['disease_id_number'].astype(str).map(omim_to_individual)


In [21]:
hpo_protein_disease = df_hpo_omim[['From_ID', 'To_ID']]
hpo_protein_disease.dropna(subset=['From_ID'], inplace=True)
hpo_protein_disease.dropna(subset=['To_ID'], inplace=True)
hpo_protein_disease

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_protein_disease.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_protein_disease.dropna(subset=['To_ID'], inplace=True)


Unnamed: 0,From_ID,To_ID
0,BMG_PT088456,BMG_DS040360
1,BMG_PT094497;BMG_PT108327;BMG_PT145971,BMG_DS056872
2,BMG_PT078492;BMG_PT131086,BMG_DS065541
3,BMG_PT073254,BMG_DS060167
4,BMG_PT040769,BMG_DS056936;BMG_DS080612
...,...,...
7478,BMG_PT040734,BMG_DS072485
7479,BMG_PT039160;BMG_PT101364;BMG_PT117859;BMG_PT1...,BMG_DS065281
7480,BMG_PT042652;BMG_PT165800;BMG_PT167291,BMG_DS065281
7481,BMG_PT042959,BMG_DS065281


In [22]:
hpo_protein_disease['From_ID'] = hpo_protein_disease['From_ID'].str.split(';')
hpo_protein_disease['To_ID'] = hpo_protein_disease['To_ID'].str.split(';')

hpo_protein_disease = hpo_protein_disease.explode('From_ID')
hpo_protein_disease = hpo_protein_disease.explode('To_ID')
hpo_protein_disease.drop_duplicates(inplace=True)
hpo_protein_disease

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_protein_disease['From_ID'] = hpo_protein_disease['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hpo_protein_disease['To_ID'] = hpo_protein_disease['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PT088456,BMG_DS040360
1,BMG_PT094497,BMG_DS056872
1,BMG_PT108327,BMG_DS056872
1,BMG_PT145971,BMG_DS056872
2,BMG_PT078492,BMG_DS065541
...,...,...
7480,BMG_PT165800,BMG_DS065281
7480,BMG_PT167291,BMG_DS065281
7481,BMG_PT042959,BMG_DS065281
7482,BMG_PT058218,BMG_DS040546


### DisGeNet Mapping

UMLS ID

In [23]:
umls_individual = biomedgraphica_disease[['UMLS_ID', 'BioMedGraphica_ID']]
umls_individual.dropna(subset=['UMLS_ID'], inplace=True)
umls_to_individual = umls_individual.groupby('UMLS_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

df_disgenet_protein_disease['From_ID'] = df_disgenet_protein_disease['gene_id'].astype(str).map(ncbi_to_individual)
df_disgenet_protein_disease['To_ID'] = df_disgenet_protein_disease['umls_code'].map(umls_to_individual)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  umls_individual.dropna(subset=['UMLS_ID'], inplace=True)


In [24]:
disgenet_protein_disease = df_disgenet_protein_disease[['From_ID', 'To_ID']]
disgenet_protein_disease.dropna(subset=['From_ID'], inplace=True)
disgenet_protein_disease.dropna(subset=['To_ID'], inplace=True)
disgenet_protein_disease.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70968 entries, 0 to 89413
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   From_ID  70968 non-null  object
 1   To_ID    70968 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disgenet_protein_disease.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disgenet_protein_disease.dropna(subset=['To_ID'], inplace=True)


In [25]:
disgenet_protein_disease['From_ID'] = disgenet_protein_disease['From_ID'].str.split(';')
disgenet_protein_disease['To_ID'] = disgenet_protein_disease['To_ID'].str.split(';')

disgenet_protein_disease = disgenet_protein_disease.explode('From_ID')
disgenet_protein_disease = disgenet_protein_disease.explode('To_ID')
disgenet_protein_disease.drop_duplicates(inplace=True)

disgenet_protein_disease

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disgenet_protein_disease['From_ID'] = disgenet_protein_disease['From_ID'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disgenet_protein_disease['To_ID'] = disgenet_protein_disease['To_ID'].str.split(';')


Unnamed: 0,From_ID,To_ID
0,BMG_PT037915,BMG_DS002896
2,BMG_PT037614,BMG_DS000127
3,BMG_PT037614,BMG_DS001865
4,BMG_PT037614,BMG_DS000799
5,BMG_PT037614,BMG_DS002055
...,...,...
89409,BMG_PT157454,BMG_DS040210
89410,BMG_PT157454,BMG_DS029621
89411,BMG_PT172849,BMG_DS028298
89412,BMG_PT172849,BMG_DS040210


### Protein-Disease Relation

In [26]:
uniprot_protein_disease['source1'] = 'Uniprot'
diseases_protein_disease['source2'] = 'DISEASES'
hpo_protein_disease['source3'] = 'HPO'
disgenet_protein_disease['source4'] = 'DisGeNet'

protein_disease = pd.merge(uniprot_protein_disease, diseases_protein_disease, on=['From_ID', 'To_ID'], how='outer')
protein_disease = pd.merge(protein_disease, hpo_protein_disease, on=['From_ID', 'To_ID'], how='outer')
protein_disease = pd.merge(protein_disease, disgenet_protein_disease, on=['From_ID', 'To_ID'], how='outer')
protein_disease

Unnamed: 0,From_ID,To_ID,source1,source2,source3,source4
0,BMG_PT001139,BMG_DS001673,,,,DisGeNet
1,BMG_PT002497,BMG_DS000490,,,,DisGeNet
2,BMG_PT002497,BMG_DS000565,,,,DisGeNet
3,BMG_PT002497,BMG_DS028100,,,,DisGeNet
4,BMG_PT003590,BMG_DS060265,,,,DisGeNet
...,...,...,...,...,...,...
143389,BMG_PT173589,BMG_DS064137,,,HPO,DisGeNet
143390,BMG_PT173589,BMG_DS065345,,,HPO,DisGeNet
143391,BMG_PT173595,BMG_DS002896,,,,DisGeNet
143392,BMG_PT173596,BMG_DS002896,,,,DisGeNet


In [27]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

protein_disease = merge_string_columns(protein_disease, ['source1', 'source2', 'source3', 'source4'], 'Source')
protein_disease

Unnamed: 0,From_ID,To_ID,Source
0,BMG_PT001139,BMG_DS001673,DisGeNet
1,BMG_PT002497,BMG_DS000490,DisGeNet
2,BMG_PT002497,BMG_DS000565,DisGeNet
3,BMG_PT002497,BMG_DS028100,DisGeNet
4,BMG_PT003590,BMG_DS060265,DisGeNet
...,...,...,...
143389,BMG_PT173589,BMG_DS064137,DisGeNet;HPO
143390,BMG_PT173589,BMG_DS065345,DisGeNet;HPO
143391,BMG_PT173595,BMG_DS002896,DisGeNet
143392,BMG_PT173596,BMG_DS002896,DisGeNet


In [28]:
protein_disease['Type'] = 'Protein-Disease'

max_length = len(str(len(protein_disease)))
protein_disease['BioMedGraphica_ID'] = ['BMG_ED_PTDS' + str(i).zfill(max_length) for i in range(1, len(protein_disease) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in protein_disease.columns if col != 'BioMedGraphica_ID']  # re-order columns
protein_disease = protein_disease[columns]
protein_disease

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
0,BMG_ED_PTDS000001,BMG_PT001139,BMG_DS001673,DisGeNet,Protein-Disease
1,BMG_ED_PTDS000002,BMG_PT002497,BMG_DS000490,DisGeNet,Protein-Disease
2,BMG_ED_PTDS000003,BMG_PT002497,BMG_DS000565,DisGeNet,Protein-Disease
3,BMG_ED_PTDS000004,BMG_PT002497,BMG_DS028100,DisGeNet,Protein-Disease
4,BMG_ED_PTDS000005,BMG_PT003590,BMG_DS060265,DisGeNet,Protein-Disease
...,...,...,...,...,...
143389,BMG_ED_PTDS143390,BMG_PT173589,BMG_DS064137,DisGeNet;HPO,Protein-Disease
143390,BMG_ED_PTDS143391,BMG_PT173589,BMG_DS065345,DisGeNet;HPO,Protein-Disease
143391,BMG_ED_PTDS143392,BMG_PT173595,BMG_DS002896,DisGeNet,Protein-Disease
143392,BMG_ED_PTDS143393,BMG_PT173596,BMG_DS002896,DisGeNet,Protein-Disease


In [29]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Protein-Disease'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_Disease.csv'
protein_disease.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Protein-Disease\BioMedGraphica_Protein_Disease.csv
