# BioMedGraphica Protein

## 1. Data Access
### Direct Download Links  
**RefSeq**: Can be downloaded directly via the link without the need for registration. [Link1](https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_refseq_uniprotkb_collab.gz); [Link2](https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz)

### Ensembl API

In [3]:
import pandas as pd
from pybiomart import Server

def list_attributes():
    server = Server(host='http://www.ensembl.org')
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    attributes = dataset.list_attributes()
    return attributes

attributes = list_attributes()

def fetch_ensembl_data(attributes):
    server = Server(host='http://www.ensembl.org')
    #https://www.ensembl.org/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL
    #this link shows that hsapiens_gene_ensembl is the GRCh38.p14
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    
    response = dataset.query(attributes)
    
    return response

attributes=['ensembl_peptide_id', 'ensembl_peptide_id_version','uniprotswissprot', 'refseq_peptide', 'entrezgene_id', 'external_gene_name']
df_ensembl = fetch_ensembl_data(attributes)
df_ensembl.to_csv('ensembl_protein.csv', index=False)

### UniProt API

In [2]:
import requests
from io import StringIO
import pandas as pd

def fetch_uniprot_data(params):
    url = "https://rest.uniprot.org/uniprotkb/stream"

    response = requests.get(url, params=params)

    if response.ok:
        tsv_data = StringIO(response.text)
        df = pd.read_csv(tsv_data, sep='\t')
        return df
    else:
        print("Failed to fetch data:", response.status_code)
        print(response.text)
        return None
# change the parameters to fetch the data
# load protein information
params = {
        'fields': 'accession,protein_name,gene_primary,xref_ensembl_full,xref_geneid',
        'format': 'tsv',
        'query': '(model_organism:9606) AND (reviewed:true)',
        'sort': 'organism_name asc'
    }

df_uniprot = fetch_uniprot_data(params)
if df_uniprot is not None:
    print(df_uniprot)
else:
    print("No data retrieved.")
#save the data to a CSV file
df_uniprot.to_csv('uniprot.csv', index=False)

            Entry                                      Protein names  \
0      A0A024R1R8        Translation machinery-associated protein 7B   
1      A0A024RBG1  Diphosphoinositol polyphosphate phosphohydrola...   
2      A0A075B6H7  Probable non-functional immunoglobulin kappa v...   
3      A0A075B6H8  Probable non-functional immunoglobulin kappa v...   
4      A0A075B6H9                Immunoglobulin lambda variable 4-69   
...           ...                                                ...   
20412      U3KPV4  Alpha-1,3-galactosyltransferase 2 (EC 2.4.1.87...   
20413      W5XKT8  Sperm acrosome membrane-associated protein 6 (...   
20414      W6CW81  Pyrin domain-containing protein 5 (Pyrin domai...   
20415      X6R8D5  Putative uncharacterized protein CIMIP3 (Cilia...   
20416      X6R8R1                                  Synaptotagmin-15B   

      Gene Names (primary)                                            Ensembl  \
0                    TMA7B  ENST00000424496.3; ENSP000

In [3]:
import requests
from io import StringIO

def fetch_uniprot_data(params):
    url = "https://rest.uniprot.org/uniprotkb/stream"

    response = requests.get(url, params=params)

    if response.ok:
        tsv_data = StringIO(response.text)
        df = pd.read_csv(tsv_data, sep='\t')
        return df
    else:
        print("Failed to fetch data:", response.status_code)
        print(response.text)
        return None
    
# change the parameters to fetch the data
# load description of proteins
params = {
        'fields': 'accession,cc_function',
        'format': 'tsv',
        'query': '(model_organism:9606) AND (reviewed:true)',
        'sort': 'organism_name asc'
    }

df_uniprot = fetch_uniprot_data(params)
if df_uniprot is not None:
    print(df_uniprot)
else:
    print("No data retrieved.")
#save the data to a CSV file
df_uniprot.to_csv('uniprot_protein_description.csv', index=False)

            Entry                                      Function [CC]
0      A0A024R1R8                                                NaN
1      A0A024RBG1  FUNCTION: Cleaves a beta-phosphate from the di...
2      A0A075B6H7  FUNCTION: Probable non-functional open reading...
3      A0A075B6H8  FUNCTION: Probable non-functional open reading...
4      A0A075B6H9  FUNCTION: V region of the variable domain of i...
...           ...                                                ...
20412      U3KPV4  FUNCTION: Synthesizes the galactose-alpha(1,3)...
20413      W5XKT8  FUNCTION: Sperm protein required for fusion of...
20414      W6CW81  FUNCTION: Functions as an inhibitor of DNA vir...
20415      X6R8D5                                                NaN
20416      X6R8R1                                                NaN

[20417 rows x 2 columns]


### RefSeq Data Pre-Process

In [None]:
import pandas as pd

def filter_data(input_file, output_file):

    df = pd.read_csv(input_file)
    
    filtered_df = df[(df['NCBI_tax_id'] == 9606) & (df['UniProtKB_tax_id'] == 9606)]
    
    filtered_df.to_csv(output_file, index=False)

# replace the input_file and output_file with the path of the files in your system
filter_data('gene_refseq_uniprotkb_collab', 'refseq_uniprot_human.csv')

## 2. Load Data

### 2.1 Ensembl data

In [8]:
import pandas as pd

df_ensembl = pd.read_csv('ensembl_protein.csv', dtype=str)

df_ensembl_merge = df_ensembl.groupby('Protein stable ID version').agg({
    'Protein stable ID': lambda x: ';'.join(x.dropna().unique()),
    'Gene name': lambda x: ';'.join(x.dropna().unique()),
    'UniProtKB/Swiss-Prot ID': lambda x: ';'.join(x.dropna().unique()),
    'RefSeq peptide ID': lambda x: ';'.join(x.dropna().unique()),
    'NCBI gene (formerly Entrezgene) ID': lambda x: ';'.join(x.dropna().unique())
})
df_ensembl_merge.replace('', pd.NA, inplace=True)
df_ensembl_merge.reset_index(inplace=True)
df_ensembl_merge

Unnamed: 0,Protein stable ID version,Protein stable ID,Gene name,UniProtKB/Swiss-Prot ID,RefSeq peptide ID,NCBI gene (formerly Entrezgene) ID
0,ENSP00000000233.5,ENSP00000000233,ARF5,P84085,NP_001653,381.0
1,ENSP00000000412.3,ENSP00000000412,M6PR,P20645,NP_002346;NP_001401249;NP_001401261;NP_0014012...,4074.0
2,ENSP00000000442.6,ENSP00000000442,ESRRA,P11474,NP_004442,2101.0
3,ENSP00000001008.4,ENSP00000001008,FKBP4,Q02790,NP_002005,2288.0
4,ENSP00000001146.2,ENSP00000001146,CYP26B1,Q9NR63,NP_063938,56603.0
...,...,...,...,...,...,...
123840,ENSP00000520928.1,ENSP00000520928,CSF2RA,,,1438.0
123841,ENSP00000520929.1,ENSP00000520929,CSF2RA,,NP_001366090;NP_001366093;NP_001366094,1438.0
123842,ENSP00000520930.1,ENSP00000520930,CSF2RA,,NP_001366088;NP_001366092;NP_001366091,1438.0
123843,ENSP00000520931.1,ENSP00000520931,GTPBP6,,,8225.0


### 2.2 UniProt data

In [2]:
import pandas as pd

In [3]:
df_uniprot = pd.read_csv('uniprot.csv', dtype=str)
df_uniprot.rename(columns={'Gene Names (primary)': 'Gene Names'}, inplace=True)
df_uniprot

Unnamed: 0,Entry,Protein names,Gene Names,Ensembl,GeneID
0,A0A024R1R8,Translation machinery-associated protein 7B,TMA7B,ENST00000424496.3; ENSP00000491117.1; ENSG0000...,
1,A0A024RBG1,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,ENST00000322209.5; ENSP00000492425.1; ENSG0000...,11163;
2,A0A075B6H7,Probable non-functional immunoglobulin kappa v...,IGKV3-7,ENST00000390247.2; ENSP00000374782.2; ENSG0000...,
3,A0A075B6H8,Probable non-functional immunoglobulin kappa v...,IGKV1D-42,ENST00000390278.3; ENSP00000374813.3; ENSG0000...,
4,A0A075B6H9,Immunoglobulin lambda variable 4-69,IGLV4-69,ENST00000390282.2; ENSP00000374817.2; ENSG0000...,
...,...,...,...,...,...
20412,U3KPV4,"Alpha-1,3-galactosyltransferase 2 (EC 2.4.1.87...",A3GALT2,ENST00000442999.3; ENSP00000475261.1; ENSG0000...,127550;
20413,W5XKT8,Sperm acrosome membrane-associated protein 6 (...,SPACA6,ENST00000637797.2; ENSP00000490829.1; ENSG0000...,147650;
20414,W6CW81,Pyrin domain-containing protein 5 (Pyrin domai...,PYDC5,ENST00000696987.1; ENSP00000513023.1; ENSG0000...,107181291;
20415,X6R8D5,Putative uncharacterized protein CIMIP3 (Cilia...,CIMIP3,ENST00000372963.4; ENSP00000362054.3; ENSG0000...,


#### Pre-processing

In [7]:
df_uniprot['GeneID'] = df_uniprot['GeneID'].str.replace(';', '', regex=False)

def process_ensembl(data):
    if pd.isna(data):
        return data  

    processed_parts = []
    groups = data.split(';')
    
    for entry in groups:
        entry = entry.strip()
        if 'ENSP' in entry:
            processed_parts.append(entry)
            
    return ' '.join(processed_parts)

df_uniprot['Ensembl(ENSP)'] = df_uniprot['Ensembl'].apply(process_ensembl)
df_uniprot.drop(columns=['Ensembl'], inplace=True)
df_uniprot

Unnamed: 0,Entry,Protein names,Gene Names,GeneID,Ensembl(ENSP)
0,A0A024R1R8,Translation machinery-associated protein 7B,TMA7B,,ENSP00000491117.1
1,A0A024RBG1,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,11163,ENSP00000492425.1
2,A0A075B6H7,Probable non-functional immunoglobulin kappa v...,IGKV3-7,,ENSP00000374782.2 ENSP00000487957.1
3,A0A075B6H8,Probable non-functional immunoglobulin kappa v...,IGKV1D-42,,ENSP00000374813.3
4,A0A075B6H9,Immunoglobulin lambda variable 4-69,IGLV4-69,,ENSP00000374817.2
...,...,...,...,...,...
20412,U3KPV4,"Alpha-1,3-galactosyltransferase 2 (EC 2.4.1.87...",A3GALT2,127550,ENSP00000475261.1
20413,W5XKT8,Sperm acrosome membrane-associated protein 6 (...,SPACA6,147650,ENSP00000490829.1
20414,W6CW81,Pyrin domain-containing protein 5 (Pyrin domai...,PYDC5,107181291,ENSP00000513023.1
20415,X6R8D5,Putative uncharacterized protein CIMIP3 (Cilia...,CIMIP3,,ENSP00000362054.3


In [8]:
df_uniprot_protein = df_uniprot.drop(['Ensembl(ENSP)'], axis=1).join(df_uniprot['Ensembl(ENSP)'].str.split(' ', expand=True).stack().reset_index(level=1, drop=True).rename('Ensembl(ENSP)'))

df_uniprot_protein_not_null = df_uniprot_protein[df_uniprot_protein['Ensembl(ENSP)'].notnull()]
df_uniprot_protein_null = df_uniprot_protein[df_uniprot_protein['Ensembl(ENSP)'].isnull()]

aggregated_data = df_uniprot_protein_not_null.groupby('Ensembl(ENSP)', dropna=True).agg({
    'Entry': lambda x: ';'.join(x.dropna().unique()),
    'Protein names': lambda x: ';'.join(x.dropna().unique()),
    'Gene Names': lambda x: ';'.join(x.dropna().unique()),
    'GeneID': lambda x: ';'.join(x.dropna().unique())
}).reset_index()

df_uniprot_protein_ensemblunique = pd.concat([aggregated_data, df_uniprot_protein_null], ignore_index=True)
df_uniprot_protein_ensemblunique

Unnamed: 0,Ensembl(ENSP),Entry,Protein names,Gene Names,GeneID
0,ENSP00000000233.5,P84085,ADP-ribosylation factor 5,ARF5,381
1,ENSP00000000412.3,P20645,Cation-dependent mannose-6-phosphate receptor ...,M6PR,4074
2,ENSP00000000442.6,P11474,Steroid hormone receptor ERR1 (Estrogen recept...,ESRRA,2101
3,ENSP00000001008.4,Q02790,Peptidyl-prolyl cis-trans isomerase FKBP4 (PPI...,FKBP4,2288
4,ENSP00000001146.2,Q9NR63,Cytochrome P450 26B1 (EC 1.14.13.-) (Cytochrom...,CYP26B1,56603
...,...,...,...,...,...
51848,,Q9Y6J3,SMAD5 antisense gene protein 1 (10.3 kDa proli...,SMAD5-AS1,
51849,,Q9Y6Z4,Putative uncharacterized protein KIF25-AS1 (KI...,KIF25-AS1,
51850,,Q9YNA8,Endogenous retrovirus group K member 19 Gag po...,ERVK-19,
51851,,S4R3P1,Humanin-like 13 (HN13) (MT-RNR2-like protein 13),MTRNR2L13,


### 2.3 RefSeq data

RefSeq and UniProt

In [12]:
df_refseq_uniprot_human = pd.read_csv('refseq_uniprot_human.csv')

df_refseq_uniprot_filter = df_refseq_uniprot_human[df_refseq_uniprot_human['method'] != 'similar']
df_refseq_uniprot_filter.reset_index(drop=True, inplace=True)
df_refseq_uniprot_filter = df_refseq_uniprot_filter.drop(columns=['NCBI_tax_id', 'UniProtKB_tax_id', 'method'])
df_refseq_uniprot_filter.rename(columns={'NCBI_protein_accession': 'RefSeq ID', 'UniProtKB_protein_accession': 'Uniprot ID'}, inplace=True)

df_refseq_uniprot_filter = df_refseq_uniprot_filter.groupby('Uniprot ID').agg({
    'RefSeq ID': lambda x: ';'.join(x.unique())
}).reset_index()
df_refseq_uniprot_filter

Unnamed: 0,Uniprot ID,RefSeq ID
0,A0A023HHK9,NP_085128.2
1,A0A023HHL0,NP_085128.2
2,A0A023IN41,NP_001186551.1
3,A0A023T695,XP_016868347;XP_016868348;XP_016868349;XP_0168...
4,A0A023T6R1,NP_060518.1
...,...,...
99771,X6RGR3,NP_001353609.1
99772,X6RLR1,NP_001268356.1
99773,X6RLX0,XP_016874551.1;XP_054227478.1
99774,X6RM00,XP_047284540.1;XP_054227509.1


RefSeq and Ensembl

In [13]:
df_refseq_ensembl_raw = pd.read_csv('gene2ensembl', sep='\t')
df_refseq_ensembl_human = df_refseq_ensembl_raw[df_refseq_ensembl_raw['#tax_id'] == 9606]
df_refseq_ensembl_human.replace('-', pd.NA, inplace=True)
#keep only protein data
df_refseq_ensembl = df_refseq_ensembl_human[['protein_accession.version', 'Ensembl_protein_identifier', 'GeneID']]
#drop rows with both column has "-"
df_refseq_ensembl = df_refseq_ensembl.dropna(subset=['protein_accession.version', 'Ensembl_protein_identifier'])
df_refseq_ensembl.rename(columns={'protein_accession.version': 'RefSeq ID', 'Ensembl_protein_identifier': 'Ensembl version'}, inplace=True)
df_refseq_ensembl

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_refseq_ensembl_human.replace('-', pd.NA, inplace=True)


Unnamed: 0,RefSeq ID,Ensembl version,GeneID
4267166,NP_570602.2,ENSP00000263100.2,1
4267167,NP_000005.3,ENSP00000323929.8,2
4267169,NP_000653.3,ENSP00000307218.4,9
4267170,NP_001153643.1,ENSP00000428270.1,9
4267171,NP_001153646.1,ENSP00000429341.1,9
...,...,...,...
4336078,NP_001381078.1,ENSP00000518845.1,128706666
4336079,NP_001007272.1,ENSP00000518853.1,128854680
4336093,NP_001410765.1,ENSP00000370908.2,131675794
4336094,NP_001380963.1,ENSP00000520497.1,131768270


In [14]:
df_refseq_ensembl['GeneID'] = df_refseq_ensembl['GeneID'].astype(str)
df_refseq_ensembl_merge = df_refseq_ensembl.groupby('Ensembl version').agg({
    'RefSeq ID': lambda x: ';'.join(x.unique()),
    'GeneID': lambda x: ';'.join(x.unique())
}).reset_index()
df_refseq_ensembl_merge.replace('', pd.NA, inplace=True)
df_refseq_ensembl_merge.reset_index(drop=True, inplace=True)
df_refseq_ensembl_merge

Unnamed: 0,Ensembl version,RefSeq ID,GeneID
0,ENSP00000000233.5,NP_001653.1,381
1,ENSP00000000412.3,NP_002346.1,4074
2,ENSP00000000442.6,NP_004442.3,2101
3,ENSP00000001008.4,NP_002005.1,2288
4,ENSP00000001146.2,NP_063938.1,56603
...,...,...,...
47218,ENSP00000520488.1,XP_006711723.1,9826
47219,ENSP00000520489.1,NP_001364347.1,9826
47220,ENSP00000520491.1,NP_001374386.1,26140
47221,ENSP00000520492.1,XP_047294981.1,55011


## 3. Merge Data

In [15]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

### 3.1 Ensembl and Uniprot

In [16]:
df_ensembl_uniprot = pd.merge(df_ensembl, df_uniprot_protein_ensemblunique, left_on='Protein stable ID version', right_on='Ensembl(ENSP)', how='outer')
df_ensembl_uniprot = merge_column(df_ensembl_uniprot, 'Protein stable ID version', 'Ensembl(ENSP)', 'Ensembl version')
df_ensembl_uniprot = merge_column(df_ensembl_uniprot, 'UniProtKB/Swiss-Prot ID', 'Entry', 'Uniprot ID')
df_ensembl_uniprot = merge_column(df_ensembl_uniprot, 'NCBI gene (formerly Entrezgene) ID', 'GeneID', 'NCBI_ID')
df_ensembl_uniprot = merge_string_columns(df_ensembl_uniprot, ['Gene name', 'Gene Names'], 'Gene_Name')
df_ensembl_uniprot

Unnamed: 0,Protein stable ID,RefSeq peptide ID,Protein names,Ensembl version,Uniprot ID,NCBI_ID,Gene_Name
0,ENSP00000000233,NP_001653,ADP-ribosylation factor 5,ENSP00000000233.5,P84085,381.0,ARF5
0,ENSP00000000233,NP_001653,ADP-ribosylation factor 5,ENSP00000000233.5,P84085,381,ARF5
1,ENSP00000000412,NP_002346,Cation-dependent mannose-6-phosphate receptor ...,ENSP00000000412.3,P20645,4074.0,M6PR
1,ENSP00000000412,NP_002346,Cation-dependent mannose-6-phosphate receptor ...,ENSP00000000412.3,P20645,4074,M6PR
2,ENSP00000000412,NP_001401249,Cation-dependent mannose-6-phosphate receptor ...,ENSP00000000412.3,P20645,4074.0,M6PR
...,...,...,...,...,...,...,...
158711,,,SMAD5 antisense gene protein 1 (10.3 kDa proli...,,Q9Y6J3,,SMAD5-AS1
158712,,,Putative uncharacterized protein KIF25-AS1 (KI...,,Q9Y6Z4,,KIF25-AS1
158713,,,Endogenous retrovirus group K member 19 Gag po...,,Q9YNA8,,ERVK-19
158714,,,Humanin-like 13 (HN13) (MT-RNR2-like protein 13),,S4R3P1,,MTRNR2L13


### 3.2 Add RefSeq

Uniprot-RefSeq

In [17]:
df_ensembl_uniprot_refseq = df_ensembl_uniprot.copy()
df_ensembl_uniprot_refseq = pd.merge(df_ensembl_uniprot_refseq, df_refseq_uniprot_filter, left_on='Uniprot ID', right_on='Uniprot ID', how='outer')
df_ensembl_uniprot_refseq = merge_string_columns(df_ensembl_uniprot_refseq, ['RefSeq ID','RefSeq peptide ID'], 'RefSeq_ID')
df_ensembl_uniprot_refseq.replace('', pd.NA, inplace=True)
df_ensembl_uniprot_refseq

Unnamed: 0,Protein stable ID,Protein names,Ensembl version,Uniprot ID,NCBI_ID,Gene_Name,RefSeq_ID
0,,,,A0A023HHK9,,,NP_085128.2
1,,,,A0A023HHL0,,,NP_085128.2
2,,,,A0A023IN41,,,NP_001186551.1
3,,,,A0A023T695,,,XP_016868350;XP_016868348;XP_054215426.1;XP_01...
4,,,,A0A023T6R1,,,NP_060518.1
...,...,...,...,...,...,...,...
310175,ENSP00000520930,,ENSP00000520930.1,,1438.0,CSF2RA,NP_001366088
310176,ENSP00000520930,,ENSP00000520930.1,,1438.0,CSF2RA,NP_001366092
310177,ENSP00000520930,,ENSP00000520930.1,,1438.0,CSF2RA,NP_001366091
310178,ENSP00000520931,,ENSP00000520931.1,,8225.0,GTPBP6,


Ensembl-RefSeq

In [18]:
df_ensembl_uniprot_refseq = pd.merge(df_ensembl_uniprot_refseq, df_refseq_ensembl_merge, on='Ensembl version', how='outer')
df_ensembl_uniprot_refseq = merge_column(df_ensembl_uniprot_refseq, 'NCBI_ID', 'GeneID', 'NCBI ID')
df_ensembl_uniprot_refseq = merge_string_columns(df_ensembl_uniprot_refseq, ['RefSeq ID','RefSeq_ID'], 'RefSeq')
df_ensembl_uniprot_refseq.replace('', pd.NA, inplace=True)
df_ensembl_uniprot_refseq

Unnamed: 0,Protein stable ID,Protein names,Ensembl version,Uniprot ID,Gene_Name,NCBI ID,RefSeq
0,ENSP00000000233,ADP-ribosylation factor 5,ENSP00000000233.5,P84085,ARF5,381.0,NP_001653.1;NP_001653
0,ENSP00000000233,ADP-ribosylation factor 5,ENSP00000000233.5,P84085,ARF5,381,NP_001653.1;NP_001653
2,ENSP00000000412,Cation-dependent mannose-6-phosphate receptor ...,ENSP00000000412.3,P20645,M6PR,4074.0,NP_001401262.1;NP_001401261.1;XP_047284806.1;N...
2,ENSP00000000412,Cation-dependent mannose-6-phosphate receptor ...,ENSP00000000412.3,P20645,M6PR,4074,NP_001401262.1;NP_001401261.1;XP_047284806.1;N...
4,ENSP00000000412,Cation-dependent mannose-6-phosphate receptor ...,ENSP00000000412.3,P20645,M6PR,4074.0,NP_001401262.1;NP_001401249;NP_001401261.1;XP_...
...,...,...,...,...,...,...,...
310190,,,,X6RGR3,,,NP_001353609.1
310191,,,,X6RLR1,,,NP_001268356.1
310192,,,,X6RLX0,,,XP_016874551.1;XP_054227478.1
310193,,,,X6RM00,,,XP_054227509.1;XP_047284540.1


### 3.3 Deep Data Cleaning

In [19]:
# merge the Uniprot ID
Uniprot_not_null = df_ensembl_uniprot_refseq[df_ensembl_uniprot_refseq['Uniprot ID'].notnull()]
Uniprot_null = df_ensembl_uniprot_refseq[df_ensembl_uniprot_refseq['Uniprot ID'].isnull()]

merge_uniprot = Uniprot_not_null.groupby('Uniprot ID').agg(
    lambda x: ';'.join(sorted(set(
        v.strip() for i in x.dropna().astype(str) for v in i.split(';')
    )))
)

merge_uniprot.reset_index(inplace=True)
merge_uniprot.replace('', pd.NA, inplace=True)
merge_uniprot

Unnamed: 0,Uniprot ID,Protein stable ID,Protein names,Ensembl version,Gene_Name,NCBI ID,RefSeq
0,A0A023HHK9,,,,,,NP_085128.2
1,A0A023HHL0,,,,,,NP_085128.2
2,A0A023IN41,,,,,,NP_001186551.1
3,A0A023T695,,,,,,XP_016868347;XP_016868348;XP_016868349;XP_0168...
4,A0A023T6R1,,,,,,NP_060518.1
...,...,...,...,...,...,...,...
100974,X6RGR3,,,,,,NP_001353609.1
100975,X6RLR1,,,,,,NP_001268356.1
100976,X6RLX0,,,,,,XP_016874551.1;XP_054227478.1
100977,X6RM00,,,,,,XP_047284540.1;XP_054227509.1


In [20]:
# For those rows missing 'UniProt ID', we will group by 'Ensembl version' and apply aggregation
ensembl_not_null = Uniprot_null[Uniprot_null['Ensembl version'].notnull()]
ensembl_null = Uniprot_null[Uniprot_null['Ensembl version'].isnull()]

merge_ensembl = ensembl_not_null.groupby('Ensembl version').agg(
    # For each group, split values by semicolons, remove duplicates, sort, and join them back with semicolons
    lambda x: ';'.join(sorted(set(
        # Split values by semicolon and strip extra spaces for each non-null value
        v.strip() for i in x.dropna().astype(str) for v in i.split(';')
    )))
)

merge_ensembl.reset_index(inplace=True)
merge_ensembl.replace('', pd.NA, inplace=True)
ensembl_grouped = pd.concat([merge_uniprot, merge_ensembl, ensembl_null], ignore_index=True)
ensembl_grouped

Unnamed: 0,Uniprot ID,Protein stable ID,Protein names,Ensembl version,Gene_Name,NCBI ID,RefSeq
0,A0A023HHK9,,,,,,NP_085128.2
1,A0A023HHL0,,,,,,NP_085128.2
2,A0A023IN41,,,,,,NP_001186551.1
3,A0A023T695,,,,,,XP_016868347;XP_016868348;XP_016868349;XP_0168...
4,A0A023T6R1,,,,,,NP_060518.1
...,...,...,...,...,...,...,...
173973,,ENSP00000520928,,ENSP00000520928.1,CSF2RA,1438.0,
173974,,ENSP00000520929,,ENSP00000520929.1,CSF2RA,1438.0,NP_001366090;NP_001366093;NP_001366094
173975,,ENSP00000520930,,ENSP00000520930.1,CSF2RA,1438.0,NP_001366088;NP_001366091;NP_001366092
173976,,ENSP00000520931,,ENSP00000520931.1,GTPBP6,8225.0,


## 4. BioMedGraphica ID

In [22]:
biomedgraphica_protein = ensembl_grouped.sort_values(by=['Uniprot ID', 'Ensembl version', 'Protein stable ID', 'RefSeq', 'NCBI ID'], na_position='last')
biomedgraphica_protein = biomedgraphica_protein.reset_index(drop=True)

max_length = len(str(len(biomedgraphica_protein)))
biomedgraphica_protein['BioMedGraphica_ID'] = ['BMG_PT' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_protein) + 1)]
biomedgraphica_protein = biomedgraphica_protein.rename(columns={'Protein stable ID': 'Ensembl_Protein_ID', 'Ensembl version': 'Ensembl_Protein_ID_Version', 'Protein names': 'UniProt_Name',
                                                          'Uniprot ID': 'Uniprot_ID', 'RefSeq': 'RefSeq_ID', 'NCBI ID':'NCBI_Gene_ID', 'Gene_Name':'HGNC_Symbol'})
column_order = ['BioMedGraphica_ID', 'Uniprot_ID', 'Ensembl_Protein_ID', 'Ensembl_Protein_ID_Version', 'RefSeq_ID', 'NCBI_Gene_ID', 'UniProt_Name', 'HGNC_Symbol']
biomedgraphica_protein = biomedgraphica_protein[column_order]
biomedgraphica_protein

Unnamed: 0,BioMedGraphica_ID,Uniprot_ID,Ensembl_Protein_ID,Ensembl_Protein_ID_Version,RefSeq_ID,NCBI_Gene_ID,UniProt_Name,HGNC_Symbol
0,BMG_PT000001,A0A023HHK9,,,NP_085128.2,,,
1,BMG_PT000002,A0A023HHL0,,,NP_085128.2,,,
2,BMG_PT000003,A0A023IN41,,,NP_001186551.1,,,
3,BMG_PT000004,A0A023T695,,,XP_016868347;XP_016868348;XP_016868349;XP_0168...,,,
4,BMG_PT000005,A0A023T6R1,,,NP_060518.1,,,
...,...,...,...,...,...,...,...,...
173973,BMG_PT173974,,ENSP00000520928,ENSP00000520928.1,,1438.0,,CSF2RA
173974,BMG_PT173975,,ENSP00000520929,ENSP00000520929.1,NP_001366090;NP_001366093;NP_001366094,1438.0,,CSF2RA
173975,BMG_PT173976,,ENSP00000520930,ENSP00000520930.1,NP_001366088;NP_001366091;NP_001366092,1438.0,,CSF2RA
173976,BMG_PT173977,,ENSP00000520931,ENSP00000520931.1,,8225.0,,GTPBP6


In [23]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein.csv'
biomedgraphica_protein.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Protein\BioMedGraphica_Protein.csv


## 5. Description

In [2]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein' / 'BioMedGraphica_Protein.csv'
biomedgraphica_protein = pd.read_csv(target_dir, dtype=str)

### 5.1 From UniProt

In [18]:
import requests
from io import StringIO

def fetch_uniprot_data(params):
    url = "https://rest.uniprot.org/uniprotkb/stream"

    response = requests.get(url, params=params)

    if response.ok:
        tsv_data = StringIO(response.text)
        df = pd.read_csv(tsv_data, sep='\t')
        return df
    else:
        print("Failed to fetch data:", response.status_code)
        print(response.text)
        return None
    
# change the parameters to fetch the data
# load description of proteins
params = {
        'fields': 'accession,cc_function',
        'format': 'tsv',
        'query': '(model_organism:9606) AND (reviewed:true)',
        'sort': 'organism_name asc'
    }

df_uniprot = fetch_uniprot_data(params)
if df_uniprot is not None:
    print(df_uniprot)
else:
    print("No data retrieved.")
#save the data to a CSV file
df_uniprot.to_csv('uniprot_protein_description.csv', index=False)

            Entry                                      Function [CC]
0      A0A024R1R8                                                NaN
1      A0A024RBG1  FUNCTION: Cleaves a beta-phosphate from the di...
2      A0A075B6H7  FUNCTION: Probable non-functional open reading...
3      A0A075B6H8  FUNCTION: Probable non-functional open reading...
4      A0A075B6H9  FUNCTION: V region of the variable domain of i...
...           ...                                                ...
20412      U3KPV4  FUNCTION: Synthesizes the galactose-alpha(1,3)...
20413      W5XKT8  FUNCTION: Sperm protein required for fusion of...
20414      W6CW81  FUNCTION: Functions as an inhibitor of DNA vir...
20415      X6R8D5                                                NaN
20416      X6R8R1                                                NaN

[20417 rows x 2 columns]


In [3]:
import pandas as pd

def clean_function(text):
    if isinstance(text, str) and 'FUNCTION:' in text:
        return text.replace('FUNCTION:', '').strip()
    return text

uniprot_description = pd.read_csv('uniprot_protein_description.csv')
uniprot_description = uniprot_description.dropna(subset=['Function [CC]'])

uniprot_description['Function [CC]'] = uniprot_description['Function [CC]'].apply(clean_function)
uniprot_description

Unnamed: 0,Entry,Function [CC]
1,A0A024RBG1,Cleaves a beta-phosphate from the diphosphate ...
2,A0A075B6H7,Probable non-functional open reading frame (OR...
3,A0A075B6H8,Probable non-functional open reading frame (OR...
4,A0A075B6H9,V region of the variable domain of immunoglobu...
5,A0A075B6I0,V region of the variable domain of immunoglobu...
...,...,...
20410,S4R3P1,Plays a role as a neuroprotective and antiapop...
20411,S4R3Y5,Plays a role as a neuroprotective and antiapop...
20412,U3KPV4,"Synthesizes the galactose-alpha(1,3)-galactose..."
20413,W5XKT8,Sperm protein required for fusion of sperm wit...


In [4]:
bmg_uniprot = biomedgraphica_protein[['BioMedGraphica_ID','Uniprot_ID']]

protein_description_uniprot = pd.merge(bmg_uniprot, uniprot_description, left_on='Uniprot_ID', right_on='Entry', how='left')
protein_description_uniprot.drop(columns=['Entry', 'Uniprot_ID'], inplace=True)
protein_description_uniprot.rename(columns={'Function [CC]':'UniProt'}, inplace=True)
protein_description_uniprot

Unnamed: 0,BioMedGraphica_ID,UniProt
0,BMG_PT000001,
1,BMG_PT000002,
2,BMG_PT000003,
3,BMG_PT000004,
4,BMG_PT000005,
...,...,...
173973,BMG_PT173974,
173974,BMG_PT173975,
173975,BMG_PT173976,
173976,BMG_PT173977,


In [5]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_Description.csv'
protein_description_uniprot.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Protein\BioMedGraphica_Protein_Description.csv


### 5.2 Combined Description

In [6]:
comb_description = protein_description_uniprot.copy()

# add the column name at the beginning of the string
# first, we need to get the column names
column_names = comb_description.columns.tolist()
column_names = [col for col in column_names if col != 'BioMedGraphica_ID']
# then we can apply the function to each column
for col in column_names:
    comb_description[col] = comb_description[col].apply(lambda x: ' | '.join([f"{col}: {i}" for i in x.split(' | ')]) if pd.notna(x) else x)

# now we can merge the columns into one
comb_description['Description'] = comb_description[column_names].apply(lambda x: ' | '.join(x.dropna()), axis=1)
comb_description = comb_description[['BioMedGraphica_ID', 'Description']]
comb_description

Unnamed: 0,BioMedGraphica_ID,Description
0,BMG_PT000001,
1,BMG_PT000002,
2,BMG_PT000003,
3,BMG_PT000004,
4,BMG_PT000005,
...,...,...
173973,BMG_PT173974,
173974,BMG_PT173975,
173975,BMG_PT173976,
173976,BMG_PT173977,


In [7]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_Description_Combined.csv'
comb_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Protein\BioMedGraphica_Protein_Description_Combined.csv


## 6. File Generation

In [19]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein' / 'BioMedGraphica_Protein.csv'
biomedgraphica_protein = pd.read_csv(target_dir, dtype=str)

### 6.1 BioChem

In [22]:
import pandas as pd

protein_seq = pd.read_csv('combined_sequences.csv')
protein_seq = protein_seq[['protein_id', 'protein_sequence']].drop_duplicates()
bmg_protein = biomedgraphica_protein[['Ensembl_Protein_ID', 'BioMedGraphica_ID']]
bmg_protein['Ensembl_Protein_ID'] = bmg_protein['Ensembl_Protein_ID'].str.split(';')
bmg_protein = bmg_protein.explode('Ensembl_Protein_ID')
bmg_protein_seq = pd.merge(bmg_protein, protein_seq, left_on='Ensembl_Protein_ID', right_on='protein_id', how='left')
bmg_protein_seq.drop(columns=['protein_id', 'Ensembl_Protein_ID'], inplace=True)
bmg_protein_seq

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmg_protein['Ensembl_Protein_ID'] = bmg_protein['Ensembl_Protein_ID'].str.split(';')


Unnamed: 0,BioMedGraphica_ID,protein_sequence
0,BMG_PT000001,
1,BMG_PT000002,
2,BMG_PT000003,
3,BMG_PT000004,
4,BMG_PT000005,
...,...,...
205481,BMG_PT173974,
205482,BMG_PT173975,
205483,BMG_PT173976,
205484,BMG_PT173977,


In [26]:
bmg_protein_seq = bmg_protein_seq.groupby('BioMedGraphica_ID').agg({
    'protein_sequence': lambda x: ' | '.join(x.dropna().unique())
}).reset_index().replace('', pd.NA)
bmg_protein_seq

Unnamed: 0,BioMedGraphica_ID,protein_sequence
0,BMG_PT000001,
1,BMG_PT000002,
2,BMG_PT000003,
3,BMG_PT000004,
4,BMG_PT000005,
...,...,...
173973,BMG_PT173974,
173974,BMG_PT173975,
173975,BMG_PT173976,
173976,BMG_PT173977,


In [27]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_BioChem.csv'
bmg_protein_seq.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Protein\BioMedGraphica_Protein_BioChem.csv


### 6.2 Name and ID

GUI Name

In [11]:
def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

gui_name = biomedgraphica_protein.copy()
gui_name = merge_string_columns(gui_name, ['UniProt_Name', 'HGNC_Symbol'], 'Protein_Name_List')
gui_name = gui_name[['BioMedGraphica_ID', 'Protein_Name_List']]
gui_name

Unnamed: 0,BioMedGraphica_ID,Protein_Name_List
0,BMG_PT000001,
1,BMG_PT000002,
2,BMG_PT000003,
3,BMG_PT000004,
4,BMG_PT000005,
...,...,...
173973,BMG_PT173974,CSF2RA
173974,BMG_PT173975,CSF2RA
173975,BMG_PT173976,CSF2RA
173976,BMG_PT173977,GTPBP6


In [12]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_GUI_Name.csv'
gui_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Protein\BioMedGraphica_Protein_GUI_Name.csv


LLM Name and ID

In [13]:
llm_name_id = biomedgraphica_protein.copy()

llm_name_id['Uniprot_ID'] = llm_name_id['Uniprot_ID'].apply(
    lambda x: ' | '.join(f"UniProt ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['RefSeq_ID'] = llm_name_id['RefSeq_ID'].apply(
    lambda x: ' | '.join(f"RefSeq ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

llm_name_id['NCBI_Gene_ID'] = llm_name_id['NCBI_Gene_ID'].apply(
    lambda x: ' | '.join(f"NCBI Gene ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

column_order = ['BioMedGraphica_ID', 'UniProt_Name', 'HGNC_Symbol', 'Uniprot_ID', 'Ensembl_Protein_ID', 'Ensembl_Protein_ID_Version', 'NCBI_Gene_ID', 'RefSeq_ID']
llm_name_id = llm_name_id[column_order] 
llm_name_id

Unnamed: 0,BioMedGraphica_ID,UniProt_Name,HGNC_Symbol,Uniprot_ID,Ensembl_Protein_ID,Ensembl_Protein_ID_Version,NCBI_Gene_ID,RefSeq_ID
0,BMG_PT000001,,,UniProt ID:A0A023HHK9,,,,RefSeq ID:NP_085128.2
1,BMG_PT000002,,,UniProt ID:A0A023HHL0,,,,RefSeq ID:NP_085128.2
2,BMG_PT000003,,,UniProt ID:A0A023IN41,,,,RefSeq ID:NP_001186551.1
3,BMG_PT000004,,,UniProt ID:A0A023T695,,,,RefSeq ID:XP_016868347 | RefSeq ID:XP_01686834...
4,BMG_PT000005,,,UniProt ID:A0A023T6R1,,,,RefSeq ID:NP_060518.1
...,...,...,...,...,...,...,...,...
173973,BMG_PT173974,,CSF2RA,,ENSP00000520928,ENSP00000520928.1,NCBI Gene ID:1438.0,
173974,BMG_PT173975,,CSF2RA,,ENSP00000520929,ENSP00000520929.1,NCBI Gene ID:1438.0,RefSeq ID:NP_001366090 | RefSeq ID:NP_00136609...
173975,BMG_PT173976,,CSF2RA,,ENSP00000520930,ENSP00000520930.1,NCBI Gene ID:1438.0,RefSeq ID:NP_001366088 | RefSeq ID:NP_00136609...
173976,BMG_PT173977,,GTPBP6,,ENSP00000520931,ENSP00000520931.1,NCBI Gene ID:8225.0,


In [14]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_LLM_Name_ID.csv'
llm_name_id.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Protein\BioMedGraphica_Protein_LLM_Name_ID.csv


LLM Name and ID Combined

In [15]:
llm_combined = llm_name_id.copy()

def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

llm_combined = merge_string_columns(llm_combined, llm_combined.columns[llm_combined.columns != 'BioMedGraphica_ID'], 'Names_and_IDs')
llm_combined

Unnamed: 0,BioMedGraphica_ID,Names_and_IDs
0,BMG_PT000001,UniProt ID:A0A023HHK9 | RefSeq ID:NP_085128.2
1,BMG_PT000002,RefSeq ID:NP_085128.2 | UniProt ID:A0A023HHL0
2,BMG_PT000003,UniProt ID:A0A023IN41 | RefSeq ID:NP_001186551.1
3,BMG_PT000004,RefSeq ID:XP_016868348 | RefSeq ID:XP_05421542...
4,BMG_PT000005,UniProt ID:A0A023T6R1 | RefSeq ID:NP_060518.1
...,...,...
173973,BMG_PT173974,ENSP00000520928.1 | ENSP00000520928 | NCBI Gen...
173974,BMG_PT173975,RefSeq ID:NP_001366093 | NCBI Gene ID:1438.0 |...
173975,BMG_PT173976,NCBI Gene ID:1438.0 | CSF2RA | RefSeq ID:NP_00...
173976,BMG_PT173977,GTPBP6 | ENSP00000520931.1 | NCBI Gene ID:8225...


In [16]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_LLM_Name_ID_Combined.csv'
llm_combined.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Protein\BioMedGraphica_Protein_LLM_Name_ID_Combined.csv


Display Name

In [17]:
display_name = biomedgraphica_protein.copy()

display_name['BMG_Protein_Name'] = display_name['HGNC_Symbol'].fillna(display_name['UniProt_Name'])
display_name = display_name[['BioMedGraphica_ID', 'BMG_Protein_Name']]
display_name

Unnamed: 0,BioMedGraphica_ID,BMG_Protein_Name
0,BMG_PT000001,
1,BMG_PT000002,
2,BMG_PT000003,
3,BMG_PT000004,
4,BMG_PT000005,
...,...,...
173973,BMG_PT173974,CSF2RA
173974,BMG_PT173975,CSF2RA
173975,BMG_PT173976,CSF2RA
173976,BMG_PT173977,GTPBP6


In [18]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Protein'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Protein_Display_Name.csv'
display_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Protein\BioMedGraphica_Protein_Display_Name.csv
