### Ensembl

In [1]:
import pandas as pd
from pybiomart import Server

# List all available attributes
def list_attributes():
    server = Server(host='http://www.ensembl.org')
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    attributes = dataset.list_attributes()
    return attributes

attributes = list_attributes()

def fetch_ensembl_data(attributes):
    server = Server(host='http://www.ensembl.org')
    #https://www.ensembl.org/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL
    #this link shows that hsapiens_gene_ensembl is the GRCh38.p14
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    
    response = dataset.query(attributes)
    
    return response

attributes=['ensembl_gene_id', 'ensembl_transcript_id']
data_ensembl = fetch_ensembl_data(attributes)
data_ensembl.to_csv('gene_transcript.csv', index=False)

In [1]:
# Download Link: API
# Download Date: 2025-03-21
# Download Version: 2025-03-21
import pandas as pd

df_ensembl_gene_transcript = pd.read_csv('gene_transcript.csv')
df_ensembl_gene_transcript

Unnamed: 0,Gene stable ID,Transcript stable ID
0,ENSG00000210049,ENST00000387314
1,ENSG00000211459,ENST00000389680
2,ENSG00000210077,ENST00000387342
3,ENSG00000210082,ENST00000387347
4,ENSG00000209082,ENST00000386347
...,...,...
412029,ENSG00000241860,ENST00000831127
412030,ENSG00000241860,ENST00000466430
412031,ENSG00000241860,ENST00000477740
412032,ENSG00000241860,ENST00000471248


### RefSeq

In [2]:
# Download Link: https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
# Download Date: 2025-03-21
# Download Version: 2025-03-21
df_refseq_gene_transcript = pd.read_csv('LRG_RefSeqGene.txt', sep='\t')
df_refseq_gene_transcript = df_refseq_gene_transcript[['RSG','RNA']]
df_refseq_gene_transcript

Unnamed: 0,RSG,RNA
0,NG_029916.1,NM_014576.4
1,NG_029916.1,NM_138932.3
2,NG_029916.1,NM_138933.3
3,NG_029916.1,NM_001198818.2
4,NG_029916.1,NM_001198819.2
...,...,...
33416,NG_053150.1,NM_020928.2
33417,NG_033939.1,NM_007057.4
33418,NG_033939.1,NM_032997.3
33419,NG_033939.1,NM_001005413.1


### BioMedGraphica ID

In [3]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir_gene = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene' / 'BioMedGraphica_Gene.csv'
target_dir_transcript = grandjson_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript' / 'BioMedGraphica_Transcript.csv'
biomedgraphica_gene = pd.read_csv(target_dir_gene, dtype=str)
biomedgraphica_transcript = pd.read_csv(target_dir_transcript, dtype=str)

### Ensembl Mapping

In [4]:
ensembl_trans = biomedgraphica_transcript[['Ensembl_Transcript_ID', 'BioMedGraphica_ID']]
ensembl_trans.dropna(subset = ['Ensembl_Transcript_ID'], inplace=True)
ensembl_trans = ensembl_trans.assign(Ensembl_Transcript_ID=biomedgraphica_transcript['Ensembl_Transcript_ID'].str.split(';')).explode('Ensembl_Transcript_ID')

ensembl_transcript_to_individualid = ensembl_trans.groupby('Ensembl_Transcript_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ensembl_trans.dropna(subset = ['Ensembl_Transcript_ID'], inplace=True)


In [5]:
ensembl_gene = biomedgraphica_gene[['Ensembl_Gene_ID', 'BioMedGraphica_ID']]
ensembl_gene.dropna(subset = ['Ensembl_Gene_ID'], inplace=True)
ensembl_gene = ensembl_gene.assign(Ensembl_Gene_ID=biomedgraphica_gene['Ensembl_Gene_ID'].str.split(';')).explode('Ensembl_Gene_ID')

ensembl_gene_to_individualid = ensembl_gene.groupby('Ensembl_Gene_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ensembl_gene.dropna(subset = ['Ensembl_Gene_ID'], inplace=True)


In [6]:
df_ensembl_gene_transcript['To_ID'] = df_ensembl_gene_transcript['Transcript stable ID'].map(ensembl_transcript_to_individualid)
df_ensembl_gene_transcript['From_ID'] = df_ensembl_gene_transcript['Gene stable ID'].map(ensembl_gene_to_individualid)
df_ensembl_gene_transcript

Unnamed: 0,Gene stable ID,Transcript stable ID,To_ID,From_ID
0,ENSG00000210049,ENST00000387314,BMG_TS029225,BMG_GN176455
1,ENSG00000211459,ENST00000389680,BMG_TS029457,BMG_GN176448
2,ENSG00000210077,ENST00000387342,BMG_TS029226,BMG_GN176470
3,ENSG00000210082,ENST00000387347,BMG_TS029227,BMG_GN176450
4,ENSG00000209082,ENST00000386347,BMG_TS029216,BMG_GN176460
...,...,...,...,...
412029,ENSG00000241860,ENST00000831127,BMG_TS392883,BMG_GN195027
412030,ENSG00000241860,ENST00000466430,BMG_TS077697,BMG_GN195027
412031,ENSG00000241860,ENST00000477740,BMG_TS087551,BMG_GN195027
412032,ENSG00000241860,ENST00000471248,BMG_TS081907,BMG_GN195027


In [7]:
df_ensembl_gene_transcript = df_ensembl_gene_transcript[['From_ID','To_ID']]
df_ensembl_gene_transcript = df_ensembl_gene_transcript.dropna(subset=['From_ID'])
df_ensembl_gene_transcript = df_ensembl_gene_transcript.dropna(subset=['To_ID'])
df_ensembl_gene_transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412034 entries, 0 to 412033
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   From_ID  412034 non-null  object
 1   To_ID    412034 non-null  object
dtypes: object(2)
memory usage: 6.3+ MB


In [8]:
df_ensembl_gene_transcript['From_ID'] = df_ensembl_gene_transcript['From_ID'].apply(lambda x: x.split(';'))
df_ensembl_gene_transcript['To_ID'] = df_ensembl_gene_transcript['To_ID'].apply(lambda x: x.split(';'))

df_ensembl_gene_transcript = df_ensembl_gene_transcript.explode('From_ID')
df_ensembl_gene_transcript = df_ensembl_gene_transcript.explode('To_ID')

df_ensembl_gene_transcript.drop_duplicates(inplace=True)
df_ensembl_gene_transcript

Unnamed: 0,From_ID,To_ID
0,BMG_GN176455,BMG_TS029225
1,BMG_GN176448,BMG_TS029457
2,BMG_GN176470,BMG_TS029226
3,BMG_GN176450,BMG_TS029227
4,BMG_GN176460,BMG_TS029216
...,...,...
412029,BMG_GN195027,BMG_TS392883
412030,BMG_GN195027,BMG_TS077697
412031,BMG_GN195027,BMG_TS087551
412032,BMG_GN195027,BMG_TS081907


### RefSeq Mapping

In [9]:
refseq_gene = biomedgraphica_gene[['RefSeq_ID', 'BioMedGraphica_ID']]
refseq_gene.dropna(subset = ['RefSeq_ID'], inplace=True)
refseq_gene = refseq_gene.assign(RefSeq_ID=biomedgraphica_gene['RefSeq_ID'].str.split(';')).explode('RefSeq_ID')

refseq_gene_to_individualid = refseq_gene.groupby('RefSeq_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refseq_gene.dropna(subset = ['RefSeq_ID'], inplace=True)


In [10]:
refseq_trans = biomedgraphica_transcript[['RefSeq_ID', 'BioMedGraphica_ID']]
refseq_trans.dropna(subset = ['RefSeq_ID'], inplace=True)
refseq_trans = refseq_trans.assign(RefSeq_ID=biomedgraphica_transcript['RefSeq_ID'].str.split(';')).explode('RefSeq_ID')

refseq_trans_to_individual = refseq_trans.groupby('RefSeq_ID')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refseq_trans.dropna(subset = ['RefSeq_ID'], inplace=True)


In [11]:
df_refseq_gene_transcript['From_ID'] = df_refseq_gene_transcript['RSG'].map(refseq_gene_to_individualid)
df_refseq_gene_transcript['To_ID'] = df_refseq_gene_transcript['RNA'].map(refseq_trans_to_individual)
df_refseq_gene_transcript

Unnamed: 0,RSG,RNA,From_ID,To_ID
0,NG_029916.1,NM_014576.4,BMG_GN171396,BMG_TS023854
1,NG_029916.1,NM_138932.3,BMG_GN171396,
2,NG_029916.1,NM_138933.3,BMG_GN171396,
3,NG_029916.1,NM_001198818.2,BMG_GN171396,
4,NG_029916.1,NM_001198819.2,BMG_GN171396,
...,...,...,...,...
33416,NG_053150.1,NM_020928.2,BMG_GN180221,BMG_TS002391
33417,NG_033939.1,NM_007057.4,BMG_GN026200,BMG_TS023820
33418,NG_033939.1,NM_032997.3,BMG_GN026200,
33419,NG_033939.1,NM_001005413.1,BMG_GN026200,


In [12]:
refseq_gene_transcript = df_refseq_gene_transcript[['From_ID','To_ID']]
refseq_gene_transcript.dropna(subset=['From_ID'], inplace=True)
refseq_gene_transcript.dropna(subset=['To_ID'], inplace = True)
refseq_gene_transcript

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refseq_gene_transcript.dropna(subset=['From_ID'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refseq_gene_transcript.dropna(subset=['To_ID'], inplace = True)


Unnamed: 0,From_ID,To_ID
0,BMG_GN171396,BMG_TS023854
11,BMG_GN166569,BMG_TS009928
12,BMG_GN164978,BMG_TS007200
16,BMG_GN177970,BMG_TS214111
17,BMG_GN186187,BMG_TS000417
...,...,...
33411,BMG_GN186535,BMG_TS008342
33413,BMG_GN178523,BMG_TS141990
33416,BMG_GN180221,BMG_TS002391
33417,BMG_GN026200,BMG_TS023820


In [13]:
# Split individual IDs by semicolon and explode into multiple rows
final_df = refseq_gene_transcript.copy()

# Split and explode individualid_RSG
final_df['From_ID'] = final_df['From_ID'].str.split(';')
final_df = final_df.explode('From_ID').reset_index(drop=True)

# Split and explode individualid_RNA
final_df['To_ID'] = final_df['To_ID'].str.split(';')
final_df = final_df.explode('To_ID').reset_index(drop=True)

final_df

Unnamed: 0,From_ID,To_ID
0,BMG_GN171396,BMG_TS023854
1,BMG_GN166569,BMG_TS009928
2,BMG_GN164978,BMG_TS007200
3,BMG_GN177970,BMG_TS214111
4,BMG_GN186187,BMG_TS000417
...,...,...
6996,BMG_GN186535,BMG_TS008342
6997,BMG_GN178523,BMG_TS141990
6998,BMG_GN180221,BMG_TS002391
6999,BMG_GN026200,BMG_TS023820


### Gene-Transcript Relation

In [14]:
biomedgraphica_ensembl_gene_transcript = df_ensembl_gene_transcript.copy()
biomedgraphica_ensembl_gene_transcript['source1'] = 'Ensembl'

biomedgraphica_refseq_gene_transcript = final_df.copy()
biomedgraphica_refseq_gene_transcript['source2'] = 'RefSeq'

biomedgraphica_gene_transcript = pd.merge(biomedgraphica_ensembl_gene_transcript, biomedgraphica_refseq_gene_transcript, how='outer')
biomedgraphica_gene_transcript

Unnamed: 0,From_ID,To_ID,source1,source2
0,BMG_GN000001,BMG_TS003701,Ensembl,
1,BMG_GN000001,BMG_TS179160,Ensembl,
2,BMG_GN000001,BMG_TS180263,Ensembl,
3,BMG_GN000001,BMG_TS181076,Ensembl,
4,BMG_GN000001,BMG_TS182582,Ensembl,
...,...,...,...,...
427805,BMG_GN230311,BMG_TS412217,Ensembl,
427806,BMG_GN230312,BMG_TS412299,Ensembl,
427807,BMG_GN230312,BMG_TS412300,Ensembl,
427808,BMG_GN230312,BMG_TS412301,Ensembl,


In [15]:
def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

biomedgraphica_gene_transcript = merge_string_columns(biomedgraphica_gene_transcript, ['source1', 'source2'], 'Source')
biomedgraphica_gene_transcript

Unnamed: 0,From_ID,To_ID,Source
0,BMG_GN000001,BMG_TS003701,Ensembl
1,BMG_GN000001,BMG_TS179160,Ensembl
2,BMG_GN000001,BMG_TS180263,Ensembl
3,BMG_GN000001,BMG_TS181076,Ensembl
4,BMG_GN000001,BMG_TS182582,Ensembl
...,...,...,...
427805,BMG_GN230311,BMG_TS412217,Ensembl
427806,BMG_GN230312,BMG_TS412299,Ensembl
427807,BMG_GN230312,BMG_TS412300,Ensembl
427808,BMG_GN230312,BMG_TS412301,Ensembl


In [16]:
max_length = len(str(len(biomedgraphica_gene_transcript)))
biomedgraphica_gene_transcript['Type'] = 'Gene-Transcript'
biomedgraphica_gene_transcript['BioMedGraphica_ID'] = ['BMG_ED_GNTS' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_gene_transcript) + 1)]
columns = ['BioMedGraphica_ID'] + [col for col in biomedgraphica_gene_transcript.columns if col != 'BioMedGraphica_ID']  # re-order columns
biomedgraphica_gene_transcript = biomedgraphica_gene_transcript[columns]
biomedgraphica_gene_transcript

Unnamed: 0,BioMedGraphica_ID,From_ID,To_ID,Source,Type
0,BMG_ED_GNTS000001,BMG_GN000001,BMG_TS003701,Ensembl,Gene-Transcript
1,BMG_ED_GNTS000002,BMG_GN000001,BMG_TS179160,Ensembl,Gene-Transcript
2,BMG_ED_GNTS000003,BMG_GN000001,BMG_TS180263,Ensembl,Gene-Transcript
3,BMG_ED_GNTS000004,BMG_GN000001,BMG_TS181076,Ensembl,Gene-Transcript
4,BMG_ED_GNTS000005,BMG_GN000001,BMG_TS182582,Ensembl,Gene-Transcript
...,...,...,...,...,...
427805,BMG_ED_GNTS427806,BMG_GN230311,BMG_TS412217,Ensembl,Gene-Transcript
427806,BMG_ED_GNTS427807,BMG_GN230312,BMG_TS412299,Ensembl,Gene-Transcript
427807,BMG_ED_GNTS427808,BMG_GN230312,BMG_TS412300,Ensembl,Gene-Transcript
427808,BMG_ED_GNTS427809,BMG_GN230312,BMG_TS412301,Ensembl,Gene-Transcript


In [17]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Relation' / 'Gene-Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_Transcript.csv'
biomedgraphica_gene_transcript.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Relation\Gene-Transcript\BioMedGraphica_Gene_Transcript.csv
