# BioMedGraphica Gene

## 1. Data Access

### Direct Download Links  
**OMIM**: Can be downloaded directly via the link without the need for registration. [Link](https://omim.org/static/omim/data/mim2gene.txt)  
**HGNC**: Can be downloaded directly via the link without the need for registration. [Link](https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_pub_chrom_map&col=gd_pub_ensembl_id&col=gd_pub_eg_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit)  
**NCBI**: Can be downloaded directly via the link without the need for registration. [Link1](https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz); [Link2](https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz)  
**RefSeq**: Can be downloaded directly via the link without the need for registration. [Link](https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz)

### Ensembl API

In [2]:
import pandas as pd
from pybiomart import Server

# List all available attributes
def list_attributes():
    server = Server(host='http://www.ensembl.org')
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    attributes = dataset.list_attributes()
    return attributes

attributes = list_attributes()

def fetch_ensembl_data(attributes):
    server = Server(host='http://www.ensembl.org')
    #https://www.ensembl.org/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL
    #this link shows that hsapiens_gene_ensembl is the GRCh38.p14
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    
    response = dataset.query(attributes)
    
    return response

attributes=['ensembl_gene_id', 'ensembl_gene_id_version','start_position', 'end_position','gene_biotype', 'hgnc_id', 'hgnc_symbol']
data_ensembl = fetch_ensembl_data(attributes)
data_ensembl.to_csv('gene_ensembl.csv', index=False)

### RefSeq Data Pre-Process

In [None]:
# filter human data
def filter_data(input_file, output_file):

    df = pd.read_csv(input_file)
    
    filtered_df = df[(df['NCBI_tax_id'] == 9606) & (df['UniProtKB_tax_id'] == 9606)]
    
    filtered_df.to_csv(output_file, index=False)

# replace the input_file and output_file with the path of the files in your system
filter_data('gene2refseq', 'refseq_geneid_human.csv')

### NCBI Gene Data Pre-Process

In [None]:
ncbi_info = pd.read_csv('gene_info', delimiter='\t')
# filter out only human entries
ncbi_info_human = ncbi_info[ncbi_info['#tax_id'] == 9606].reset_index(drop=True)
ncbi_info_human.to_csv('gene_info_human.csv', index=False)

## 2. Load data

### 2.1 Load Ensembl Data

In [4]:
import pandas as pd

df_ensembl = pd.read_csv('gene_ensembl.csv')
# Check the data types
df_ensembl['Gene start (bp)'] = df_ensembl['Gene start (bp)'].astype(str)
df_ensembl['Gene end (bp)'] = df_ensembl['Gene end (bp)'].astype(str)

df_ensembl_merge = df_ensembl.groupby('Gene stable ID').agg({
    'Gene stable ID version': lambda x: ';'.join(x.dropna().unique()),
    'Gene start (bp)': lambda x: ';'.join(x.dropna().unique()),
    'Gene end (bp)': lambda x: ';'.join(x.dropna().unique()),
    'Gene type': lambda x: ';'.join(x.dropna().unique()),
    'HGNC ID': lambda x: ';'.join(x.dropna().unique()),
    'HGNC symbol': lambda x: ';'.join(x.dropna().unique())
}).reset_index()
df_ensembl_merge.replace('', pd.NA, inplace=True)
df_ensembl_merge

Unnamed: 0,Gene stable ID,Gene stable ID version,Gene start (bp),Gene end (bp),Gene type,HGNC ID,HGNC symbol
0,ENSG00000000003,ENSG00000000003.16,100627108,100639991,protein_coding,HGNC:11858,TSPAN6
1,ENSG00000000005,ENSG00000000005.6,100584936,100599885,protein_coding,HGNC:17757,TNMD
2,ENSG00000000419,ENSG00000000419.14,50934867,50959140,protein_coding,HGNC:3005,DPM1
3,ENSG00000000457,ENSG00000000457.14,169849631,169894267,protein_coding,HGNC:19285,SCYL3
4,ENSG00000000460,ENSG00000000460.17,169662007,169854080,protein_coding,HGNC:25565,FIRRM
...,...,...,...,...,...,...,...
86397,ENSG00000310553,ENSG00000310553.1,267677,276210,lncRNA,,
86398,ENSG00000310554,ENSG00000310554.1,278946,281825,lncRNA,,
86399,ENSG00000310555,ENSG00000310555.1,57138638,57145842,lncRNA,,
86400,ENSG00000310556,ENSG00000310556.1,1169151,1179160,lncRNA,,


### 2.2 Load OMIM Data

In [6]:
df_OMIM = pd.read_csv('mim2gene.txt', sep='\t', comment='#', header=None, dtype=str)
df_OMIM.columns = ['MIM Number', 'MIM Entry Type', 'Entrez Gene ID', 'HGNC symbol', 'Ensembl']

# Filter out only gene entries
df_OMIM = df_OMIM[df_OMIM['MIM Entry Type'] == 'gene'].reset_index(drop=True)
df_OMIM_gene = df_OMIM[['MIM Number', 'Entrez Gene ID', 'HGNC symbol']]
df_OMIM_gene

Unnamed: 0,MIM Number,Entrez Gene ID,HGNC symbol
0,100640,216,ALDH1A1
1,100660,218,ALDH3A1
2,100670,219,ALDH1B1
3,100678,39,ACAT2
4,100690,1134,CHRNA1
...,...,...,...
17458,621073,254122,SNX32
17459,621074,91748,MIDEAS
17460,621075,120224,TMEM45B
17461,621076,221294,NT5DC1


Add Chrom number

In [9]:
omim_chrom = pd.read_csv('genemap2.txt', sep='\t', comment='#', header=None)
omim_chrom.columns = ['Chromosome', 'Genomic Position Start', 'Genomic Position End', 'Cyto Location', 'Computed Cyto Location', 'MIM Number', 'Gene/Locus And Other Related Symbols', 'Gene Name', 'Approved Gene Symbol', 'Entrez Gene ID', 'Ensembl Gene ID', 'Comments', 'Phenotypes', 'Mouse Gene Symbol/ID']
omim_chrom_filter = omim_chrom[['MIM Number', 'Chromosome']]
omim_chrom_filter.drop_duplicates(inplace=True)
omim_chrom_filter.reset_index(drop=True, inplace=True)
omim_chrom_filter

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  omim_chrom_filter.drop_duplicates(inplace=True)


Unnamed: 0,MIM Number,Chromosome
0,612367,chr1
1,606788,chr1
2,605462,chr1
3,606928,chr1
4,618815,chr1
...,...,...
18620,400016,chrY
18621,400036,chrY
18622,475000,chrY
18623,400043,chrY


Combine OMIM data

In [10]:
omim_chrom_filter['MIM Number'] = omim_chrom_filter['MIM Number'].astype(str)
omim_final = pd.merge(df_OMIM_gene, omim_chrom_filter, on='MIM Number', how='left')

df_OMIM_gene_symbol_unique = omim_final.groupby('Entrez Gene ID').agg({
    'MIM Number': lambda x: ';'.join(x.dropna().unique()), 
    'HGNC symbol': lambda x: ';'.join(x.dropna().unique()),
    'Chromosome': lambda x: ';'.join(map(str, x.dropna().unique())) if x.notna().any() else ''
}).reset_index()
df_OMIM_gene_symbol_unique['Chromosome'] = df_OMIM_gene_symbol_unique['Chromosome'].str.replace('chr', '').astype(str)
df_OMIM_gene_symbol_unique.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  omim_chrom_filter['MIM Number'] = omim_chrom_filter['MIM Number'].astype(str)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17360 entries, 0 to 17359
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Entrez Gene ID  17360 non-null  object
 1   MIM Number      17360 non-null  object
 2   HGNC symbol     17360 non-null  object
 3   Chromosome      17360 non-null  object
dtypes: object(4)
memory usage: 542.6+ KB


### 2.3 Load HGNC Data

In [12]:
df_HGNC = pd.read_csv('HGNC_Custom.txt', delimiter='\t')
df_HGNC['NCBI gene ID'] = df_HGNC['NCBI gene ID'].fillna(-1).astype(int).replace(-1, pd.NA)
df_HGNC

Unnamed: 0,HGNC ID,Approved symbol,Approved name,Chromosome,Ensembl gene ID,NCBI gene ID
0,HGNC:5,A1BG,alpha-1-B glycoprotein,19,ENSG00000121410,1
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,19,ENSG00000268895,503538
2,HGNC:24086,A1CF,APOBEC1 complementation factor,10,ENSG00000148584,29974
3,HGNC:7,A2M,alpha-2-macroglobulin,12,ENSG00000175899,2
4,HGNC:27057,A2M-AS1,A2M antisense RNA 1,12,ENSG00000245105,144571
...,...,...,...,...,...,...
43911,HGNC:55597,FAM13B-AS1,FAM13B antisense RNA 1,5,ENSG00000246323,100130172
43912,HGNC:19371,FAM13C,family with sequence similarity 13 member C,10,ENSG00000148541,220965
43913,HGNC:23015,FAM20A,FAM20A golgi associated secretory pathway pseu...,17,ENSG00000108950,54757
43914,HGNC:23017,FAM20B,FAM20B glycosaminoglycan xylosylkinase,1,ENSG00000116199,9917


In [15]:
# Merge rows with the same 'HGNC ID' and concatenate unique values with semicolons
def merge_rows(df, group_by_col):
    # Group rows and concatenate unique values with semicolons, excluding empty Ensembl gene ID for merging
    df_grouped = df[df[group_by_col].notna()].groupby(group_by_col).agg(lambda x: ';'.join(map(str, x.dropna().unique()))).reset_index()
    
    # Get rows where Ensembl gene ID is NaN or empty
    df_empty = df[df[group_by_col].isna()]
    
    # Combine grouped and empty rows
    df_combined = pd.concat([df_grouped, df_empty], ignore_index=True)

    return df_combined

df_HGNC_ensembl_unique = merge_rows(df_HGNC, 'Ensembl gene ID')
df_HGNC_ensembl_unique.replace('', pd.NA, inplace=True)
df_HGNC_ensembl_unique

Unnamed: 0,Ensembl gene ID,HGNC ID,Approved symbol,Approved name,Chromosome,NCBI gene ID
0,ENSG00000000003,HGNC:11858,TSPAN6,tetraspanin 6,X,7105
1,ENSG00000000005,HGNC:17757,TNMD,tenomodulin,X,64102
2,ENSG00000000419,HGNC:3005,DPM1,dolichyl-phosphate mannosyltransferase subunit...,20,8813
3,ENSG00000000457,HGNC:19285,SCYL3,SCY1 like pseudokinase 3,1,57147
4,ENSG00000000460,HGNC:25565,FIRRM,FIGNL1 interacting regulator of recombination ...,1,55732
...,...,...,...,...,...,...
43910,,HGNC:3463,ESAT,esterase activator,14,2096
43911,,HGNC:3464,ESB3,esterase B3,16,2097
43912,,HGNC:23868,FAM8A7P,"family with sequence similarity 8 member A7, p...",Y,386725
43913,,HGNC:23870,FAM8A9P,"family with sequence similarity 8 member A9, p...",Y,386727


### 2.4 Load NCBI Data

General NCBI Info

In [None]:
ncbi_info_human = pd.read_csv('gene_info_human.csv')
# Filter out only necessary columns
ncbi_info_filter = ncbi_info_human[['GeneID', 'Symbol', 'description', 'chromosome']]
ncbi_info_filter = ncbi_info_filter.replace('-', pd.NA)
ncbi_info_filter['chromosome'] = ncbi_info_filter['chromosome'].str.split('|')
ncbi_info_filter = ncbi_info_filter.explode('chromosome').reset_index(drop=True)
# Merge rows with the same 'GeneID' and concatenate unique values with semicolons
ncbi_info_filter_unique = merge_rows(ncbi_info_filter, 'GeneID')
ncbi_info_filter_unique

NCBI and Ensembl Mapping Relationship

In [14]:
# load the data and filter out only human entries
df_NCBI = pd.read_csv('gene2ensembl', delimiter='\t')
df_NCBI_gene = df_NCBI[df_NCBI['#tax_id'] == 9606].reset_index(drop=True)
df_NCBI_gene = df_NCBI_gene[['GeneID', 'Ensembl_gene_identifier']]
df_NCBI_gene.drop_duplicates(inplace=True)

df_NCBI_gene = pd.merge(df_NCBI_gene, ncbi_info_filter_unique, left_on='GeneID', right_on='GeneID', how='left')
df_NCBI_gene['GeneID'] = df_NCBI_gene['GeneID'].astype(str)
df_NCBI_gene = df_NCBI_gene.rename(columns={'chromosome': 'Chromosome'})

df_NCBI_gene = df_NCBI_gene.groupby('Ensembl_gene_identifier').agg({
    'GeneID': lambda x: ';'.join(map(str, x.unique())),
    'Symbol': lambda x: ';'.join(map(str, x.unique())),
    'description': lambda x: ';'.join(map(str, x.unique())),
    'Chromosome': lambda x: ';'.join(map(str, x.unique()))
}).reset_index()
df_NCBI_gene

Unnamed: 0,Ensembl_gene_identifier,GeneID,Symbol,description,Chromosome
0,ENSG00000000003,7105,TSPAN6,tetraspanin 6,X
1,ENSG00000000005,64102,TNMD,tenomodulin,X
2,ENSG00000000419,8813,DPM1,dolichyl-phosphate mannosyltransferase subunit...,20
3,ENSG00000000457,57147,SCYL3,SCY1 like pseudokinase 3,1
4,ENSG00000000460,55732,FIRRM,FIGNL1 interacting regulator of recombination ...,1
...,...,...,...,...,...
38184,ENSG00000310526,653635,WASH7P,"WASP family homolog 7, pseudogene",1
38185,ENSG00000310527,100996442,WASH9P,"WAS protein family homolog 9, pseudogene",1
38186,ENSG00000310533,100101440,PMS2P7,"PMS1 homolog 2, mismatch repair system compone...",7
38187,ENSG00000310537,5380,PMS2P2,"PMS1 homolog 2, mismatch repair system compone...",7


### 2.5 Load RefSeq Data

In [3]:
df_refseq = pd.read_csv('refseq_geneid_human.csv')

df_refseq_gene = df_refseq[['GeneID', 'status', 'genomic_nucleotide_accession.version', 'Symbol']]
df_refseq_gene = df_refseq_gene.reset_index(drop=True)

df_refseq_gene_reviewed = df_refseq_gene[(df_refseq_gene['status'] == 'REVIEWED') | (df_refseq_gene['status'] == 'MODEL')]
df_refseq_gene_reviewed.drop(columns=['status'], inplace=True)
df_refseq_gene_reviewed.drop_duplicates(inplace=True)
df_refseq_gene_reviewed = df_refseq_gene_reviewed.reset_index(drop=True)
# Make sure that the GeneIDs are unique
df_refseq_gene_merge = df_refseq_gene_reviewed.copy()
df_refseq_gene_merge = df_refseq_gene_merge[df_refseq_gene_merge['genomic_nucleotide_accession.version'] != '-']
df_refseq_gene_merge.info()

  df_refseq = pd.read_csv('refseq_geneid_human.csv')


<class 'pandas.core.frame.DataFrame'>
Index: 461410 entries, 0 to 461429
Data columns (total 3 columns):
 #   Column                                Non-Null Count   Dtype 
---  ------                                --------------   ----- 
 0   GeneID                                461410 non-null  int64 
 1   genomic_nucleotide_accession.version  461410 non-null  object
 2   Symbol                                461410 non-null  object
dtypes: int64(1), object(2)
memory usage: 14.1+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_refseq_gene_reviewed.drop(columns=['status'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_refseq_gene_reviewed.drop_duplicates(inplace=True)


Add Chrom number

In [16]:
refseq_chrom_human = ncbi_info_human
refseq_chrom_human = refseq_chrom_human[['GeneID', 'chromosome']].drop_duplicates().reset_index(drop=True)
refseq_chrom_human

Unnamed: 0,GeneID,chromosome
0,1,19
1,2,12
2,3,12
3,9,8
4,10,8
...,...,...
193335,139281665,19
193336,139281666,8
193337,139281667,12
193338,139281668,1


Combine RefSeq data

In [17]:
df_refseq_gene_merge = pd.merge(df_refseq_gene_merge, refseq_chrom_human, left_on='GeneID', right_on='GeneID', how='left')
df_refseq_gene_merge = df_refseq_gene_merge.groupby('GeneID').agg({
        'genomic_nucleotide_accession.version': lambda x: ';'.join(map(str, x.unique())),
        'chromosome': lambda x: ';'.join(map(str, x.dropna().unique())),
    }).reset_index()
df_refseq_gene_merge['GeneID'] = df_refseq_gene_merge['GeneID'].astype(str)
df_refseq_gene_merge.rename(columns={'chromosome': 'Chromosome'}, inplace=True)
df_refseq_gene_merge

Unnamed: 0,GeneID,genomic_nucleotide_accession.version,Chromosome
0,1,NC_000019.10;NC_060943.1,19
1,2,NC_000012.12;NC_060936.1;NG_011717.2,12
2,9,NC_000008.11;NC_060932.1;NG_012245.2,8
3,10,NC_000008.11;NC_060932.1;NG_012246.1,8
4,12,NG_012879.1;NC_000014.9;NC_060938.1,14
...,...,...,...
160841,132211114,NC_000018.10;NC_060942.1;NG_231517.1,18
160842,132211115,NC_000022.11;NC_060946.1;NG_231518.1;NW_003315...,22
160843,133206433,NC_000001.11;NC_060925.1;NG_242261.1,1
160844,133206434,NC_000001.11;NC_060925.1;NG_242262.1,1


## 3. Merge Data

In [18]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

def merge_string_columns(df, columns, merge_name, separator='; '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

### 3.1 Ensembl + HGNC

In [19]:
df_ensembl_hgnc = pd.merge(df_ensembl, df_HGNC_ensembl_unique, left_on='Gene stable ID', right_on = 'Ensembl gene ID', how='outer')
df_ensembl_hgnc_v1 = merge_column(df_ensembl_hgnc, 'Gene stable ID', 'Ensembl gene ID', 'Ensembl ID')
df_ensembl_hgnc_v1.rename(columns={'Gene stable ID version': 'Ensembl ID version'}, inplace=True)
df_ensembl_hgnc_v2 = merge_column(df_ensembl_hgnc_v1, 'HGNC ID_x', 'HGNC ID_y', 'HGNC ID')
df_ensembl_hgnc_v2 = merge_column(df_ensembl_hgnc_v2, 'Approved symbol', 'HGNC symbol', 'HGNC_symbol')
df_ensembl_hgnc_v2

Unnamed: 0,Ensembl ID version,Gene start (bp),Gene end (bp),Gene type,Approved name,Chromosome,NCBI gene ID,Ensembl ID,HGNC ID,HGNC_symbol
0,ENSG00000000003.16,100627108,100639991,protein_coding,tetraspanin 6,X,7105,ENSG00000000003,HGNC:11858,TSPAN6
1,ENSG00000000005.6,100584936,100599885,protein_coding,tenomodulin,X,64102,ENSG00000000005,HGNC:17757,TNMD
2,ENSG00000000419.14,50934867,50959140,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,20,8813,ENSG00000000419,HGNC:3005,DPM1
3,ENSG00000000457.14,169849631,169894267,protein_coding,SCY1 like pseudokinase 3,1,57147,ENSG00000000457,HGNC:19285,SCYL3
4,ENSG00000000460.17,169662007,169854080,protein_coding,FIGNL1 interacting regulator of recombination ...,1,55732,ENSG00000000460,HGNC:25565,FIRRM
...,...,...,...,...,...,...,...,...,...,...
89134,,,,,esterase activator,14,2096,,HGNC:3463,ESAT
89135,,,,,esterase B3,16,2097,,HGNC:3464,ESB3
89136,,,,,"family with sequence similarity 8 member A7, p...",Y,386725,,HGNC:23868,FAM8A7P
89137,,,,,"family with sequence similarity 8 member A9, p...",Y,386727,,HGNC:23870,FAM8A9P


### 3.2 Add NCBI data

In [20]:
df_ensembl_hgnc_ncbi = df_ensembl_hgnc_v2.copy()
df_NCBI_gene_v1 = df_NCBI_gene.rename(columns={'Ensembl_gene_identifier': 'Ensembl ID'})

df_ensembl_hgnc_ncbi_v1 = pd.merge(df_ensembl_hgnc_ncbi, df_NCBI_gene_v1, on='Ensembl ID', how='outer')
df_ensembl_hgnc_ncbi_v2 = merge_column(df_ensembl_hgnc_ncbi_v1, 'GeneID', 'NCBI gene ID', 'NCBI_ID')
df_ensembl_hgnc_ncbi_v2 = merge_column(df_ensembl_hgnc_ncbi_v2, 'Symbol', 'HGNC_symbol', 'HGNC Symbol')
df_ensembl_hgnc_ncbi_v2 = merge_string_columns(df_ensembl_hgnc_ncbi_v2, ['description', 'Approved name'], 'Name')
df_ensembl_hgnc_ncbi_v2 = merge_string_columns(df_ensembl_hgnc_ncbi_v2, ['Chromosome_x', 'Chromosome_y'], 'Chromosome')
df_ensembl_hgnc_ncbi_v2.reset_index(drop=True, inplace=True)
df_ensembl_hgnc_ncbi_v2

Unnamed: 0,Ensembl ID version,Gene start (bp),Gene end (bp),Gene type,Ensembl ID,HGNC ID,NCBI_ID,HGNC Symbol,Name,Chromosome
0,ENSG00000000003.16,100627108,100639991,protein_coding,ENSG00000000003,HGNC:11858,7105,TSPAN6,tetraspanin 6,X
1,ENSG00000000005.6,100584936,100599885,protein_coding,ENSG00000000005,HGNC:17757,64102,TNMD,tenomodulin,X
2,ENSG00000000419.14,50934867,50959140,protein_coding,ENSG00000000419,HGNC:3005,8813,DPM1,dolichyl-phosphate mannosyltransferase subunit...,20
3,ENSG00000000457.14,169849631,169894267,protein_coding,ENSG00000000457,HGNC:19285,57147,SCYL3,SCY1 like pseudokinase 3,1
4,ENSG00000000460.17,169662007,169854080,protein_coding,ENSG00000000460,HGNC:25565,55732,FIRRM,FIGNL1 interacting regulator of recombination ...,1
...,...,...,...,...,...,...,...,...,...,...
90220,,,,,,HGNC:3463,2096,ESAT,esterase activator,14
90221,,,,,,HGNC:3464,2097,ESB3,esterase B3,16
90222,,,,,,HGNC:23868,386725,FAM8A7P,"family with sequence similarity 8 member A7, p...",Y
90223,,,,,,HGNC:23870,386727,FAM8A9P,"family with sequence similarity 8 member A9, p...",Y


### 3.3 Add RefSeq data

In [21]:
df_ensembl_hgnc_ncbi_refseq = df_ensembl_hgnc_ncbi_v2.copy()
df_refseq_gene_merge.rename(columns={'GeneID': 'NCBI_ID'}, inplace=True)

df_ensembl_hgnc_ncbi_refseq_v1 = pd.merge(df_ensembl_hgnc_ncbi_refseq, df_refseq_gene_merge, on='NCBI_ID', how='outer')
df_ensembl_hgnc_ncbi_refseq_v1.rename(columns={'genomic_nucleotide_accession.version': 'RefSeq ID'}, inplace=True)
df_ensembl_hgnc_ncbi_refseq_v2 = merge_string_columns(df_ensembl_hgnc_ncbi_refseq_v1, ['Chromosome_x','Chromosome_y'], 'Chromosome')
df_ensembl_hgnc_ncbi_refseq_v2

Unnamed: 0,Ensembl ID version,Gene start (bp),Gene end (bp),Gene type,Ensembl ID,HGNC ID,NCBI_ID,HGNC Symbol,Name,RefSeq ID,Chromosome
0,ENSG00000121410.12,58345178,58353492,protein_coding,ENSG00000121410,HGNC:5,1,A1BG,alpha-1-B glycoprotein,NC_000019.10;NC_060943.1,19
1,ENSG00000156006.5,18391282,18401218,protein_coding,ENSG00000156006,HGNC:7646,10,NAT2,N-acetyltransferase 2,NC_000008.11;NC_060932.1;NG_012246.1,8
2,ENSG00000196839.14,44584896,44652252,protein_coding,ENSG00000196839,HGNC:186,100,ADA,adenosine deaminase,NG_007385.1;NC_000020.11;NC_060944.1,20
3,ENSG00000170558.10,27932879,28177946,protein_coding,ENSG00000170558,HGNC:1759,1000,CDH2,cadherin 2,NC_000018.10;NC_060942.1;NG_011959.2,18
4,ENSG00000117020.19,243488233,243851079,protein_coding,ENSG00000117020,HGNC:393,10000,AKT3,AKT serine/threonine kinase 3,NC_000001.11;NT_187519.1;NC_060925.1;NG_029764.2,1
...,...,...,...,...,...,...,...,...,...,...,...
232142,,,,,,HGNC:37114,,DDX11L15,DEAD/H-box helicase 11 like 15 (pseudogene),,X
232143,,,,,,HGNC:3142,,EBVM1,Epstein Barr virus modification site 1,,11
232144,,,,,,HGNC:53421,,ERVH-8,"endogenous retrovirus group H member 8, envelope",,2
232145,,,,,,HGNC:53422,,ERVH-9,"endogenous retrovirus group H member 9, envelope",,3


### 3.4 Add OMIM data

In [22]:
df_ensembl_hgnc_ncbi_refseq_omim = df_ensembl_hgnc_ncbi_refseq_v2.copy()
df_OMIM_gene_symbol_unique.rename(columns={'Entrez Gene ID': 'NCBI_ID'}, inplace=True)
df_ensembl_hgnc_ncbi_refseq_omim = pd.merge(df_ensembl_hgnc_ncbi_refseq_omim, df_OMIM_gene_symbol_unique, on='NCBI_ID', how='outer')

df_ensembl_hgnc_ncbi_refseq_omim_v1 = merge_column(df_ensembl_hgnc_ncbi_refseq_omim, 'HGNC Symbol', 'HGNC symbol', 'HGNC_Symbol')
df_ensembl_hgnc_ncbi_refseq_omim_v1 = merge_string_columns(df_ensembl_hgnc_ncbi_refseq_omim_v1, ['Chromosome_x', 'Chromosome_y'], 'Chromosome')
df_ensembl_hgnc_ncbi_refseq_omim_v1

Unnamed: 0,Ensembl ID version,Gene start (bp),Gene end (bp),Gene type,Ensembl ID,HGNC ID,NCBI_ID,Name,RefSeq ID,MIM Number,HGNC_Symbol,Chromosome
0,ENSG00000121410.12,58345178,58353492,protein_coding,ENSG00000121410,HGNC:5,1,alpha-1-B glycoprotein,NC_000019.10;NC_060943.1,138670,A1BG,19
1,ENSG00000156006.5,18391282,18401218,protein_coding,ENSG00000156006,HGNC:7646,10,N-acetyltransferase 2,NC_000008.11;NC_060932.1;NG_012246.1,612182,NAT2,8
2,ENSG00000196839.14,44584896,44652252,protein_coding,ENSG00000196839,HGNC:186,100,adenosine deaminase,NG_007385.1;NC_000020.11;NC_060944.1,608958,ADA,20
3,ENSG00000170558.10,27932879,28177946,protein_coding,ENSG00000170558,HGNC:1759,1000,cadherin 2,NC_000018.10;NC_060942.1;NG_011959.2,114020,CDH2,18
4,ENSG00000117020.19,243488233,243851079,protein_coding,ENSG00000117020,HGNC:393,10000,AKT serine/threonine kinase 3,NC_000001.11;NT_187519.1;NC_060925.1;NG_029764.2,611223,AKT3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
232173,,,,,,HGNC:37114,,DEAD/H-box helicase 11 like 15 (pseudogene),,,DDX11L15,X
232174,,,,,,HGNC:3142,,Epstein Barr virus modification site 1,,,EBVM1,11
232175,,,,,,HGNC:53421,,"endogenous retrovirus group H member 8, envelope",,,ERVH-8,2
232176,,,,,,HGNC:53422,,"endogenous retrovirus group H member 9, envelope",,,ERVH-9,3


### 3.5 Combine NCBI Column

In [23]:
df_ncbi_not_null = df_ensembl_hgnc_ncbi_refseq_omim_v1[df_ensembl_hgnc_ncbi_refseq_omim_v1['NCBI_ID'].notnull()]
df_ncbi_null = df_ensembl_hgnc_ncbi_refseq_omim_v1[df_ensembl_hgnc_ncbi_refseq_omim_v1['NCBI_ID'].isnull()]
# Split the NCBI_ID column by semicolon and explode the rows
df_ncbi_not_null['NCBI_ID'] = df_ncbi_not_null['NCBI_ID'].str.split(';')
df_ncbi_not_null = df_ncbi_not_null.explode('NCBI_ID').reset_index(drop=True)
df_ncbi_not_null

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ncbi_not_null['NCBI_ID'] = df_ncbi_not_null['NCBI_ID'].str.split(';')


Unnamed: 0,Ensembl ID version,Gene start (bp),Gene end (bp),Gene type,Ensembl ID,HGNC ID,NCBI_ID,Name,RefSeq ID,MIM Number,HGNC_Symbol,Chromosome
0,ENSG00000121410.12,58345178,58353492,protein_coding,ENSG00000121410,HGNC:5,1,alpha-1-B glycoprotein,NC_000019.10;NC_060943.1,138670,A1BG,19
1,ENSG00000156006.5,18391282,18401218,protein_coding,ENSG00000156006,HGNC:7646,10,N-acetyltransferase 2,NC_000008.11;NC_060932.1;NG_012246.1,612182,NAT2,8
2,ENSG00000196839.14,44584896,44652252,protein_coding,ENSG00000196839,HGNC:186,100,adenosine deaminase,NG_007385.1;NC_000020.11;NC_060944.1,608958,ADA,20
3,ENSG00000170558.10,27932879,28177946,protein_coding,ENSG00000170558,HGNC:1759,1000,cadherin 2,NC_000018.10;NC_060942.1;NG_011959.2,114020,CDH2,18
4,ENSG00000117020.19,243488233,243851079,protein_coding,ENSG00000117020,HGNC:393,10000,AKT serine/threonine kinase 3,NC_000001.11;NT_187519.1;NC_060925.1;NG_029764.2,611223,AKT3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
191801,ENSG00000159197.5,34180729,34371381,protein_coding,ENSG00000159197,HGNC:6242,9992,potassium voltage-gated channel subfamily E re...,NG_008804.1;NC_000021.9;NC_060945.1,603796,KCNE2,21
191802,ENSG00000070413.21,19036282,19122454,protein_coding,ENSG00000070413,HGNC:2845,9993,DiGeorge syndrome critical region gene 2,NG_021333.2;NC_000022.11;NC_060946.1,600594,DGCR2,22
191803,ENSG00000118412.13,89829894,89874436,protein_coding,ENSG00000118412,HGNC:1510,9994,caspase 8 associated protein 2,NW_017363815.1;NC_060930.1,606880,CASP8AP2,6
191804,ENSG00000234402.1,105546610,105549694,transcribed_unprocessed_pseudogene,ENSG00000234402,HGNC:3324,9995,"ETS transcription factor ELK2B, pseudogene",,,ELK2BP,14


In [24]:
# Merge rows with the same 'NCBI_ID' and concatenate unique values with semicolons
merged_df = df_ncbi_not_null.groupby('NCBI_ID').agg(lambda x: ';'.join(sorted(set(x.dropna().astype(str)))))
merged_df.reset_index(inplace=True)
# Combine the merged and null rows
df_gene = pd.concat([merged_df, df_ncbi_null], ignore_index=True)
df_gene.replace('', pd.NA, inplace=True)
df_gene

Unnamed: 0,NCBI_ID,Ensembl ID version,Gene start (bp),Gene end (bp),Gene type,Ensembl ID,HGNC ID,Name,RefSeq ID,MIM Number,HGNC_Symbol,Chromosome
0,1,ENSG00000121410.12,58345178,58353492,protein_coding,ENSG00000121410,HGNC:5,alpha-1-B glycoprotein,NC_000019.10;NC_060943.1,138670,A1BG,19
1,10,ENSG00000156006.5,18391282,18401218,protein_coding,ENSG00000156006,HGNC:7646,N-acetyltransferase 2,NC_000008.11;NC_060932.1;NG_012246.1,612182,NAT2,8
2,100,ENSG00000196839.14,44584896,44652252,protein_coding,ENSG00000196839,HGNC:186,adenosine deaminase,NG_007385.1;NC_000020.11;NC_060944.1,608958,ADA,20
3,1000,ENSG00000170558.10,27932879,28177946,protein_coding,ENSG00000170558,HGNC:1759,cadherin 2,NC_000018.10;NC_060942.1;NG_011959.2,114020,CDH2,18
4,10000,ENSG00000117020.19,243488233,243851079,protein_coding,ENSG00000117020,HGNC:393,AKT serine/threonine kinase 3,NC_000001.11;NT_187519.1;NC_060925.1;NG_029764.2,611223,AKT3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
230353,,,,,,,HGNC:37114,DEAD/H-box helicase 11 like 15 (pseudogene),,,DDX11L15,X
230354,,,,,,,HGNC:3142,Epstein Barr virus modification site 1,,,EBVM1,11
230355,,,,,,,HGNC:53421,"endogenous retrovirus group H member 8, envelope",,,ERVH-8,2
230356,,,,,,,HGNC:53422,"endogenous retrovirus group H member 9, envelope",,,ERVH-9,3


## 4. BioMedgraphica ID

In [26]:
# Create a copy of df_gene to avoid modifying the original data
biomedgraphica_gene = df_gene.copy()

# Sort the biomedgraphica_gene DataFrame based on multiple fields in order:
# NCBI_ID, Ensembl ID, Ensembl ID version, HGNC_Symbol, and MIM Number.
# 'na_position' set to 'last' places NaN values at the end of the sorted columns
biomedgraphica_gene = biomedgraphica_gene.sort_values(by=['NCBI_ID', 'Ensembl ID', 'Ensembl ID version', 'HGNC_Symbol', 'MIM Number'], na_position='last')

biomedgraphica_gene.reset_index(drop=True, inplace=True)

# Generate a unique BioMedGraphica_ID for each row in the format 'BMG_GN' + a zero-padded number of max_length digits
max_length = len(str(len(biomedgraphica_gene)))
biomedgraphica_gene['BioMedGraphica_ID'] = ['BMG_GN' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_gene) + 1)]
biomedgraphica_gene = biomedgraphica_gene.rename(columns={'Ensembl ID': 'Ensembl_Gene_ID', 'Ensembl ID version': 'Ensembl_Gene_ID_Version', 
                                                    'MIM Number': 'OMIM_ID', 'RefSeq ID': 'RefSeq_ID','HGNC ID': 'HGNC_ID', 'NCBI_ID':'NCBI_Gene_ID',
                                                    'Gene start (bp)': 'Gene_Start', 'Gene end (bp)': 'Gene_End', 'Gene type': 'Gene_Type', 'Name': 'Gene_Name'})
columns = ['BioMedGraphica_ID'] + [col for col in biomedgraphica_gene.columns if col != 'BioMedGraphica_ID']  # re-order columns

biomedgraphica_gene = biomedgraphica_gene[columns]

# Moving the 'Chromosome' column after 'Gene_End'
cols = biomedgraphica_gene.columns.tolist()
chromosome_index = cols.index("Chromosome")
gene_end_index = cols.index("Gene_End")

# Reordering the columns to move Chromosome after Gene_End
cols.insert(gene_end_index + 1, cols.pop(chromosome_index))
biomedgraphica_gene = biomedgraphica_gene[cols]
biomedgraphica_gene

Unnamed: 0,BioMedGraphica_ID,NCBI_Gene_ID,Ensembl_Gene_ID_Version,Gene_Start,Gene_End,Chromosome,Gene_Type,Ensembl_Gene_ID,HGNC_ID,Gene_Name,RefSeq_ID,OMIM_ID,HGNC_Symbol
0,BMG_GN000001,1,ENSG00000121410.12,58345178,58353492,19,protein_coding,ENSG00000121410,HGNC:5,alpha-1-B glycoprotein,NC_000019.10;NC_060943.1,138670,A1BG
1,BMG_GN000002,10,ENSG00000156006.5,18391282,18401218,8,protein_coding,ENSG00000156006,HGNC:7646,N-acetyltransferase 2,NC_000008.11;NC_060932.1;NG_012246.1,612182,NAT2
2,BMG_GN000003,100,ENSG00000196839.14,44584896,44652252,20,protein_coding,ENSG00000196839,HGNC:186,adenosine deaminase,NG_007385.1;NC_000020.11;NC_060944.1,608958,ADA
3,BMG_GN000004,1000,ENSG00000170558.10,27932879,28177946,18,protein_coding,ENSG00000170558,HGNC:1759,cadherin 2,NC_000018.10;NC_060942.1;NG_011959.2,114020,CDH2
4,BMG_GN000005,10000,ENSG00000117020.19,243488233,243851079,1,protein_coding,ENSG00000117020,HGNC:393,AKT serine/threonine kinase 3,NC_000001.11;NT_187519.1;NC_060925.1;NG_029764.2,611223,AKT3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
230353,BMG_GN230354,,,,,1,,,HGNC:54204,tRNA-Glu (TTC) 8-2,,,TRE-TTC8-2
230354,BMG_GN230355,,,,,1,,,HGNC:54203,tRNA-Asn (GTT) 15-2,,,TRN-GTT15-2
230355,BMG_GN230356,,,,,1,,,HGNC:54205,tRNA-Asn (GTT) 19-2,,,TRN-GTT19-2
230356,BMG_GN230357,,,,,21,,,HGNC:54206,tRNA-Gln (CTG) 8-3,,,TRQ-CTG8-3


In [27]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene.csv'
biomedgraphica_gene.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Folder D:\RA\BMG\BioMedGraphica\Entity\Gene has been created.
Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene.csv


## 5. File Generation

In [1]:
import pandas as pd
from pathlib import Path
import os

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene' / 'BioMedGraphica_Gene.csv'
biomedgraphica_gene = pd.read_csv(target_dir, dtype=str)

### 5.1 BioChem

In [2]:
import pandas as pd

combine_seq = pd.read_csv('combined_sequences.csv',dtype=str)
combine_seq_gene = combine_seq[['gene_id','dna_sequence']]

In [3]:
BMG_gene_ensembl = biomedgraphica_gene[['BioMedGraphica_ID', 'Ensembl_Gene_ID']]
BMG_gene_seq = pd.merge(BMG_gene_ensembl, combine_seq_gene, left_on='Ensembl_Gene_ID', right_on='gene_id', how='left')
BMG_gene_seq = BMG_gene_seq[['BioMedGraphica_ID', 'dna_sequence']]
BMG_gene_seq

Unnamed: 0,BioMedGraphica_ID,dna_sequence
0,BMG_GN000001,ATTGCTGCAGACGCTCACCCCAGACACTCACTGCACCGGAGTGAGC...
1,BMG_GN000002,ACTTTATTACAGACCTTGGAAGCAAGAGGATTGCATTCAGCCTAGT...
2,BMG_GN000003,AGCTCCAGCCTTCCTCGCCTCCTTTCACTCCCAGCTCCCTGGAGTC...
3,BMG_GN000004,GGGGAGAGCGGCGGCGGCTCGCCCAGGTCGCGCAGCGGAGGCCGAG...
4,BMG_GN000005,ATTGGGCACCGCCCACTTCGTGGGCTTCCAGGTGCGAGCCCTCGCG...
...,...,...
230353,BMG_GN230354,
230354,BMG_GN230355,
230355,BMG_GN230356,
230356,BMG_GN230357,


In [4]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_BioChem.csv'
BMG_gene_seq.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene_BioChem.csv


### 5.2 Position

In [5]:
BMG_gene_position = biomedgraphica_gene[['BioMedGraphica_ID', 'HGNC_Symbol', 'Gene_Name','Gene_Type', 'Chromosome', 'Gene_Start', 'Gene_End']]

In [6]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_Position.csv'
BMG_gene_position.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene_Position.csv


### 5.3 Name and ID

GUI Name

In [7]:
def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

gui_name = biomedgraphica_gene.copy()
gui_name = merge_string_columns(gui_name, ['Gene_Name', 'HGNC_Symbol'], 'Gene_Name_List')
gui_name = gui_name[['BioMedGraphica_ID', 'Gene_Name_List']]
gui_name

Unnamed: 0,BioMedGraphica_ID,Gene_Name_List
0,BMG_GN000001,alpha-1-B glycoprotein | A1BG
1,BMG_GN000002,N-acetyltransferase 2 | NAT2
2,BMG_GN000003,ADA | adenosine deaminase
3,BMG_GN000004,cadherin 2 | CDH2
4,BMG_GN000005,AKT serine/threonine kinase 3 | AKT3
...,...,...
230353,BMG_GN230354,TRE-TTC8-2 | tRNA-Glu (TTC) 8-2
230354,BMG_GN230355,TRN-GTT15-2 | tRNA-Asn (GTT) 15-2
230355,BMG_GN230356,TRN-GTT19-2 | tRNA-Asn (GTT) 19-2
230356,BMG_GN230357,TRQ-CTG8-3 | tRNA-Gln (CTG) 8-3


In [8]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_GUI_Name.csv'
gui_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene_GUI_Name.csv


LLM Name and ID

In [9]:
llm_name_id = biomedgraphica_gene.copy()
llm_name_id.drop(columns=['Gene_Start', 'Gene_End', 'Chromosome', 'Gene_Type'], inplace=True)

llm_name_id['NCBI_Gene_ID'] = llm_name_id['NCBI_Gene_ID'].apply(
    lambda x: ' | '.join(f"NCBI Gene ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)
llm_name_id['RefSeq_ID'] = llm_name_id['RefSeq_ID'].apply(
    lambda x: ' | '.join(f"RefSeq ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)
llm_name_id['OMIM_ID'] = llm_name_id['OMIM_ID'].apply(
    lambda x: ' | '.join(f"OMIM ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

column_order = ['BioMedGraphica_ID', 'HGNC_Symbol', 'Gene_Name', 
                'NCBI_Gene_ID', 'Ensembl_Gene_ID', 'HGNC_ID', 'RefSeq_ID', 'OMIM_ID']

llm_name_id = llm_name_id[column_order]
llm_name_id

Unnamed: 0,BioMedGraphica_ID,HGNC_Symbol,Gene_Name,NCBI_Gene_ID,Ensembl_Gene_ID,HGNC_ID,RefSeq_ID,OMIM_ID
0,BMG_GN000001,A1BG,alpha-1-B glycoprotein,NCBI Gene ID:1,ENSG00000121410,HGNC:5,RefSeq ID:NC_000019.10 | RefSeq ID:NC_060943.1,OMIM ID:138670
1,BMG_GN000002,NAT2,N-acetyltransferase 2,NCBI Gene ID:10,ENSG00000156006,HGNC:7646,RefSeq ID:NC_000008.11 | RefSeq ID:NC_060932.1...,OMIM ID:612182
2,BMG_GN000003,ADA,adenosine deaminase,NCBI Gene ID:100,ENSG00000196839,HGNC:186,RefSeq ID:NG_007385.1 | RefSeq ID:NC_000020.11...,OMIM ID:608958
3,BMG_GN000004,CDH2,cadherin 2,NCBI Gene ID:1000,ENSG00000170558,HGNC:1759,RefSeq ID:NC_000018.10 | RefSeq ID:NC_060942.1...,OMIM ID:114020
4,BMG_GN000005,AKT3,AKT serine/threonine kinase 3,NCBI Gene ID:10000,ENSG00000117020,HGNC:393,RefSeq ID:NC_000001.11 | RefSeq ID:NT_187519.1...,OMIM ID:611223
...,...,...,...,...,...,...,...,...
230353,BMG_GN230354,TRE-TTC8-2,tRNA-Glu (TTC) 8-2,,,HGNC:54204,,
230354,BMG_GN230355,TRN-GTT15-2,tRNA-Asn (GTT) 15-2,,,HGNC:54203,,
230355,BMG_GN230356,TRN-GTT19-2,tRNA-Asn (GTT) 19-2,,,HGNC:54205,,
230356,BMG_GN230357,TRQ-CTG8-3,tRNA-Gln (CTG) 8-3,,,HGNC:54206,,


In [10]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_LLM_Name_ID.csv'
llm_name_id.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene_LLM_Name_ID.csv


LLM Name and ID Combined

In [11]:
llm_combined = llm_name_id.copy()

def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

llm_combined = merge_string_columns(llm_combined, ['HGNC_Symbol', 'Gene_Name', 'NCBI_Gene_ID', 
                'Ensembl_Gene_ID', 'HGNC_ID', 'RefSeq_ID', 'OMIM_ID'], 'Names_and_IDs')
llm_combined

Unnamed: 0,BioMedGraphica_ID,Names_and_IDs
0,BMG_GN000001,RefSeq ID:NC_000019.10 | HGNC:5 | A1BG | RefSe...
1,BMG_GN000002,RefSeq ID:NG_012246.1 | NCBI Gene ID:10 | NAT2...
2,BMG_GN000003,ENSG00000196839 | RefSeq ID:NC_060944.1 | HGNC...
3,BMG_GN000004,RefSeq ID:NG_011959.2 | NCBI Gene ID:1000 | ca...
4,BMG_GN000005,AKT3 | RefSeq ID:NC_000001.11 | RefSeq ID:NT_1...
...,...,...
230353,BMG_GN230354,HGNC:54204 | TRE-TTC8-2 | tRNA-Glu (TTC) 8-2
230354,BMG_GN230355,HGNC:54203 | TRN-GTT15-2 | tRNA-Asn (GTT) 15-2
230355,BMG_GN230356,HGNC:54205 | TRN-GTT19-2 | tRNA-Asn (GTT) 19-2
230356,BMG_GN230357,HGNC:54206 | TRQ-CTG8-3 | tRNA-Gln (CTG) 8-3


In [12]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_LLM_Name_ID_Combined.csv'
llm_combined.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene_LLM_Name_ID_Combined.csv


Display Name

In [13]:
display_name = biomedgraphica_gene.copy()

display_name['BMG_Gene_Name'] = display_name['HGNC_Symbol'].fillna(display_name['Gene_Name'])
display_name = display_name[['BioMedGraphica_ID', 'BMG_Gene_Name']]
display_name

Unnamed: 0,BioMedGraphica_ID,BMG_Gene_Name
0,BMG_GN000001,A1BG
1,BMG_GN000002,NAT2
2,BMG_GN000003,ADA
3,BMG_GN000004,CDH2
4,BMG_GN000005,AKT3
...,...,...
230353,BMG_GN230354,TRE-TTC8-2
230354,BMG_GN230355,TRN-GTT15-2
230355,BMG_GN230356,TRN-GTT19-2
230356,BMG_GN230357,TRQ-CTG8-3


In [14]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_Display_Name.csv'
display_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene_Display_Name.csv


## 6. Description

In [15]:
import pandas as pd
import os
from pathlib import Path

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene' / 'BioMedGraphica_Gene.csv'
biomedgraphica_gene = pd.read_csv(target_dir, dtype=str)

### 6.1 From NCBI Gene

In [16]:
# Using free Perl script geneDocSum.pl provided by NCBI to fetch the description data. 

gene_description = pd.read_csv('ncbi_human_gene_description.txt', delimiter='\t', encoding='UTF-16')
gene_description = gene_description[['geneId', 'Summary']]
gene_description.dropna(subset=['Summary'], inplace=True)
gene_description.dropna(subset=['geneId'], inplace=True)
gene_description.replace('&gt;','', inplace=True)
gene_description['geneId'] = gene_description['geneId'].astype(str)

gene_description

Unnamed: 0,geneId,Summary
0,7157,This gene encodes a tumor suppressor protein c...
1,1956,The protein encoded by this gene is a transmem...
2,348,The protein encoded by this gene is a major ap...
3,7124,This gene encodes a multifunctional proinflamm...
4,3569,This gene encodes a cytokine that functions in...
...,...,...
289192,138,DISCONTINUED: This record has been withdrawn b...
289193,129,DISCONTINUED: This record has been withdrawn b...
289194,85,DISCONTINUED: This record was withdrawn by the...
289195,84,DISCONTINUED: This record was withdrawn by the...


In [17]:
BMG_gene = biomedgraphica_gene[['BioMedGraphica_ID', 'NCBI_Gene_ID']]
BMG_gene['NCBI_Gene_ID'] = BMG_gene['NCBI_Gene_ID'].astype(str).str.replace(r'\.0$', '', regex=True).replace('nan', pd.NA)

BMG_gene_description_ncbi = pd.merge(BMG_gene, gene_description, left_on='NCBI_Gene_ID', right_on='geneId', how='left')
BMG_gene_description_ncbi.drop(columns=['geneId','NCBI_Gene_ID'], inplace=True)
BMG_gene_description_ncbi.rename(columns={'Summary': 'NCBI Gene'}, inplace=True)
BMG_gene_description_ncbi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BMG_gene['NCBI_Gene_ID'] = BMG_gene['NCBI_Gene_ID'].astype(str).str.replace(r'\.0$', '', regex=True).replace('nan', pd.NA)


Unnamed: 0,BioMedGraphica_ID,NCBI Gene
0,BMG_GN000001,The protein encoded by this gene is a plasma g...
1,BMG_GN000002,This gene encodes an enzyme that functions to ...
2,BMG_GN000003,This gene encodes an enzyme that catalyzes the...
3,BMG_GN000004,This gene encodes a classical cadherin and mem...
4,BMG_GN000005,The protein encoded by this gene is a member o...
...,...,...
230353,BMG_GN230354,
230354,BMG_GN230355,
230355,BMG_GN230356,
230356,BMG_GN230357,


### 6.2 From Ensembl

In [18]:
import pandas as pd
from pybiomart import Server

# List all available attributes
def list_attributes():
    server = Server(host='http://www.ensembl.org')
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    attributes = dataset.list_attributes()
    return attributes

attributes = list_attributes()

def fetch_ensembl_data(attributes):
    server = Server(host='http://www.ensembl.org')
    #https://www.ensembl.org/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL
    #this link shows that hsapiens_gene_ensembl is the GRCh38.p14
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    
    response = dataset.query(attributes)
    
    return response

attributes=['ensembl_gene_id', 'description']
data_ensembl = fetch_ensembl_data(attributes)
data_ensembl

Unnamed: 0,Gene stable ID,Gene description
0,ENSG00000210049,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...
1,ENSG00000211459,mitochondrially encoded 12S rRNA [Source:HGNC ...
2,ENSG00000210077,mitochondrially encoded tRNA-Val (GUN) [Source...
3,ENSG00000210082,mitochondrially encoded 16S rRNA [Source:HGNC ...
4,ENSG00000209082,mitochondrially encoded tRNA-Leu (UUA/G) 1 [So...
...,...,...
86397,ENSG00000235358,SCMH1 divergent transcript [Source:HGNC Symbol...
86398,ENSG00000228067,long intergenic non-protein coding RNA 1740 [S...
86399,ENSG00000293271,SLC44A3 antisense RNA 1 [Source:HGNC Symbol;Ac...
86400,ENSG00000310526,"WASP family homolog 7, pseudogene [Source:HGNC..."


In [19]:
BMG_gene = biomedgraphica_gene[['BioMedGraphica_ID', 'Ensembl_Gene_ID']]

BMG_gene_description_ensembl = pd.merge(BMG_gene, data_ensembl, left_on='Ensembl_Gene_ID', right_on='Gene stable ID', how='left')
BMG_gene_description_ensembl.drop(columns=['Ensembl_Gene_ID', 'Gene stable ID'], inplace=True)
BMG_gene_description_ensembl.rename(columns={'Gene description': 'Ensembl'}, inplace=True)
BMG_gene_description_ensembl

Unnamed: 0,BioMedGraphica_ID,Ensembl
0,BMG_GN000001,alpha-1-B glycoprotein [Source:HGNC Symbol;Acc...
1,BMG_GN000002,N-acetyltransferase 2 [Source:HGNC Symbol;Acc:...
2,BMG_GN000003,adenosine deaminase [Source:HGNC Symbol;Acc:HG...
3,BMG_GN000004,cadherin 2 [Source:HGNC Symbol;Acc:HGNC:1759]
4,BMG_GN000005,AKT serine/threonine kinase 3 [Source:HGNC Sym...
...,...,...
230353,BMG_GN230354,
230354,BMG_GN230355,
230355,BMG_GN230356,
230356,BMG_GN230357,


In [20]:
bmg_description = pd.merge(BMG_gene_description_ncbi, BMG_gene_description_ensembl, on='BioMedGraphica_ID', how='outer')
bmg_description

Unnamed: 0,BioMedGraphica_ID,NCBI Gene,Ensembl
0,BMG_GN000001,The protein encoded by this gene is a plasma g...,alpha-1-B glycoprotein [Source:HGNC Symbol;Acc...
1,BMG_GN000002,This gene encodes an enzyme that functions to ...,N-acetyltransferase 2 [Source:HGNC Symbol;Acc:...
2,BMG_GN000003,This gene encodes an enzyme that catalyzes the...,adenosine deaminase [Source:HGNC Symbol;Acc:HG...
3,BMG_GN000004,This gene encodes a classical cadherin and mem...,cadherin 2 [Source:HGNC Symbol;Acc:HGNC:1759]
4,BMG_GN000005,The protein encoded by this gene is a member o...,AKT serine/threonine kinase 3 [Source:HGNC Sym...
...,...,...,...
230353,BMG_GN230354,,
230354,BMG_GN230355,,
230355,BMG_GN230356,,
230356,BMG_GN230357,,


In [21]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_Description.csv'
bmg_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene_Description.csv


### 6.3 Combined Description

In [22]:
comb_description = bmg_description.copy()

# add the column name at the beginning of the string
# first, we need to get the column names
column_names = comb_description.columns.tolist()
column_names = [col for col in column_names if col != 'BioMedGraphica_ID']
# then we can apply the function to each column
for col in column_names:
    comb_description[col] = comb_description[col].apply(lambda x: ' | '.join([f"{col}: {i}" for i in x.split(' | ')]) if pd.notna(x) else x)

# now we can merge the columns into one
comb_description['Description'] = comb_description[column_names].apply(lambda x: ' | '.join(x.dropna()), axis=1)
comb_description = comb_description[['BioMedGraphica_ID', 'Description']]
comb_description

Unnamed: 0,BioMedGraphica_ID,Description
0,BMG_GN000001,NCBI Gene: The protein encoded by this gene is...
1,BMG_GN000002,NCBI Gene: This gene encodes an enzyme that fu...
2,BMG_GN000003,NCBI Gene: This gene encodes an enzyme that ca...
3,BMG_GN000004,NCBI Gene: This gene encodes a classical cadhe...
4,BMG_GN000005,NCBI Gene: The protein encoded by this gene is...
...,...,...
230353,BMG_GN230354,
230354,BMG_GN230355,
230355,BMG_GN230356,
230356,BMG_GN230357,


In [23]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Gene'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Gene_Description_Combined.csv'
comb_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Gene\BioMedGraphica_Gene_Description_Combined.csv
