# BioMedGraphica Transcript

## 1. Data Access

### Direct Download Links  
**RefSeq**: Can be downloaded directly via the link without the need for registration. [Link](https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/MANE.GRCh38.v1.3.summary.txt.gz)  
**RNAcentral**: Can be downloaded directly via the link without the need for registration. [Link](https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/ensembl.tsv)  

### Ensembl API

In [None]:
import pandas as pd
from pybiomart import Server

def list_attributes():
    server = Server(host='http://www.ensembl.org')
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    attributes = dataset.list_attributes()
    return attributes

attributes = list_attributes()

def fetch_ensembl_data(attributes):
    server = Server(host='http://www.ensembl.org')
    #https://www.ensembl.org/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL
    #this link shows that hsapiens_gene_ensembl is the GRCh38.p14
    dataset = server.marts['ENSEMBL_MART_ENSEMBL'].datasets['hsapiens_gene_ensembl']
    
    response = dataset.query(attributes)
    
    return response

# Load Ensembl data
attributes=['ensembl_transcript_id' ,'ensembl_transcript_id_version', 'ensembl_gene_id', 'external_gene_name', 'external_transcript_name', 'transcript_biotype',
            'transcript_start','transcript_end','transcription_start_site','transcript_length','chromosome_name',
            'refseq_mrna', 'refseq_ncrna', 'transcript_mane_select']
data_ensembl = fetch_ensembl_data(attributes)
data_ensembl.to_csv('ensembl_transcript.csv', index=False)


In [4]:
# Load Ensembl Description
attributes=['ensembl_transcript_id' ,'ensembl_transcript_id_version', 'description']
description = fetch_ensembl_data(attributes)
description.to_csv('ensembl_transcript_description.csv', index=False)

## 2. Load Data

### 2.1 Ensembl data

In [1]:
import pandas as pd

df_ensembl = pd.read_csv('ensembl_transcript.csv',dtype=str)
df_ensembl

Unnamed: 0,Transcript stable ID,Transcript stable ID version,Gene stable ID,Gene name,Transcript name,Transcript type,Transcript start (bp),Transcript end (bp),Transcription start site (TSS),Transcript length (including UTRs and CDS),Chromosome/scaffold name,RefSeq mRNA ID,RefSeq ncRNA ID,RefSeq match transcript (MANE Select)
0,ENST00000387314,ENST00000387314.1,ENSG00000210049,MT-TF,MT-TF-201,Mt_tRNA,577,647,577,71,MT,,,
1,ENST00000389680,ENST00000389680.2,ENSG00000211459,MT-RNR1,MT-RNR1-201,Mt_rRNA,648,1601,648,954,MT,,,
2,ENST00000387342,ENST00000387342.1,ENSG00000210077,MT-TV,MT-TV-201,Mt_tRNA,1602,1670,1602,69,MT,,,
3,ENST00000387347,ENST00000387347.2,ENSG00000210082,MT-RNR2,MT-RNR2-201,Mt_rRNA,1671,3229,1671,1559,MT,,,
4,ENST00000386347,ENST00000386347.1,ENSG00000209082,MT-TL1,MT-TL1-201,Mt_tRNA,3230,3304,3230,75,MT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451954,ENST00000831127,ENST00000831127.1,ENSG00000241860,,,lncRNA,177311,181130,181130,521,1,,,
451955,ENST00000466430,ENST00000466430.5,ENSG00000241860,,,lncRNA,89295,120932,120932,2748,1,,,
451956,ENST00000477740,ENST00000477740.5,ENSG00000241860,,,lncRNA,92230,129217,129217,491,1,,,
451957,ENST00000471248,ENST00000471248.1,ENSG00000241860,,,lncRNA,110953,129173,129173,629,1,,,


### 2.2 RefSeq data

In [5]:
# https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/MANE.GRCh38.v1.4.summary.txt.gz 
df_MANE = pd.read_csv('MANE.GRCh38.v1.4.summary.txt', delimiter='\t')
df_refseq = df_MANE[['Ensembl_nuc', 'RefSeq_nuc']]
df_refseq

Unnamed: 0,Ensembl_nuc,RefSeq_nuc
0,ENST00000263100.8,NM_130786.4
1,ENST00000318602.12,NM_000014.6
2,ENST00000307719.9,NM_000662.8
3,ENST00000286479.4,NM_000015.3
4,ENST00000393078.5,NM_001085.5
...,...,...
19399,ENST00000713549.1,NM_001394149.2
19400,ENST00000713560.1,NM_001007271.3
19401,ENST00000646820.1,NR_185500.1
19402,ENST00000381497.7,NM_001423836.2


### 2.3 RNAcentral data

In [7]:
# https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/ensembl.tsv 
df_rnacentral = pd.read_csv('ensembl.tsv', sep='\t', header=None)
df_rnacentral.columns = ['RNAcentral', 'Ensembl', 'Transcript stable ID', 'type', 'RNA type','ENSG']
df_rnacentral_human = df_rnacentral[df_rnacentral['type'] == 9606]
df_rnacentral_human.reset_index(drop = True, inplace=True)

df_rnacentral_human_filter = df_rnacentral_human.copy()
df_rnacentral_human_filter.drop(columns=['type', 'Ensembl', 'RNA type', 'ENSG'], inplace=True)
df_rnacentral_human_filter

Unnamed: 0,RNAcentral,Transcript stable ID
0,URS0000000055,ENST00000585414
1,URS00000000C9,ENST00000514011
2,URS00000000FD,ENST00000448543
3,URS0000000344,ENST00000633884
4,URS0000000351,ENST00000452009
...,...,...
66784,URS0002869A10,ENST00000631613
66785,URS0002869A38,ENST00000692138
66786,URS0002869A99,ENST00000469426
66787,URS0002869AE1,ENST00000519537


## 3. Merge Data

In [8]:
# check duplicates inside the dataframe
def merge_column(df, column1, column2, new_column):
    df[column1] = df[column1].fillna('')
    df[column2] = df[column2].fillna('')
    df[new_column] = df.apply(lambda row: f"{row[column1]} {row[column2]}".strip(), axis=1)

    expanded_rows = df[new_column].str.split(expand=True).stack().reset_index(level=1, drop=True)
    expanded_rows.name = new_column

    df = df.drop(columns=[new_column]).join(expanded_rows)
    df.drop(columns=[column1, column2], inplace=True)
    df.drop_duplicates(inplace=True)
    
    return df

def merge_string_columns(df, columns, merge_name, separator=';'):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

### 3.1 Ensembl + RefSeq

In [9]:
df_refseq.rename(columns={'Ensembl_nuc': 'Transcript stable ID version'}, inplace=True)
df_ensembl_refseq = pd.merge(df_ensembl, df_refseq, how='outer')

df_ensembl_refseq_v1 = merge_string_columns(df_ensembl_refseq, ['RefSeq mRNA ID', 'RefSeq ncRNA ID', 'RefSeq match transcript (MANE Select)', 'RefSeq_nuc'], 'Refseq')
df_ensembl_refseq_v1['Refseq'].replace('', pd.NA, inplace=True)
df_ensembl_refseq_v1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_refseq.rename(columns={'Ensembl_nuc': 'Transcript stable ID version'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_ensembl_refseq_v1['Refseq'].replace('', pd.NA, inplace=True)


Unnamed: 0,Transcript stable ID,Transcript stable ID version,Gene stable ID,Gene name,Transcript name,Transcript type,Transcript start (bp),Transcript end (bp),Transcription start site (TSS),Transcript length (including UTRs and CDS),Chromosome/scaffold name,Refseq
0,ENST00000000233,ENST00000000233.10,ENSG00000004059,ARF5,ARF5-201,protein_coding,127588411,127591700,127588411,1032,7,NM_001662;NM_001662.4
1,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_002355;NM_002355.4
2,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_002355.4;NM_001414320
3,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_001414332;NM_002355.4
4,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_001414333;NM_002355.4
...,...,...,...,...,...,...,...,...,...,...,...,...
451978,ENST00000850841,ENST00000850841.1,ENSG00000292360,LINC03112,LINC03112-261,lncRNA,2566027,2609185,2609185,1434,Y,
451979,ENST00000850842,ENST00000850842.1,ENSG00000292360,LINC03112,LINC03112-262,lncRNA,2566024,2609205,2609205,1525,Y,
451980,ENST00000850843,ENST00000850843.1,ENSG00000292362,CD99P1,CD99P1-312,lncRNA,2609381,2658943,2609381,1082,Y,
451981,,ENST00000850890.1,,,,,,,,,,NR_186063.1


### 3.2 Add RNAcentral

In [10]:
df_ensembl_refseq_rnacentral = df_ensembl_refseq_v1.copy()
df_ensembl_refseq_rnacentral = pd.merge(df_ensembl_refseq_rnacentral, df_rnacentral_human_filter, how='outer')

# Completion of the missing transcript type
transcript_type_map = df_rnacentral.set_index('Transcript stable ID')['RNA type'].to_dict()
df_ensembl_refseq_rnacentral['Transcript type'] = df_ensembl_refseq_rnacentral['Transcript type'].fillna(df_ensembl_refseq_rnacentral['Transcript stable ID'].map(transcript_type_map))
df_ensembl_refseq_rnacentral

Unnamed: 0,Transcript stable ID,Transcript stable ID version,Gene stable ID,Gene name,Transcript name,Transcript type,Transcript start (bp),Transcript end (bp),Transcription start site (TSS),Transcript length (including UTRs and CDS),Chromosome/scaffold name,Refseq,RNAcentral
0,ENST00000000233,ENST00000000233.10,ENSG00000004059,ARF5,ARF5-201,protein_coding,127588411,127591700,127588411,1032,7,NM_001662;NM_001662.4,
1,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_002355;NM_002355.4,
2,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_002355.4;NM_001414320,
3,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_001414332;NM_002355.4,
4,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_001414333;NM_002355.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
452246,,ENST00000667305.2,,,,,,,,,,NR_190187.1,
452247,,ENST00000669203.2,,,,,,,,,,NR_186187.1,
452248,,ENST00000718005.2,,,,,,,,,,NR_197229.1,
452249,,ENST00000850890.1,,,,,,,,,,NR_186063.1,


### 3.3 Combine Ensembl ID Column

In [11]:
df_ensembl_unique = df_ensembl_refseq_rnacentral.copy()
df_ensembl_unique['Transcript stable ID version copy'] = df_ensembl_unique['Transcript stable ID version']
df_ensembl_unique['Transcript stable ID version copy'] = df_ensembl_unique['Transcript stable ID version copy'].fillna(df_ensembl_unique['Transcript stable ID'])

grouped_data = df_ensembl_unique.groupby('Transcript stable ID version copy').agg(lambda x: ';'.join(sorted(set(x.dropna())))).reset_index()
grouped_data.drop(columns=['Transcript stable ID version copy'], inplace=True)
grouped_data.replace('', pd.NA, inplace=True)
grouped_data

Unnamed: 0,Transcript stable ID,Transcript stable ID version,Gene stable ID,Gene name,Transcript name,Transcript type,Transcript start (bp),Transcript end (bp),Transcription start site (TSS),Transcript length (including UTRs and CDS),Chromosome/scaffold name,Refseq,RNAcentral
0,ENST00000000233,ENST00000000233.10,ENSG00000004059,ARF5,ARF5-201,protein_coding,127588411,127591700,127588411,1032,7,NM_001662;NM_001662.4,
1,ENST00000000412,ENST00000000412.8,ENSG00000003056,M6PR,M6PR-201,protein_coding,8940361,8949645,8949645,2450,12,NM_001414331;NM_002355.4;NM_001414332;NM_00235...,
2,ENST00000000442,ENST00000000442.11,ENSG00000173153,ESRRA,ESRRA-201,protein_coding,64305524,64316743,64305524,2274,11,NM_004451;NM_004451.5,
3,ENST00000001008,ENST00000001008.6,ENSG00000004478,FKBP4,FKBP4-201,protein_coding,2794970,2805423,2794970,3715,12,NM_002014;NM_002014.4,
4,ENST00000001146,ENST00000001146.7,ENSG00000003137,CYP26B1,CYP26B1-201,protein_coding,72129238,72147862,72147862,4556,2,NM_019885.4;NM_019885,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
412321,ENST00000850841,ENST00000850841.1,ENSG00000292360,LINC03112,LINC03112-261,lncRNA,2566027,2609185,2609185,1434,Y,,
412322,ENST00000850842,ENST00000850842.1,ENSG00000292360,LINC03112,LINC03112-262,lncRNA,2566024,2609205,2609205,1525,Y,,
412323,ENST00000850843,ENST00000850843.1,ENSG00000292362,CD99P1,CD99P1-312,lncRNA,2609381,2658943,2609381,1082,Y,,
412324,,ENST00000850890.1,,,,,,,,,,NR_186063.1,


## 4. BioMedGraphica ID

In [12]:
biomedgraphica_transcript = grouped_data.copy()
max_length = len(str(len(biomedgraphica_transcript)))
biomedgraphica_transcript['BioMedGraphica_ID'] = ['BMG_TS' + str(i).zfill(max_length) for i in range(1, len(biomedgraphica_transcript) + 1)]
biomedgraphica_transcript = biomedgraphica_transcript.rename(columns={'Transcript stable ID': 'Ensembl_Transcript_ID', 'Transcript stable ID version': 'Ensembl_Transcript_ID_Version',
                                                                'Gene stable ID': 'Ensembl_Gene_ID', 'Gene name': 'HGNC_Symbol', 'Transcript type': 'Transcript_Type', 'Transcript name': 'Transcript_Name',
                                                                'Refseq': 'RefSeq_ID', 'RNAcentral': 'RNACentral_ID',
                                                                'Transcript start (bp)': 'Transcript_Start', 'Transcript end (bp)': 'Transcript_End', 'Transcription start site (TSS)': 'Transcription_Start_Site',
                                                                'Transcript length (including UTRs and CDS)': 'Transcript_Length', 'Chromosome/scaffold name': 'Chromosome'})
columns = ['BioMedGraphica_ID'] + [col for col in biomedgraphica_transcript.columns if col != 'BioMedGraphica_ID']  # re-order columns
biomedgraphica_transcript = biomedgraphica_transcript[columns]
biomedgraphica_transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412326 entries, 0 to 412325
Data columns (total 14 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   BioMedGraphica_ID              412326 non-null  object
 1   Ensembl_Transcript_ID          412302 non-null  object
 2   Ensembl_Transcript_ID_Version  412058 non-null  object
 3   Ensembl_Gene_ID                412034 non-null  object
 4   HGNC_Symbol                    297276 non-null  object
 5   Transcript_Name                297276 non-null  object
 6   Transcript_Type                412302 non-null  object
 7   Transcript_Start               412034 non-null  object
 8   Transcript_End                 412034 non-null  object
 9   Transcription_Start_Site       412034 non-null  object
 10  Transcript_Length              412034 non-null  object
 11  Chromosome                     412034 non-null  object
 12  RefSeq_ID                      63543 non-nul

In [13]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript.csv'
biomedgraphica_transcript.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Folder D:\RA\BMG\BioMedGraphica\Entity\Transcript has been created.
Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript.csv


## 5. File Generation

In [1]:
import pandas as pd
from pathlib import Path
import os

current_working_dir = Path(os.getcwd()).resolve()
grandparent_dir = current_working_dir.parent.parent.parent
target_dir = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript' / 'BioMedGraphica_Transcript.csv'
biomedgraphica_transcript = pd.read_csv(target_dir, dtype=str)

### 5.1 BioChem

In [2]:
import pandas as pd

combine_seq = pd.read_csv('combined_sequences.csv', dtype=str)
combine_seq_trans = combine_seq[['gene_id','transcript_sequence']]

BMG_gene_ensembl = biomedgraphica_transcript[['BioMedGraphica_ID', 'Ensembl_Gene_ID']]

BMG_transcript_seq = pd.merge(BMG_gene_ensembl, combine_seq_trans, left_on='Ensembl_Gene_ID', right_on='gene_id', how='left')
BMG_transcript_seq = BMG_transcript_seq[['BioMedGraphica_ID', 'transcript_sequence']]
BMG_transcript_seq

Unnamed: 0,BioMedGraphica_ID,transcript_sequence
0,BMG_TS000001,CUGCUGCUGCUGCGCCCCAUCCCCCCGCGGCCGGCCAGUUCCAGCC...
1,BMG_TS000002,AGAGUGGGGCACAGCGAGGCGCUAGGGGGAACGCUGGCCUCUGAAA...
2,BMG_TS000003,GUCAGCUGGAGGAAGCGGAGUAGGAAGCGGCCGCGAUGUCCUUUUG...
3,BMG_TS000004,CCUACCCCAGCUCUCGCGCCGCGUGCAGAGGUGCUCAAGCCUCCUC...
4,BMG_TS000005,ACAGCCAAUCCCCCGAGCGGCCGCCAACAUGCUCUUUGAGGGCUUG...
...,...,...
412321,BMG_TS412322,
412322,BMG_TS412323,
412323,BMG_TS412324,
412324,BMG_TS412325,


In [3]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript_BioChem.csv'
BMG_transcript_seq.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript_BioChem.csv


### 5.2 Position

In [4]:
BMG_trans_position = biomedgraphica_transcript[['BioMedGraphica_ID', 'Transcript_Name','HGNC_Symbol',
                                                'Transcript_Type', 'Chromosome', 'Transcription_Start_Site', 
                                                'Transcript_Start', 'Transcript_End', 'Transcript_Length']]

In [5]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript_Position.csv'
BMG_trans_position.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript_Position.csv


### 5.3 Name and ID

GUI Name

In [6]:
def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

gui_name = biomedgraphica_transcript.copy()
gui_name = merge_string_columns(gui_name, ['Transcript_Name', 'HGNC_Symbol'], 'Transcript_Name_List')
gui_name = gui_name[['BioMedGraphica_ID','Transcript_Name_List']]
gui_name

Unnamed: 0,BioMedGraphica_ID,Transcript_Name_List
0,BMG_TS000001,ARF5-201 | ARF5
1,BMG_TS000002,M6PR-201 | M6PR
2,BMG_TS000003,ESRRA | ESRRA-201
3,BMG_TS000004,FKBP4 | FKBP4-201
4,BMG_TS000005,CYP26B1-201 | CYP26B1
...,...,...
412321,BMG_TS412322,LINC03112 | LINC03112-261
412322,BMG_TS412323,LINC03112 | LINC03112-262
412323,BMG_TS412324,CD99P1 | CD99P1-312
412324,BMG_TS412325,


In [7]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript_GUI_Name.csv'
gui_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript_GUI_Name.csv


LLM Name and ID

In [8]:
llm_name_id = biomedgraphica_transcript.copy()
llm_name_id.drop(columns=['Transcript_Type', 'Transcript_Start', 'Transcript_End', 'Transcript_Length', 'Chromosome', 'Transcription_Start_Site'], inplace=True)

llm_name_id['RefSeq_ID'] = llm_name_id['RefSeq_ID'].apply(
    lambda x: ' | '.join(f"RefSeq ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)
llm_name_id['RNACentral_ID'] = llm_name_id['RNACentral_ID'].apply(
    lambda x: ' | '.join(f"RNAcentral ID:{id}" for id in x.split(';')) if pd.notna(x) and x != '' else x
)

column_order = ['BioMedGraphica_ID', 'Transcript_Name', 'HGNC_Symbol', 'Ensembl_Transcript_ID', 'Ensembl_Transcript_ID_Version', 'Ensembl_Gene_ID', 'RefSeq_ID', 'RNACentral_ID']
llm_name_id = llm_name_id[column_order]
llm_name_id

Unnamed: 0,BioMedGraphica_ID,Transcript_Name,HGNC_Symbol,Ensembl_Transcript_ID,Ensembl_Transcript_ID_Version,Ensembl_Gene_ID,RefSeq_ID,RNACentral_ID
0,BMG_TS000001,ARF5-201,ARF5,ENST00000000233,ENST00000000233.10,ENSG00000004059,RefSeq ID:NM_001662 | RefSeq ID:NM_001662.4,
1,BMG_TS000002,M6PR-201,M6PR,ENST00000000412,ENST00000000412.8,ENSG00000003056,RefSeq ID:NM_001414331 | RefSeq ID:NM_002355.4...,
2,BMG_TS000003,ESRRA-201,ESRRA,ENST00000000442,ENST00000000442.11,ENSG00000173153,RefSeq ID:NM_004451 | RefSeq ID:NM_004451.5,
3,BMG_TS000004,FKBP4-201,FKBP4,ENST00000001008,ENST00000001008.6,ENSG00000004478,RefSeq ID:NM_002014 | RefSeq ID:NM_002014.4,
4,BMG_TS000005,CYP26B1-201,CYP26B1,ENST00000001146,ENST00000001146.7,ENSG00000003137,RefSeq ID:NM_019885.4 | RefSeq ID:NM_019885,
...,...,...,...,...,...,...,...,...
412321,BMG_TS412322,LINC03112-261,LINC03112,ENST00000850841,ENST00000850841.1,ENSG00000292360,,
412322,BMG_TS412323,LINC03112-262,LINC03112,ENST00000850842,ENST00000850842.1,ENSG00000292360,,
412323,BMG_TS412324,CD99P1-312,CD99P1,ENST00000850843,ENST00000850843.1,ENSG00000292362,,
412324,BMG_TS412325,,,,ENST00000850890.1,,RefSeq ID:NR_186063.1,


In [9]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript_LLM_Name_ID.csv'
llm_name_id.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript_LLM_Name_ID.csv


LLM Name and ID Combined

In [10]:
llm_combined = llm_name_id.copy()

def merge_string_columns(df, columns, merge_name, separator=' | '):
    def merge_strings(row):
        combined = set()
        for column in columns:
            if pd.notnull(row[column]):
                combined.update(row[column].split(separator))
        return separator.join(combined)
    
    # Apply the function to each row and create a new column
    combined_column_name = merge_name
    df[combined_column_name] = df.apply(merge_strings, axis=1)
    df.drop(columns=columns, inplace=True)
    
    return df

llm_combined = merge_string_columns(llm_combined, ['Ensembl_Transcript_ID', 'Ensembl_Transcript_ID_Version', 'Ensembl_Gene_ID', 
                'HGNC_Symbol', 'Transcript_Name', 'RefSeq_ID', 'RNACentral_ID'], 'Names_and_IDs')
llm_combined

Unnamed: 0,BioMedGraphica_ID,Names_and_IDs
0,BMG_TS000001,ARF5 | RefSeq ID:NM_001662 | RefSeq ID:NM_0016...
1,BMG_TS000002,M6PR-201 | RefSeq ID:NM_001414333 | RefSeq ID:...
2,BMG_TS000003,RefSeq ID:NM_004451 | ENST00000000442 | ESRRA ...
3,BMG_TS000004,FKBP4-201 | FKBP4 | ENST00000001008 | RefSeq I...
4,BMG_TS000005,ENST00000001146.7 | CYP26B1-201 | RefSeq ID:NM...
...,...,...
412321,BMG_TS412322,ENSG00000292360 | LINC03112-261 | ENST00000850...
412322,BMG_TS412323,ENST00000850842.1 | ENST00000850842 | ENSG0000...
412323,BMG_TS412324,CD99P1 | ENST00000850843.1 | ENST00000850843 |...
412324,BMG_TS412325,RefSeq ID:NR_186063.1 | ENST00000850890.1


In [11]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript_LLM_Name_ID_Combined.csv'
llm_combined.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript_LLM_Name_ID_Combined.csv


Display Name

In [12]:
display_name = biomedgraphica_transcript.copy()

display_name['BMG_Transcript_Name'] = display_name['Transcript_Name'].fillna(display_name['HGNC_Symbol'])
display_name = display_name[['BioMedGraphica_ID', 'BMG_Transcript_Name']]
display_name

Unnamed: 0,BioMedGraphica_ID,BMG_Transcript_Name
0,BMG_TS000001,ARF5-201
1,BMG_TS000002,M6PR-201
2,BMG_TS000003,ESRRA-201
3,BMG_TS000004,FKBP4-201
4,BMG_TS000005,CYP26B1-201
...,...,...
412321,BMG_TS412322,LINC03112-261
412322,BMG_TS412323,LINC03112-262
412323,BMG_TS412324,CD99P1-312
412324,BMG_TS412325,


In [13]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript_Display_Name.csv'
display_name.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript_Display_Name.csv


## 6. Description

### 6.1 From Ensembl

In [14]:
transcript_description = pd.read_csv('ensembl_transcript_description.csv')
transcript_description = transcript_description[['Transcript stable ID version', 'Gene description']]
transcript_description.dropna(subset=['Gene description'], inplace=True)
transcript_description.dropna(subset=['Transcript stable ID version'], inplace=True)
transcript_description

Unnamed: 0,Transcript stable ID version,Gene description
0,ENST00000387314.1,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...
1,ENST00000389680.2,mitochondrially encoded 12S rRNA [Source:HGNC ...
2,ENST00000387342.1,mitochondrially encoded tRNA-Val (GUN) [Source...
3,ENST00000387347.2,mitochondrially encoded 16S rRNA [Source:HGNC ...
4,ENST00000386347.1,mitochondrially encoded tRNA-Leu (UUA/G) 1 [So...
...,...,...
412029,ENST00000831127.1,novel transcript
412030,ENST00000466430.5,novel transcript
412031,ENST00000477740.5,novel transcript
412032,ENST00000471248.1,novel transcript


In [15]:
BMG_transcript = biomedgraphica_transcript[['BioMedGraphica_ID', 'Ensembl_Transcript_ID_Version']]
BMG_transcript.dropna(subset=['Ensembl_Transcript_ID_Version'], inplace=True)

ensembl_individual = BMG_transcript.assign(Ensembl_Transcript_ID=BMG_transcript['Ensembl_Transcript_ID_Version'].str.split(';')).explode('Ensembl_Transcript_ID_Version')
transcript_to_individual = ensembl_individual.groupby('Ensembl_Transcript_ID_Version')['BioMedGraphica_ID'].apply(lambda x: ';'.join(x.dropna().unique())).to_dict()

transcript_description['BioMedGraphica_ID'] = transcript_description['Transcript stable ID version'].map(transcript_to_individual)
transcript_description

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BMG_transcript.dropna(subset=['Ensembl_Transcript_ID_Version'], inplace=True)


Unnamed: 0,Transcript stable ID version,Gene description,BioMedGraphica_ID
0,ENST00000387314.1,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...,BMG_TS029225
1,ENST00000389680.2,mitochondrially encoded 12S rRNA [Source:HGNC ...,BMG_TS029457
2,ENST00000387342.1,mitochondrially encoded tRNA-Val (GUN) [Source...,BMG_TS029226
3,ENST00000387347.2,mitochondrially encoded 16S rRNA [Source:HGNC ...,BMG_TS029227
4,ENST00000386347.1,mitochondrially encoded tRNA-Leu (UUA/G) 1 [So...,BMG_TS029216
...,...,...,...
412029,ENST00000831127.1,novel transcript,BMG_TS392883
412030,ENST00000466430.5,novel transcript,BMG_TS077697
412031,ENST00000477740.5,novel transcript,BMG_TS087551
412032,ENST00000471248.1,novel transcript,BMG_TS081907


In [16]:
BMG_transcript_description = biomedgraphica_transcript[['BioMedGraphica_ID']]
BMG_transcript_description = pd.merge(BMG_transcript_description, transcript_description, how='left')
BMG_transcript_description.drop(columns=['Transcript stable ID version'], inplace=True)
BMG_transcript_description.rename(columns={'Gene description': 'Ensembl'}, inplace=True)
BMG_transcript_description

Unnamed: 0,BioMedGraphica_ID,Ensembl
0,BMG_TS000001,ADP ribosylation factor 5 [Source:HGNC Symbol;...
1,BMG_TS000002,"mannose-6-phosphate receptor, cation dependent..."
2,BMG_TS000003,estrogen related receptor alpha [Source:HGNC S...
3,BMG_TS000004,FKBP prolyl isomerase 4 [Source:HGNC Symbol;Ac...
4,BMG_TS000005,cytochrome P450 family 26 subfamily B member 1...
...,...,...
412321,BMG_TS412322,long intergenic non-protein coding RNA 3112 [S...
412322,BMG_TS412323,long intergenic non-protein coding RNA 3112 [S...
412323,BMG_TS412324,CD99 molecule pseudogene 1 [Source:HGNC Symbol...
412324,BMG_TS412325,


In [17]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript_Description.csv'
BMG_transcript_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript_Description.csv


### 6.2 Combined Description

In [18]:
comb_description = BMG_transcript_description.copy()

# add the column name at the beginning of the string
# first, we need to get the column names
column_names = comb_description.columns.tolist()
column_names = [col for col in column_names if col != 'BioMedGraphica_ID']
# then we can apply the function to each column
for col in column_names:
    comb_description[col] = comb_description[col].apply(lambda x: ' | '.join([f"{col}: {i}" for i in x.split(' | ')]) if pd.notna(x) else x)

# now we can merge the columns into one
comb_description['Description'] = comb_description[column_names].apply(lambda x: ' | '.join(x.dropna()), axis=1)
comb_description = comb_description[['BioMedGraphica_ID', 'Description']]
comb_description

Unnamed: 0,BioMedGraphica_ID,Description
0,BMG_TS000001,Ensembl: ADP ribosylation factor 5 [Source:HGN...
1,BMG_TS000002,"Ensembl: mannose-6-phosphate receptor, cation ..."
2,BMG_TS000003,Ensembl: estrogen related receptor alpha [Sour...
3,BMG_TS000004,Ensembl: FKBP prolyl isomerase 4 [Source:HGNC ...
4,BMG_TS000005,Ensembl: cytochrome P450 family 26 subfamily B...
...,...,...
412321,BMG_TS412322,Ensembl: long intergenic non-protein coding RN...
412322,BMG_TS412323,Ensembl: long intergenic non-protein coding RN...
412323,BMG_TS412324,Ensembl: CD99 molecule pseudogene 1 [Source:HG...
412324,BMG_TS412325,


In [19]:
import os
from pathlib import Path

# get the current working directory
current_working_dir = Path(os.getcwd()).resolve()

# get the output directory
grandparent_dir = current_working_dir.parent.parent.parent

target_folder = grandparent_dir / 'BioMedGraphica' / 'Entity' / 'Transcript'
if not target_folder.exists():
    target_folder.mkdir(parents=True)
    print(f"Folder {target_folder} has been created.")

output_file_path = target_folder / 'BioMedGraphica_Transcript_Description_Combined.csv'
comb_description.to_csv(output_file_path, index=False)
print(f"Data has been saved to {output_file_path}")

Data has been saved to D:\RA\BMG\BioMedGraphica\Entity\Transcript\BioMedGraphica_Transcript_Description_Combined.csv
