## 1. Protein Node Description

### 1.1 Protein Entity

In [None]:
import pandas as pd

bmgc_gene_df = pd.read_csv('./data/BioMedGraphica-Conn/Entity/Gene/BioMedGraphica_Conn_Gene.csv', dtype=str)
bmgc_gene_desc_df = pd.read_csv('./data/BioMedGraphica-Conn/Entity/Gene/BioMedGraphica_Conn_Gene_Description.csv', dtype=str).drop(columns=['BioMedGraphica_ID'])
bmgc_gene_llm_df = pd.merge(bmgc_gene_df, bmgc_gene_desc_df, how='left', on='BioMedGraphica_Conn_ID')
# keep columns ['HGNC_Symbol', 'Gene_Name', ''Gene_Type', 'Chromosome', 'Gene_Start', 'Gene_End', 'Ensembl_Gene_ID', 'Ensembl_Gene_ID_Version', 'Ensembl', 'NCBI_Gene_ID', 'NCBI Gene']
bmgc_gene_llm_df = bmgc_gene_llm_df[['HGNC_Symbol', 'Gene_Name', 'Gene_Type', 'Chromosome', 'Gene_Start', 'Gene_End', 'Ensembl_Gene_ID', 'Ensembl_Gene_ID_Version', 'Ensembl', 'NCBI_Gene_ID', 'NCBI Gene']]
# drop the NaN content in the 'HGNC_Symbol' column
bmgc_gene_llm_df = bmgc_gene_llm_df.dropna(subset=['HGNC_Symbol'])
# drop the duplicates
bmgc_gene_llm_df = bmgc_gene_llm_df.drop_duplicates(subset=['HGNC_Symbol', 'Gene_Name', 'Gene_Type', 'Chromosome', 'Gene_Start', 'Gene_End', 'Ensembl_Gene_ID', 'Ensembl_Gene_ID_Version', 'Ensembl']).reset_index(drop=True)
display(bmgc_gene_llm_df)


In [None]:
# check the value counts of HGNC_Symbol column
hgnc_symbol_counts = bmgc_gene_llm_df['HGNC_Symbol'].value_counts()
# print(hgnc_symbol_counts)
# Filter out the HGNC_Symbols that have more than 1 occurrence
hgnc_symbol_counts = hgnc_symbol_counts[hgnc_symbol_counts == 1]
# Convert the series to a list
hgnc_symbol_unique_list = hgnc_symbol_counts.index.tolist()
# print(len(hgnc_symbol_unique_list))
# Select rows which are in the [hgnc_symbol_unique_list]
bmgc_gene_llm_df = bmgc_gene_llm_df[bmgc_gene_llm_df['HGNC_Symbol'].isin(hgnc_symbol_unique_list)].reset_index(drop=True)
# rename the column 'NCBI Gene' to 'NCBI_Gene'
bmgc_gene_llm_df = bmgc_gene_llm_df.rename(columns={'NCBI Gene': 'NCBI_Gene'})
display(bmgc_gene_llm_df)

### 1.2 Protein Relation

In [None]:
bmgc_relation_df = pd.read_csv('./data/BioMedGraphica-Conn/Relation/BioMedGraphica_Conn_Relation.csv', dtype=str)

In [None]:
# Select the rows where the 'Type' is 'Protein-Protein'
bmgc_relation_ppi_df = bmgc_relation_df[bmgc_relation_df['Type'] == 'Protein-Protein']
# Keep the columns ['BMGC_From_ID', 'BMGC_To_ID', 'Source', 'Type']
bmgc_relation_ppi_df = bmgc_relation_ppi_df[['BMGC_From_ID', 'BMGC_To_ID', 'Source', 'Type']].reset_index(drop=True)
display(bmgc_relation_ppi_df)

### 1.3 Map the gene with protein

In [None]:
bmgc_df = pd.read_csv('./data/BioMedGraphica-Conn/Entity/BioMedGraphica_Conn_Entity.csv', dtype=str)

In [None]:
# translation chain converging to the same node
# fetch the promoter, gene, transcript and protein entity alone
promoter_entity_df = bmgc_df[bmgc_df['Type'] == 'Promoter'].copy()
gene_entity_df = bmgc_df[bmgc_df['Type'] == 'Gene'].copy()
transcript_entity_df = bmgc_df[bmgc_df['Type'] == 'Transcript'].copy()
protein_entity_df = bmgc_df[bmgc_df['Type'] == 'Protein'].copy()

# recheck the null values in bmgc_relation_df
print("Null values in bmgc_relation_df:")
print(bmgc_relation_df.isnull().sum())

# fetch the Promoter-Gene, Gene-Transcript, Transcript-Protein relation alone
promoter_gene_relation_df = bmgc_relation_df[bmgc_relation_df['Type'] == 'Promoter-Gene'].copy()
gene_transcript_relation_df = bmgc_relation_df[bmgc_relation_df['Type'] == 'Gene-Transcript'].copy()
transcript_protein_relation_df = bmgc_relation_df[bmgc_relation_df['Type'] == 'Transcript-Protein'].copy()

In [None]:
gene_transcript_entity_df = pd.merge(gene_entity_df, gene_transcript_relation_df[['BMGC_From_ID', 'BMGC_To_ID']], left_on='BioMedGraphica_Conn_ID', right_on='BMGC_From_ID', how='outer')
gene_transcript_protein_entity_df = pd.merge(gene_transcript_entity_df, transcript_protein_relation_df[['BMGC_From_ID', 'BMGC_To_ID']], left_on='BMGC_To_ID', right_on='BMGC_From_ID', how='outer')
# drop NaN values in BMGC_From_ID_x	BMGC_To_ID_x BMGC_From_ID_y	BMGC_To_ID_y
gene_transcript_protein_entity_df = gene_transcript_protein_entity_df.dropna(subset=['BMGC_From_ID_x', 'BMGC_To_ID_x', 'BMGC_From_ID_y', 'BMGC_To_ID_y']).reset_index(drop=True)
# keep the columns ['BioMedGraphica_Conn_ID', 'BMGC_To_ID_x', 'BMGC_To_ID_y'] and rename the columns to ['BMGC_GN_ID', 'BMGC_TS_ID', 'BMGC_PT_ID']
gene_transcript_protein_entity_df = gene_transcript_protein_entity_df[['BioMedGraphica_Conn_ID', 'BMGC_To_ID_y']].rename(columns={'BioMedGraphica_Conn_ID': 'BMGC_GN_ID', 'BMGC_To_ID_y': 'BMGC_PT_ID'}).sort_values(by='BMGC_GN_ID').reset_index(drop=True)
# drop duplicates rows in gene_transcript_protein_entity_df
gene_transcript_protein_entity_df = gene_transcript_protein_entity_df.drop_duplicates().reset_index(drop=True)
# Merge HGNC_Symbol with [gene_transcript_protein_entity_df]
bmgc_gene_df = bmgc_gene_df[['BioMedGraphica_Conn_ID', 'HGNC_Symbol']]
gene_transcript_protein_hgnc_entity_df = pd.merge(gene_transcript_protein_entity_df, bmgc_gene_df, left_on='BMGC_GN_ID', right_on='BioMedGraphica_Conn_ID', how='left')
# Fill the NaN values with 'Unknown'
gene_transcript_protein_hgnc_entity_df['HGNC_Symbol'] = gene_transcript_protein_hgnc_entity_df['HGNC_Symbol'].fillna('Unknown')
# Only keep the columns ['BMGC_PT_ID', 'HGNC_Symbol']
gene_transcript_protein_hgnc_entity_df = gene_transcript_protein_hgnc_entity_df[['BMGC_PT_ID', 'HGNC_Symbol']]
# Build up the dictionary for mapping the BMGC_PT_ID to HGNC_Symbol from [gene_transcript_protein_hgnc_entity_df] columns ['BMGC_PT_ID', 'HGNC_Symbol']
protein_hgnc_map_dict = dict(zip(gene_transcript_protein_hgnc_entity_df['BMGC_PT_ID'], gene_transcript_protein_hgnc_entity_df['HGNC_Symbol']))


In [None]:
# Map the BMGC_PT_ID to HGNC_Symbol in bmgc_relation_ppi_df['BMGC_From_ID', 'BMGC_To_ID'] columns
bmgc_relation_ppi_map_df = bmgc_relation_ppi_df.copy()
bmgc_relation_ppi_map_df['BMGC_From_ID'] = bmgc_relation_ppi_map_df['BMGC_From_ID'].map(protein_hgnc_map_dict)
bmgc_relation_ppi_map_df['BMGC_To_ID'] = bmgc_relation_ppi_map_df['BMGC_To_ID'].map(protein_hgnc_map_dict)
# Drop the rows where the BMGC_From_ID or BMGC_To_ID is 'Unknown'
bmgc_relation_ppi_map_df = bmgc_relation_ppi_map_df[(bmgc_relation_ppi_map_df['BMGC_From_ID'] != 'Unknown') & (bmgc_relation_ppi_map_df['BMGC_To_ID'] != 'Unknown')].reset_index(drop=True)
# Check if there is any Unknown string in either BMGC_From_ID or BMGC_To_ID
unknown_from_id = bmgc_relation_ppi_map_df[bmgc_relation_ppi_map_df['BMGC_From_ID'] == 'Unknown']
unknown_to_id = bmgc_relation_ppi_map_df[bmgc_relation_ppi_map_df['BMGC_To_ID'] == 'Unknown']
# Print the number of Unknown entries in BMGC_From_ID and BMGC_To_ID
print(f"Number of Unknown entries in BMGC_From_ID: {len(unknown_from_id)}")
print(f"Number of Unknown entries in BMGC_To_ID: {len(unknown_to_id)}")
display(bmgc_relation_ppi_map_df)

In [None]:
# Make a reversed copy of bmgc_relation_ppi_map_df and concatenate it with the original dataframe
bmgc_relation_ppi_map_df_reversed = bmgc_relation_ppi_map_df.copy()
bmgc_relation_ppi_map_df_reversed = bmgc_relation_ppi_map_df_reversed.rename(columns={'BMGC_From_ID': 'BMGC_To_ID', 'BMGC_To_ID': 'BMGC_From_ID'})
bmgc_relation_ppi_map_concat_df = pd.concat([bmgc_relation_ppi_map_df, bmgc_relation_ppi_map_df_reversed], ignore_index=True)
display(bmgc_relation_ppi_map_concat_df)
# Drop duplicates in the concatenated dataframe with BMGC_From_ID and BMGC_To_ID
bmgc_relation_ppi_map_concat_df = bmgc_relation_ppi_map_concat_df.drop_duplicates(subset=['BMGC_From_ID', 'BMGC_To_ID']).reset_index(drop=True)
display(bmgc_relation_ppi_map_concat_df)

In [None]:
# Group by 'BMGC_From_ID', and merge the 'BMGC_To_ID' values into a list (droping the Source and Type columns)
bmgc_relation_ppi_map_concat_group_df = bmgc_relation_ppi_map_concat_df.drop(columns=['Source', 'Type']).copy()
bmgc_relation_ppi_map_concat_group_df = bmgc_relation_ppi_map_concat_group_df.groupby('BMGC_From_ID')['BMGC_To_ID'].apply(list).reset_index()
# Check the element in the BMGC_To_ID column, if it contains the ';' string, then replace the ';' with ', '
bmgc_relation_ppi_map_concat_group_df['BMGC_To_ID'] = bmgc_relation_ppi_map_concat_group_df['BMGC_To_ID'].apply(lambda x: [str(i).replace(';', ', ') for i in x])
display(bmgc_relation_ppi_map_concat_group_df)

In [None]:
bmgc_gene_llm_relation_df = pd.merge(bmgc_gene_llm_df, bmgc_relation_ppi_map_concat_group_df, left_on='HGNC_Symbol', right_on='BMGC_From_ID', how='left').drop(columns=['BMGC_From_ID'])
display(bmgc_gene_llm_relation_df)

In [None]:
{
"text": "{HGNC_Symbol} short for {Gene_Name} is a {Gene_Type} gene located on Chromosome {Chromosome} from {Gene_Start} to {Gene_End}. The Ensembl Gene ID is {Ensembl_Gene_ID}, also {Ensembl_Gene_ID_Version} and the NCBI Gene ID is {NCBI_Gene_ID}. In details, {Gene_Name} has the NCBI Gene description with {NCBI_Gene}. Also, it has the Ensembl description with {Ensembl}. Aside from that, {HGNC_Symbol} is related to the following genes: {BMGC_To_ID}.",
}

In [None]:
import json
import os

# Function to create text description while handling NaN values and gene relationships
def create_text_description(row):
    # Start with HGNC_Symbol which we know exists
    text = f"{row['HGNC_Symbol']}"
    
    # Add Gene_Name if available
    if pd.notna(row['Gene_Name']):
        text += f" short for {row['Gene_Name']}"
    
    # Add Gene_Type if available
    if pd.notna(row['Gene_Type']):
        text += f" is a {row['Gene_Type']} gene"
    else:
        text += " is a gene"
    
    # Add Chromosome, Gene_Start, Gene_End if available
    if pd.notna(row['Chromosome']):
        text += f" located on Chromosome {row['Chromosome']}"
        
        if pd.notna(row['Gene_Start']) and pd.notna(row['Gene_End']):
            text += f" from {row['Gene_Start']} to {row['Gene_End']}"
    
    # Add Ensembl Gene IDs if available
    if pd.notna(row['Ensembl_Gene_ID']):
        text += f". The Ensembl Gene ID is {row['Ensembl_Gene_ID']}"
        if pd.notna(row['Ensembl_Gene_ID_Version']):
            text += f", also {row['Ensembl_Gene_ID_Version']}"
    
    # Add NCBI Gene ID if available
    if pd.notna(row['NCBI_Gene_ID']):
        text += f" and the NCBI Gene ID is {row['NCBI_Gene_ID']}"
    
    # Add detailed descriptions if available
    if pd.notna(row['Gene_Name']):
        if pd.notna(row['NCBI_Gene']):
            text += f". In details, {row['Gene_Name']} has the NCBI Gene description with {row['NCBI_Gene']}"
        
        if pd.notna(row['Ensembl']):
            text += f". Also, it has the Ensembl description with {row['Ensembl']}"
    
    # Add related genes if available - Fixed to properly check if BMGC_To_ID exists and is not None/NaN
    if 'BMGC_To_ID' in row and row['BMGC_To_ID'] is not None and isinstance(row['BMGC_To_ID'], list) and len(row['BMGC_To_ID']) > 0:
        # Format the list of related genes as a string
        related_genes = ', '.join(row['BMGC_To_ID'])
        text += f". Aside from that, {row['HGNC_Symbol']} is related to the following genes: {related_genes}"
    
    # Add period at the end if needed
    if not text.endswith('.'):
        text += "."
        
    return text

# Create the output directory if it doesn't exist
output_dir = './data/TargetPretrain'
os.makedirs(output_dir, exist_ok=True)

# Path for the JSONL file
output_path = os.path.join(output_dir, 'gene_relation_description.jsonl')

# Write each row as a separate JSON line in the file
with open(output_path, 'w', encoding='utf-8') as f:
    for _, row in bmgc_gene_llm_relation_df.iterrows():
        text_description = create_text_description(row)
        json_line = {"text": text_description}
        f.write(json.dumps(json_line, ensure_ascii=False) + '\n')

print(f"JSONL file created with {len(bmgc_gene_llm_relation_df)} gene descriptions at {os.path.abspath(output_path)}")

## 2. Disease Node Description

### 2.1 Disease Entity

In [None]:
bmgc_disease_df = pd.read_csv('./data/BioMedGraphica-Conn/Entity/Disease/BioMedGraphica_Conn_Disease.csv', dtype=str)
bmgc_disease_desc_df = pd.read_csv('./data/BioMedGraphica-Conn/Entity/Disease/BioMedGraphica_Conn_Disease_Description.csv', dtype=str).drop(columns=['BioMedGraphica_ID'])
bmgc_disease_display_df = pd.read_csv('./data/BioMedGraphica-Conn/Entity/Disease/BioMedGraphica_Conn_Disease_Display_Name.csv', dtype=str).drop(columns=['BioMedGraphica_ID'])
# display(bmgc_disease_display_df)
bmgc_disease_llm_df = pd.merge(bmgc_disease_df, bmgc_disease_desc_df, how='left', on='BioMedGraphica_Conn_ID')
bmgc_disease_llm_df = pd.merge(bmgc_disease_llm_df, bmgc_disease_display_df, how='left', on='BioMedGraphica_Conn_ID')
# # count each columns unique values
# print(bmgc_disease_llm_df.nunique())
# keep columns ['MONDO_Name', 'MONDO_ID', 'UMLS_Name', 'UMLS_ID', 'DO_Name', 'DO_ID', 'SNOMEDCT_Name', 'SNOMEDCT_ID', 'MeSH_Name', 'MESH_ID', 'ICD11_Title', 'ICD11_ID', 'ICD10_ID', 'OMIM_ID', 'MONDO', 'MESH', 'NCI', 'SNOMEDCT_US', 'ORPHANET', 'HPO']
bmgc_disease_llm_df = bmgc_disease_llm_df[['BioMedGraphica_Conn_ID', 'BMG_Disease_Name', 'MONDO_Name', 'MONDO_ID', 'UMLS_Name', 'UMLS_ID', 'DO_Name', 'DO_ID', 'SNOMEDCT_Name', 'SNOMEDCT_ID', 'MeSH_Name', 'MeSH_ID', 'ICD11_Title', 'ICD11_ID', 'ICD10_ID', 'OMIM_ID', 'MONDO', 'MeSH', 'NCI', 'SNOMEDCT_US', 'ORPHANET', 'HPO']]
display(bmgc_disease_llm_df.head(2))

### 2.2 Disease Protein Relation

In [None]:
# Filter out rows in type ['Protein-Disease', 'Disease-Protein']
bmgc_relation_disease_df = bmgc_relation_df[bmgc_relation_df['Type'].isin(['Protein-Disease', 'Disease-Protein'])].copy()
# Keep the columns ['BMGC_From_ID', 'BMGC_To_ID', 'Source', 'Type']
bmgc_relation_disease_df = bmgc_relation_disease_df[['BMGC_From_ID', 'BMGC_To_ID', 'Source', 'Type']].reset_index(drop=True)
# Map the BMGC_From_ID to hgnc_symbol in bmgc_relation_disease_df['BMGC_From_ID'] column
bmgc_relation_disease_df['BMGC_From_ID'] = bmgc_relation_disease_df['BMGC_From_ID'].map(protein_hgnc_map_dict)
# Drop the rows where the BMGC_From_ID is 'Unknown'
bmgc_relation_disease_df = bmgc_relation_disease_df[bmgc_relation_disease_df['BMGC_From_ID'] != 'Unknown'].reset_index(drop=True)
display(bmgc_relation_disease_df)

In [None]:
# Group by 'BMGC_To_ID' by droping columns ['Source', 'Type']
bmgc_relation_disease_group_df = bmgc_relation_disease_df.drop(columns=['Source', 'Type']).copy()
bmgc_relation_disease_group_df = bmgc_relation_disease_group_df.groupby('BMGC_To_ID')['BMGC_From_ID'].apply(list).reset_index()
display(bmgc_relation_disease_group_df)

In [None]:
bmgc_disease_llm_relation_df = pd.merge(bmgc_disease_llm_df, bmgc_relation_disease_group_df, left_on='BioMedGraphica_Conn_ID', right_on='BMGC_To_ID', how='left').drop(columns=['BMGC_To_ID'])
display(bmgc_disease_llm_relation_df)

In [None]:
# For column "BMG_Disease_Name", make the string initial letter capitalized
# Handle NaN values and ensure we're working with strings
bmgc_disease_llm_relation_df['BMG_Disease_Name'] = bmgc_disease_llm_relation_df['BMG_Disease_Name'].apply(
    lambda x: ', '.join([i.strip().capitalize() for i in str(x).split(',')]) if pd.notna(x) and ', ' in str(x) 
    else (str(x).capitalize() if pd.notna(x) else x)
)
# For column "BMG_Disease_Name":
# 1. Make the string initial letter capitalized
# 2. Remove commas while preserving text and original capitalization pattern
# 3. Handle NaN values properly
bmgc_disease_llm_relation_df['BMG_Disease_Name'] = bmgc_disease_llm_relation_df['BMG_Disease_Name'].apply(
    lambda x: ' '.join([i.strip() if idx > 0 else i.strip().capitalize() 
                       for idx, i in enumerate(str(x).split(','))])
    if pd.notna(x) else x
)
# Replace the ' | ' with ' or ' in string columns that might contain it
# Handle NaN values to avoid AttributeError
for col in ['BMG_Disease_Name', 'MONDO_Name', 'UMLS_Name', 'DO_Name', 'SNOMEDCT_Name', 'MeSH_Name', 'ICD11_Title']:
    bmgc_disease_llm_relation_df[col] = bmgc_disease_llm_relation_df[col].apply(
        lambda x: x.replace(' | ', ' or ') if pd.notna(x) and isinstance(x, str) else x
    )

display(bmgc_disease_llm_relation_df)

In [None]:
# Convert the Dataframe to text description in this format:
{
    "text": "{BMG_Disease_Name} is a disease, which has been recorded in MONDO with MONDO Name {MONDO_Name} and {MONDO_ID}. It is also recorded in UMLS with UMLS ID {UMLS_ID} and name with {UMLS_Name}. In addition, it is recorded in Disease Ontology (DO) with {DO_ID} and name with {DO_Name}. The disease is also recorded in SNOMEDCT with SNOMEDCT ID with {SNOMEDCT_ID} and name with {SNOMEDCT_Name}. It is also recorded in MeSH with MeSH ID {MeSH_ID} and {MeSH_Name}. The disease is also recorded in ICD11 with {ICD11_ID}, named as {ICD11_Title} and ICD10 with {ICD10_ID}. The disease is also recorded in OMIM with OMIM ID {OMIM_ID}. In details, the disease {BMG_Disease_Name} has the MONDO description with: {MONDO} {BMG_Disease_Name} also has MeSH description with: {MeSH} NCI description with: {NCI} SNOMEDCT_US description with: {SNOMEDCT_US} ORPHANET description with: {ORPHANET} and HPO description with: {HPO} Aside from that, {BMG_Disease_Name} is related to the following genes: {BMGC_From_ID}.",
}

In [None]:
import json
import os
import pandas as pd
import numpy as np

# Function to create text description for diseases following the specified template
def create_disease_description(row):
    # Start with disease name - use alternative names if BMG_Disease_Name is NaN
    if pd.notna(row['BMG_Disease_Name']):
        disease_name = row['BMG_Disease_Name']
    else:
        # Try to get name from other sources in order of preference
        name_sources = ['MONDO_Name', 'UMLS_Name', 'DO_Name', 'SNOMEDCT_Name', 'MeSH_Name']
        disease_name = None
        for source in name_sources:
            if pd.notna(row[source]):
                disease_name = row[source]
                break
        
        # If still no name found, return None to skip this entry
        if disease_name is None:
            return None
    
    # Start building the text following the template
    text = f"{disease_name} is a disease"
    
    # Add MONDO information
    if pd.notna(row['MONDO_ID']) or pd.notna(row['MONDO_Name']):
        text += ", which has been recorded in MONDO"
        if pd.notna(row['MONDO_Name']):
            text += f" with MONDO Name {row['MONDO_Name']}"
        if pd.notna(row['MONDO_ID']):
            text += f" and {row['MONDO_ID']}"
    
    # Add UMLS information
    if pd.notna(row['UMLS_ID']) or pd.notna(row['UMLS_Name']):
        text += ". It is also recorded in UMLS"
        if pd.notna(row['UMLS_ID']):
            text += f" with UMLS ID {row['UMLS_ID']}"
        if pd.notna(row['UMLS_Name']):
            text += f" and name with {row['UMLS_Name']}"
    
    # Add Disease Ontology information
    if pd.notna(row['DO_ID']) or pd.notna(row['DO_Name']):
        text += ". In addition, it is recorded in Disease Ontology (DO)"
        if pd.notna(row['DO_ID']):
            text += f" with {row['DO_ID']}"
        if pd.notna(row['DO_Name']):
            text += f" and name with {row['DO_Name']}"
    
    # Add SNOMEDCT information
    if pd.notna(row['SNOMEDCT_ID']) or pd.notna(row['SNOMEDCT_Name']):
        text += ". The disease is also recorded in SNOMEDCT"
        if pd.notna(row['SNOMEDCT_ID']):
            text += f" with SNOMEDCT ID with {row['SNOMEDCT_ID']}"
        if pd.notna(row['SNOMEDCT_Name']):
            text += f" and name with {row['SNOMEDCT_Name']}"
    
    # Add MeSH information
    if pd.notna(row['MeSH_ID']) or pd.notna(row['MeSH_Name']):
        text += ". It is also recorded in MeSH"
        if pd.notna(row['MeSH_ID']):
            text += f" with MeSH ID {row['MeSH_ID']}"
        if pd.notna(row['MeSH_Name']):
            text += f" and {row['MeSH_Name']}"
    
    # Add ICD information
    if pd.notna(row['ICD11_ID']) or pd.notna(row['ICD11_Title']) or pd.notna(row['ICD10_ID']):
        text += ". The disease is also recorded"
        if pd.notna(row['ICD11_ID']) or pd.notna(row['ICD11_Title']):
            text += " in ICD11"
            if pd.notna(row['ICD11_ID']):
                text += f" with {row['ICD11_ID']}"
            if pd.notna(row['ICD11_Title']):
                text += f", named as {row['ICD11_Title']}"
        
        if pd.notna(row['ICD10_ID']):
            if pd.notna(row['ICD11_ID']) or pd.notna(row['ICD11_Title']):
                text += " and"
            text += f" ICD10 with {row['ICD10_ID']}"
    
    # Add OMIM information
    if pd.notna(row['OMIM_ID']):
        text += f". The disease is also recorded in OMIM with OMIM ID {row['OMIM_ID']}"
    
    # Add detailed descriptions
    descriptions = []
    
    # MONDO description
    if pd.notna(row['MONDO']):
        descriptions.append(f"the disease {disease_name} has the MONDO description with: {row['MONDO']}")
    
    # MeSH description
    if pd.notna(row['MeSH']):
        descriptions.append(f"{disease_name} also has MeSH description with: {row['MeSH']}")
    
    # NCI description
    if pd.notna(row['NCI']):
        descriptions.append(f"NCI description with: {row['NCI']}")
    
    # SNOMEDCT_US description
    if pd.notna(row['SNOMEDCT_US']):
        descriptions.append(f"SNOMEDCT_US description with: {row['SNOMEDCT_US']}")
    
    # ORPHANET description
    if pd.notna(row['ORPHANET']):
        descriptions.append(f"ORPHANET description with: {row['ORPHANET']}")
    
    # HPO description
    if pd.notna(row['HPO']):
        descriptions.append(f"HPO description with: {row['HPO']}")
    
    # Add descriptions if any exist
    if descriptions:
        text += ". In details, " + " ".join(descriptions)
    
    # Add related genes if available
    if 'BMGC_From_ID' in row and isinstance(row['BMGC_From_ID'], list) and len(row['BMGC_From_ID']) > 0:
        # Convert all elements to strings, filtering out any NaN values
        valid_genes = [str(gene) for gene in row['BMGC_From_ID'] 
                      if pd.notna(gene) and not (isinstance(gene, float) and np.isnan(gene))]
        
        if valid_genes:  # Only proceed if there are valid genes
            related_genes = ', '.join(valid_genes)
            text += f". Aside from that, {disease_name} is related to the following genes: {related_genes}"
    
    # Add period at the end if needed
    if not text.endswith('.'):
        text += "."
    
    return text

# Path for the JSONL file
disease_output_path = os.path.join(output_dir, 'disease_relation_description.jsonl')

# Write each row as a separate JSON line
skipped_count = 0
written_count = 0

with open(disease_output_path, 'w', encoding='utf-8') as f:
    for _, row in bmgc_disease_llm_relation_df.iterrows():
        description = create_disease_description(row)
        if description is not None:
            json_line = {"text": description}
            f.write(json.dumps(json_line, ensure_ascii=False) + '\n')
            written_count += 1
        else:
            skipped_count += 1

print(f"JSONL file created with {written_count} disease descriptions at {os.path.abspath(disease_output_path)}")
print(f"Skipped {skipped_count} entries due to missing disease names")

## 3. Mixed pretrainining data

In [None]:
import json
import random
import os

# Lists to hold the data
gene_data = []
disease_data = []

# Load the gene descriptions
gene_file_path = './data/TargetPretrain/gene_relation_description.jsonl'
with open(gene_file_path, 'r', encoding='utf-8') as f:
    for line in f:
        gene_data.append(json.loads(line))

# Load the disease descriptions
disease_file_path = './data/TargetPretrain/disease_relation_description.jsonl'
with open(disease_file_path, 'r', encoding='utf-8') as f:
    for line in f:
        disease_data.append(json.loads(line))

# Combine the data
combined_data = gene_data + disease_data

# Shuffle the combined data
random.seed(42)  # For reproducibility
random.shuffle(combined_data)

# Save the mixed data as JSONL
output_path = './data/TargetPretrain/mixed_description.jsonl'
with open(output_path, 'w', encoding='utf-8') as f:
    for item in combined_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Mixed data file created with {len(combined_data)} total descriptions")
print(f"- {len(gene_data)} gene descriptions")
print(f"- {len(disease_data)} disease descriptions")
print(f"Output saved to: {os.path.abspath(output_path)}")