### MSK-CancerKG Heterogenous PyG dataframe
#### Author: Thahmina A. Ali
#### Date: January 18, 2024

In [2]:
!pip install pandas torch_geometric



In [3]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData

import networkx as nx
import matplotlib.pyplot as plt


df_patients = pd.read_csv('patient.tsv', sep='\t')

# Replace "UNKNOWN" with NaN for age and then impute
df_patients['age'] = pd.to_numeric(df_patients['age'], errors='coerce')
median_age = df_patients['age'].median()
df_patients['age'].fillna(median_age, inplace=True)

# Convert 'gender', 'race', and 'survival_status' to category type and include "UNKNOWN"
df_patients['gender'] = df_patients['gender'].astype('category')
df_patients['race'] = df_patients['race'].astype('category')
df_patients['survival_status'] = df_patients['survival_status'].astype('category')

# One-hot encode the categorical variables, including "UNKNOWN"
categorical_features = pd.get_dummies(df_patients[['gender', 'race', 'survival_status']])

# Normalize the 'age' column and convert to a tensor
normalized_age = (df_patients['age'] - df_patients['age'].mean()) / df_patients['age'].std()
age_tensor = torch.tensor(normalized_age.values, dtype=torch.float).unsqueeze(1)

# Concatenate the one-hot encoded categorical features and the age tensor along the second dimension (columns)
patient_features_tensor = torch.cat([torch.tensor(categorical_features.values, dtype=torch.float), age_tensor], dim=1)

# Store the feature names for later reference
feature_names = ['age'] + list(categorical_features.columns)

# Create a HeteroData object and add the patient node features
hetero_data = HeteroData()
hetero_data['patient'].x = patient_features_tensor

# Display the HeteroData object
print(hetero_data)


HeteroData(
  patient={ x=[164751, 24] }
)


In [4]:
df_patients

Unnamed: 0,patient,gender,age,survival_status,race
0,GENIE-DFCI-003875,Male,60.0,DECEASED,WHITE
1,GENIE-DFCI-106693,Female,60.0,Not Applicable,WHITE
2,GENIE-DUKE-P470,Female,60.0,DECEASED,BLACK OR AFRICAN AMERICAN
3,GENIE-COLU-1156,Male,60.0,Not Applicable,BLACK OR AFRICAN AMERICAN
4,GENIE-DFCI-004985,Female,60.0,Not Applicable,WHITE
...,...,...,...,...,...
164746,P-0091436,Female,60.0,LIVING,OTHER
164747,P-0092079,Female,60.0,LIVING,WHITE
164748,P-0091336,Female,60.0,LIVING,ASIAN-FAR EAST/INDIAN SUBCONT
164749,P-0089206,Male,60.0,LIVING,WHITE


In [5]:
patient_features_tensor

tensor([[0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0072],
        [1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0072],
        [1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0072],
        ...,
        [1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0072],
        [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0072],
        [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0072]])

In [6]:
print("Feature names")
for name in feature_names:
    print(name)

Feature names
age
gender_Female
gender_Male
gender_Other
gender_UNKNOWN
race_ASIAN-FAR EAST/INDIAN SUBCONT
race_BLACK OR AFRICAN AMERICAN
race_NATIVE AMERICAN-AM IND/ALASKA
race_NATIVE HAWAIIAN OR PACIFIC ISL
race_NO VALUE ENTERED
race_Not Applicable
race_Not collected
race_OTHER
race_PT REFUSED TO ANSWER
race_Pacific Islander
race_UNKNOWN
race_WHITE
survival_status_CENSORED
survival_status_DECEASED
survival_status_LIVING
survival_status_Not Applicable
survival_status_Not Collected
survival_status_Not Released
survival_status_UNKNOWN


In [7]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# Load your data
df_alteration = pd.read_csv('alteration.tsv', sep='\t')
df_alteration_class = pd.read_csv('patient_alteration_class.tsv', sep='\t')
df_alteration_status = pd.read_csv('patient_alteration_status.tsv', sep='\t')

# One-hot encode the relation column in alteration class and status DataFrames
df_alteration_class_onehot = pd.get_dummies(df_alteration_class, columns=['relation'])
df_alteration_status_onehot = pd.get_dummies(df_alteration_status, columns=['relation'])

# Drop the 'patient' column since it's no longer needed after one-hot encoding
df_alteration_class_onehot.drop('patient', axis=1, inplace=True)
df_alteration_status_onehot.drop('patient', axis=1, inplace=True)

# Group by hgvsp and aggregate with max function to get binary encoding
df_alteration_class_grouped = df_alteration_class_onehot.groupby('hgvsp').max()
df_alteration_status_grouped = df_alteration_status_onehot.groupby('hgvsp').max()

# join operation should not cause an error since 'patient' is not present
df_alteration_features = df_alteration.set_index('hgvsp').join(
    df_alteration_class_grouped, how='left').join(
    df_alteration_status_grouped, how='left').fillna(0)

# Convert the features DataFrame to a tensor
alteration_features_tensor = torch.tensor(df_alteration_features.values, dtype=torch.float)

# Add the alteration node features to the HeteroData object
hetero_data['alteration'].x = alteration_features_tensor

# List the features and their encodings
alteration_feature_names = df_alteration_features.columns.tolist()

print("Alteration features and their encodings:")
for name in alteration_feature_names:
    print(name)

print(hetero_data['alteration'])


Alteration features and their encodings:
relation_3'Flank
relation_3'UTR
relation_5'Flank
relation_5'UTR
relation_Frame_Shift_Del
relation_Frame_Shift_Ins
relation_In_Frame_Del
relation_In_Frame_Ins
relation_Intron
relation_Missense_Mutation
relation_Nonsense_Mutation
relation_Nonstop_Mutation
relation_RNA
relation_Silent
relation_Splice_Region
relation_Splice_Site
relation_Translation_Start_Site
relation_GERMLINE
relation_SOMATIC
relation_UNKNOWN
{'x': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])}


In [8]:
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] }
)


In [10]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# Load the data
df_patient_cancer_type = pd.read_csv('patient_cancer_type.tsv', sep='\t')
df_patient_metastasis = pd.read_csv('patient_metastasis.tsv', sep='\t')
df_cancer_type = pd.read_csv('cancer_type.tsv', sep='\t', header=None, names=['cancerType'])

# Ensure 'patient' columns are strings
df_patient_cancer_type['patient'] = df_patient_cancer_type['patient'].astype(str)
df_patient_metastasis['patient'] = df_patient_metastasis['patient'].astype(str)

# One-hot encode the metastasis column
metastasis_onehot = pd.get_dummies(df_patient_metastasis[['patient', 'metastasis']], columns=['metastasis'])

# Aggregate metastasis features for each patient
df_patient_metastasis_features = metastasis_onehot.groupby('patient').max()

# Merge the patient metastasis features with the patient to cancer type mapping
df_patient_cancer_metastasis = pd.merge(df_patient_cancer_type, df_patient_metastasis_features, on='patient', how='left').fillna(0)

# Drop unnecessary columns
df_patient_cancer_metastasis.drop(['patient', 'relation'], axis=1, inplace=True)

# Aggregate the metastasis features for each cancer type
df_cancer_metastasis_features = df_patient_cancer_metastasis.groupby('cancer_type').max()

# Ensure the cancer types from df_cancer_type are included in the final feature matrix
df_cancer_metastasis_features = df_cancer_metastasis_features.reindex(df_cancer_type['cancerType'].values, fill_value=0)

# Convert the features DataFrame to a tensor
cancer_type_features_tensor = torch.tensor(df_cancer_metastasis_features.values, dtype=torch.float32)

#  add the cancer type node features

hetero_data['cancer_type'].x = cancer_type_features_tensor

# List the features and their encodings
cancer_type_feature_names = df_cancer_metastasis_features.columns.tolist()

print("Cancer type features and their encodings:")
for name in cancer_type_feature_names:
    print(name)


# Display the HeteroData object (for the cancer_type node type)
print(hetero_data['cancer_type'])


Cancer type features and their encodings:
metastasis_Distant organ metastasis
metastasis_Local recurrence
metastasis_Lymph node metastasis
metastasis_Metastasis site unspecified
metastasis_Not Collected
metastasis_Not applicable or hematologic malignancy
metastasis_Not otherwise specified
metastasis_Primary tumor
{'x': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 0., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 0.,  ..., 1., 0., 1.],
        [1., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.]])}


In [11]:
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] }
)


In [12]:
def load_data(file_name):
    return pd.read_csv(file_name, sep='\t', names=['name'])

# Load data for each node type
df_anatomical_region = load_data('anatomical_region.tsv')
df_biological_process = load_data('biological_process.tsv')
df_cell_line = load_data('cell_line.tsv')
df_cellular_component = load_data('cellular_component.tsv')
df_drug = load_data('drug.tsv')
df_exposure = load_data('exposure.tsv')
df_gene = load_data('gene.tsv')
df_molecular_function = load_data('molecular_function.tsv')
df_pathway = load_data('pathway.tsv')
df_phenotype = load_data('phenotype.tsv')

hetero_data['anatomical_region'].x = torch.zeros(len(df_anatomical_region), 1)
hetero_data['biological_process'].x = torch.zeros(len(df_biological_process), 1)
hetero_data['cell_line'].x = torch.zeros(len(df_cell_line), 1)
hetero_data['cellular_component'].x = torch.zeros(len(df_cellular_component), 1)
hetero_data['drug'].x = torch.zeros(len(df_drug), 1)
hetero_data['exposure'].x = torch.zeros(len(df_exposure), 1)
hetero_data['gene'].x = torch.zeros(len(df_gene), 1)
hetero_data['molecular_function'].x = torch.zeros(len(df_molecular_function), 1)
hetero_data['pathway'].x = torch.zeros(len(df_pathway), 1)
hetero_data['phenotype'].x = torch.zeros(len(df_phenotype), 1)

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] }
)


In [13]:
print("Node types:", hetero_data.node_types)

Node types: ['patient', 'alteration', 'cancer_type', 'anatomical_region', 'biological_process', 'cell_line', 'cellular_component', 'drug', 'exposure', 'gene', 'molecular_function', 'pathway', 'phenotype']


In [14]:
# Load the patient to alteration relationship data
df_patient_alteration = pd.read_csv('patient_alteration_relation-oncogenicity.tsv', sep='\t')

# Extract unique identifiers from the DataFrame
unique_patients = df_patient_alteration['patient'].unique()
unique_alterations = df_patient_alteration['hgvsp'].unique()

# Create mappings based on order of appearance
patient_mapping = {pid: i for i, pid in enumerate(unique_patients)}
alteration_mapping = {hid: i for i, hid in enumerate(unique_alterations)}

# Map the patient and hgvsp identifiers to indices
patient_indices = df_patient_alteration['patient'].map(patient_mapping)
alteration_indices = df_patient_alteration['hgvsp'].map(alteration_mapping)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([patient_indices.tolist(), alteration_indices.tolist()], dtype=torch.long)

# One-hot encode the oncogenicity column for edge attributes
oncogenicity_onehot = pd.get_dummies(df_patient_alteration['oncogenicity'])
edge_attr = torch.tensor(oncogenicity_onehot.values, dtype=torch.float)  # Corrected this line

# Add edges and edge attributes to the HeteroData object
hetero_data['patient', 'has_alteration', 'alteration'].edge_index = edge_index
hetero_data['patient', 'has_alteration', 'alteration'].edge_attr = edge_attr

print(hetero_data)


HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  }
)


In [15]:
# One-hot encode the oncogenicity column and get the feature names
oncogenicity_onehot = pd.get_dummies(df_patient_alteration['oncogenicity'])
feature_names = oncogenicity_onehot.columns

# Print the feature names
print("Feature names:", feature_names.tolist())

# Access and print the first few rows of the edge attributes along with their feature names
edge_attributes = hetero_data['patient', 'has_alteration', 'alteration'].edge_attr[:5]
print("First few rows of edge attributes with feature names:")
for row in edge_attributes:
    # Create a dictionary of feature name to value for each row
    attribute_dict = {feature: value.item() for feature, value in zip(feature_names, row)}
    print(attribute_dict)

Feature names: ['Inconclusive', 'Likely Neutral', 'Likely Oncogenic', 'Oncogenic', 'Resistance', 'Unknown']
First few rows of edge attributes with feature names:
{'Inconclusive': 0.0, 'Likely Neutral': 0.0, 'Likely Oncogenic': 1.0, 'Oncogenic': 0.0, 'Resistance': 0.0, 'Unknown': 0.0}
{'Inconclusive': 0.0, 'Likely Neutral': 0.0, 'Likely Oncogenic': 0.0, 'Oncogenic': 0.0, 'Resistance': 0.0, 'Unknown': 1.0}
{'Inconclusive': 0.0, 'Likely Neutral': 0.0, 'Likely Oncogenic': 0.0, 'Oncogenic': 0.0, 'Resistance': 0.0, 'Unknown': 1.0}
{'Inconclusive': 0.0, 'Likely Neutral': 0.0, 'Likely Oncogenic': 1.0, 'Oncogenic': 0.0, 'Resistance': 0.0, 'Unknown': 0.0}
{'Inconclusive': 0.0, 'Likely Neutral': 0.0, 'Likely Oncogenic': 0.0, 'Oncogenic': 0.0, 'Resistance': 0.0, 'Unknown': 1.0}


In [16]:
# Load the patient to cancer type relationship data
df_patient_cancer_type = pd.read_csv('patient_cancer_type.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for patients and cancer types
unique_patients = df_patient_cancer_type['patient'].unique()
unique_cancer_types = df_patient_cancer_type['cancer_type'].unique()

# Create mappings based on order of appearance
patient_mapping = {pid: i for i, pid in enumerate(unique_patients)}
cancer_type_mapping = {cid: i for i, cid in enumerate(unique_cancer_types)}

# Map the patient and cancer type identifiers to indices
patient_indices = df_patient_cancer_type['patient'].map(patient_mapping)
cancer_type_indices = df_patient_cancer_type['cancer_type'].map(cancer_type_mapping)

# Create edge index tensor (2 x num_edges)
edge_index_patient_cancer_type = torch.tensor([patient_indices.tolist(), cancer_type_indices.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df_patient_cancer_type['relation'])
edge_attr_patient_cancer_type = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['patient', 'has_cancer_type', 'cancer_type'].edge_index = edge_index_patient_cancer_type
hetero_data['patient', 'has_cancer_type', 'cancer_type'].edge_attr = edge_attr_patient_cancer_type

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  }
)


In [17]:
# Load the alteration to gene relationship data
df_alteration_gene = pd.read_csv('alteration_gene.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
unique_alterations = df_alteration_gene['hgvsp'].unique()
unique_genes = df_alteration_gene['gene'].unique()

# Create mappings based on order of appearance
alteration_mapping = {hid: i for i, hid in enumerate(unique_alterations)}
gene_mapping = {gid: i for i, gid in enumerate(unique_genes)}

# Map the alteration and gene identifiers to indices
alteration_indices = df_alteration_gene['hgvsp'].map(alteration_mapping)
gene_indices = df_alteration_gene['gene'].map(gene_mapping)

# Create edge index tensor (2 x num_edges)
edge_index_alteration_gene = torch.tensor([alteration_indices.tolist(), gene_indices.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df_alteration_gene['relation'])
edge_attr_alteration_gene = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['alteration', 'mutated_in_gene', 'gene'].edge_index = edge_index_alteration_gene
hetero_data['alteration', 'mutated_in_gene', 'gene'].edge_attr = edge_attr_alteration_gene

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  }
)


In [18]:
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  }
)


In [19]:
df = pd.read_csv('gene_biological_process.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['gene'].unique()
nodeB = df['biological_process'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['gene'].map(mapping1)
indices2 = df['biological_process'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['gene', 'interacts_with', 'biological_process'].edge_index = edge_index
hetero_data['gene', 'interacts_with', 'biological_process'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  }
)


In [20]:
df = pd.read_csv('gene_cellular_component.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['gene'].unique()
nodeB = df['cellular_component'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['gene'].map(mapping1)
indices2 = df['cellular_component'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['gene', 'interacts_with', 'cellular_component'].edge_index = edge_index
hetero_data['gene', 'interacts_with', 'cellular_component'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  }
)


In [21]:
df = pd.read_csv('gene_molecular_function.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['gene'].unique()
nodeB = df['molecular_function'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['gene'].map(mapping1)
indices2 = df['molecular_function'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['gene', 'interacts_with', 'molecular_function'].edge_index = edge_index
hetero_data['gene', 'interacts_with', 'molecular_function'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [22]:
df = pd.read_csv('gene_exposure.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['gene'].unique()
nodeB = df['exposure'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['gene'].map(mapping1)
indices2 = df['exposure'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['gene', 'interacts_with', 'exposure'].edge_index = edge_index
hetero_data['gene', 'interacts_with', 'exposure'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [23]:
df = pd.read_csv('molecular_function_exposure.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['molecular_function'].unique()
nodeB = df['exposure'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['molecular_function'].map(mapping1)
indices2 = df['exposure'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['molecular_function', 'interacts_with', 'exposure'].edge_index = edge_index
hetero_data['molecular_function', 'interacts_with', 'exposure'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [24]:
df = pd.read_csv('cellular_component_exposure.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['cellular_component'].unique()
nodeB = df['exposure'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['cellular_component'].map(mapping1)
indices2 = df['exposure'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['cellular_component', 'interacts_with', 'exposure'].edge_index = edge_index
hetero_data['cellular_component', 'interacts_with', 'exposure'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [25]:
df = pd.read_csv('biological_process_exposure.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['biological_process'].unique()
nodeB = df['exposure'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['biological_process'].map(mapping1)
indices2 = df['exposure'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['biological_process', 'interacts_with', 'exposure'].edge_index = edge_index
hetero_data['biological_process', 'interacts_with', 'exposure'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [26]:
df = pd.read_csv('gene_pathway.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['gene'].unique()
nodeB = df['pathway'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['gene'].map(mapping1)
indices2 = df['pathway'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['gene', 'interacts_with', 'pathway'].edge_index = edge_index
hetero_data['gene', 'interacts_with', 'pathway'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [27]:
df = pd.read_csv('gene_phenotype.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['gene'].unique()
nodeB = df['phenotype'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['gene'].map(mapping1)
indices2 = df['phenotype'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

# Optionally, one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['gene', 'associated_with', 'phenotype'].edge_index = edge_index
hetero_data['gene', 'associated_with', 'phenotype'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [28]:
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [29]:
df_gene_cell_line = pd.read_csv('gene_cell_line.tsv', sep='\t')

In [30]:
# Extract unique identifiers from the DataFrame for genes and cell lines
unique_genes = df_gene_cell_line['gene'].unique()
unique_cell_lines = df_gene_cell_line['cell_line'].unique()

# Create mappings based on order of appearance
gene_mapping = {gene: i for i, gene in enumerate(unique_genes)}
cell_line_mapping = {cell_line: i for i, cell_line in enumerate(unique_cell_lines)}

# Map the gene and cell line identifiers to indices
gene_indices = df_gene_cell_line['gene'].map(gene_mapping)
cell_line_indices = df_gene_cell_line['cell_line'].map(cell_line_mapping)

# Create edge index tensor (2 x num_edges)
edge_index_gene_cell_line = torch.tensor([gene_indices.tolist(), cell_line_indices.tolist()], dtype=torch.long)

# One-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df_gene_cell_line['relation'])
edge_attr_gene_cell_line = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['gene', 'in', 'cell_line'].edge_index = edge_index_gene_cell_line
hetero_data['gene', 'in', 'cell_line'].edge_attr = edge_attr_gene_cell_line

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [31]:
# One-hot encode the relation column and get the feature names
relation_onehot = pd.get_dummies(df_gene_cell_line['relation'])
feature_names = relation_onehot.columns

# Print the feature names
print("Feature names for edge attributes:", feature_names.tolist())

# Access and print the first few rows of the edge attributes
edge_attributes = hetero_data['gene', 'in', 'cell_line'].edge_attr[:5]
print("First few rows of edge attributes:")

# Print the edge attributes along with their feature names
for row in edge_attributes:
    attribute_dict = {feature: value.item() for feature, value in zip(feature_names, row)}
    print(attribute_dict)


Feature names for edge attributes: ['CLOSE_TO_NON_ESSENTIAL_IN', 'CLOSE_TO_PAN_ESSENTIAL_IN', 'SHOWS_DEPLETION_IN', 'SHOWS_STRONG_KILLING_IN']
First few rows of edge attributes:
{'CLOSE_TO_NON_ESSENTIAL_IN': 0.0, 'CLOSE_TO_PAN_ESSENTIAL_IN': 0.0, 'SHOWS_DEPLETION_IN': 0.0, 'SHOWS_STRONG_KILLING_IN': 1.0}
{'CLOSE_TO_NON_ESSENTIAL_IN': 0.0, 'CLOSE_TO_PAN_ESSENTIAL_IN': 0.0, 'SHOWS_DEPLETION_IN': 0.0, 'SHOWS_STRONG_KILLING_IN': 1.0}
{'CLOSE_TO_NON_ESSENTIAL_IN': 0.0, 'CLOSE_TO_PAN_ESSENTIAL_IN': 0.0, 'SHOWS_DEPLETION_IN': 0.0, 'SHOWS_STRONG_KILLING_IN': 1.0}
{'CLOSE_TO_NON_ESSENTIAL_IN': 0.0, 'CLOSE_TO_PAN_ESSENTIAL_IN': 0.0, 'SHOWS_DEPLETION_IN': 0.0, 'SHOWS_STRONG_KILLING_IN': 1.0}
{'CLOSE_TO_NON_ESSENTIAL_IN': 0.0, 'CLOSE_TO_PAN_ESSENTIAL_IN': 0.0, 'SHOWS_DEPLETION_IN': 0.0, 'SHOWS_STRONG_KILLING_IN': 1.0}


In [32]:
df_gene_anatomical_region = pd.read_csv('gene_anatomical_region.tsv', sep='\t')

unique_genes = df_gene_anatomical_region['gene'].unique()
unique_anatomical_regions = df_gene_anatomical_region['anatomical_region'].unique()

gene_mapping = {gene: i for i, gene in enumerate(unique_genes)}
anatomical_region_mapping = {region: i for i, region in enumerate(unique_anatomical_regions)}

gene_indices = df_gene_anatomical_region['gene'].map(gene_mapping)
anatomical_region_indices = df_gene_anatomical_region['anatomical_region'].map(anatomical_region_mapping)

edge_index_gene_anatomical_region = torch.tensor([gene_indices.tolist(), anatomical_region_indices.tolist()], dtype=torch.long)

expression_onehot = pd.get_dummies(df_gene_anatomical_region['expression'])
edge_attr_gene_anatomical_region = torch.tensor(expression_onehot.values, dtype=torch.float)

hetero_data['gene', 'expressed_in', 'anatomical_region'].edge_index = edge_index_gene_anatomical_region
hetero_data['gene', 'expressed_in', 'anatomical_region'].edge_attr = edge_attr_gene_anatomical_region

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [33]:
df = pd.read_csv('drug_phenotype.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for alterations (hgvsp) and genes
nodeA = df['drug'].unique()
nodeB = df['phenotype'].unique()

# Create mappings based on order of appearance
mapping1 = {hid: i for i, hid in enumerate(nodeA)}
mapping2 = {gid: i for i, gid in enumerate(nodeB)}

# Map the alteration and gene identifiers to indices
indices1 = df['drug'].map(mapping1)
indices2 = df['phenotype'].map(mapping2)

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([indices1.tolist(), indices2.tolist()], dtype=torch.long)

#  one-hot encode the relation column for edge attributes
relation_onehot = pd.get_dummies(df['relation'])
edge_attr = torch.tensor(relation_onehot.values, dtype=torch.float)

# Add edges and edge attributes to the HeteroData object
hetero_data['drug', 'associated_with', 'phenotype'].edge_index = edge_index
hetero_data['drug', 'associated_with', 'phenotype'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [34]:
# Load the combined gene-drug relationship data
df_gene_drug = pd.read_csv('gene_is-a_drug.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for genes and drugs
unique_genes = df_gene_drug['gene'].unique()
unique_drugs = df_gene_drug['drug'].unique()

# Create mappings based on order of appearance
gene_to_index = {gene: i for i, gene in enumerate(unique_genes)}
drug_to_index = {drug: i for i, drug in enumerate(unique_drugs)}

# Map the gene and drug identifiers to indices
gene_indices = df_gene_drug['gene'].apply(lambda x: gene_to_index[x])
drug_indices = df_gene_drug['drug'].apply(lambda x: drug_to_index[x])

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([gene_indices.values, drug_indices.values], dtype=torch.long)

# One-hot encode the relation column for edge attributes
relation_types = df_gene_drug['relation'].unique()
relation_to_index = {relation: i for i, relation in enumerate(relation_types)}
relation_indices = df_gene_drug['relation'].apply(lambda x: relation_to_index[x])
edge_attr = torch.zeros(len(relation_indices), len(relation_types))
edge_attr[range(len(relation_indices)), relation_indices] = 1


# Add edges and edge attributes to the HeteroData object
hetero_data['gene', 'is_a', 'drug'].edge_index = edge_index
hetero_data['gene', 'is_a', 'drug'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

  edge_index = torch.tensor([gene_indices.values, drug_indices.values], dtype=torch.long)


In [35]:
# Load the drug to gene relationship data
df_drug_gene = pd.read_csv('drug_is-a_gene.tsv', sep='\t')

# Extract unique identifiers from the DataFrame for drugs and genes
unique_drugs = df_drug_gene['drug'].unique()
unique_genes = df_drug_gene['gene'].unique()

# Create mappings based on order of appearance
drug_to_index = {drug: i for i, drug in enumerate(unique_drugs)}
gene_to_index = {gene: i for i, gene in enumerate(unique_genes)}

# Map the drug and gene identifiers to indices
drug_indices = df_drug_gene['drug'].apply(lambda x: drug_to_index[x])
gene_indices = df_drug_gene['gene'].apply(lambda x: gene_to_index[x])

# Create edge index tensor (2 x num_edges)
edge_index = torch.tensor([drug_indices.values, gene_indices.values], dtype=torch.long)

# One-hot encode the relation column for edge attributes
relation_types = df_drug_gene['relation'].unique()
relation_to_index = {relation: i for i, relation in enumerate(relation_types)}
relation_indices = df_drug_gene['relation'].apply(lambda x: relation_to_index[x])
edge_attr = torch.zeros(len(relation_indices), len(relation_types))
edge_attr[range(len(relation_indices)), relation_indices] = 1


# Add edges and edge attributes to the HeteroData object
hetero_data['drug', 'is_a', 'gene'].edge_index = edge_index
hetero_data['drug', 'is_a', 'gene'].edge_attr = edge_attr

# Display the HeteroData object
print(hetero_data)


HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [36]:
# Load the patient to drug relationship data
df_patient_drug = pd.read_csv('patient_drug.tsv', sep='\t')

# Load the patient treatment data
df_patient_treatment = pd.read_csv('patient_treatment.tsv', sep='\t')

# Merge the patient_drug and patient_treatment dataframes on the 'patient' column
df_patient_drug_treatment = pd.merge(df_patient_drug, df_patient_treatment, on='patient', how='left')

# Handle NaN values in the treatment column by filling them with a placeholder
df_patient_drug_treatment['treatment'].fillna('No_Treatment', inplace=True)

# Extract unique identifiers from the DataFrame for patients and drugs
unique_patients = df_patient_drug_treatment['patient'].unique()
unique_drugs = df_patient_drug_treatment['drug'].unique()
unique_treatments = df_patient_drug_treatment['treatment'].unique()

# Create mappings based on order of appearance
patient_to_index = {patient: i for i, patient in enumerate(unique_patients)}
drug_to_index = {drug: i for i, drug in enumerate(unique_drugs)}
treatment_to_index = {treatment: i for i, treatment in enumerate(unique_treatments)}
patient_indices = df_patient_drug_treatment['patient'].map(patient_to_index)
drug_indices = df_patient_drug_treatment['drug'].map(drug_to_index)
edge_index = torch.tensor([patient_indices.values, drug_indices.values], dtype=torch.long)

treatment_indices = df_patient_drug_treatment['treatment'].map(treatment_to_index)
edge_attr = torch.zeros(len(df_patient_drug_treatment), len(unique_treatments))
edge_attr[torch.arange(len(df_patient_drug_treatment)), treatment_indices] = 1

hetero_data['patient', 'treated_with', 'drug'].edge_index = edge_index
hetero_data['patient', 'treated_with', 'drug'].edge_attr = edge_attr

In [37]:
print(hetero_data)

HeteroData(
  patient={ x=[164751, 24] },
  alteration={ x=[314702, 20] },
  cancer_type={ x=[854, 8] },
  anatomical_region={ x=[164, 1] },
  biological_process={ x=[28643, 1] },
  cell_line={ x=[1096, 1] },
  cellular_component={ x=[4177, 1] },
  drug={ x=[8646, 1] },
  exposure={ x=[151, 1] },
  gene={ x=[19209, 1] },
  molecular_function={ x=[11170, 1] },
  pathway={ x=[2499, 1] },
  phenotype={ x=[15312, 1] },
  (patient, has_alteration, alteration)={
    edge_index=[2, 849307],
    edge_attr=[849307, 6],
  },
  (patient, has_cancer_type, cancer_type)={
    edge_index=[2, 325723],
    edge_attr=[325723, 1],
  },
  (alteration, mutated_in_gene, gene)={
    edge_index=[2, 666003],
    edge_attr=[666003, 1],
  },
  (gene, interacts_with, biological_process)={
    edge_index=[2, 145020],
    edge_attr=[145020, 1],
  },
  (gene, interacts_with, cellular_component)={
    edge_index=[2, 83586],
    edge_attr=[83586, 1],
  },
  (gene, interacts_with, molecular_function)={
    edge_index=[

In [38]:
import torch

file_path = '/userfs/PyG/msk_cancer-kg.pt'  
torch.save(hetero_data, file_path)

In [39]:
print("Node types:", hetero_data.node_types)
print("Edge types:", hetero_data.edge_types)

Node types: ['patient', 'alteration', 'cancer_type', 'anatomical_region', 'biological_process', 'cell_line', 'cellular_component', 'drug', 'exposure', 'gene', 'molecular_function', 'pathway', 'phenotype']
Edge types: [('patient', 'has_alteration', 'alteration'), ('patient', 'has_cancer_type', 'cancer_type'), ('alteration', 'mutated_in_gene', 'gene'), ('gene', 'interacts_with', 'biological_process'), ('gene', 'interacts_with', 'cellular_component'), ('gene', 'interacts_with', 'molecular_function'), ('gene', 'interacts_with', 'exposure'), ('molecular_function', 'interacts_with', 'exposure'), ('cellular_component', 'interacts_with', 'exposure'), ('biological_process', 'interacts_with', 'exposure'), ('gene', 'interacts_with', 'pathway'), ('gene', 'associated_with', 'phenotype'), ('gene', 'in', 'cell_line'), ('gene', 'expressed_in', 'anatomical_region'), ('drug', 'associated_with', 'phenotype'), ('gene', 'is_a', 'drug'), ('drug', 'is_a', 'gene'), ('patient', 'treated_with', 'drug')]


In [40]:
import networkx as nx

# Initialize a new undirected graph
G = nx.Graph()

# Add nodes for each node type with a unique identifier
for node_type in hetero_data.node_types:
    # Add nodes to the graph, ensuring unique identifiers
    nodes = [(f"{node_type}_{i}", {"type": node_type}) for i in range(hetero_data[node_type].num_nodes)]
    G.add_nodes_from(nodes)

# Add edges for each edge type
for edge_type in hetero_data.edge_types:
    # Retrieve edge index tensor for the current edge type
    src_type, _, dst_type = edge_type
    edge_index = hetero_data[edge_type].edge_index.numpy()

    # Add edges to the graph, using the unique identifiers
    edges = [(f"{src_type}_{src}", f"{dst_type}_{dst}") for src, dst in edge_index.T]
    G.add_edges_from(edges)


In [41]:
nx.write_graphml(G, "msk_cancer-kg.graphml")

In [42]:
import networkx as nx

# Load the graph from a GraphML file
G = nx.read_graphml('msk_cancer-kg.graphml')

# Convert it to an edge list and save to a file
with open('edge_list.edg', 'w') as f:
    for edge in G.edges():
        f.write(f"{edge[0]} {edge[1]}\n")
