In [1]:
import pandas as pd
import numpy as np
import torch
import pickle
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold



In [2]:
def load_network(file_path):
    """
    Load network from file.
    :param file_path: Full pathname of the network file
    :return: net (class: pandas.DataFrame): Edges in the network, nodes (class: pandas.DataFrame): The nodes in the network
    """
    net = pd.read_table(filepath_or_buffer=file_path, header=None,
                        index_col=None, names=['source', 'target'], sep='\t')
    nodes = pd.concat([net['source'], net['target']], ignore_index=True)
    nodes = pd.DataFrame(nodes, columns=['nodes']).drop_duplicates()
    nodes.reset_index(drop=True, inplace=True)
    return net, nodes

In [3]:
def build_customized_feature_matrix(feat_files, network_file):
    """
    Build feature matrix using multiple datasets from multiple files.
    :param feat_files: List of feature file paths (TSV format)
    :param network_file: Path to the network file
    :return: Concatenated feature matrix for all datasets
    """
    # Read and concatenate all feature files
    dataframes = [pd.read_csv(file, sep='\t', index_col=0) for file in feat_files]
    
    # Combine into one DataFrame
    full_data = pd.concat(dataframes, axis=1)
    
    # Extract all 16 datasets (columns with "MF:")
    dataset_names = [col for col in full_data.columns if "GE:" in col]
    return full_data[dataset_names]

In [4]:
def create_edge_index(network_file, net_features):
    """
    Convert edges in the network into indices for PyG.
    :param network_file: Path to the network file
    :param net_features: Feature matrix
    :return: Edge index tensor
    """
    net, _ = load_network(network_file)
    node_df = pd.DataFrame({'name': net_features.index.values.tolist(),
                            'id': np.arange(net_features.shape[0])})
    
    net = pd.merge(net, node_df, how='left', left_on='source', right_on='name')
    net = pd.merge(net, node_df, how='left', left_on='target', right_on='name')

    edge_index = net[['id_x', 'id_y']].dropna().astype(int).values  # Convert to integer IDs
    edge_index = np.vstack([edge_index, edge_index[:, ::-1]])  # Undirected edges
    
    return torch.LongTensor(edge_index.T)

In [5]:
def generate_5CV_set(drivers,nondrivers,randseed):
    """
    Generate 5CV splits.
    :param drivers: List of canonical driver genes(positive samples)
    :param nondrivers: List of nondriver genes(negative samples)
    :param randseed: Random seed
    :return: 5CV splits sorted in a dictionary
    """
    # StratifiedKFold
    X, y = drivers + nondrivers, np.hstack(([1]*len(drivers), [0]*len(nondrivers)))
    skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=randseed)
    X_5CV = {}
    cv_idx=1
    for train, test in skf.split(X, y):
        # train/test sorts the sample indices in X list.
        # For each split, we should convert the indices in train/test to names
        train_set=[]
        train_label=[]
        test_set=[]
        test_label=[]
        for i in train:
            train_set.append(X[i])
            train_label.append(y[i])
        for i in test:
            test_set.append(X[i])
            test_label.append(y[i])
        X_5CV['train_%d' % cv_idx] = train_set
        X_5CV['test_%d' % cv_idx] = test_set
        X_5CV['train_label_%d' % cv_idx] = train_label
        X_5CV['test_label_%d' % cv_idx] = test_label
        cv_idx = cv_idx + 1
    return X_5CV

In [6]:
# feat_file_lst = ['./preprocess_data/gene_mutation/MF_BLCA_mutation_matrix.tsv',
#                  './preprocess_data/gene_expression/GE_BLCA_expression_matrix.tsv',
#                  './preprocess_data/DNA_methylation/METH_BLCA_methylation_RATIO_mean.tsv']
feat_file_lst = ['./preprocess_data/gene_expression/GE_expression_matrix.tsv']

In [7]:
network_file = './data/PathNet/PathNet.txt'

In [8]:
# feat_name_lst = ['mut','exp','methy']
feat_name_lst = ['exp']

In [9]:
# Concatenate multiple features to form one feature matrix
net_features = build_customized_feature_matrix(feat_file_lst, network_file)

In [10]:
print(net_features)

            GE: KIRC  GE: BRCA  GE: READ  GE: PRAD  GE: STAD  GE: HNSC  \
Unnamed: 0                                                               
STIM1       0.021826  0.078583  0.081009  0.024927  0.058403  0.054881   
TRPC1       0.035037  0.175396  0.127587  0.160771  0.025139  0.017627   
NOS1        0.317315  0.366842  0.362653  0.327616  0.386143  0.240406   
ATP2B4      0.058874  0.121747  0.254115  0.198962  0.000587  0.046899   
ABCC9       0.006580  0.252722  0.109012  0.148716  0.036229  0.008346   
...              ...       ...       ...       ...       ...       ...   
GPR153      0.021059  0.048254  0.010328  0.087304  0.032409  0.133440   
HDC         0.122908  0.159479  0.135696  0.030383  0.155934  0.015314   
CSMD1       0.131125  0.319950  0.000000  0.214714  0.129068  0.184721   
BHLHE22     0.001159  0.139090  0.046438  0.125946  0.038114  0.048209   
PROKR1      0.109166  0.559162  0.262671  0.278055  0.000000  0.000000   

            GE: LUAD  GE: THCA  GE: B

In [11]:
# A dataset contains the following data:
# feature: the gene feature matrix
# edge_index: graph edges for training model
# node_name: gene names
# feature_name: feature names
# label: True labels of genes (0 for negative samples and 1 for positive samples),
# k_sets: 5CV splits that randomly generated for ten times
# mask: mask for training a single model without cross-validation
dataset=dict()
dataset['feature'] = torch.FloatTensor(np.array(net_features))
dataset['node_name'] = net_features.index.values.tolist()

In [12]:
# Create edge_index by edges in network file
edge_index = create_edge_index(network_file,net_features)
dataset['edge_index'] = torch.LongTensor(np.array(edge_index).transpose())
dataset['feature_name'] = net_features.columns.values.tolist()

In [13]:
d_lst = pd.read_table("./data/796_drivers.txt", sep='\t', header=None, names=['driver'])['driver'].tolist()
nd_lst = pd.read_table("./data/2187_nondrivers.txt", sep='\t', header=None, names=['nondriver'])['nondriver'].tolist()

In [14]:
labels = [1 if g in d_lst else 0 for g in dataset['node_name']]
mask = [True if g in d_lst or g in nd_lst else False for g in dataset['node_name']]

In [15]:
dataset['label'] = torch.FloatTensor(labels)
dataset['mask'] = np.array(mask)

# Generate 10 rounds of 5-fold CV splits
d_in_net = [g for g in dataset['node_name'] if g in d_lst]
nd_in_net = [g for g in dataset['node_name'] if g in nd_lst]

k_sets_net = dict()
for k in range(10):
    randseed = (k+1) % 100 + (k+1) * 5
    cv_splits = generate_5CV_set(d_in_net, nd_in_net, randseed)
    k_sets_net[k] = [
        (np.array([g in cv_splits[f'train_{i}'] for g in dataset['node_name']]),
         np.array([g in cv_splits[f'test_{i}'] for g in dataset['node_name']]))
        for i in range(1, 6)
    ]

dataset['split_set'] = k_sets_net

# Save the dataset as pickle file, which can be used for training HGDC
with open("/home/abhijeet/Desktop/Cancer_Mutation_Prediction/GNN_Transformers/Preprocessed_Dataset/Path_Net/GE_PathNet_dataset_ten_5CV.pkl", 'wb') as f:
    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)

In [16]:
print(dataset)

{'feature': tensor([[0.0218, 0.0786, 0.0810,  ..., 0.0304, 0.0910, 0.0173],
        [0.0350, 0.1754, 0.1276,  ..., 0.0662, 0.3309, 0.0425],
        [0.3173, 0.3668, 0.3627,  ..., 0.2257, 0.3389, 0.5127],
        ...,
        [0.1311, 0.3200, 0.0000,  ..., 0.0416, 0.3418, 0.2629],
        [0.0012, 0.1391, 0.0464,  ..., 0.0920, 0.0548, 0.1508],
        [0.1092, 0.5592, 0.2627,  ..., 0.0452, 0.0000, 0.0000]]), 'node_name': ['STIM1', 'TRPC1', 'NOS1', 'ATP2B4', 'ABCC9', 'KCNJ11', 'HADHA', 'HADHB', 'GTF2E2', 'GTF2E1', 'GTF2A2', 'GTF2A1', 'VANGL2', 'SCRIB', 'DVL2', 'PARD6A', 'GRB2', 'GAB1', 'EGFR', 'EGF', 'PIK3CA', 'PIK3R1', 'SRC', 'PXN', 'CLPS', 'PNLIP', 'GNB1', 'GNGT1', 'CNGA1', 'CNGB1', 'FNTB', 'FNTA', 'RAD50', 'MRE11', 'KPNA2', 'NBN', 'KAT5', 'ATM', 'BAZ1B', 'SMARCA5', 'UBE2I', 'SUMO1', 'UBE2V2', 'UBE2N', 'BARD1', 'BRCA1', 'SUMO2', 'MDC1', 'HERC2', 'ATRIP', 'ATR', 'TIMELESS', 'TIPIN', 'PPP4R2', 'PPP4C', 'RAD51C', 'XRCC3', 'EME2', 'MUS81', 'EME1', 'BRCA2', 'RAD51', 'ERCC4', 'ERCC1', 'XRCC1

In [17]:
print(dataset.keys())

dict_keys(['feature', 'node_name', 'edge_index', 'feature_name', 'label', 'mask', 'split_set'])


In [18]:
print(len(dataset['label']), len(dataset['feature']))

13627 13627


In [19]:
print(dataset['feature'].shape)

torch.Size([13627, 16])


In [23]:
import pickle

def load_pickle(file_path):
    """Load a pickle file and return the dictionary."""
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Load both pickle files
file1 = "/home/abhijeet/Desktop/Cancer_Mutation_Prediction/GNN_Transformers/Preprocessed_Dataset/Path_Net/MF_PathNet_dataset_ten_5CV.pkl"
file2 = "/home/abhijeet/Desktop/Cancer_Mutation_Prediction/GNN_Transformers/Preprocessed_Dataset/Path_Net/GE_PathNet_dataset_ten_5CV.pkl"

data1 = load_pickle(file1)
data2 = load_pickle(file2)

# Extract labels
labels1 = data1.get('label', None)
labels2 = data2.get('label', None)

# Ensure both files contain the 'label' key
if labels1 is None or labels2 is None:
    print("One or both files do not contain the 'label' key.")
else:
    # Convert labels to tensors if they are numerical
    def convert_to_tensor(labels):
        if isinstance(labels, torch.Tensor):
            return labels  # Already a tensor
        elif isinstance(labels, (list, tuple)) and all(isinstance(i, (int, float)) for i in labels):
            return torch.tensor(labels)  # Convert numerical lists to tensors
        elif isinstance(labels, list) and all(isinstance(i, str) for i in labels):
            print("Labels contain strings. Unique values:", set(labels))  # Debugging info
            return None  # Return None since we can't directly compare
        else:
            print("Unexpected label format:", type(labels))
            return None
    
    labels1 = convert_to_tensor(labels1)
    labels2 = convert_to_tensor(labels2)

    # Ensure both labels are tensors before comparison
    if labels1 is not None and labels2 is not None:
        if torch.equal(labels1, labels2):
            print("The labels match in both files.")
        else:
            print("The labels do not match.")
            
            # Find mismatched indices
            mismatches = (labels1 != labels2).nonzero(as_tuple=True)[0]
            print(f"Number of mismatches: {len(mismatches)}")
            if len(mismatches) > 0:
                print(f"Mismatched indices (first 10): {mismatches[:10]}")  # Print first 10 mismatches

The labels match in both files.


In [24]:
for i in labels1 == labels2:
    if i:
        pass
    else:
        print(i)