In [1]:
import os
import time
import pandas as pd
from typing import List, Tuple, Union
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import logging
import cProfile, pstats, io
import sys
import glob
import h5py
from deeprankcore.dataset import GraphDataset
from deeprankcore.domain import nodestorage as Nfeat
from deeprankcore.domain import edgestorage as Efeat

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
run_day = '230329'
run_day_data = '230329'
project_folder = '/projects/0/einf2380/'
project_folder_sample = '/home/cyulin/snellius_data_sample/'
data = 'pMHCI'
resolution = 'residue' # either 'residue' or 'atomic'
target_dataset = 'binary'
features = 'electrostatic'
protein_class = 'I'
target_data = 'BA'
resolution_data = 'residue' # either 'residue' or 'atomic'
cluster_dataset = None # 'cl_allele'# None # 'allele_type'
cluster_dataset_type = 'string' # None # 'string'
# train_clusters = [0, 1, 2, 3, 4, 7, 9]
# val_clusters = [5, 8]
test_clusters = ['C']

# Target/s
target_group = 'target_values'
target_dataset = 'binary'
task = 'classif'

folder_data = f'{project_folder}/data/pMHC{protein_class}/features_output_folder/GNN/{resolution_data}/{run_day_data}'
input_data_path = glob.glob(os.path.join(folder_data, 'residue-3328373.hdf5'))
output_folder =f'{project_folder_sample}/data/pMHC{protein_class}/features_output_folder/GNN/{resolution_data}/{run_day_data}'

# Loggers
_log = logging.getLogger('')
_log.setLevel(logging.INFO)

#fh = logging.FileHandler(os.path.join(exp_path, 'training.log'))
sh = logging.StreamHandler(sys.stdout)
#fh.setLevel(logging.INFO)
sh.setLevel(logging.INFO)
formatter_fh = logging.Formatter('[%(asctime)s] - %(name)s - %(message)s',
                               datefmt='%a, %d %b %Y %H:%M:%S')
#fh.setFormatter(formatter_fh)

#_log.addHandler(fh)
_log.addHandler(sh)
####################

In [3]:
features_arr=['distance']

#hdf5_pandas = os.path.join(output_folder, f'{resolution}_pandas.feather')
images_path = os.path.join(output_folder, 'images_new')
if not os.path.exists(images_path):
    os.makedirs(images_path)
    


In [4]:
def save_hist( # pylint: disable=too-many-arguments, too-many-branches, useless-suppression
        df,
        features: Union[str,List[str]],
        fname: str = 'features_hist.png',
        bins: Union[int,List[float],str] = 10,
        figsize: Tuple = (15, 15),
        log: bool = False
):
    
    if not isinstance(features, list):
        features = [features]

    features_df = [col for feat in features for col in df.columns.values.tolist() if feat in col]
    
    means = [
        round(np.concatenate(df[feat].values).mean(), 1) if isinstance(df[feat].values[0], np.ndarray) \
        else round(df[feat].values.mean(), 1) \
        for feat in features_df]
    devs = [
        round(np.concatenate(df[feat].values).std(), 1) if isinstance(df[feat].values[0], np.ndarray) \
        else round(df[feat].values.std(), 1) \
        for feat in features_df]

    if len(features_df) > 1:

        fig, axs = plt.subplots(len(features_df), figsize=figsize)

        for row, feat in enumerate(features_df):       
            if isinstance(df[feat].values[0], np.ndarray):
                if(log):
                    log_data = np.log(np.concatenate(df[feat].values))
                    log_data[log_data == -np.inf] = 0
                    axs[row].hist(log_data, bins=bins)
                else:
                    axs[row].hist(np.concatenate(df[feat].values), bins=bins)
            else:
                if(log):
                    log_data = np.log(df[feat].values)
                    log_data[log_data == -np.inf] = 0 
                    axs[row].hist(log_data, bins=bins)
                else:
                    axs[row].hist(df[feat].values, bins=bins)
            axs[row].set(xlabel=f'{feat} (mean {means[row]}, std {devs[row]})', ylabel='Count')
        fig.tight_layout()

    elif len(features_df) == 1:
        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)
        if isinstance(df[features_df[0]].values[0], np.ndarray):
            if(log):
                log_data = np.log(np.concatenate(df[features_df[0]].values))
                log_data[log_data == -np.inf] = 0
                ax.hist(log_data, bins=bins)
            else:
                ax.hist(np.concatenate(df[features_df[0]].values), bins=bins)
        else:
            if(log):
                log_data = np.log(df[features_df[0]].values)
                log_data[log_data == -np.inf] = 0
                ax.hist(log_data, bins=bins)
            else:
                ax.hist(df[features_df[0]].values, bins=bins)
        ax.set(xlabel=f'{features_df[0]} (mean {means[0]}, std {devs[0]})', ylabel='Count')

    else:
        raise ValueError("Please provide valid features names. They must be present in the current :class:`DeeprankDataset` children instance.")
    
    fig.tight_layout()
    fig.savefig(fname)
    plt.close(fig)

In [5]:
hdf5_test="train.hdf5"
#hdf5_files = glob.glob(os.path.join(output_folder, '*.hdf5'))
dataset = GraphDataset(
    hdf5_path = input_data_path,
    target = target_dataset
)
df = dataset.hdf5_to_pandas()


Checking dataset Integrity...
Target classes set up to: [0, 1]
   ['/projects/0/einf2380//data/pMHCI/features_output_folder/GNN/residue/230329/residue-3328373.hdf5'] dataset                 : 100%|██████████| 1/1 [00:00<00:00, 332.91it/s, entry_name=residue-3328373.hdf5]


In [6]:
for feat in features_arr:
    save_hist(df, feat, os.path.join(images_path, f'{feat}_nostandardize.png'), bins=10)
    _log.info('%s histogram saved',feat)

distance histogram saved


In [7]:
feat_notrans_dict={'bsa':{'Transformation':None,'Standardization':True},
               'res_depth':{'Transformation':None,'Standardization':True},
               'info_content':{'Transformation':None,'Standardization':True},
               'sasa':{'Transformation':None,'Standardization':True},
               'electrostatic':{'Transformation':None,'Standardization':True},
               'vanderwaals':{'Transformation':None,'Standardization':True},
               'res_size':{'Transformation':None,'Standardization':True},
               'res_charge':{'Transformation':None,'Standardization':True},
               'hb_donors':{'Transformation':None,'Standardization':True},
               'hb_acceptors':{'Transformation':None,'Standardization':True},
               'hse':{'Transformation':None,'Standardization':True},
               'irc_nonpolar_negative':{'Transformation':None,'Standardization':True},
               'irc_nonpolar_nonpolar':{'Transformation':None,'Standardization':True},
               'irc_nonpolar_polar':{'Transformation':None,'Standardization':True},
               'irc_nonpolar_positive':{'Transformation':None,'Standardization':True},
               'irc_polar_polar':{'Transformation':None,'Standardization':True},
               'irc_polar_positive':{'Transformation':None,'Standardization':True},
               'irc_total':{'Transformation':None,'Standardization':True},
               'irc_negative_positive':{'Transformation':None,'Standardization':True},
               'irc_positive_positive':{'Transformation':None,'Standardization':True},
               'irc_polar_negative':{'Transformation':None,'Standardization':True},
               'irc_negative_negative':{'Transformation':None,'Standardization':True},
               'res_mass':{'Transformation':None,'Standardization':True},
               'res_pI':{'Transformation':None,'Standardization':True},
               'distance':{'Transformation':None,'Standardization':True},
               'pssm':{'Transformation':None,'Standardization':True}}

In [8]:
#hdf5_files = glob.glob(os.path.join(output_folder, '*.hdf5'))
dataset_standardize = GraphDataset(
    hdf5_path = input_data_path,
    target = target_dataset,
    feat_trans_dict=feat_notrans_dict
)


Checking dataset Integrity...
Target classes set up to: [0, 1]
   ['/projects/0/einf2380//data/pMHCI/features_output_folder/GNN/residue/230329/residue-3328373.hdf5'] dataset                 : 100%|██████████| 1/1 [00:00<00:00, 373.42it/s, entry_name=residue-3328373.hdf5]


In [None]:
#bsa = []
#for idx in range(len(dataset)):
    #x = dataset_standardize.get(idx,feat_notrans_dict)
    #bsa.append(x.x[:,0])
#df_standardize = dataset_standardize.hdf5_to_pandas()
#bsa = np.concatenate(bsa)

In [None]:
with h5py.File(hdf5_test, 'r') as f5:
            grp = f5[list(f5.keys())[0]]
            # test_node_features=['bsa','hse'] #test node features
            # # getting all node features values
            # tensor_idx = 0
            # features_dict = {}
            # for feat in test_node_features:
            #     vals = grp[f"{Nfeat.NODE}/{feat}"][()]
            #     if vals.ndim == 1: # features with only one channel
            #         arr = []
            #         for entry_idx in range(len(dataset_standardize)):
            #             arr.append(dataset_standardize.get(entry_idx,feat_notrans_dict).x[:, tensor_idx])
            #         arr = np.concatenate(arr)
            #         features_dict[feat] = arr
            #         tensor_idx += 1
                    
            #         #plot histogram
            #         fig = plt.figure(figsize=(15, 15))
            #         ax = fig.add_subplot(111)
            #         ax.hist(features_dict[feat], bins=10)
            #         for key, values in features_dict.items():
            #             if(key == 'bsa'):
            #                 mean = values.mean()
            #                 dev = values.std()
            #         ax.set(xlabel=f'{test_node_features[0]} (mean {mean}, std {dev})', ylabel='Count')
            #         fig.tight_layout()
            #         fig.savefig(os.path.join(images_path, f'{feat}_standardize.png'))
            #         plt.close(fig)
            #     else: #features with multiple channels
            #         for ch in range(vals.shape[1]):
            #             arr = []
            #             for entry_idx in range(len(dataset_standardize)):
            #                 arr.append(dataset_standardize.get(entry_idx,feat_notrans_dict).x[:, tensor_idx])
            #             tensor_idx += 1
            #             arr = np.concatenate(arr)
            #             features_dict[feat + f'_{ch}'] = arr
                        
            #         #plot histogram
            #         fig, axs = plt.subplots(3, figsize=(15,15))
            #         for ch in range(vals.shape[1]):
            #             axs[ch].hist(features_dict[feat + f'_{ch}'], bins=10)
            #             mean=features_dict[feat + f'_{ch}'].mean()
            #             dev=features_dict[feat + f'_{ch}'].std()
            #             axs[ch].set(xlabel=f'{feat}_{ch} (mean {mean}, std {dev})', ylabel='Count')
            #         fig.tight_layout()
            #         fig.savefig(os.path.join(images_path, f'{feat}_standardize.png'))
            #         plt.close(fig)
            
            # getting all edge features values
            tensor_idx = 0
            features_dict = {}
            test_edge_features=['distance'] #test node features
            for feat in test_edge_features:
                vals = grp[f"{Efeat.EDGE}/{feat}"][()]
                if vals.ndim == 1: # features with only one channel
                    arr = []
                    for entry_idx in range(len(dataset_standardize)):
                        arr.append(dataset_standardize.get(entry_idx,feat_notrans_dict).edge_attr[:, tensor_idx])
                    arr = np.concatenate(arr)
                    features_dict[feat] = arr
                    tensor_idx += 1
                    
                    #plot histogram
                    fig = plt.figure(figsize=(15, 15))
                    ax = fig.add_subplot(111)
                    ax.hist(features_dict[feat], bins=10)
                    for key, values in features_dict.items():
                        if(key == 'distance'):
                            mean = values.mean()
                            dev = values.std()
                    ax.set(xlabel=f'{test_edge_features[0]} (mean {mean}, std {dev})', ylabel='Count')
                    fig.tight_layout()
                    fig.savefig(os.path.join(images_path, f'{feat}_standardize.png'))
                    plt.close(fig)