<a href="https://colab.research.google.com/github/Aniket25042003/Aniket25042003/blob/main/DeepPPISP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

__init__.py (generator)

In [None]:
from .data_generator import dataSet

1. What : Imports a module named "dataset" from the data_generator" package or module.
2. Why : Importing the "dataSet" module allows you to access the functions, classes, or variables defined within that module.

data_generator.py

In [None]:
#-*- encoding:utf8 -*-



import os
import time
import pickle
import torch as t
import numpy as np
from torch.utils import data


#my lib
from utils.config import DefaultConfig



# Creating PyTorch class to custom datasets
class dataSet(data.Dataset):
    # Intializing datasets
    def __init__(self,window_size,sequences_file=None,pssm_file=None, dssp_file=None, label_file=None, protein_list_file=None):
        super(dataSet,self).__init__()

        # rb - read in binary

        # List to store sequences
        self.all_sequences = []
        for seq_file in sequences_file:
            with open(seq_file,"rb") as fp_seq: # Opening seq_file.pkl file
               temp_seq  = pickle.load(fp_seq) # Unpickling seq_file.pkl file
            self.all_sequences.extend(temp_seq) # Storing data into the list

        # List to store PSSM
        self.all_pssm = []
        for pm_file in pssm_file:
            with open(pm_file,"rb") as fp_pssm: # Opening pm_file.pkl file
                temp_pssm = pickle.load(fp_pssm) # Unpickling pm_file.pkl file
            self.all_pssm.extend(temp_pssm) # Storing data into the list

        # List to store DSSP
        self.all_dssp = []
        for dp_file in dssp_file:
            with open(dp_file,"rb") as fp_dssp: # Opening dp_file.pkl file
                temp_dssp  = pickle.load(fp_dssp) # Unpickling dp_file.pkl file
            self.all_dssp.extend(temp_dssp) # Storing data into the list

        # List to store label
        self.all_label = []
        for lab_file in label_file:
            with open(lab_file, "rb") as fp_label: # Opening lab_file.pkl file
                temp_label = pickle.load(fp_label) # Unpickling lab_file.pkl file
            self.all_label.extend(temp_label) # Storing data into the list

        with open(protein_list_file, "rb") as list_label: # Opening protein_list_file.pkl file
            self.protein_list = pickle.load(list_label) # Unpickling protein_list_file.pkl file



        self.Config = DefaultConfig()
        self.max_seq_len = self.Config.max_sequence_length
        self.window_size = window_size



    def __getitem__(self,index):

        count,id_idx,ii,dset,protein_id,seq_length = self.protein_list[index] # Retrieves the information (data) for the given index
        window_size = self.window_size # setting window_size to calculate the start and end positions of the sliding window
        id_idx = int(id_idx) # index
        win_start = ii - window_size # starting point of the sliding window
        win_end = ii + window_size # ending point of the sliding window
        seq_length = int(seq_length) # sequence length
        label_idx = (win_start+win_end)//2 # label index

        # Intializing all_seq_features list & seq_len variable
        all_seq_features = []
        seq_len = 0

        # Interating over the sequence data
        for idx in self.all_sequences[id_idx][:self.max_seq_len]:
            # Converting amino acid indexes into a one hot encoding representation
            acid_one_hot = [0 for i in range(20)]
            acid_one_hot[idx] = 1
            # Storing one hot encoding representations into all_seq_features list
            all_seq_features.append(acid_one_hot)
            seq_len += 1

        # Interating over the sequence data to ensure that the length of all_seq_features_ list is equal to self.max_seq_len
        while seq_len<self.max_seq_len:
            acid_one_hot = [0 for i in range(20)]
            all_seq_features.append(acid_one_hot)
            seq_len += 1

        # Iterating over the pssm data to ensure that the length of all_pssm_features list is equal to self.max_seq_len
        all_pssm_features = self.all_pssm[id_idx][:self.max_seq_len]
        seq_len = len(all_pssm_features)
        while seq_len<self.max_seq_len:
            zero_vector = [0 for i in range(20)]
            all_pssm_features.append(zero_vector) # Adding zero vectors if necessary
            seq_len += 1

        # Iterating over the pssm data to ensure that the length of all_pssm_features list is equal to self.max_seq_len
        all_dssp_features = self.all_dssp[id_idx][:self.max_seq_len]
        seq_len = len(all_dssp_features)
        while seq_len<self.max_seq_len:
            zero_vector = [0 for i in range(9)]
            all_dssp_features.append(zero_vector) # Adding zero vectors if necessary
            seq_len += 1

        # Intializing local_features and labels list
        local_features = []
        labels = []

        # Iterating over the sliding window and adding the corresponding features for each postion
        # For starting positionless than 0
        while win_start<0:
            # Intializing data list
            data = []
            acid_one_hot = [0 for i in range(20)]
            data.extend(acid_one_hot) # Adding acid_one_hot

            pssm_zero_vector = [0 for i in range(20)]
            data.extend(pssm_zero_vector) # Adding ppsm_zero_vector for missing positions

            dssp_zero_vector = [0 for i in range(9)]
            data.extend(dssp_zero_vector) # Adding dssp_zero_vector for missing positions

            # Adding features to the local_features list
            local_features.extend(data)
            win_start += 1

        valid_end = min(win_end,seq_length-1)
        # For starting position less than or equal to valid end position
        while win_start<=valid_end:
            data = []
            idx = self.all_sequences[id_idx][win_start]

            acid_one_hot = [0 for i in range(20)]
            acid_one_hot[idx] = 1
            data.extend(acid_one_hot) # Adding acid_one_hot


            pssm_val = self.all_pssm[id_idx][win_start]
            data.extend(pssm_val) # Adding ppsm_zero_vector

            try:
                dssp_val = self.all_dssp[id_idx][win_start]
            except:
                dssp_val = [0 for i in range(9)]
            data.extend(dssp_val)  # Adding dssp_zero_vector

            # Adding features to the local_features list
            local_features.extend(data)
            win_start += 1

        # For starting position less than or equal to ending position
        while win_start<=win_end:
            data = []
            acid_one_hot = [0 for i in range(20)]
            data.extend(acid_one_hot) # Adding acid_one_hot

            pssm_zero_vector = [0 for i in range(20)]
            data.extend(pssm_zero_vector) # Adding ppsm_zero_vector for missing positions

            dssp_zero_vector = [0 for i in range(9)]
            data.extend(dssp_zero_vector) # Adding dssp_zero_vector for missing positions

            # Adding features to the local_features list
            local_features.extend(data)
            win_start += 1

        # Retrieving label for the central position of the sliding window
        label = self.all_label[id_idx][label_idx]
        # Converting label list into NumPy array
        label = np.array(label,dtype=np.float32)

        # Converting all_seq_features list into NumPy array
        all_seq_features = np.stack(all_seq_features)
        # Resizing all_seq_features NumPy array by adding an extra dimension at the beginning of it
        all_seq_features = all_seq_features[np.newaxis,:,:]
        # Converting all_pssm_features list into NumPy array
        all_pssm_features = np.stack(all_pssm_features)
        # Resizing all_pssm_features NumPy array by adding an extra dimension at the beginning of it
        all_pssm_features = all_pssm_features[np.newaxis,:,:]
        # Converting all_dssp_features list into NumPy array
        all_dssp_features = np.stack(all_dssp_features)
        # Resizing all_dssp_features NumPy array by adding an extra dimension at the beginning of it
        all_dssp_features = all_dssp_features[np.newaxis,:,:]
        # Converting local_features list into NumPy array
        local_features = np.stack(local_features)


        # Returns all_seq_features,all_pssm_features,all_dssp_features,local_features,label NumPy arrays
        return all_seq_features,all_pssm_features,all_dssp_features,local_features,label


    def __len__(self):
        # Returns the total number of proteins in the dataset
        return len(self.protein_list)

1. What : The 'dataSet' class in 'data_generator.py' is designed to load and process protein sequence data along with associated features (PSSM, DSSP) and labels. It provides functionality to retrieve specific items from the dataset, preprocess the data, and return it as NumPy arrays
2. Why : This custom dataset class can be used in conjunction with PyTorch's data loading utilities to efficiently handle and iterate over protein sequence data during training or evaluation of a machine learning model

__init__.py (config)

In [None]:
from utils import config as Configs

1. What : Imports a module named 'config' from the 'utils' package
2. Why : This allows the code to access the classes, functions, or variables defined in that module

config.py

In [None]:
#-*- encoding:utf8 -*-

# Creating DefaulfConfig class that inherits from the object class
class DefaultConfig(object):

    acid_one_hot = [0 for i in range(20)] # Creating acid_one_hot list and intizlizing it with 20 zeros
    acid_idex = {j:i for i,j in enumerate("ACDEFGHIKLMNPQRSTVWY")} # Creating acid_index dictionary and assining each amino acid charater a corresponding index


    BASE_PATH = "../../" # Creating BASE_PATH string variable
    sequence_path = "{0}/data_cache/sequence_data".format(BASE_PATH) # Creating a sequence_path string variable which represents the path to the sequence data cache directory by formating the BASE_PATH variable
    pssm_path = "{0}/data_cache/pssm_data".format(BASE_PATH) # Creating a sequence_path string variable which represents the path to the pssm data cache directory by formating the BASE_PATH variable
    dssp_path = "{0}/data_cache/dssp_data".format(BASE_PATH)# Creating a sequence_path string variable which represents the path to the dssp data cache directory by formating the BASE_PATH variable

    max_sequence_length = 500 # Creating an integer variable max_sequence_length (Maximum length of a proteing sequence) and set it to 500
    windows_size = 3 # Creating an integer variable windows_size (Sliding window size) and set it to 3

    batch_size = 32 # Creating an integer variabel batch_size (Number of samples) and set it to 32
    seq_dim = 20 # Creating an integer variabel seq_dim (Dimensions of the sequnece features) and set it to 20
    dssp_dim = 9 # Creating an integer variabel dssp_dim (Dimensions of the dssp features) and set it to 9
    pssm_dim = 20 # Creating an integer variabel pssm_dim (Dimensions of the pssm features) and set it to 20

    kernels = [13,15,17] # Creating a list kernels (Size of convolutional kernels) which contains the values 13, 15, and 17
    dropout =0.2 # Creating a flot variable dropout (Dropout rate) and set it to 0.2
    splite_rate = 0.9 # Creating a flot variable splite_rate (Split rate) and set it to 09 (90% data will be used for training and 10% data will be used for testing)

1. What : 'DefaultConfig' class contains various attributes that define configuration settings for a protein sequence analysis task
2. Why : These configuration settings provide default values for various parameters and paths used in the protein sequence analysis task. By modifying these attributes, the behavior of the analysis can be adjusted to suit specific requirements or experimental conditions

__init__.py (deep_ppi)

In [None]:
from .deep_ppi import DeepPPI

1. What : Imports 'DeepPPI' module from '.deep_ppi' package
2. Why : This allows the code to access the classes, functions, or variables defined in that module

BasicModule.py

In [None]:
#-*- encoding:utf-8 -*-

import torch as t
import time

# Creating a class BasicModule that inherits from torch.nn.Module
class BasicModule(t.nn.Module):

    # Creating a constructor
    def __init__(self):
        super(BasicModule,self).__init__() # Intialize the parent class torch.nn.Module
        self.model_name = str(type(self))

    # Creating a load method
    def load(self,path):
        self.load_state_dict(t.load(path)) # Loads the state dictionary from specified path

    # Creating a save method
    def save(self,name=None):
        if name is None: # Checks if the parameter is not provided to the save method
            prefix = "" # Intializing an empty string variable premix
            name = time.strftime("%y%m%d_%H:%M:%S.pth".format(prefix)) # Generates a timestamp-based name for the saved model file

        t.save(self.state_dict(),name) # Saves the state dictionary of the model
        return name # Returns the name

1. What : This BasicModule class provides basic functionality for loading and saving models.
2. Why : It can be inherited and extended to create custom neural network modules with specific architectures and functionalities

deep_ppi.py

In [None]:
#-*- encoding:utf8 -*-

import os
import time
import sys

import torch as t
from torch import nn
from torch.autograd import Variable


#from basic_module import BasicModule
from models.BasicModule import BasicModule

sys.path.append("../") # Appends the parent directory to the system path
from utils.config import DefaultConfig
configs = DefaultConfig() # Creates an instance of the DefaultConfig class and assings it to the config variable

# Creates a ConvsLayer (Convolutional layer module) class that inherits from the BasicModule
class ConvsLayer(BasicModule):
    # Constructor method
    def __init__(self,):

        super(ConvsLayer,self).__init__() # Calls the constructor of BasicModule class to intialize it

        self.kernels = configs.kernels # Intializes kernels
        hidden_channels = configs.cnn_chanel # Intializes hidden_channels
        in_channel = 1 # Intializes in_channel (Intial channel)
        features_L = configs.max_sequence_length # Intializes features_L (Features length)
        seq_dim = configs.seq_dim # Intializes seq_dim (Sequence dimensions)
        dssp_dim = configs.dssp_dim # Intializes dssp_dim (DSSP dimensions)
        pssm_dim = configs.pssm_dim # Intializes pssm_dim (PSSM dimensions)
        W_size = seq_dim + dssp_dim + pssm_dim # Intializes W_size (Window size)

        padding1 = (self.kernels[0]-1)//2 # Intializes padding1
        padding2 = (self.kernels[1]-1)//2 # Intializes padding2
        padding3 = (self.kernels[2]-1)//2 # Intializes padding3
        self.conv1 = nn.Sequential() # Adding first Sequential layer
        self.conv1.add_module("conv1", # Adding convolutional module nn.Conv2d to the first Sequential layer names conv1
            nn.Conv2d(in_channel, hidden_channels,
            padding=(padding1,0),
            kernel_size=(self.kernels[0],W_size)))
        self.conv1.add_module("ReLU",nn.PReLU()) # Adding activation function to the conv1
        self.conv1.add_module("pooling1",nn.MaxPool2d(kernel_size=(features_L,1),stride=1)) # Adding max pooling module to the conv1

        self.conv2 = nn.Sequential() # Adding second Sequential layer
        self.conv2.add_module("conv2", # Adding convolutional module nn.Conv2d to the second Sequential layer names conv2
            nn.Conv2d(in_channel, hidden_channels,
            padding=(padding2,0),
            kernel_size=(self.kernels[1],W_size)))
        self.conv2.add_module("ReLU",nn.ReLU()) # Adding activation function to the conv2
        self.conv2.add_module("pooling2",nn.MaxPool2d(kernel_size=(features_L,1),stride=1)) # Adding max pooling module to the conv2

        self.conv3 = nn.Sequential() # Adding third Sequential layer
        self.conv3.add_module("conv3", # Adding convolutional module nn.Conv2d to the third Sequential layer names conv3
            nn.Conv2d(in_channel, hidden_channels,
            padding=(padding3,0),
            kernel_size=(self.kernels[2],W_size)))
        self.conv3.add_module("ReLU",nn.ReLU()) # Adding activation function to the conv3
        self.conv3.add_module("pooling3",nn.MaxPool2d(kernel_size=(features_L,1),stride=1)) # Adding max pooling module to the conv3

    # Creates forward method
    def forward(self,x):

        features1 = self.conv1(x) # Producing the output features1 by using conv1
        features2 = self.conv2(x) # Producing the output features2 by using conv2
        features3 = self.conv3(x) # Producing the output features3 by using conv3
        features = t.cat((features1,features2,features3),1) # Concatenates the output features from the three sequetial layers and adds another dimension at the end (Now it has total 2 dimensions)
        shapes = features.data.shape # Retrieves the shape of the features tensor
        features = features.view(shapes[0],shapes[1]*shapes[2]*shapes[3]) # Reshapes the features tensor by flattening the last three (1,2,3) dimensions into a single dimension

        return features # Returns the features tensor

# Creates a class DeepPPI that inherits from the BasicModule class
class DeepPPI(BasicModule):
    # Creates a constructor
    def __init__(self,class_nums,window_size,ratio=None):
        super(DeepPPI,self).__init__() # Calls the constructor of tha parent class BasicModule to intialize it
        global configs # Global configs object
        configs.kernels = [13, 15, 17] # Sets the kernels 13,15, and 17
        self.dropout = configs.dropout = 0.2 # Sets the dropout (Drop rate) 0.2

        seq_dim = configs.seq_dim*configs.max_sequence_length # Sets the seq_dim (Sequence dimensions)


        self.seq_layers = nn.Sequential() # Sets the seq_layers
        self.seq_layers.add_module("seq_embedding_layer", # Adds linear layer called seq_embedding_layer
        nn.Linear(seq_dim,seq_dim))
        self.seq_layers.add_module("seq_embedding_ReLU", # Adds activation function called seq_embedding_ReLU
        nn.ReLU())


        seq_dim = configs.seq_dim # Retrieves sequence dimensions from configs object
        dssp_dim = configs.dssp_dim # Retrieves dssp dimensions from configs object
        pssm_dim = configs.pssm_dim # Retrieves pssm dimensions from configs object
        local_dim = (window_size*2+1)*(pssm_dim+dssp_dim+seq_dim) # Calculates local dimensions
        if ratio:
            configs.cnn_chanel = (local_dim*int(ratio[0]))//(int(ratio[1])*3) # Calculates cnn channel
        input_dim = configs.cnn_chanel*3+local_dim # Calculates input dimensions

        self.multi_CNN = nn.Sequential() # Creates a sequential container to hold the convolutional layers
        self.multi_CNN.add_module("layer_convs",
                               ConvsLayer())



        self.DNN1 = nn.Sequential() # Creates a sequential container
        self.DNN1.add_module("DNN_layer1", # Adds linear layer called DNN_layer1
                            nn.Linear(input_dim,1024))
        self.DNN1.add_module("ReLU1",
                            nn.ReLU()) # Adds activation function called ReLU1
        #self.dropout_layer = nn.Dropout(self.dropout)
        self.DNN2 = nn.Sequential() # Creates a sequential container
        self.DNN2.add_module("DNN_layer2", # Adds linear layer called DNN_layer2
                            nn.Linear(1024,256))
        self.DNN2.add_module("ReLU2", # Adds activation function called ReLU2
                            nn.ReLU())

        # outLayer is a sequential container with a single linear layer mapping the input dimension (256) to the number of output classes (class_nums) and a sigmoid activation function
        self.outLayer = nn.Sequential(
            nn.Linear(256, class_nums),
            nn.Sigmoid())

    # The forward method is defined, which specifies how the input flows through the network during the forward pass
    def forward(self,seq,dssp,pssm,local_features):
        # Passes input tensors seq,dssp,pssm,local_features through the seq_layers container to apply the sequence embedding layers
        shapes = seq.data.shape
        features = seq.view(shapes[0],shapes[1]*shapes[2]*shapes[3])
        features = self.seq_layers(features)
        features = features.view(shapes[0],shapes[1],shapes[2],shapes[3])

        features = t.cat((features,dssp,pssm),3)
        # Passes concatenated tensor features through the multi_CNN container to apply convolutional layers
        features = self.multi_CNN(features)
        # Passes concatenated tensor features through the local_features
        features = t.cat((features, local_features), 1)
        # Passes features tensor through DNN1 container to apply the first set of fully connected layers
        features = self.DNN1(features)
        #features =self.dropout_layer(features)
        # Passes features tensor through DNN2 container to apply the second set of fully connected layers
        features = self.DNN2(features)
        # Passes final features tensor through the outLayer container, which applies a linear layer followed by a sigmoid activation function
        features = self.outLayer(features)

        return features # Returns the features tensor

1. What : 'ConvsLayer' class contains three sequential layers (conv1, conv2, and conv3), each consisting of a convolutional module, activation function, and max pooling module. In the forward method, it takes an input tensor x and applies the three sequential layers to produce a concatenated output tensor. 'DeepPPI' class contains various sequential layers (seq_layers, multi_CNN, DNN1, and DNN2) to process different types of input features.In the forward method, it takes four input tensors (seq, dssp, pssm, and local_features) and passes them through the model's layers to produce a final output tensor representing the predicted PPI.
2. Why : The purpose of these classes is to define the architecture and forward pass of the deep learning model used for PPI prediction. The ConvsLayer class represents a single convolutional layer, while the DeepPPI class combines multiple layers to create a more complex model for PPI prediction.

train.py

In [None]:
#-*- encoding:utf8 -*-

import os
import time


import pickle
import numpy as np
import torch
from torch.optim import lr_scheduler
from torch.nn.init import xavier_normal,xavier_normal_
from torch import nn
import torch.utils.data.sampler as sampler


from utils.config import DefaultConfig
from models.deep_ppi import DeepPPI
from generator import data_generator


from evaluation import compute_roc, compute_aupr, compute_mcc, micro_score,acc_score, compute_performance

configs = DefaultConfig()
THREADHOLD = 0.2

class AverageMeter(object):
    """
    Computes and stores the average and current value
    Copied from: https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def weight_init(m):
    if isinstance(m,nn.Conv2d):
        xavier_normal_(m.weight.data)
    elif isinstance(m,nn.Linear):
        xavier_normal_(m.weight.data)


def train_epoch(model, loader, optimizer, epoch, all_epochs, print_freq=100):
    batch_time = AverageMeter()
    losses = AverageMeter()

    global THREADHOLD
    # Model on train mode
    model.train()

    end = time.time()
    for batch_idx, (seq_data, pssm_data, dssp_data, local_data, label) in enumerate(loader):
        # Create vaiables
        with torch.no_grad():
            if torch.cuda.is_available():
                seq_var = torch.autograd.Variable(seq_data.cuda(async=True).float())
                pssm_var = torch.autograd.Variable(pssm_data.cuda(async=True).float())
                dssp_var = torch.autograd.Variable(dssp_data.cuda(async=True).float())
                local_var = torch.autograd.Variable(local_data.cuda(async=True).float())
                target_var = torch.autograd.Variable(label.cuda(async=True).float())
            else:
                seq_var = torch.autograd.Variable(seq_data.float())
                pssm_var = torch.autograd.Variable(pssm_data.float())
                dssp_var = torch.autograd.Variable(dssp_data.float())
                local_var = torch.autograd.Variable(local_data.float())
                target_var = torch.autograd.Variable(label.float())

        # compute output
        output = model(seq_var, dssp_var, pssm_var, local_var)
        shapes = output.data.shape
        output = output.view(shapes[0]*shapes[1])
        loss = torch.nn.functional.binary_cross_entropy(output, target_var).cuda()

        # measure accuracy and record loss
        batch_size = label.size(0)
        pred_out = output.ge(THREADHOLD)
        MiP, MiR, MiF, PNum, RNum = micro_score(pred_out.data.cpu().numpy(),
                                                target_var.data.cpu().numpy())
        losses.update(loss.item(), batch_size)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print stats
        if batch_idx % print_freq == 0:
            res = '\t'.join([
                'Epoch: [%d/%d]' % (epoch + 1, all_epochs),
                'Iter: [%d/%d]' % (batch_idx + 1, len(loader)),
                'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                'Loss %.4f (%.4f)' % (losses.val, losses.avg),
                'f_max:%.6f' % (MiP),
                'p_max:%.6f' % (MiR),
                'r_max:%.6f' % (MiF),
                't_max:%.2f' % (PNum)])
            print(res)

    return batch_time.avg, losses.avg


def eval_epoch(model, loader, print_freq=10, is_test=True):
    batch_time = AverageMeter()
    losses = AverageMeter()
    error = AverageMeter()

    global THREADHOLD
    # Model on eval mode
    model.eval()

    all_trues = []
    all_preds = []
    all_gos = []
    end = time.time()
    for batch_idx, (seq_data, pssm_data, dssp_data, local_data, label) in enumerate(loader):

        # Create vaiables
        with torch.no_grad():
            if torch.cuda.is_available():
                seq_var = torch.autograd.Variable(seq_data.cuda(async=True).float())
                pssm_var = torch.autograd.Variable(pssm_data.cuda(async=True).float())
                dssp_var = torch.autograd.Variable(dssp_data.cuda(async=True).float())
                local_var = torch.autograd.Variable(local_data.cuda(async=True).float())
                target_var = torch.autograd.Variable(label.cuda(async=True).float())
            else:
                seq_var = torch.autograd.Variable(seq_data.float())
                pssm_var = torch.autograd.Variable(pssm_data.float())
                dssp_var = torch.autograd.Variable(dssp_data.float())
                local_var = torch.autograd.Variable(local_data.float())
                target_var = torch.autograd.Variable(label.float())

        # compute output
        output =  model(seq_var, dssp_var, pssm_var, local_var)
        shapes = output.data.shape
        output = output.view(shapes[0]*shapes[1])

        loss = torch.nn.functional.binary_cross_entropy(output, target_var).cuda()

        # measure accuracy and record loss
        batch_size = label.size(0)
        losses.update(loss.item(), batch_size)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print stats
        if batch_idx % print_freq == 0:
            res = '\t'.join([
                'Test' if is_test else 'Valid',
                'Iter: [%d/%d]' % (batch_idx + 1, len(loader)),
                'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                'Loss %.4f (%.4f)' % (losses.val, losses.avg),
            ])
            print(res)
        all_trues.append(label.numpy())
        all_preds.append(output.data.cpu().numpy())

    all_trues = np.concatenate(all_trues, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    auc = compute_roc(all_preds, all_trues)
    aupr = compute_aupr(all_preds, all_trues)
    f_max, p_max, r_max, t_max, predictions_max = compute_performance(all_preds,all_trues)
    acc_val = acc_score(predictions_max,all_trues)
    mcc = compute_mcc(predictions_max, all_trues)
    return batch_time.avg, losses.avg, acc_val, f_max, p_max, r_max, auc, aupr,t_max, mcc


def train(class_tag,model, train_data_set, save, n_epochs=3,
          batch_size=64, lr=0.001, wd=0.0001, momentum=0.9, seed=None, num=1,
          train_file=None):

    class_tag = "all_dset"
    if seed is not None:
        torch.manual_seed(seed)
    global THREADHOLD
    # # split data
    with open(train_file,"rb") as fp:
        train_list = pickle.load(fp)

    samples_num =len(train_list)
    split_num = int(configs.splite_rate * samples_num)
    data_index = train_list
    np.random.shuffle(data_index)
    train_index = data_index[:split_num]
    eval_index = data_index[split_num:]
    train_samples = sampler.SubsetRandomSampler(train_index)
    eval_samples = sampler.SubsetRandomSampler(eval_index)



    # Data loaders
    train_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size,
                                               sampler=train_samples, pin_memory=(torch.cuda.is_available()),
                                               num_workers=5, drop_last=False)
    valid_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size,
                                              sampler=eval_samples, pin_memory=(torch.cuda.is_available()),
                                               num_workers=5, drop_last=False)
    # Model on cuda
    if torch.cuda.is_available():
        model = model.cuda()

    # Wrap model for multi-GPUs, if necessary
    model_wrapper = model

    # Optimizer
    optimizer = torch.optim.Adam(model_wrapper.parameters(), lr=0.001)

    # Start log
    with open(os.path.join(save, 'DeepPPI_results.csv'), 'w') as f:
        f.write('epoch,loss,acc,F_value, precision,recall,auc,aupr,mcc,threadhold\n')

        # Train model
        best_F = 0
        threadhold = 0
        count = 0
        for epoch in range(n_epochs):
            _, train_loss = train_epoch(
                model=model_wrapper,
                loader=train_loader,
                optimizer=optimizer,
                epoch=epoch,
                all_epochs=n_epochs,
            )
            _, valid_loss, acc, f_max, p_max, r_max, auc, aupr,t_max,mcc= eval_epoch(
                model=model_wrapper,
                loader=valid_loader,
                is_test=(not valid_loader)
            )

            print(
            'epoch:%03d,valid_loss:%0.5f\nacc:%0.6f,F_value:%0.6f, precision:%0.6f,recall:%0.6f,auc:%0.6f,aupr:%0.6f,mcc:%0.6f,threadhold:%0.6f\n' % (
                (epoch + 1), valid_loss, acc, f_max, p_max, r_max,auc, aupr,mcc,t_max))
            if f_max > best_F:
                count = 0
                best_F = f_max
                THREADHOLD = t_max
                print("new best F_value:{0}(threadhold:{1})".format(f_max, THREADHOLD))
                torch.save(model.state_dict(), os.path.join(save, 'DeepPPI_model.dat'))
            else:
                count += 1
                if count>=5:
                    return None
            # Log results
            f.write('%03d,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f\n' % (
                (epoch + 1), valid_loss, acc, f_max, p_max, r_max, auc, aupr,mcc,t_max))



def demo(train_data,save=None, train_num = 1,
    ratio=None,window_size=3,splite_rate = 0.1, efficient=True,
              epochs=10, seed=None,pretrained_result=None):

    train_sequences_file = ['data_cache/{0}_sequence_data.pkl'.format(key) for key in train_data]
    train_dssp_file = ['data_cache/{0}_dssp_data.pkl'.format(key) for key in train_data]
    train_pssm_file = ['data_cache/{0}_pssm_data.pkl'.format(key) for key in train_data]
    train_label_file = ['data_cache/{0}_label.pkl'.format(key) for key in train_data]
    all_list_file = 'data_cache/all_dset_list.pkl'
    train_list_file = 'data_cache/training_list.pkl'


    #parameters
    batch_size = configs.batch_size

    # Datasets
    train_dataSet = data_generator.dataSet(window_size, train_sequences_file, train_pssm_file, train_dssp_file, train_label_file,
                                             all_list_file)
    # Models

    class_nums = 1
    model = DeepPPI(class_nums,window_size,ratio)
    model.apply(weight_init)

    # Train the model
    train(train_data,model=model, train_data_set=train_dataSet, save=save,
          n_epochs=epochs, batch_size=batch_size, seed=seed,num=train_num,
          train_file=train_list_file)
    print('Done!')

if __name__ == '__main__':

    ratio_list = (2,1)  #glboal:local
    path_dir = "./checkpoints/deep_ppi_saved_models"
    train_data = ["dset186","dset164","dset72"]
    if not os.path.exists(path_dir):
        os.makedirs(path_dir)

    for ii in range(1,5):
        demo(train_data,path_dir,ii,ratio_list)


1. What & Why: It trains the deep learning model for protein-protein interaction prediction using different training datasets (train_data). It performs multiple training runs (train_num) with different ratios (ratio_list) of global to local features. The trained models are saved in the specified directory (path_dir).

predict.py

In [None]:
#-*- encoding:utf8 -*-

import os
import time


import pickle
import numpy as np
import torch
from torch.optim import lr_scheduler
from torch.nn.init import xavier_normal,xavier_normal_
from torch import nn
import torch.utils.data.sampler as sampler


from utils.config import DefaultConfig
from models.deep_ppi import DeepPPI
from generator import data_generator


from evaluation import compute_roc, compute_aupr, compute_mcc, micro_score,acc_score, compute_performance

# Creates an instance of the DefaultConfig class and set a threshold value to 0.2
configs = DefaultConfig()
THREADHOLD = 0.2

# Defines a class named AverageMeter that is used for computing and storing the average and current values. It includes methods for resetting the values and updating them
class AverageMeter(object):
    """
    Computes and stores the average and current value
    Copied from: https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# Defines a function named weight_init that initializes the weights of the model using the Xavier normal initialization method. It takes a model m as input and initializes the weights of convolutional and linear layers
def weight_init(m):
    if isinstance(m,nn.Conv2d):
        xavier_normal_(m.weight.data)
    elif isinstance(m,nn.Linear):
        xavier_normal_(m.weight.data)

# Defines a function named test that performs testing on the trained model
def test(model, loader,path_dir,pre_num=1):

    # Model on eval mode
    model.eval() # The model is set to evaluation mode
    length = len(loader)
    result = []
    all_trues = []

    for batch_idx, (seq_data, pssm_data, dssp_data, local_data, label) in enumerate(loader):

        # Create vaiables
        with torch.no_grad():
            if torch.cuda.is_available():
                seq_var = torch.autograd.Variable(seq_data.cuda(async=True).float())
                pssm_var = torch.autograd.Variable(pssm_data.cuda(async=True).float())
                dssp_var = torch.autograd.Variable(dssp_data.cuda(async=True).float())
                local_var = torch.autograd.Variable(local_data.cuda(async=True).float())
                target_var = torch.autograd.Variable(label.cuda(async=True).float())
            else:
                seq_var = torch.autograd.Variable(seq_data.float())
                pssm_var = torch.autograd.Variable(pssm_data.float())
                dssp_var = torch.autograd.Variable(dssp_data.float())
                local_var = torch.autograd.Variable(local_data.float())
                target_var = torch.autograd.Variable(label.float())

        # compute output
        output =  model(seq_var, dssp_var, pssm_var, local_var)
        shapes = output.data.shape
        output = output.view(shapes[0]*shapes[1])
        result.append(output.data.cpu().numpy())
        all_trues.append(label.numpy())


    #Caculates the predictions and true labels
    all_trues = np.concatenate(all_trues, axis=0)
    all_preds = np.concatenate(result, axis=0)

    # Calculates evaluation metrics such as area under the ROC curve (AUC), area under the precision-recall curve (AUPR), F1 score, accuracy, and Matthews correlation coefficient (MCC)
    auc = compute_roc(all_preds, all_trues)
    aupr = compute_aupr(all_preds, all_trues)
    f_max, p_max, r_max, t_max, predictions_max = compute_performance(all_preds,all_trues)
    acc = acc_score(predictions_max,all_trues)
    mcc = compute_mcc(predictions_max, all_trues)

    print(
        'acc:%0.6f,F_value:%0.6f, precision:%0.6f,recall:%0.6f,auc:%0.6f,aupr:%0.6f,mcc:%0.6f,threadhold:%0.6f\n' % (
        acc, f_max, p_max, r_max,auc, aupr,mcc,t_max))



    predict_result = {}
    predict_result["pred"] = all_preds
    predict_result["label"] = all_trues
    result_file = "{0}/test_predict.pkl".format(path_dir)
    with open(result_file,"wb") as fp:
        pickle.dump(predict_result,fp)

# Defines a function named predict that performs prediction using a trained model
def predict(model_file,test_data,window_size,path_dir,ratio):
    test_sequences_file = ['data_cache/{0}_sequence_data.pkl'.format(key) for key in test_data]
    test_dssp_file = ['data_cache/{0}_dssp_data.pkl'.format(key) for key in test_data]
    test_pssm_file = ['data_cache/{0}_pssm_data.pkl'.format(key) for key in test_data]
    test_label_file = ['data_cache/{0}_label.pkl'.format(key) for key in test_data]
    all_list_file = 'data_cache/all_dset_list.pkl'
    test_list_file = 'data_cache/testing_list.pkl'
    # parameters
    batch_size = configs.batch_size

    print(test_list_file)
    #parameters
    batch_size = configs.batch_size

    # Datasets
    test_dataSet = data_generator.dataSet(window_size, test_sequences_file, test_pssm_file, test_dssp_file, test_label_file,
                                             all_list_file) # Loads the test data  from files
    # Models
    with open(test_list_file,"rb") as fp:
        test_list = pickle.load(fp) # Creates a data loader for the test data

    test_samples = sampler.SubsetRandomSampler(test_list)
    test_loader = torch.utils.data.DataLoader(test_dataSet, batch_size=batch_size,
                                              sampler=test_samples, pin_memory=(torch.cuda.is_available()),
                                               num_workers=5, drop_last=False)

    # Models
    class_nums = 1
    model = DeepPPI(class_nums,window_size,ratio) # Creates An instance of the DeepPPI model and loaded with the trained weights from the model file
    model.load_state_dict(torch.load(model_file))
    model = model.cuda()
    test(model, test_loader,path_dir) # Calls test function to perform testing and evaluate the model's performance on the test data

    print('Done!')



if __name__ == '__main__':


    ratio_list = (2,1) #glboal:local
    window_size = 3
    path_dir = "./checkpoints/deep_ppi_saved_models"

    datas = ["dset186","dset164","dset72"]
    if not os.path.exists(path_dir):
        os.makedirs(path_dir)

    model_file = "{0}/DeepPPI_model.dat".format(path_dir)
    predict(model_file,datas,window_size,path_dir,ratio_list)

1. What & Why : The code loads a trained DeepPPI model, performs testing on the test data, and evaluates the model's performance by calculating various evaluation metrics. It also saves the predicted results for further analysis.

evaluation.py

In [None]:
#-*- encoding:utf8 -*-
#!/usr/bin/env python
from __future__ import print_function
from __future__ import absolute_import

import os
import numpy as np
from collections import deque
import pickle

from sklearn.metrics import roc_curve, auc, matthews_corrcoef, precision_recall_curve,accuracy_score

# ROC - Receiver Operating Characteristic
# AUC - Area Under the Curve

def compute_roc(preds, labels):
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten())
    roc_auc = auc(fpr, tpr)
    return roc_auc


def compute_aupr(preds, labels):
    # Compute ROC curve and ROC area for each class
    p, r, _ = precision_recall_curve(labels.flatten(), preds.flatten())
    aupr = auc(r, p)
    return aupr


def compute_mcc(preds, labels, threshold=0.5):
    preds = preds.astype(np.float64)
    labels = labels.astype(np.float64)
    # Compute ROC curve and ROC area for each class
    mcc = matthews_corrcoef(labels.flatten(), preds.flatten())
    return mcc


def compute_performance(preds, labels):

    predictions_max = None # Sets predictions_max to None
    # Initializes f_max, p_max, r_max, t_max
    f_max = 0
    p_max = 0
    r_max = 0
    t_max = 0
    for t in range(1, 100): # Iterates over threshold values from 0.01 to 0.99
        threshold = t / 100.0
        predictions = (preds > threshold).astype(np.int32) # Calulates predictions
        p = 0.0
        r = 0.0
        total = 0
        p_total = 0

        tp = np.sum(predictions * labels) # Calulates true positives (tp)
        fp = np.sum(predictions) - tp # Calculates false positives (fp)
        fn = np.sum(labels) - tp # Calculates false negatives (fn)

        if tp == 0 and fp == 0 and fn == 0: # Checks if tp,fp and fn are equal to 0
            continue # If so, it continues to the next iteration of the loop
        total += 1 # Otherwise, it increments the total by 1
        if tp != 0: # If tp is non-zero
            p_total += 1 # It increments p_total by 1
            precision = tp / (1.0 * (tp + fp)) # Calculates precision
            recall = tp / (1.0 * (tp + fn)) # Calculates recall
            p += precision # Adds precision to p
            r += recall # Adds recall to r

        if total > 0 and p_total > 0: # if total and p_total is greater than zero
            r /= total # Computes recall (r)
            p /= p_total # Computes precision (p)
            if p + r > 0: # If precision (p) + recall (r) is greater than zero
                f = 2 * p * r / (p + r) # Calculates F1 Score (f) : 2 * precision * recall / (precision + recall)
                if f_max < f: # If f_max is less than f
                    f_max = f # Updates the f_max
                    p_max = p # Updates the p_max
                    r_max = r # Updates the r_max
                    t_max = threshold # Updates the t_max
                    predictions_max = predictions # Updates the predictions_max

    return f_max, p_max, r_max, t_max, predictions_max # Returns f_max, p_max, r_max, t_max, predictions_max

# Calulates various evaluation metrics for a binary classification task
def micro_score(output, label):
    N = len(output) # Calculates the total number of instances (N)
    total_P = np.sum(output) # Calculates sum of positiove predictions (total_P)
    total_R = np.sum(label) # Calculates sum of true labels (total_R)
    TP = float(np.sum(output * label)) # Calculates trues positives (TP)
    MiP = TP / max(total_P, 1e-12) # Calculates micro-precision (MiP)
    MiR = TP / max(total_R, 1e-12) # Calculates micro-recall (MiR)
    if TP==0: # If true positives equals to 0
        MiF = 0 # Sets micro-F1 (MiF) Score to 0
    else: # Else
        MiF = 2 * MiP * MiR / (MiP + MiR) # Calculates micro-F1 (MiF) using the following formula : 2 * MiP * MiR / (MiP + MiR)
    return MiP, MiR, MiF, total_P / N, total_R / N # Returns MiP, MiR, MiF, total_P / N, total_R / N

def acc_score(output,label):
    acc = accuracy_score(label.flatten(), output.flatten()) # Calculates the accuracy
    return acc # Return the accuracy

if __name__ == '__main__':
    pass

1. What : The code provides utility functions for evaluating the performance of binary classification models by computing metrics such as ROC-AUC, precision-recall AUC, Matthews correlation coefficient, F1 score, micro-precision, micro-recall, micro-F1 score, and accuracy.
2. Why : These metrics are commonly used to assess the quality of classification models.