In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import preprocess1 as pp
import matplotlib.pyplot as plt

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('The code uses a GPU!')
else:
    device = torch.device('cpu')
    print('The code uses a CPU...')

The code uses a CPU...


In [4]:
task = 'regression'
dataset = 'AP_penta_fd1'
radius = 1
arr = [i for i in range(0,5,1)]
(dataset,
     N_fingerprints) = pp.create_datasets(task, dataset, radius, device, arr)

pentapepsmi0_w.txt
pentapepsmi1_w.txt
pentapepsmi2_w.txt
pentapepsmi3_w.txt
pentapepsmi4_w.txt


In [5]:
#dataset_train = datasets_train
#print('# of training data samples:', len(dataset_train))
#print('# of development data samples:', len(dataset_dev))
#print('# of test data samples:', len(dataset_test))

dim=50
layer_hidden=6
layer_output=6
batch_train=32
batch_test=32

In [6]:
class MolecularGraphNeuralNetwork(nn.Module):
    def __init__(self, N_fingerprints, dim, layer_hidden, layer_output):
        super(MolecularGraphNeuralNetwork, self).__init__()
        self.embed_fingerprint = nn.Embedding(N_fingerprints, dim)
        self.W_fingerprint = nn.ModuleList([nn.Linear(dim, dim)
                                            for _ in range(layer_hidden)])
        self.W_output = nn.ModuleList([nn.Linear(dim, dim)
                                       for _ in range(layer_output)])
        if task == 'classification':
            self.W_property = nn.Linear(dim, 2)
        if task == 'regression':
            self.W_property = nn.Linear(dim, 1)

    def pad(self, matrices, pad_value):
        """Pad the list of matrices
        with a pad_value (e.g., 0) for batch processing.
        For example, given a list of matrices [A, B, C],
        we obtain a new matrix [A00, 0B0, 00C],
        where 0 is the zero (i.e., pad value) matrix.
        """
        shapes = [m.shape for m in matrices]
        M, N = sum([s[0] for s in shapes]), sum([s[1] for s in shapes])
        zeros = torch.FloatTensor(np.zeros((M, N))).to(device)
        pad_matrices = pad_value + zeros
        i, j = 0, 0
        for k, matrix in enumerate(matrices):
            m, n = shapes[k]
            pad_matrices[i:i+m, j:j+n] = matrix
            i += m
            j += n
        return pad_matrices

    def update(self, matrix, vectors, layer):
        hidden_vectors = torch.relu(self.W_fingerprint[layer](vectors))
        return hidden_vectors + torch.matmul(matrix, hidden_vectors)

    def sum(self, vectors, axis):
        sum_vectors = [torch.sum(v, 0) for v in torch.split(vectors, axis)]
        return torch.stack(sum_vectors)

    def mean(self, vectors, axis):
        mean_vectors = [torch.mean(v, 0) for v in torch.split(vectors, axis)]
        return torch.stack(mean_vectors)

    def gnn(self, inputs):

        """Cat or pad each input data for batch processing."""
        fingerprints, adjacencies, molecular_sizes = inputs
        fingerprints = torch.cat(fingerprints)
        adjacencies = self.pad(adjacencies, 0)

        """GNN layer (update the fingerprint vectors)."""
        fingerprint_vectors = self.embed_fingerprint(fingerprints)
        for l in range(layer_hidden):
            hs = self.update(adjacencies, fingerprint_vectors, l)
            fingerprint_vectors = F.normalize(hs, 2, 1)  # normalize.

        """Molecular vector by sum or mean of the fingerprint vectors."""
        molecular_vectors = self.sum(fingerprint_vectors, molecular_sizes)
        # molecular_vectors = self.mean(fingerprint_vectors, molecular_sizes)

        return molecular_vectors

    def mlp(self, vectors):
        """Classifier or regressor based on multilayer perceptron."""
        for l in range(layer_output):
            vectors = torch.relu(self.W_output[l](vectors))
        outputs = self.W_property(vectors)
        return outputs

    def forward_classifier(self, data_batch, train):

        inputs = data_batch[:-1]
        correct_labels = torch.cat(data_batch[-1])

        if train:
            molecular_vectors = self.gnn(inputs)
            predicted_scores = self.mlp(molecular_vectors)
            loss = F.cross_entropy(predicted_scores, correct_labels)
            return loss
        else:
            with torch.no_grad():
                molecular_vectors = self.gnn(inputs)
                predicted_scores = self.mlp(molecular_vectors)
            predicted_scores = predicted_scores.to('cpu').data.numpy()
            predicted_scores = [s[1] for s in predicted_scores]
            correct_labels = correct_labels.to('cpu').data.numpy()
            return predicted_scores, correct_labels

    def forward_regressor(self, data_batch, train):

        inputs = data_batch[:-1]
        correct_values = torch.cat(data_batch[-1])

        if train:
            molecular_vectors = self.gnn(inputs)
            predicted_values = self.mlp(molecular_vectors)
            loss = F.mse_loss(predicted_values, correct_values)
            return loss
        else:
            with torch.no_grad():
                molecular_vectors = self.gnn(inputs)
                predicted_values = self.mlp(molecular_vectors)
            predicted_values = predicted_values.to('cpu').data.numpy()
            correct_values = correct_values.to('cpu').data.numpy()
            molecular_vectors = molecular_vectors.to('cpu').data.numpy()
            predicted_values = np.concatenate(predicted_values)
            correct_values = np.concatenate(correct_values)
            molecular_vectors = np.concatenate(molecular_vectors)
            return predicted_values, correct_values

In [7]:
torch.manual_seed(1234)
model = MolecularGraphNeuralNetwork(
        38, dim, layer_hidden, layer_output).to(device)

print('# of model parameters:',
      sum([np.prod(p.size()) for p in model.parameters()]))

# of model parameters: 32551


In [8]:
model.load_state_dict(torch.load('BestModel_sim_tri.pt'))
model.eval()

  model.load_state_dict(torch.load('BestModel_sim_tri.pt'))


MolecularGraphNeuralNetwork(
  (embed_fingerprint): Embedding(38, 50)
  (W_fingerprint): ModuleList(
    (0-5): 6 x Linear(in_features=50, out_features=50, bias=True)
  )
  (W_output): ModuleList(
    (0-5): 6 x Linear(in_features=50, out_features=50, bias=True)
  )
  (W_property): Linear(in_features=50, out_features=1, bias=True)
)

In [9]:
N = len(dataset[0])
D = []
for j in range(len(arr)):
    V = []
    for i in range(0, N, batch_test):
        data_batch = list(zip(*dataset[j][i:i+batch_test]))
        predicted_scores, correct_scores = model.forward_regressor(
                                           data_batch, train=False)
        V.append(np.abs(predicted_scores-correct_scores))
    D.append(V)

In [12]:
len(D)

5

In [13]:
diff = []
for i in range(len(D)):
    for j in range(len(D[i])):
        for k in range(len(D[i][j])):
            diff.append(D[i][j][k])

In [14]:
max(diff)

4.1542997

In [15]:
len(diff)

800000

In [19]:
for i in range(0,5,1):
    pep = pd.read_csv('../dataset/regression/AP_penta_fd1/pentapepslc'+str(i)+'_w.txt', sep=' ',header=None)
    for j in range(i*len(pep), (i+1)*len(pep), 1):
        pep.loc[1, j-(i*len(pep))]=diff[j]
    pep.to_csv('../dataset/regression/AP_penta_fd1/penta'+str(i)+'_pred.csv')

  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
  pep.loc[1, j-(i*len(pep))]=diff[j]
 

: 

In [18]:
pep[1]

0         0
1         0
2         0
3         0
4         0
         ..
159995    0
159996    0
159997    0
159998    0
159999    0
Name: 1, Length: 160000, dtype: int64

In [47]:
# Step 1: Create bins of size 0.2 for the 'AP' column
bin_size = 0.2
pep['bins'] = pd.cut(pep['AP'], bins=np.arange(pep['AP'].min(), pep['AP'].max() + bin_size, bin_size))

# Step 2: Determine the number of bins and peptides per bin
total_peptides = 3000
unique_bins = pep['bins'].nunique()
peptides_per_bin = total_peptides // unique_bins

# Step 3: Sample equal number of peptides from each bin
sampled_pep = pep.groupby('bins').apply(lambda x: x.sample(n=min(len(x), peptides_per_bin), random_state=1)).reset_index(drop=True)

# Step 4: Adjust the total sample size if necessary
remaining = total_peptides - len(sampled_pep)

# If there are still remaining peptides to be sampled due to rounding
if remaining > 0:
    extra_samples = pep[~pep.index.isin(sampled_pep.index)].sample(n=remaining, random_state=1)
    sampled_pep = pd.concat([sampled_pep, extra_samples])

# Display the sampled dataframe
print(sampled_pep)


         Pep        AP        bins
0       SSSS  1.177884  (1.0, 1.2]
1       SSSG  1.126320  (1.0, 1.2]
2       GGGG  1.181096  (1.0, 1.2]
3       SASG  1.198624  (1.0, 1.2]
4       SDDG  1.344497  (1.2, 1.4]
...      ...       ...         ...
96055   MANS  1.882964  (1.8, 2.0]
4860    AMDA  1.755749  (1.6, 1.8]
134153  TSQF  2.048654  (2.0, 2.2]
138934  WQHP  2.227512  (2.2, 2.4]
70251   GSMK  1.762201  (1.6, 1.8]

[3000 rows x 3 columns]


  sampled_pep = pep.groupby('bins').apply(lambda x: x.sample(n=min(len(x), peptides_per_bin), random_state=1)).reset_index(drop=True)
  sampled_pep = pep.groupby('bins').apply(lambda x: x.sample(n=min(len(x), peptides_per_bin), random_state=1)).reset_index(drop=True)


In [51]:
sampled_pep.to_csv('../dataset/regression/AP_tetra_fd/pred.csv')