In [None]:
import os
import math
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.core.dtypes.cast import maybe_box_datetimelike
from copy import deepcopy
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn.modules.container import Sequential
from torch.utils.data import random_split, TensorDataset, Dataset, DataLoader
from torchmetrics import PearsonCorrCoef, R2Score, MeanSquaredError

import torchvision
from torchvision import datasets


from Model_and_trainer import CustomStructureDataset, First_CNN, ProgressPlotter, BaseTrainer

import random


def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
mean = 7.3169
std = 2.15
batch_size = 1
test1_dataset = CustomStructureDataset('Data/test_1.csv', str_dir = 'Data/Dataset_t1')
test1_dataset.train = False
test1_dataset.normalize = True
test1_dataset.mean = mean
test1_dataset.std = std
test1_dataset.transform = None
test1_loader = DataLoader(test1_dataset, shuffle=False, batch_size=batch_size, num_workers=2)

test2_dataset = CustomStructureDataset('Data/test_2.csv', str_dir = 'Data/Dataset_t2')
test2_dataset.train = False
test2_dataset.normalize = True
test2_dataset.mean = mean
test2_dataset.std = std
test2_dataset.transform = None
test2_loader = DataLoader(test2_dataset, shuffle=False, batch_size=batch_size, num_workers=2)

In [None]:
model = torch.load(f'Result Model/best_model.pt')
model = model.double()
model = model.to(device)

In [None]:
#checking the model on the test 1
trainer = BaseTrainer(
    model= model,
    train_dataloader = test1_loader,

    test_dataloader= test1_loader
)
trainer.mean = mean
trainer.std = std
y_test_pred, y_test_true = trainer.get_predictions(model=model, dl=test1_loader)
pearson = PearsonCorrCoef()
corr_coef= pearson(y_test_pred, y_test_true)
mse = MeanSquaredError()
rmse = torch.sqrt(mse(y_test_pred, y_test_true))
print('test 1')
print(f'Pearson_corr: {round(corr_coef.item(), 2)}')
print(f'RMSE: {round(rmse.item(), 3)}')

In [None]:
import scipy
corr_df = pd.DataFrame(columns=['Pred_pKD', 'True_pKD'])
corr_df['Pred_pKD'] = y_test_pred
corr_df['True_pKD'] = y_test_true
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

corr_df = corr_df.round(3)

pers = scipy.stats.pearsonr(corr_df['Pred_pKD'].values, corr_df['True_pKD'].values)
print(f'Pearson for test 1: {pers}')

In [None]:
import seaborn as sns
sns.set_context(rc={'figure.dpi': 500, 'font.size': 12})
fig = sns.jointplot(data=corr_df, x='True_pKD', y='Pred_pKD', palette='Set2', ylim=(2, 14), xlim=(2, 14), kind="reg")

In [None]:
#checking the model on the test 2
y_test_pred, y_test_true = trainer.get_predictions(model=model, dl=test2_loader)
pearson = PearsonCorrCoef()
corr_coef= pearson(y_test_pred, y_test_true)
mse = MeanSquaredError()
rmse = torch.sqrt(mse(y_test_pred, y_test_true))
print('test 2')
print(f'Pearson_corr: {round(corr_coef.item(), 2)}')
print(f'RMSE: {round(rmse.item(), 3)}')

In [None]:
import scipy
corr_df = pd.DataFrame(columns=['Pred_pKD', 'True_pKD'])
corr_df['Pred_pKD'] = y_test_pred
corr_df['True_pKD'] = y_test_true
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

corr_df = corr_df.round(3)

pers = scipy.stats.pearsonr(corr_df['Pred_pKD'].values, corr_df['True_pKD'].values)
print(f'Pearson for test 2: {pers}')

In [None]:
sns.set_context(rc={'figure.dpi': 500, 'font.size': 12})
fig = sns.jointplot(data=corr_df, x='True_pKD', y='Pred_pKD', palette='Set2', ylim=(2, 14), xlim=(2, 14), kind="reg")

In [None]:
# features importance
# 0: Hydrogen bond acceptors pr1 + Hydrogen bond donors pr2
# 1: Hydrogen bond donors pr1 + Hydrogen bond acceptors pr2
# 2: Hydrogen bond acceptors pr1 + Weak hydrogen bond donors pr2
# 3: Weak hydrogen bond donors pr1 + Hydrogen bond acceptors pr2
# 4: Positive charge atoms pr1 + Negative charge atoms pr2
# 5: Negative charge atoms pr1 + Positive charge atoms pr2
# 6: Hydrophobic atoms pr1 + Hydrophobic atoms pr2
# 7: Carbonyl carbons pr1 + Carbonyl carbons pr2
# 8: Carbonyl oxygens pr1 + Carbonyl oxygens pr2
# 9: Aromatic atoms pr1 + Aromatic atoms pr2

channels_names = ['HB_Ac1+Don2', 'HB_Ac2+Don1', 'HB_Ac1+Weak_Don2', 'HB_Ac2+Weak_Don1', 'Pos1+Neg2', 'Pos2+Neg1', 'Hph1+Hph2', 'Carboxy_C1+Carboxy_C2', 'Carboxy_O1+Carboxy_O2', 'Arom1+Arom2']

In [None]:
# output of weights for each input channel
w0 = best_model2.conv_stack[0].weight
w0 = pd.DataFrame(np.transpose(w0.cpu().detach().numpy(), [0, 4, 2, 3, 1]).reshape((-1, 10)),
                  columns=channels_names)

In [None]:
# calculating the average value for significant neurons
diff = (w0.abs() > 0.001).mean()
diff.sort_values(ascending=False)

In [None]:
# range between 25th and 75th percentiles
perc_diff = ((w0.apply(lambda x: np.percentile(x, 75))
             - w0.apply(lambda x: np.percentile(x, 25)))
             .sort_values(ascending=False))

# plotting spreads of weights for each of 10 channels
fig, ax = plt.subplots(figsize=(7, 6), dpi=300)

sns.boxplot(data=w0, fliersize=0, orient='h', ax=ax)
ax.set_xlim(-0.055, 0.055)
ax.set_xticks(np.arange(-0.04, 0.05, 0.02))
ax.set_ylim(10, -1)

fig.tight_layout()
fig.figure.savefig("Result Model/model_feat_imp_1.png")