In [134]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from xgboost import XGBClassifier
import warnings
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from itertools import product
from sklearn.metrics import classification_report
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import StandardScaler

#For reproducibility
np.random.seed(42)

#Load the data
file_path = r"C:\Users\20201527\OneDrive - TU Eindhoven\Desktop\8CC00\Group assignement\tested_molecules.csv"
data = pd.read_csv(file_path)

data

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
0,C=C(C)c1nc(N)nc(N)n1,0,0
1,C=C(Cl)COc1ccc2c(C)cc(=O)oc2c1,0,0
2,C=CCNC(=O)CCCC(=O)NCC=C,0,0
3,C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21,0,0
4,C=CCn1cc(Cl)c(=O)n(CC=C)c1=O,0,0
...,...,...,...
1111,O=C1c2ccccc2[C@H](Nc2ccc3c(c2)OCCO3)N1Cc1ccco1,0,1
1112,O=S(=O)(Nc1cccc(-c2cn3ccsc3[nH+]2)c1)c1ccc(F)cc1,0,1
1113,Oc1c(C[NH+]2CCN(c3ccccn3)CC2)cc(Cl)c2cccnc12,0,1
1114,c1ccc(-c2csc(N3CCN(c4ccccn4)CC3)n2)cc1,0,1


In [135]:
# Function to calculate descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(Descriptors.descList)
    return [func(mol) for name, func in Descriptors.descList]

# Calculate descriptors for all SMILES
descriptor_names = [name for name, func in Descriptors.descList]
descriptor_values = data['SMILES'].apply(calculate_descriptors)

# Create a DataFrame with descriptor values
descriptor_df = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)


descriptor_df

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,5.313889,0.120833,5.313889,0.120833,0.592228,151.173,142.101,151.085795,58,0,...,0,0,0,0,0,0,0,0,0,0
1,11.238954,-0.366756,11.238954,0.225308,0.785414,250.681,239.593,250.039672,88,0,...,0,0,0,0,0,0,0,0,0,0
2,11.090706,-0.049610,11.090706,0.049610,0.581062,210.277,192.133,210.136828,84,0,...,0,0,0,0,0,0,0,0,0,0
3,11.892238,-0.457824,11.892238,0.076632,0.441090,232.239,220.143,232.084792,88,0,...,0,0,0,0,0,0,0,0,0,0
4,11.693580,-0.498260,11.693580,0.012315,0.720343,226.663,215.575,226.050905,80,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,12.955843,-0.290408,12.955843,0.016627,0.764433,362.385,344.241,362.126657,136,0,...,0,0,0,0,0,0,0,0,0,0
1112,12.983770,-3.772852,12.983770,0.009487,0.594812,374.442,361.338,374.042773,126,0,...,0,1,0,0,0,1,0,0,0,0
1113,10.579691,0.249681,10.579691,0.249681,0.753119,355.849,335.689,355.132015,128,0,...,0,0,0,0,0,0,0,0,0,0
1114,4.812249,0.983396,4.812249,0.983396,0.738254,322.437,304.293,322.125218,116,0,...,0,0,0,0,0,1,0,0,0,0


In [136]:
descriptor_df['PKM2_inhibition'] = data['PKM2_inhibition']
descriptor_df['ERK2_inhibition'] = data['ERK2_inhibition']
descriptor_df

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,PKM2_inhibition,ERK2_inhibition
0,5.313889,0.120833,5.313889,0.120833,0.592228,151.173,142.101,151.085795,58,0,...,0,0,0,0,0,0,0,0,0,0
1,11.238954,-0.366756,11.238954,0.225308,0.785414,250.681,239.593,250.039672,88,0,...,0,0,0,0,0,0,0,0,0,0
2,11.090706,-0.049610,11.090706,0.049610,0.581062,210.277,192.133,210.136828,84,0,...,0,0,0,0,0,0,0,0,0,0
3,11.892238,-0.457824,11.892238,0.076632,0.441090,232.239,220.143,232.084792,88,0,...,0,0,0,0,0,0,0,0,0,0
4,11.693580,-0.498260,11.693580,0.012315,0.720343,226.663,215.575,226.050905,80,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,12.955843,-0.290408,12.955843,0.016627,0.764433,362.385,344.241,362.126657,136,0,...,0,0,0,0,0,0,0,0,0,1
1112,12.983770,-3.772852,12.983770,0.009487,0.594812,374.442,361.338,374.042773,126,0,...,0,0,0,1,0,0,0,0,0,1
1113,10.579691,0.249681,10.579691,0.249681,0.753119,355.849,335.689,355.132015,128,0,...,0,0,0,0,0,0,0,0,0,1
1114,4.812249,0.983396,4.812249,0.983396,0.738254,322.437,304.293,322.125218,116,0,...,0,0,0,1,0,0,0,0,0,1


In [137]:
#Split into train val

# Defining features (X) and targets (y1, y2)
X = descriptor_df.drop(columns=['PKM2_inhibition', 'ERK2_inhibition'])
y1 = descriptor_df['PKM2_inhibition']
y2 = descriptor_df['ERK2_inhibition']

# Combine targets into a single DataFrame
y = pd.concat([y1, y2], axis=1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42)

# Print the shapes of the splits
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)

Training set shape: (558, 208) (558, 2)
Validation set shape: (558, 208) (558, 2)


In [138]:
# Ignore all warnings
warnings.filterwarnings("ignore")

# Apply RandomOverSampler to handle imbalance for each target separately
ros_pkm2 = RandomOverSampler(random_state=42)
X_train_ros_pkm2, y_train_ros_pkm2 = ros_pkm2.fit_resample(X_train, y_train['PKM2_inhibition'])

ros_erk2 = RandomOverSampler(random_state=42)
X_train_ros_erk2, y_train_ros_erk2 = ros_erk2.fit_resample(X_train, y_train['ERK2_inhibition'])

# Calculate scale_pos_weight for each target
scale_pos_weight_pkm2 = y_train_ros_pkm2.value_counts()[0] / y_train_ros_pkm2.value_counts()[1]
scale_pos_weight_erk2 = y_train_ros_erk2.value_counts()[0] / y_train_ros_erk2.value_counts()[1]

# Create and train the XGBClassifier for PKM2_inhibition
model_pkm2 = XGBClassifier(scale_pos_weight=scale_pos_weight_pkm2, random_state=42)
model_pkm2.fit(X_train_ros_pkm2, y_train_ros_pkm2)

# Create and train the XGBClassifier for ERK2_inhibition
model_erk2 = XGBClassifier(scale_pos_weight=scale_pos_weight_erk2, random_state=42)
model_erk2.fit(X_train_ros_erk2, y_train_ros_erk2)

# Predict on the validation set
y_pred_pkm2 = model_pkm2.predict(X_val)
y_pred_erk2 = model_erk2.predict(X_val)

# Convert y_val to numpy array for comparison
y_val_binary = y_val.to_numpy()

# Calculate the accuracy, precision, recall, F1 score, and balanced accuracy for each target column
accuracy_pkm2 = accuracy_score(y_val_binary[:, 0], y_pred_pkm2)
accuracy_erk2 = accuracy_score(y_val_binary[:, 1], y_pred_erk2)

precision_pkm2 = precision_score(y_val_binary[:, 0], y_pred_pkm2)
precision_erk2 = precision_score(y_val_binary[:, 1], y_pred_erk2)

recall_pkm2 = recall_score(y_val_binary[:, 0], y_pred_pkm2)
recall_erk2 = recall_score(y_val_binary[:, 1], y_pred_erk2)

f1_pkm2 = f1_score(y_val_binary[:, 0], y_pred_pkm2)
f1_erk2 = f1_score(y_val_binary[:, 1], y_pred_erk2)

# Create a DataFrame to store the results
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'PKM2_inhibition': [accuracy_pkm2, precision_pkm2, recall_pkm2, f1_pkm2],
    'ERK2_inhibition': [accuracy_erk2, precision_erk2, recall_erk2, f1_erk2]
})

# Transpose the DataFrame for the desired format
results = results.set_index('Metric').T

results

Metric,Accuracy,Precision,Recall,F1 Score
PKM2_inhibition,0.97491,0.333333,0.076923,0.125
ERK2_inhibition,0.948029,0.2,0.038462,0.064516


In [139]:
# Print the classification reports
print("Classification report for PKM2_inhibition:")
print(classification_report(y_val_binary[:, 0], y_pred_pkm2))

print("Classification report for ERK2_inhibition:")
print(classification_report(y_val_binary[:, 1], y_pred_erk2))

# Combine the predictions for both targets
y_pred_combined = pd.concat([pd.Series(y_pred_pkm2, name='PKM2_inhibition'), pd.Series(y_pred_erk2, name='ERK2_inhibition')], axis=1)
print("Predicted binary values:\n", y_pred_combined)

Classification report for PKM2_inhibition:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       545
           1       0.33      0.08      0.12        13

    accuracy                           0.97       558
   macro avg       0.66      0.54      0.56       558
weighted avg       0.96      0.97      0.97       558

Classification report for ERK2_inhibition:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       532
           1       0.20      0.04      0.06        26

    accuracy                           0.95       558
   macro avg       0.58      0.52      0.52       558
weighted avg       0.92      0.95      0.93       558

Predicted binary values:
      PKM2_inhibition  ERK2_inhibition
0                  0                0
1                  0                0
2                  0                0
3                  0                0
4                  0                0
..    

In [140]:
# Create a new DataFrame with all possible combinations
all_combinations = pd.DataFrame(list(product([0, 1], [0, 1])), columns=['PKM2_inhibition', 'ERK2_inhibition'])
all_combinations.set_index(['PKM2_inhibition', 'ERK2_inhibition'], inplace=True)

# Count the occurrences of each combination in the predictions
sequence_counts = y_pred_combined.value_counts().reset_index(name='Count')
sequence_counts.set_index(['PKM2_inhibition', 'ERK2_inhibition'], inplace=True)

# Reindex to ensure all combinations are present, filling missing values with 0
sequence_counts = all_combinations.join(sequence_counts, how='left').fillna(0).astype(int)

# Reset the index to turn PKM2_inhibition and ERK2_inhibition back into columns
sequence_counts.reset_index(inplace=True)


sequence_counts

Unnamed: 0,PKM2_inhibition,ERK2_inhibition,Count
0,0,0,550
1,0,1,5
2,1,0,3
3,1,1,0
