In [396]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from xgboost import XGBClassifier
import warnings
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from itertools import product
from sklearn.metrics import classification_report
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mordred import Calculator, descriptors

#For reproducibility
np.random.seed(42)

#Load the data
file_path = r"C:\Users\20201527\OneDrive - TU Eindhoven\Desktop\8CC00\Group assignement\tested_molecules.csv"
data = pd.read_csv(file_path)
data = data.drop(columns=['PKM2_inhibition', 'ERK2_inhibition'])
data

Unnamed: 0,SMILES
0,C=C(C)c1nc(N)nc(N)n1
1,C=C(Cl)COc1ccc2c(C)cc(=O)oc2c1
2,C=CCNC(=O)CCCC(=O)NCC=C
3,C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21
4,C=CCn1cc(Cl)c(=O)n(CC=C)c1=O
...,...
1111,O=C1c2ccccc2[C@H](Nc2ccc3c(c2)OCCO3)N1Cc1ccco1
1112,O=S(=O)(Nc1cccc(-c2cn3ccsc3[nH+]2)c1)c1ccc(F)cc1
1113,Oc1c(C[NH+]2CCN(c3ccccn3)CC2)cc(Cl)c2cccnc12
1114,c1ccc(-c2csc(N3CCN(c4ccccn4)CC3)n2)cc1


In [397]:
# Function to convert SMILES to Morgan fingerprints with nBits
def smiles_to_fingerprint(smiles, nBits=512):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=nBits)
        return np.array(fp)
    else:
        return np.zeros((nBits,))

# Apply the function to your dataset with nBits=1024
fingerprints = data['SMILES'].apply(lambda x: smiles_to_fingerprint(x, nBits=512))
fingerprints_matrix = np.stack(fingerprints.values)

# Standardize the fingerprints
scaler = StandardScaler()
scaled_fingerprints = scaler.fit_transform(fingerprints_matrix)

# Perform PCA
pca = PCA()
pca.fit(scaled_fingerprints)

# Determine the number of components to retain 90% variance
cumulative_variance = pca.explained_variance_ratio_.cumsum()
num_components = (cumulative_variance < 0.40).sum() + 1

print(f"Number of components to retain 40% variance: {num_components}")

# Transform the dataset using these components
pca = PCA(n_components=num_components)
principal_components = pca.fit_transform(scaled_fingerprints)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components)
pca_df

Number of components to retain 40% variance: 63


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
0,1.157136,0.365255,-2.438733,-0.833015,0.869695,-1.341804,1.302786,1.267361,0.334519,-3.810940,...,-0.535106,0.572246,-0.986170,1.347112,0.179324,0.389461,-0.662910,0.614039,1.505357,-0.389502
1,-0.652357,2.598671,-0.125888,-1.928148,-0.403555,0.158467,0.262807,1.991545,1.091777,-0.677284,...,0.605024,0.077718,-1.593988,1.184204,-1.891184,-0.089445,-2.944632,2.244815,0.290019,0.542279
2,0.700065,-0.389961,-0.365425,0.050217,-0.094143,-4.048969,2.901671,3.091357,1.435485,-1.667670,...,-1.646889,0.317414,-1.243658,0.676291,-0.036529,0.063381,0.130300,0.594853,1.146155,0.077394
3,3.986348,0.309383,-1.080464,-0.492504,-0.190285,-1.123572,0.939277,-1.188212,-0.753285,-1.806597,...,1.221535,3.572960,0.844288,-0.319696,0.337165,1.070729,2.030025,-0.949288,-2.191535,-0.988216
4,3.833646,-0.795066,-0.676303,-0.487197,-1.528633,-1.641877,1.614382,0.975558,0.047651,-2.648286,...,0.022520,-0.313288,0.446328,-1.305621,-1.292248,2.712741,1.691159,-0.592565,1.486693,0.158991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,-1.418964,-3.352089,5.845459,0.902035,2.543198,3.105718,2.416636,-0.084173,-1.912838,-1.651505,...,-1.721105,1.363079,1.866441,0.428019,-2.924021,0.087867,-1.354250,0.893690,-1.049941,-2.323343
1112,-1.534316,-1.533664,-4.091269,0.590062,2.539767,-1.230094,-1.825278,-0.676675,-4.062593,-1.903572,...,2.648381,-1.304353,-0.094563,-1.837992,-0.511032,-0.546720,-0.894382,1.077080,-2.780355,-0.116294
1113,1.077740,-4.036203,1.082517,-1.650573,0.546616,1.927100,-1.054294,-2.592981,0.874873,4.929592,...,0.083845,-3.287780,1.758651,-0.281502,2.349578,-0.596967,0.402144,-2.042612,-0.203264,0.305794
1114,1.316859,-2.819870,-3.866747,1.087502,3.057116,2.094926,-2.485460,-3.565705,-1.869109,2.098178,...,-2.729949,-2.178345,1.138944,-1.862361,0.807499,1.984461,-1.157796,0.545566,-0.372917,0.114570


In [398]:
# Function to calculate descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(Descriptors.descList)
    return [func(mol) for name, func in Descriptors.descList]

# Calculate descriptors for all SMILES
descriptor_names = [name for name, func in Descriptors.descList]
descriptor_values = data['SMILES'].apply(calculate_descriptors)

# Create a DataFrame with descriptor values
descriptor_df = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)

descriptor_df

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,5.313889,0.120833,5.313889,0.120833,0.592228,151.173,142.101,151.085795,58,0,...,0,0,0,0,0,0,0,0,0,0
1,11.238954,-0.366756,11.238954,0.225308,0.785414,250.681,239.593,250.039672,88,0,...,0,0,0,0,0,0,0,0,0,0
2,11.090706,-0.049610,11.090706,0.049610,0.581062,210.277,192.133,210.136828,84,0,...,0,0,0,0,0,0,0,0,0,0
3,11.892238,-0.457824,11.892238,0.076632,0.441090,232.239,220.143,232.084792,88,0,...,0,0,0,0,0,0,0,0,0,0
4,11.693580,-0.498260,11.693580,0.012315,0.720343,226.663,215.575,226.050905,80,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,12.955843,-0.290408,12.955843,0.016627,0.764433,362.385,344.241,362.126657,136,0,...,0,0,0,0,0,0,0,0,0,0
1112,12.983770,-3.772852,12.983770,0.009487,0.594812,374.442,361.338,374.042773,126,0,...,0,1,0,0,0,1,0,0,0,0
1113,10.579691,0.249681,10.579691,0.249681,0.753119,355.849,335.689,355.132015,128,0,...,0,0,0,0,0,0,0,0,0,0
1114,4.812249,0.983396,4.812249,0.983396,0.738254,322.437,304.293,322.125218,116,0,...,0,0,0,0,0,1,0,0,0,0


In [399]:
# Combine PCA and descriptors DataFrames
combined_df = pd.concat([descriptor_df, pca_df], axis=1)
combined_df.columns = combined_df.columns.astype(str)
combined_df

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,53,54,55,56,57,58,59,60,61,62
0,5.313889,0.120833,5.313889,0.120833,0.592228,151.173,142.101,151.085795,58,0,...,-0.535106,0.572246,-0.986170,1.347112,0.179324,0.389461,-0.662910,0.614039,1.505357,-0.389502
1,11.238954,-0.366756,11.238954,0.225308,0.785414,250.681,239.593,250.039672,88,0,...,0.605024,0.077718,-1.593988,1.184204,-1.891184,-0.089445,-2.944632,2.244815,0.290019,0.542279
2,11.090706,-0.049610,11.090706,0.049610,0.581062,210.277,192.133,210.136828,84,0,...,-1.646889,0.317414,-1.243658,0.676291,-0.036529,0.063381,0.130300,0.594853,1.146155,0.077394
3,11.892238,-0.457824,11.892238,0.076632,0.441090,232.239,220.143,232.084792,88,0,...,1.221535,3.572960,0.844288,-0.319696,0.337165,1.070729,2.030025,-0.949288,-2.191535,-0.988216
4,11.693580,-0.498260,11.693580,0.012315,0.720343,226.663,215.575,226.050905,80,0,...,0.022520,-0.313288,0.446328,-1.305621,-1.292248,2.712741,1.691159,-0.592565,1.486693,0.158991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,12.955843,-0.290408,12.955843,0.016627,0.764433,362.385,344.241,362.126657,136,0,...,-1.721105,1.363079,1.866441,0.428019,-2.924021,0.087867,-1.354250,0.893690,-1.049941,-2.323343
1112,12.983770,-3.772852,12.983770,0.009487,0.594812,374.442,361.338,374.042773,126,0,...,2.648381,-1.304353,-0.094563,-1.837992,-0.511032,-0.546720,-0.894382,1.077080,-2.780355,-0.116294
1113,10.579691,0.249681,10.579691,0.249681,0.753119,355.849,335.689,355.132015,128,0,...,0.083845,-3.287780,1.758651,-0.281502,2.349578,-0.596967,0.402144,-2.042612,-0.203264,0.305794
1114,4.812249,0.983396,4.812249,0.983396,0.738254,322.437,304.293,322.125218,116,0,...,-2.729949,-2.178345,1.138944,-1.862361,0.807499,1.984461,-1.157796,0.545566,-0.372917,0.114570


In [400]:
# Standardize the combined data
scaler = StandardScaler()
scaled_combined_data = scaler.fit_transform(combined_df)

# Perform PCA
pca = PCA()
pca.fit(scaled_combined_data)

# Determine the number of components to retain 90% variance
cumulative_variance = pca.explained_variance_ratio_.cumsum()
num_components = (cumulative_variance < 0.90).sum() + 1

print(f"Number of components to retain 90% variance: {num_components}")

# Transform the dataset using these components
pca = PCA(n_components=num_components)
principal_components = pca.fit_transform(scaled_combined_data)

Number of components to retain 90% variance: 86


In [401]:
# Create a DataFrame with the principal components
final_pca_df = pd.DataFrame(data=principal_components)
final_data.columns = final_data.columns.astype(str)

# Save the final PCA dataset
final_pca_df.to_csv('final_pca_dataset.csv', index=False)

print("Final PCA dataset with 90% variance saved as 'final_pca_dataset.csv'")

Final PCA dataset with 90% variance saved as 'final_pca_dataset.csv'
