In [41]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import rdkit
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

In [42]:
#For reproducibility
np.random.seed(42)

#Load the data
file_path = r"tested_molecules.csv"
data = pd.read_csv(file_path)

data

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition
0,C=C(C)c1nc(N)nc(N)n1,0,0
1,C=C(Cl)COc1ccc2c(C)cc(=O)oc2c1,0,0
2,C=CCNC(=O)CCCC(=O)NCC=C,0,0
3,C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21,0,0
4,C=CCn1cc(Cl)c(=O)n(CC=C)c1=O,0,0
...,...,...,...
1111,O=C1c2ccccc2[C@H](Nc2ccc3c(c2)OCCO3)N1Cc1ccco1,0,1
1112,O=S(=O)(Nc1cccc(-c2cn3ccsc3[nH+]2)c1)c1ccc(F)cc1,0,1
1113,Oc1c(C[NH+]2CCN(c3ccccn3)CC2)cc(Cl)c2cccnc12,0,1
1114,c1ccc(-c2csc(N3CCN(c4ccccn4)CC3)n2)cc1,0,1


In [43]:
# Function to calculate descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(Descriptors.descList)
    return [func(mol) for name, func in Descriptors.descList]

# Calculate descriptors for all SMILES
descriptor_names = [name for name, func in Descriptors.descList]
descriptor_values = data['SMILES'].apply(calculate_descriptors)

# Create a DataFrame with descriptor values
descriptor_df = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)

combined_df = pd.concat([data, descriptor_df], axis=1)
combined_df

Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C=C(C)c1nc(N)nc(N)n1,0,0,5.313889,0.120833,5.313889,0.120833,0.592228,151.173,142.101,...,0,0,0,0,0,0,0,0,0,0
1,C=C(Cl)COc1ccc2c(C)cc(=O)oc2c1,0,0,11.238954,-0.366756,11.238954,0.225308,0.785414,250.681,239.593,...,0,0,0,0,0,0,0,0,0,0
2,C=CCNC(=O)CCCC(=O)NCC=C,0,0,11.090706,-0.049610,11.090706,0.049610,0.581062,210.277,192.133,...,0,0,0,0,0,0,0,0,0,0
3,C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21,0,0,11.892238,-0.457824,11.892238,0.076632,0.441090,232.239,220.143,...,0,0,0,0,0,0,0,0,0,0
4,C=CCn1cc(Cl)c(=O)n(CC=C)c1=O,0,0,11.693580,-0.498260,11.693580,0.012315,0.720343,226.663,215.575,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1111,O=C1c2ccccc2[C@H](Nc2ccc3c(c2)OCCO3)N1Cc1ccco1,0,1,12.955843,-0.290408,12.955843,0.016627,0.764433,362.385,344.241,...,0,0,0,0,0,0,0,0,0,0
1112,O=S(=O)(Nc1cccc(-c2cn3ccsc3[nH+]2)c1)c1ccc(F)cc1,0,1,12.983770,-3.772852,12.983770,0.009487,0.594812,374.442,361.338,...,0,1,0,0,0,1,0,0,0,0
1113,Oc1c(C[NH+]2CCN(c3ccccn3)CC2)cc(Cl)c2cccnc12,0,1,10.579691,0.249681,10.579691,0.249681,0.753119,355.849,335.689,...,0,0,0,0,0,0,0,0,0,0
1114,c1ccc(-c2csc(N3CCN(c4ccccn4)CC3)n2)cc1,0,1,4.812249,0.983396,4.812249,0.983396,0.738254,322.437,304.293,...,0,0,0,0,0,1,0,0,0,0


In [44]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Assuming 'data' is your DataFrame
data = descriptor_df

# List of continuous and binary columns
continuous_features = ['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons']
binary_features = [col for col in data.columns if col.startswith('fr_')]

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', StandardScaler(), continuous_features),
        ('bin', 'passthrough', binary_features)
    ])

# Fit and transform the data
scaled_data = preprocessor.fit_transform(data)

# Convert scaled data back to DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=continuous_features + binary_features)
scaled_df




Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,...,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,-2.886977,0.763336,-2.886977,-0.232493,-0.622140,...,0.0,0.0,0.0,0.0,0.0
1,-0.231561,0.445359,-0.231561,0.336260,0.675637,...,0.0,0.0,0.0,0.0,0.0
2,-0.298001,0.652183,-0.298001,-0.620226,-0.697156,...,0.0,0.0,0.0,0.0,0.0
3,0.061219,0.385970,0.061219,-0.473124,-1.637454,...,0.0,0.0,0.0,0.0,0.0
4,-0.027813,0.359600,-0.027813,-0.823262,0.238509,...,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1111,0.537890,0.495149,0.537890,-0.799789,0.534697,...,0.0,0.0,0.0,0.0,0.0
1112,0.550406,-1.775896,0.550406,-0.838657,-0.604784,...,1.0,0.0,0.0,0.0,0.0
1113,-0.527021,0.847363,-0.527021,0.468948,0.458685,...,0.0,0.0,0.0,0.0,0.0
1114,-3.111795,1.325849,-3.111795,4.463257,0.358828,...,1.0,0.0,0.0,0.0,0.0


In [49]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, precision_recall_curve
import joblib
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

# Load your dataset with molecular features and inhibition labels
data_with_labels = combined_df

# Define the list of continuous and binary feature columns
continuous_features = ['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons']
binary_features = [col for col in data_with_labels.columns if col.startswith('fr_')]

# Define the ColumnTransformer for scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', StandardScaler(), continuous_features),
        ('bin', 'passthrough', binary_features)
    ])

# Separate features and targets
X = data_with_labels.drop(columns=['SMILES'])  # Drop identifier column
y_PKM2 = data_with_labels['PKM2_inhibition']  # Target column for PKM2 inhibition
y_ERK2 = data_with_labels['ERK2_inhibition']  # Target column for ERK2 inhibition

# Fit and transform the features
X_scaled = preprocessor.fit_transform(X)

# Apply PCA
pca = PCA(n_components=10)  # Adjust the number of components as needed
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and testing sets for PKM2
X_train_PKM2, X_test_PKM2, y_train_PKM2, y_test_PKM2 = train_test_split(X_pca, y_PKM2, test_size=0.2, random_state=42)

# Split the data into training and testing sets for ERK2
X_train_ERK2, X_test_ERK2, y_train_ERK2, y_test_ERK2 = train_test_split(X_pca, y_ERK2, test_size=0.2, random_state=42)

# Initialize and train an XGBClassifier for PKM2
model_PKM2 = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
model_PKM2.fit(X_train_PKM2, y_train_PKM2)

# Apply SMOTE to the ERK2 training data
smote = SMOTE(random_state=42)
X_train_ERK2_res, y_train_ERK2_res = smote.fit_resample(X_train_ERK2, y_train_ERK2)

# Hyperparameter tuning for ERK2 model
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [5, 10, 15]
}

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
grid_search = GridSearchCV(xgb, param_grid, scoring='precision', cv=3)
grid_search.fit(X_train_ERK2_res, y_train_ERK2_res)

# Best parameters
best_params = grid_search.best_params_
print(best_params)

# Train with the best parameters
model_ERK2 = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
model_ERK2.fit(X_train_ERK2_res, y_train_ERK2_res)

# Predict on the test set for PKM2
y_pred_PKM2 = model_PKM2.predict(X_test_PKM2)

# Predict on the test set for ERK2
y_probs_ERK2 = model_ERK2.predict_proba(X_test_ERK2)[:, 1]

# Adjust threshold for higher precision
precision, recall, thresholds = precision_recall_curve(y_test_ERK2, y_probs_ERK2)

# Note that precision array length is greater than thresholds array length by one
precision_for_thresholds = precision[:-1]

# Select the threshold where precision is at least 0.7 (or your desired value)
desired_precision = 0.7
threshold_index = np.where(precision_for_thresholds >= desired_precision)[0][0]
threshold = thresholds[threshold_index]

# Apply the threshold to make final predictions for ERK2
y_pred_ERK2_adjusted = (y_probs_ERK2 >= threshold).astype(int)

# Evaluate the models
print("PKM2 Inhibition Classification Report:")
print(classification_report(y_test_PKM2, y_pred_PKM2))

print("ERK2 Inhibition Classification Report (Adjusted Threshold):")
print(classification_report(y_test_ERK2, y_pred_ERK2_adjusted))

# Save the preprocessor, PCA transformer, and models for later use
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(pca, 'pca.pkl')
joblib.dump(model_PKM2, 'model_PKM2.pkl')
joblib.dump(model_ERK2, 'model_ERK2.pkl')

# Load the new dataset
new_data = pd.read_csv('untested_molecules.csv')

# Function to calculate descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(Descriptors.descList)
    return [func(mol) for name, func in Descriptors.descList]

# Calculate descriptors for all SMILES
descriptor_names = [name for name, func in Descriptors.descList]
descriptor_values = new_data['SMILES'].apply(calculate_descriptors)

# Create a DataFrame with descriptor values
descriptor_df2 = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)

combined_df3 = pd.concat([new_data, descriptor_df2], axis=1)

# Apply the same preprocessing and PCA to the new data
X_new = combined_df3.drop(columns=['SMILES'])  # Drop the identifier column
X_new_scaled = preprocessor.transform(X_new)
X_new_pca = pca.transform(X_new_scaled)

# Make predictions on the new data for PKM2
new_predictions_PKM2 = model_PKM2.predict(X_new_pca)

# Make predictions on the new data for ERK2 with adjusted threshold
new_predictions_ERK2_probs = model_ERK2.predict_proba(X_new_pca)[:, 1]
new_predictions_ERK2 = (new_predictions_ERK2_probs >= threshold).astype(int)

# Add the predictions to the new data DataFrame
new_data['PKM2_inhibition_prediction'] = new_predictions_PKM2
new_data['ERK2_inhibition_prediction'] = new_predictions_ERK2

# Print the first few rows of the new data with predictions
new_data.head()


{'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'scale_pos_weight': 5, 'subsample': 0.8}
PKM2 Inhibition Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       218
           1       0.50      0.17      0.25         6

    accuracy                           0.97       224
   macro avg       0.74      0.58      0.62       224
weighted avg       0.96      0.97      0.97       224

ERK2 Inhibition Classification Report (Adjusted Threshold):
              precision    recall  f1-score   support

           0       0.93      1.00      0.97       208
           1       1.00      0.06      0.12        16

    accuracy                           0.93       224
   macro avg       0.97      0.53      0.54       224
weighted avg       0.94      0.93      0.90       224



Unnamed: 0,SMILES,PKM2_inhibition,ERK2_inhibition,PKM2_inhibition_prediction,ERK2_inhibition_prediction
0,C[C@@H](Sc1nc(=O)cc(N)[nH]1)C(=O)NC1CCCCC1,,,0,0
1,O=C(CCN1C(=O)COc2ccccc21)NCc1cccs1,,,0,0
2,Cn1nnnc1SCC(=O)N1CC[NH+](Cc2ccccc2)CC1,,,0,0
3,CCOC(=O)CCP(=O)([O-])[C@@H](O)c1ccc(OC)cc1,,,0,0
4,C=CCNC(=O)c1cc(-c2ccccc2O)on1,,,0,0


In [34]:
df = pd.DataFrame(new_data)

# Export DataFrame to CSV
try:
    df.to_csv('outputgroupass.csv', index=False)  # Set index=False to exclude row numbers from the output
    print("CSV file exported successfully.")
except Exception as e:
    print("Error:", e)

CSV file exported successfully.


In [50]:
#ISN"T CORRECT YET
"DOESNT WORK YET"



import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, precision_recall_curve
import joblib
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

# Load your dataset with molecular features and inhibition labels
data_with_labels = combined_df

# Define the list of continuous and binary feature columns
continuous_features = ['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons']
binary_features = [col for col in data_with_labels.columns if col.startswith('fr_')]

# Fit an XGBoost classifier to get feature importances for PKM2
model_PKM2_importance = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
model_PKM2_importance.fit(X_train_PKM2, y_train_PKM2)

# Get feature importances
feature_importances_PKM2 = model_PKM2_importance.feature_importances_

# Select the top k most important features
k = 5  # Adjust this value as needed
top_k_indices_PKM2 = np.argsort(feature_importances_PKM2)[::-1][:k]
top_k_features_PKM2 = [continuous_features[i-1] for i in top_k_indices_PKM2]

# Define the ColumnTransformer for scaling with selected features
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', StandardScaler(), top_k_features_PKM2),
        ('bin', 'passthrough', binary_features)
    ])

# Separate features and targets
X = data_with_labels.drop(columns=['SMILES'])  # Drop identifier column
y_PKM2 = data_with_labels['PKM2_inhibition']  # Target column for PKM2 inhibition
y_ERK2 = data_with_labels['ERK2_inhibition']  # Target column for ERK2 inhibition

# Fit and transform the features with selected descriptors
X_scaled = preprocessor.fit_transform(X)

# Apply PCA
pca = PCA(n_components=10)  # Adjust the number of components as needed
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and testing sets for PKM2
X_train_PKM2, X_test_PKM2, y_train_PKM2, y_test_PKM2 = train_test_split(X_pca, y_PKM2, test_size=0.2, random_state=42)

# Split the data into training and testing sets for ERK2
X_train_ERK2, X_test_ERK2, y_train_ERK2, y_test_ERK2 = train_test_split(X_pca, y_ERK2, test_size=0.2, random_state=42)

# Initialize and train an XGBClassifier for PKM2
model_PKM2 = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
model_PKM2.fit(X_train_PKM2, y_train_PKM2)

# Apply SMOTE to the ERK2 training data
smote = SMOTE(random_state=42)
X_train_ERK2_res, y_train_ERK2_res = smote.fit_resample(X_train_ERK2, y_train_ERK2)

# Hyperparameter tuning for ERK2 model
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [5, 10, 15]
}

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
grid_search = GridSearchCV(xgb, param_grid, scoring='precision', cv=3)
grid_search.fit(X_train_ERK2_res, y_train_ERK2_res)

# Best parameters
best_params = grid_search.best_params_
print(best_params)

# Train with the best parameters
model_ERK2 = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
model_ERK2.fit(X_train_ERK2_res, y_train_ERK2_res)

# Predict on the test set for PKM2
y_pred_PKM2 = model_PKM2.predict(X_test_PKM2)

# Predict on the test set for ERK2
y_probs_ERK2 = model_ERK2.predict_proba(X_test_ERK2)[:, 1]

# Adjust threshold for higher precision
precision, recall, thresholds = precision_recall_curve(y_test_ERK2, y_probs_ERK2)

# Note that precision array length is greater than thresholds array length by one
precision_for_thresholds = precision[:-1]

# Select the threshold where precision is at least 0.7 (or your desired value)
desired_precision = 0.7
threshold_index = np.where(precision_for_thresholds >= desired_precision)[0][0]
threshold = thresholds[threshold_index]

# Apply the threshold to make final predictions for ERK2
y_pred_ERK2_adjusted = (y_probs_ERK2 >= threshold).astype(int)

# Evaluate the models
print("PKM2 Inhibition Classification Report:")
print(classification_report(y_test_PKM2, y_pred_PKM2))

print("ERK2 Inhibition Classification Report (Adjusted Threshold):")
print(classification_report(y_test_ERK2, y_pred_ERK2_adjusted))

# Save the preprocessor, PCA transformer, and models for later use
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(pca, 'pca.pkl')
joblib.dump(model_PKM2, 'model_PKM2.pkl')
joblib.dump(model_ERK2, 'model_ERK2.pkl')

# Load the new dataset
new_data = pd.read_csv('untested_molecules.csv')

# Function to calculate descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * len(Descriptors.descList)
    return [func(mol) for name, func in Descriptors.descList]

# Calculate descriptors for all SMILES
descriptor_names = [name for name, func in Descriptors.descList]
descriptor_values = new_data['SMILES'].apply(calculate_descriptors)

# Create a DataFrame with descriptor values
descriptor_df2 = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)

combined_df3 = pd.concat([new_data, descriptor_df2], axis=1)

# Apply the same preprocessing and PCA to the new data
X_new = combined_df3.drop(columns=['SMILES'])  # Drop the identifier column
X_new_scaled = preprocessor.transform(X_new)
X_new_pca = pca.transform(X_new_scaled)

# Make predictions on the new data for PKM2
new_predictions_PKM2 = model_PKM2.predict(X_new_pca)

# Make predictions on the new data for ERK2 with adjusted threshold
new_predictions_ERK2_probs = model_ERK2.predict_proba(X_new_pca)[:, 1]
new_predictions_ERK2 = (new_predictions_ERK2_probs >= threshold).astype(int)

# Add the predictions to the new data DataFrame
new_data['PKM2_inhibition_prediction'] = new_predictions_PKM2
new_data['ERK2_inhibition_prediction'] = new_predictions_ERK2

# Print the first few rows of the new data with predictions
new_data.head()


{'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'scale_pos_weight': 5, 'subsample': 1.0}


IndexError: index 0 is out of bounds for axis 0 with size 0