In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np

In [43]:
#Load dataset

file1 = "/content/drive/MyDrive/bioactivity/beta_secretase1_bioactivity_data_pIC50_rdkit_descriptors.csv"
df = pd.read_csv(file1)
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,pIC50,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CHEMBL406146,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,6.38405,14.032746,-2.195009,14.032746,0.017379,0.041154,999.085,932.557,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL78946,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,8.69897,13.595406,-1.573089,13.595406,0.09781,0.042501,893.005,828.493,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL324109,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,6.337242,13.243577,-1.303772,13.243577,0.114162,0.077027,751.988,690.5,...,1,0,0,0,0,0,0,0,0,0
3,CHEMBL114147,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,5.045757,13.416202,-1.312338,13.416202,0.118038,0.09937,737.895,682.455,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL419949,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,5.251812,13.721715,-1.361064,13.721715,0.112353,0.074085,828.02,766.532,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# Check for NaN values in the entire DataFrame
print("Checking for NaN values in df...")
print(df.isna().sum())

Checking for NaN values in df...
molecule_chembl_id    0
canonical_smiles      0
pIC50                 0
MaxEStateIndex        0
MinEStateIndex        0
                     ..
fr_thiazole           0
fr_thiocyan           0
fr_thiophene          0
fr_unbrch_alkane      0
fr_urea               0
Length: 211, dtype: int64


In [45]:
# Drop rows with any NaN values
df_cleaned = df.dropna()

In [60]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Find the column with the maximum value
max_values = numeric_df.max()  # Get the maximum value in each numeric column
column_with_max_value = max_values.idxmax()  # Get the name of the column with the largest value
largest_value = max_values.max()  # Get the largest value itself

print(f"The column '{column_with_max_value}' has the largest value: {largest_value}")

The column 'Ipc' has the largest value: 8.127863505038417e+84


In [61]:
# Drop the column with the largest value
df_cleaned = df_cleaned.drop(columns=[column_with_max_value])

print(f"Column '{column_with_max_value}' has been dropped.")
print(f"New DataFrame shape: {df_cleaned.shape}")

Column 'Ipc' has been dropped.
New DataFrame shape: (10567, 210)


In [63]:
# x, y data

x = df_cleaned.iloc[:,3:]
y = df_cleaned.pIC50

In [64]:
x.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,14.032746,-2.195009,14.032746,0.017379,0.041154,999.085,932.557,998.459678,390,0,...,0,0,0,0,0,0,0,0,0,0
1,13.595406,-1.573089,13.595406,0.09781,0.042501,893.005,828.493,892.454199,352,0,...,0,0,0,0,0,0,0,0,0,0
2,13.243577,-1.303772,13.243577,0.114162,0.077027,751.988,690.5,751.419,294,0,...,1,0,0,0,0,0,0,0,0,0
3,13.416202,-1.312338,13.416202,0.118038,0.09937,737.895,682.455,737.399978,290,0,...,0,0,0,0,0,0,0,0,0,0
4,13.721715,-1.361064,13.721715,0.112353,0.074085,828.02,766.532,827.446929,324,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
y.head()

Unnamed: 0,pIC50
0,6.38405
1,8.69897
2,6.337242
3,5.045757
4,5.251812


In [66]:
x.shape

(10567, 207)

# **Remove low variance features**

Features with low variance across samples are often not very informative in machine learning, as they do not vary much between data points. VarianceThreshold is a simple feature selection method that removes all features with variance below a certain threshold. Following code removes features from the dataset X that have a variance below 0.16.

In [68]:
from sklearn.feature_selection import VarianceThreshold

selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
x = selection.fit_transform(x)
x.shape

(10567, 130)

In [69]:
# Train, Test dataset splitting

# Perform data splitting using 80/20 ratio
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [70]:
# Initialize RandomForestRegressor
model = RandomForestRegressor()

In [71]:
# Perform 5-fold cross-validation
kf = KFold(n_splits=5)
cv_scores = cross_val_score(model, x, y, cv=kf, scoring='neg_mean_squared_error')

In [72]:
# Calculate average performance across folds
average_score = np.mean(cv_scores)
print(f"Average MSE across folds: {-average_score}")

Average MSE across folds: 1.1621318532010434


In [73]:
# Retrain the model on the entire dataset
model.fit(x, y)

In [74]:
# Save the final model trained on the entire dataset
joblib.dump(model, 'final_rf_model.pkl')

['final_rf_model.pkl']

In [75]:
! cp final_rf_model.pkl "/content/drive/MyDrive/bioactivity/"


In [76]:
! ls "/content/drive/MyDrive/bioactivity/"

'beta_secretase1_bioactivity_data (1).gsheet'		        bioactivity_data.csv
 beta_secretase1_bioactivity_data.gsheet		        bioactivity_preprocessed_data.csv
 beta_secretase1_bioactivity_data_pIC50.csv		        final_rf_model.pkl
 beta_secretase1_bioactivity_data_pIC50_rdkit_descriptors.csv
