In [1]:
#Importing libraries for DATA PREPARARION 1

import pandas as pd
import numpy as np

path = 'Datasets/general_preprocessed.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,Compound_Id,Canonical_smiles,Activity
0,72810.0,C1=CC2=C(C3=C(C=CC=N3)C(=O)C2=O)N=C1,1
1,23618032.0,CN(C)C1=NC=NC2=C1N=CN2[C@H]3[C@@H]([C@@H]([C@H...,1
2,99927.0,CC1C(C(CC(O1)OC2CC(OC(C2O)C)OC3=CC4=CC5=C(C(=O...,1
3,254021.0,C1=CC=C2C(=C1)C3=C(C=C(C=C3)N)C(=O)C2=O,1
4,45280821.0,CCC(C)C1C(CC(=O)O[C@H](C(=O)[C@H](C(=O)NC(C(=O...,1


## **Molecular Descriptors**

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem

#function to calculate Morgan fps
def morgan_fps(data):
  fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) for mol in data]  # Calculate Morgan fingerprints for each molecule
  fp_array = [np.array(fp) for fp in fps]   # Convert fingerprints to numpy array
  column_names = ['morgan_' + str(i) for i in range(len(fp_array[0]))]
  return pd.DataFrame(fp_array, columns = column_names)

#creating an array of mols
smiles_list = data['Canonical_smiles'].to_list()
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list] # generates the 2D structures of molecular smiles
mols[0]

In [10]:
df_morganfps = morgan_fps(mols)
df_morganfps.head(10)

Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_morganfps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2338 entries, 0 to 2337
Columns: 2048 entries, morgan_0 to morgan_2047
dtypes: int32(2048)
memory usage: 18.3 MB


In [21]:
import sklearn

X = df_morganfps.copy()
y = data['Activity']

#Splitting our data into train and test data using train_test_split module
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=45)

ModuleNotFoundError: No module named 'sklearn'

In [11]:
#removal of columns with low variance

from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold = 0)
selector.fit(X_train, y_train)

mask = selector.get_support() #gets boolean mask of selected features
high_var_cols = X_train.columns[mask]
len(high_var_cols)

2038

In [18]:
X_train = X_train[high_var_cols]

In [13]:
#Handling of data imbalance using SMOTE technique

from imblearn.over_sampling import SMOTE

smt = SMOTE(random_state = 14)
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)
X_resampled

Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1880,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_test = X_test[high_var_cols]


## Model Building and Evaluation

**Random Forest Classifier**

In [36]:
# Random Forest model
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import GridSearchCV


rfc = RandomForestClassifier(random_state = 45)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100, 300, 500, 700],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2', None]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

grid_search.fit(X_resampled, y_resampled)

print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

Best Parameters:  {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 500}
Best Accuracy:  0.807059060378118


In [39]:
rfc = RandomForestClassifier(random_state=45, criterion = 'entropy', max_features='sqrt', n_estimators = 500)
rfc.fit(X_resampled, y_resampled)

In [42]:
from sklearn.metrics import classification_report

y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       411
           1       0.78      0.82      0.80       291

    accuracy                           0.83       702
   macro avg       0.82      0.83      0.83       702
weighted avg       0.83      0.83      0.83       702



**Support vector machine model**

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


svm_classifier = SVC()

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
}


g_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5)

# Fit the grid search to your training data
g_search.fit(X_resampled, y_resampled)

# Print the best hyperparameters and corresponding accuracy
print("Best Parameters: ", g_search.best_params_)
print("Best Accuracy: ", g_search.best_score_)

Best Parameters:  {'C': 10, 'kernel': 'rbf'}
Best Accuracy:  0.8220455443309442


In [16]:
svm_classifier = SVC(C = 10, kernel = 'rbf')
svm_classifier.fit(X_resampled, y_resampled)

In [17]:
from sklearn.metrics import classification_report

y_pred = svm_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87       411
           1       0.81      0.84      0.82       291

    accuracy                           0.85       702
   macro avg       0.84      0.85      0.85       702
weighted avg       0.85      0.85      0.85       702

