# PROJECT

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Read data in notebook
data = pd.read_csv("C:/Users/Ethel Enam Nyamador/Desktop/WACCBIP/Drug Discovery/Project/SCARA5 Target.csv")
data

In [None]:
data.columns

In [None]:
#Select essential features
essential_features = data[["Molecule ChEMBL ID", "Smiles", "Standard Value"]]
essential_features

In [None]:
#Drop nul values (NaN) in essential features
essential_features = essential_features.dropna()
essential_features

In [None]:
#Group compunds into activity class
bioactivity_class = []
for i in essential_features['Standard Value']:
  if float(i) >= 1000:
    bioactivity_class.append("inactive")
  else:
    bioactivity_class.append("active")

bioactivity_class

In [None]:
#Append a new column to the data
essential_features["Bioactivity"] = bioactivity_class
essential_features

In [None]:
import seaborn as sns
sns.set(style='ticks')

In [None]:
#Create a bar chart for bioactivity class
plt.figure(figsize=(5, 5))

sns.countplot(x='Bioactivity', data=essential_features, edgecolor='blue')

plt.title('Bar chart of Bioactivity class', fontsize=14, fontweight='bold')
plt.xlabel('Bioactivity class', fontsize=13, fontweight='bold')
plt.ylabel('Frequency', fontsize=13, fontweight='bold')

In [None]:
#Convert categorical fetaures to numeric
encoded_class = []
for i in essential_features.Bioactivity:
  if i == "active":
    encoded_class.append(1)
  else:
    encoded_class.append(0)

In [None]:
essential_features["Numerical class"] = encoded_class
essential_features

In [None]:
dd = essential_features[["Molecule ChEMBL ID", "Smiles"]]
dd.to_csv("cleaned_data.csv", index=False)
dd

In [None]:
#Import rdkit
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [None]:
#Convert Smiles to Morgan fingerprints
def morgan_fpts(data):
    Morgan_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i) 
        fpts =  AllChem.GetMorganFingerprintAsBitVect(mol,2,2048)
        mfpts = np.array(fpts)
        Morgan_fpts.append(mfpts)   
    return np.array(Morgan_fpts)

Morgan_fpts = morgan_fpts(dd["Smiles"])
Morgan_fingerprints = pd.DataFrame(Morgan_fpts,columns=['Col_{}'.format(i) for i in range(Morgan_fpts.shape[1])])
Morgan_fingerprints

In [None]:
#Convert fingerprints to csv file
Morgan_fingerprints.to_csv("fingerprints.csv", index=False)

In [None]:
#Open fingerprints csv file
fingerprints = pd.read_csv("fingerprints.csv")

In [None]:
#Merge fingerprints with bioactivity classes to form a single dataset
fingerprints["Bioactivity"] = essential_features["Numerical class"]
whole_data = fingerprints
whole_data

In [None]:
#Convert whole data to csv file
whole_data.to_csv("whole_data.csv", index=False)

In [None]:
#Drop all nul values
whole_data = whole_data.dropna()

In [None]:
#Split data into target feature and independent features
X = whole_data.drop("Bioactivity", axis = 1)
y = whole_data["Bioactivity"]

In [None]:
X

In [None]:
y

In [None]:
#Model training using LogisticRegression
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X, y = make_classification(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)  # apply scaling on training data

pipe.score(X_test, y_test)  # apply scaling on testing data, without leaking training data.

In [None]:
#Model training using StackingClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score

# Create a random binary classification dataset (replace with your actual data)
X, y = make_classification(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train a Logistic Regression model
lr_model = make_pipeline(StandardScaler(), LogisticRegression())
lr_model.fit(X_train, y_train)

# Define the base estimators with Logistic Regression and RandomForest
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('lr', LogisticRegression()) 
]

# Create a Stacking Classifier that combines Logistic Regression and RandomForest
stacking_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

# Fit the stacking classifier on the training data
stacking_clf.fit(X_train, y_train)

y_pred = stacking_clf.predict(X_test)

# Calculate accuracy for the entire pipeline
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Stacking Classifier:", accuracy)


In [None]:
y_pred

In [None]:
#Model evaluation
#Import confusion matrix
from sklearn.metrics import confusion_matrix  

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)  # Replace y_true and y_pred with your actual and predicted labels

# Create a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# Display the confusion matrix plot
plt.figure(figsize=(8, 6))
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Stacking Classifier')
plt.show()


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score


In [None]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
balanced_acc = balanced_accuracy_score(y_test, y_pred)

In [None]:
precision

In [None]:
recall

In [None]:
f1

In [None]:
balanced_acc

In [None]:
roc_auc