We will be comparing several ML algorithms for build regression models of acetylcholinesterase inhibitors.

## Import libraries

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor

## Load the dataset

In [None]:
df = pd.read_csv("acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv")

In [None]:
x = df.drop("pIC50", axis=1)
y = df.pIC50

## Data pre-processing

In [None]:
# Examine x dimension
x.shape

In [None]:
# Remove low variance features
from sklearn.feature_selection import VarianceThreshold

selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
x = selection.fit_transform(x)
x.shape

In [None]:
# Perform data splitting using 80/20 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## Compare ML algorithms

In [None]:
# Define and builds the lazyclassifier
clf = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
train, test = clf.fit(x_train, x_test, y_train, y_test)

In [None]:
# Performance table of the training set (80% subset)
train

In [None]:
# Performance table of the test set (20% subset)
test

## Data visualization of model performance

In [None]:
# Barplot of R-squared values
import matplotlib.pyplot as plt
import seaborn as sns

# train["R-Squared"] = [0 if i < 0 else i for i in train.iloc[:,0]]

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=train.index, x="R-Squared", data=train)
ax.set(xlim=(0, 1))

In [None]:
# Barplot of RMSE values
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=train.index, x="RMSE", data=train)
ax.set(xlim=(0, 10))

In [None]:
# Barplot of calculation time
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=train.index, x="Time Taken", data=train)
ax.set(xlim=(0, 10))