We will be building a regression model of acetylocholinesterase inhibtors using the random forest algorithm.

### Import libraries

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

### Load the dataset

In [None]:
df = pd.read_csv("acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv")

In [None]:
df

### Input features

The Acetyloholinesterase dataset contains 881 input features and 1 output variable (pIC50 values).

In [None]:
x = df.drop("pIC50", axis=1)

In [None]:
x

### Output features

In [None]:
y = df.pIC50

In [None]:
y

### Examine the data dimension

In [None]:
x.shape

In [None]:
y.shape

### Remove low variance features

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
x = selection.fit_transform(x)

In [None]:
x.shape

### Data split (80/20 ratio)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
x_train.shape, y_train.shape

In [None]:
x_test.shape, y_test.shape

### Building a regression model using random forest

In [None]:
import numpy as np
np.random.seed(100)

model = RandomForestRegressor(n_estimators=100)
model.fit(x_train, y_train)
r2 = model.score(x_test, y_test)

r2

In [None]:
y_pred = model.predict(x_test)

### Scatter plot of experimental vs predicted pIC50 values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(y_test, y_pred, scatter_kws={"alpha": 0.4})
ax.set_xlabel("Experimental pIC50", fontsize="large", fontweight="bold")
ax.set_ylabel("Predicted pIC50", fontsize="large", fontweight="bold")
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.figure.set_size_inches(5, 5)

plt.show