In [2]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
Collecting matplotlib!=3.6.1,>=3.1
  Downloading matplotlib-3.6.3-cp310-cp310-win_amd64.whl (7.2 MB)
Collecting cycler>=0.10
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting contourpy>=1.0.1
  Downloading contourpy-1.0.7-cp310-cp310-win_amd64.whl (162 kB)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.38.0-py3-none-any.whl (965 kB)
Collecting pillow>=6.2.0
  Downloading Pillow-9.4.0-cp310-cp310-win_amd64.whl (2.5 MB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.4.4-cp310-cp310-win_amd64.whl (55 kB)
Installing collected packages: pillow, kiwisolver, fonttools, cycler, contourpy, matplotlib, seaborn
Successfully installed contourpy-1.0.7 cycler-0.11.0 fonttools-4.38.0 kiwisolver-1.4.4 matplotlib-3.6.3 pillow-9.4.0 seaborn-0.12.2
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Ervin\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.0-cp310-cp310-win_amd64.whl (8.2 MB)
Collecting scipy>=1.3.2
  Downloading scipy-1.10.0-cp310-cp310-win_amd64.whl (42.5 MB)
Collecting joblib>=1.1.1Note: you may need to restart the kernel to use updated packages.
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0


You should consider upgrading via the 'c:\Users\Ervin\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.



  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.0 scipy-1.10.0 threadpoolctl-3.1.0


In [2]:
#import libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
#Lipinski descriptors = set of simple molecular descriptors - quick overview of drop-like properties for molecule (global features)
#Pubchem fingerprints = describing local features of molecule - each molecule described by their atomic structure

In [4]:
#load our dataset
df = pd.read_csv('monkeypox_bioactivity_data_3class_pIC50_pubchem_fp.csv')

## **3. Input features**
The ***Acetylcholinesterase*** data set contains 881 input features and 1 output variable (pIC50 values).

In [5]:
#input features (X)
X = df.drop('pIC50', axis = 1)
X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#output features (Y)
Y=df.pIC50
Y

0     4.318759
1     4.657577
2     4.267606
3     3.455932
4     3.301030
5     3.522879
6     3.838632
7     3.119186
8     3.296709
9     6.494850
10    6.494850
11    6.494850
Name: pIC50, dtype: float64

In [7]:
X.shape
#12 compounds, 881 fingerprints)

(12, 881)

In [8]:
Y.shape

(12,)

In [9]:
#removing low variance features
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(0.8 * (1- 0.8)))
X = selection.fit_transform(X)

In [10]:
X.shape
#93 fingerprints remaining

(12, 93)

In [11]:
#splitting data (4:1 ratio)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
X_train.shape, Y_train.shape

((9, 93), (9,))

In [12]:
X_test.shape, Y_test.shape

((3, 93), (3,))

## **5. Building a Regression Model using Random Forest**

In [13]:
#Build regression model using random forest
import numpy as np
np.random.seed(100)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
r2

0.8972645071642512

In [14]:
#make prediction
#remember the scores we get are the pIC50, which is the half-maximal inhibitory concentration - (how much drug is needed to inhibit a biological process by half)
Y_pred = model.predict(X_test)
Y_pred

array([4.04342782, 6.33052791, 3.65281346])

In [16]:
Y_pred

array([4.04342782, 6.33052791, 3.65281346])

In [17]:
Y_test2 = Y_test.to_numpy()
Y_test2

array([4.65757732, 6.49485002, 3.30103   ])

## **Scatter Plot of Experimental vs Predicted pIC50 Values**

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y_test2, Y_pred, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.figure.set_size_inches(5, 5)
plt.show

TypeError: regplot() takes from 0 to 1 positional arguments but 2 positional arguments (and 1 keyword-only argument) were given