In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score, f1_score, mean_squared_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, ElasticNet

In [4]:
df = pd.read_csv('bioactivity_data_all3_pubchem.csv')

**The dataset has 881 features and 1 target variable (pIC50)**

In [5]:
X = df.drop('pIC50', axis=1)
X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4690,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4691,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4692,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4693,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
Y = df.pIC50
Y

Unnamed: 0,pIC50
0,6.124939
1,7.000000
2,4.301030
3,6.522879
4,6.096910
...,...
4690,5.612610
4691,5.595166
4692,5.419075
4693,5.460924


**Filter the dataset using Variance threshold**

In [7]:
#variance threshold kept at 0, to remove features. This removes any features with 0 variance (all identical feature values)
selection = VarianceThreshold(threshold=0)
X = selection.fit_transform(X)

In [8]:
X.shape

(4695, 629)

# **Train-test split**

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [10]:
X_train.shape, Y_train.shape

((3756, 629), (3756,))

# **Fitting a Random forest model on the data**

In [14]:
np.random.seed(150)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
print(f"SVM R-squared: {r2}")
Y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(Y_test, Y_pred)
print(f"Mean Squared Error: {mse}")

SVM R-squared: 0.5409360332882199
Mean Squared Error: 1.0859327863689763


# **Fitting a SVM model on the data**

In [42]:
# Support Vector Regression (SVM)
np.random.seed(150)
svm_regressor = SVR()
svm_regressor.fit(X_train, Y_train)
r2_svm = svm_regressor.score(X_test, Y_test)
print(f"SVM R-squared: {r2_svm}")

Y_pred_svm = svm_regressor.predict(X_test)
mse_svm = mean_squared_error(Y_test, Y_pred_svm)
print(f"SVM Mean Squared Error: {mse_svm}")

SVM R-squared: 0.4598074173361356
SVM Mean Squared Error: 1.24176641559561


# **Fitting a ExtraTrees regressor model on the data**

In [13]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Assuming you have X_train, X_test, Y_train, Y_test from your previous code

# Create and train the Extra Trees Regressor
np.random.seed(150)  # Setting a random seed for reproducibility.
et_regressor = ExtraTreesRegressor(n_estimators=100)
et_regressor.fit(X_train, Y_train)

# Make predictions
Y_pred_et = et_regressor.predict(X_test)

# Evaluate the model
r2_et = r2_score(Y_test, Y_pred_et)
print(f"Extra Trees R-squared: {r2_et}")

mse_et = mean_squared_error(Y_test, Y_pred_et)
print(f"Extra Trees Mean Squared Error: {mse_et}")

Extra Trees R-squared: 0.26980590381556446
Extra Trees Mean Squared Error: 1.7273011322136353


# **Fitting a Linear regressor model on the data**

In [17]:

linear_regressor = LinearRegression()
linear_regressor.fit(X_train, Y_train)

Y_pred_linear = linear_regressor.predict(X_test)

r2_linear = r2_score(Y_test, Y_pred_linear)
print(f"Linear Regression R-squared: {r2_linear}")

mse_linear = mean_squared_error(Y_test, Y_pred_linear)
print(f"Linear Regression Mean Squared Error: {mse_linear}")


Linear Regression R-squared: 0.3402653092582044
Linear Regression Mean Squared Error: 1.5606268035219517


From the above set of commonly used regressor models, we can clearly see that **Random forest** model has the better peformance with a relatively high R2 value (>0.5) and relatively low MSE value (1.08)