## Bioactivity KDTRee Prediction Model

### Importing Packages

In [7]:
import pandas as pd
import numpy as py
import sklearn
from sklearn.model_selection import train_test_split #Split the dataset into training and testing sets 
from sklearn.preprocessing import StandardScaler # Standardizing features by removing the mean
from sklearn.neighbors import KNeighborsRegressor # K-Nearest Neighbors model for KDTree
from sklearn.preprocessing import LabelEncoder # Encoding categorical labels into numerical values
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score # Used to evaluted precision of kdtree model

### Importing Bioactivity CSV Dataset

In [8]:
# Set our dataset equal to ds variable
ds = pd.read_csv('coronavirus_bioactivity_data_3class_pIC50_pubchem_fp.csv')

# View first 5 lines of dataset
ds.head()

### Feature Identification

In [9]:
# Setting X equal to the IC50 inhibitiion concentration
X = ds.drop('pIC50', axis=1)

# Setting Y equal to all other features
y = ds.pIC50


### Data Split

In [10]:
# Dividing up our dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Scaling

#### Ensures that all features will contribute equally between the different data points

In [11]:
scaler = StandardScaler()

# Fit the scaler on the X data and transform it
X = scaler.fit_transform(X)

### Building Model and Training

In [12]:
# Define a K-Fold cross-validator using 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# kd tree boosting model using k-nearest regressor
kdtree = KNeighborsRegressor(n_neighbors=5, algorithm='kd_tree')

# Cross-validation using negative mean squared error scoring tool
neg_mean_squared_scores = cross_val_score(kdtree, X, y, cv=kf, scoring='neg_mean_squared_error')

# Ccross-validation using negative mean absolute error scoring tool
neg_mean_absolute_scores = cross_val_score(kdtree, X, y, cv=kf, scoring='neg_mean_absolute_error')

# Cross-validation using R-squared scoring tool
r2_scores = cross_val_score(kdtree, X, y, cv=kf, scoring='r2')


### Evaluating KDTree Model

In [13]:
# Display the mean values for the Negative Mean Squared Error over 5 folds
print("Mean Negative Mean Squared Error: ", abs(py.mean(neg_mean_squared_scores)))

# Display the mean values for the Negative Mean Absolute Error over 5 folds
print("Mean Negative Mean Absolute Error:", abs(py.mean(neg_mean_absolute_scores)))

# Display the mean values for the R-Squared over 5 folds
print("Mean R-Squared:", py.mean(r2_scores))

Mean Negative Mean Squared Error:  1.8385380319151061
Mean Negative Mean Absolute Error: 1.11250571846247
Mean R-Squared: 0.38660569782501375
