## Bioactivity Multi-Layer Perception Prediction Model

### Importing Packages

In [8]:
import pandas as pd
import numpy as py
import sklearn
from sklearn.model_selection import train_test_split #Split the dataset into training and testing sets 
from sklearn.preprocessing import StandardScaler # Standardizing features by removing the mean
from sklearn.neural_network import MLPRegressor # Importing the Multi-layer Perception regressor  model
from sklearn.preprocessing import LabelEncoder # Encoding categorical labels into numerical values
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score # Used to evaluted precision of MLP model

### Importing Bioactivity CSV Dataset

In [9]:
# Set our dataset equal to ds variable
ds = pd.read_csv('coronavirus_bioactivity_data_3class_pIC50_pubchem_fp.csv')

# View first 5 lines of dataset
ds.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.142668
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.026872
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.869666
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.882397
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.69897


### Feature Identification

In [10]:
# Setting X equal to the IC50 inhibitiion concentration
X = ds.drop('pIC50', axis=1)

y = ds.pIC50


### Data Split

In [11]:
# Dividing up our dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Scaling

#### Ensures that all features will contribute equally between the different data points

In [12]:
scaler = StandardScaler()

# Fit the scaler on the X data and transform it
X = scaler.fit_transform(X)

### Building Model and Training

In [13]:
# Define a K-Fold cross-validator using 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize MLP regressor
mlp = MLPRegressor()

# Train the MLP regressor model
mlp.fit(X_train, y_train)

# Cross-validation using negative mean squared error scoring tool
neg_mean_squared_scores = cross_val_score(mlp, X, y, cv=kf, scoring='neg_mean_squared_error')

# Ccross-validation using negative mean absolute error scoring tool
neg_mean_absolute_scores = cross_val_score(mlp, X, y, cv=kf, scoring='neg_mean_absolute_error')

# Cross-validation using R-squared scoring tool
r2_scores = cross_val_score(mlp, X, y, cv=kf, scoring='r2')




### Evaluating MLP Model

In [14]:
# Display the mean values for the Negative Mean Squared Error over 5 folds
print("Mean Negative Mean Squared Error: ", abs(py.mean(neg_mean_squared_scores)))

# Display the mean values for the Negative Mean Absolute Error over 5 folds
print("Mean Negative Mean Absolute Error:", abs(py.mean(neg_mean_absolute_scores)))

# Display the mean values for the R-Squared over 5 folds
print("Mean R-Squared:", py.mean(r2_scores))

Mean Negative Mean Squared Error:  10.839988177295858
Mean Negative Mean Absolute Error: 2.455105602540909
Mean R-Squared: -3.534873598299913
