## Objective
### Build a machine learning model to precict Biological activity (pIC50) against BRD4.

## Data: 
### Compounds with known biological activity against BRD4. 

## Assay: 
### Fluorescence Polarization 

## Endpoint: 
### pIC50 

## Variables: 
### PhysicochemicalDescriptors

## Method: 
### Linear Multivariate Regression

#### Data information: http://www.ippidb.cdithem.fr

In [None]:
#Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Load data

In [None]:
Data = pd.read_csv("data/Bromodomain_Histone.csv")

In [None]:
Data.head()

In [None]:
Data["Test name"].unique()

In [None]:
Data["Activity type"].unique()

### Exploratory Data Analysis

In [None]:
#Statistical descriptin of numerical data
Data.describe()

In [None]:
#column names
Data.columns

In [None]:
#Numerical types
Data.dtypes

In [None]:
#Select numerical descriptors
Data.select_dtypes(include=['float64', 'int64']).head()

In [None]:
#Crear un DataFrame datos numericos (df_feat)
numerical_data  = Data.select_dtypes(include=['float64', 'int64'])
numerical_data.head(5)

In [None]:
#drop some variables, 
numerical_data = numerical_data.drop(["Compound's ID"], axis =1)
numerical_data.head()

In [None]:
numerical_data.columns

In [None]:
#Correlation matrix
sns.heatmap(numerical_data.corr())

In [None]:
#scatter matrix
from pandas.plotting import scatter_matrix
features=["Activity", 'Molecular weight', 'ALogP', 'Number of H-bond donor']
_ = numerical_data[features] 
pd.plotting.scatter_matrix(_, figsize=(20,20), alpha=0.2)

In [None]:
#Plot activity, vs Lipophilic
plt.scatter(Data["Activity"], Data["Lipophilic efficiency"])

In [None]:
#Create a dataframe to storage Target values
df_target = pd.DataFrame(Data["Activity"], columns=["Activity"])
df_target.head(5)

In [None]:
#drop activity from numetical data
numerical_data = numerical_data.drop(["Activity"], axis =1)
numerical_data.head()

## Machine Learning Model

### Linear multivariate regression

#### scikit-learn API
##### https://scikit-learn.org/stable/

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
#Create a test set
X_train, X_test, y_train, y_test = train_test_split(numerical_data,df_target, test_size = 0.3, random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
#Import model
from sklearn.linear_model import LinearRegression

In [None]:
#Assign Model
lm = LinearRegression()

In [None]:
#Train model
lm.fit(X_train, y_train)

In [None]:
print(lm.intercept_)

In [None]:
lm.coef_

In [None]:
cdf = pd.DataFrame(data = lm.coef_[0],
                   index = X_train.columns,
                   columns=["Coeficiente corelación"])

In [None]:
cdf

### Predictions

In [None]:
#write a function to select descriptors from an specific compoud descriptor

In [None]:
def test_compound(ID):
    test = Data[Data["Compound's ID"]== ID]
    test = test[numerical_data.columns]
    return test

In [None]:
#Look for an specific compound
test = test_compound(1603)
test

#### single prediction

In [None]:
#predic
lm.predict(test)

In [None]:
#Figure out real activity
#write a function to select an specific compound
def test_compound_Activity(ID):
    test = Data[Data["Compound's ID"]== ID]
    Activity = test["Activity"]
    return Activity

In [None]:
activity = test_compound_Activity(1603)
activity

In [None]:
def report(ID, test, activity):
    print("Evaluation of compund ID", str(ID))
    print("Predicted activity value: ", str(lm.predict(test)[0][0]))
    print("Real activity value", str(activity))

In [None]:
report(1603, test, activity)

In [None]:
#Predictions to compute metrics
predictions = lm.predict(X_test)

In [None]:

plt.scatter(y_test, predictions)

In [None]:
#residual plot
sns.distplot((y_test-predictions))

### Evaluate model
#### Note: Regression Metrics
#### scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

In [None]:
from sklearn import metrics
from sklearn.metrics import r2_score

In [None]:
#Compute R2
print("R^2:",r2_score(y_test, predictions))

In [None]:
#Compute metrics
print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))