# SVM

In [None]:
Objective: Build a model to predict "Drug Like" properties of a 
            single compound.

Data: ADME descriptors for 3 libraries.
    Libraries:
        AFRODB
        Biofacquim
        FDA
        
    Endpoint: "Drug Like" (Binary)
        1 -> Drug Like
        0 -> No Drug Like
        
    Descriptors
        ADME descriptors:
            '#Aromatic heavy atoms'
            '#H-bond acceptors'
            '#H-bond donors'
            '#Heavy atoms'
            '#Rotatable bonds'
            'Ali Log S'
            'Ali Solubility (mg/ml)'   
Method: Support Vector Machine

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#
import os

In [5]:
pwd

'/home/babs/Documents/DIFACQUIM/CABANA_CHEMOINFORMATICS/Day_4/Supervised_Learning_Classifications/SVM'

In [6]:
"""Open Database"""
#Modifi Dataset location
Data = pd.read_csv("/home/babs/Documents/DIFACQUIM/CABANA_CHEMOINFORMATICS/Day_4/Supervised_Learning_Classifications/SVM/Data/Data_SVM.csv", sep = ",")
Data = Data.drop("Unnamed: 0", axis = 1)
Data.head()

Unnamed: 0,#Aromatic heavy atoms,#H-bond acceptors,#H-bond donors,#Heavy atoms,#Rotatable bonds,Ali Class,Ali Log S,Ali Solubility (mg/ml),Ali Solubility (mol/l),BBB permeant,...,Silicos-IT Solubility (mg/ml),Silicos-IT Solubility (mol/l),Silicos-IT class,Synthetic Accessibility,TPSA,Veber #violations,WLOGP,XLOGP3,iLOGP,log Kp (cm/s)
0,0,7,3,85,52,Insoluble,-32.61,2.96e-30,2.4800000000000002e-33,No,...,2.92e-18,2.44e-21,Insoluble,10.0,105.45,1,22.41,29.72,15.27,7.52
1,0,27,14,83,19,Moderately soluble,-5.13,0.00884,7.4e-06,No,...,12600000.0,10500.0,Soluble,10.0,418.89,2,-3.54,-3.1,5.55,-15.79
2,0,26,14,83,17,Poorly soluble,-6.01,0.00117,9.8e-07,No,...,3630000.0,3040.0,Soluble,10.0,393.98,2,-2.66,-1.75,5.8,-14.83
3,6,14,3,79,63,Insoluble,-27.92,1.35e-25,1.21e-28,No,...,3.87e-16,3.46e-19,Insoluble,9.97,201.42,2,16.88,23.26,12.7,3.4
4,6,8,3,67,54,Insoluble,-26.06,8.14e-24,8.630000000000001e-27,No,...,3e-16,3.1799999999999996e-19,Insoluble,8.97,122.52,1,16.91,23.07,12.32,4.32


In [7]:
Data.columns

Index(['#Aromatic heavy atoms', '#H-bond acceptors', '#H-bond donors',
       '#Heavy atoms', '#Rotatable bonds', 'Ali Class', 'Ali Log S',
       'Ali Solubility (mg/ml)', 'Ali Solubility (mol/l)', 'BBB permeant',
       'Bioavailability Score', 'Brenk #alerts', 'CYP1A2 inhibitor',
       'CYP2C19 inhibitor', 'CYP2C9 inhibitor', 'CYP2D6 inhibitor',
       'CYP3A4 inhibitor', 'Canonical SMILES', 'Consensus Log P', 'Drug Like',
       'ESOL Class', 'ESOL Log S', 'ESOL Solubility (mg/ml)',
       'ESOL Solubility (mol/l)', 'Egan #violations', 'Formula',
       'Fraction Csp3', 'GI absorption', 'Ghose #violations', 'ID_Database',
       'Input Smiles', 'Leadlikeness #violations', 'Library',
       'Lipinski #violations', 'MLOGP', 'MR', 'MW', 'Molecule',
       'Muegge #violations', 'Name', 'PAINS #alerts', 'Pgp substrate',
       'Silicos-IT Log P', 'Silicos-IT LogSw', 'Silicos-IT Solubility (mg/ml)',
       'Silicos-IT Solubility (mol/l)', 'Silicos-IT class',
       'Synthetic Accessibil

In [8]:
#Identify Libraries
Data.Library.unique()

array(['Afro', 'Biofacquim', 'FDA'], dtype=object)

In [10]:
#Identify Target
Data["Drug Like"].unique()

array([0, 1])

Note: "Drug like" column (Target column) correspond to Target
    1 -> Drug Like
    0 -> No Drug Like

## Exploratory Data Analysis

In [None]:
"""Plot a Descriptor"""
sns.boxplot(x = "Library", y = Data["MW"], data=Data)

In [None]:
#Identify Numerical Data (Descriptors)
Data.select_dtypes(np.number).columns

In [None]:
feature_names = ['#Aromatic heavy atoms', '#H-bond acceptors', '#H-bond donors',
       '#Heavy atoms', '#Rotatable bonds', 'Ali Log S',
       'Ali Solubility (mg/ml)', 'Ali Solubility (mol/l)',
       'Bioavailability Score', 'Brenk #alerts', 'Consensus Log P',
       'Drug Like', 'ESOL Log S', 'ESOL Solubility (mg/ml)',
       'ESOL Solubility (mol/l)', 'Egan #violations', 'Fraction Csp3',
       'Ghose #violations', 'Leadlikeness #violations', 'Lipinski #violations',
       'MLOGP', 'MR', 'MW', 'Muegge #violations', 'PAINS #alerts',
       'Silicos-IT Log P', 'Silicos-IT LogSw', 'Silicos-IT Solubility (mg/ml)',
       'Silicos-IT Solubility (mol/l)', 'Synthetic Accessibility', 'TPSA',
       'Veber #violations', 'WLOGP', 'XLOGP3', 'iLOGP', 'log Kp (cm/s)']

In [None]:
#Remove target columns
feature_names.remove('Drug Like')
feature_names

In [None]:
#Convert numerial data into a new DF
df_feat = Data[feature_names]
df_feat.head()

In [None]:
#Statistical values
df_feat.describe()

In [None]:
#Correlation
Correlation = df_feat.corr()
Correlation.head()

In [None]:
sns.heatmap(df_feat.corr(annotate = True))
plt.savefig("correlacion_inicial.png")

In [None]:
#Delete correlated variables (Avoid Overfitting)
feature_names.remove('XLOGP3')
feature_names.remove('iLOGP')
feature_names.remove('log Kp (cm/s)')
feature_names.remove('Silicos-IT LogSw')
feature_names.remove('Ali Solubility (mol/l)')
feature_names.remove('Ali Solubility (mg/ml)')
feature_names.remove('Consensus Log P')
feature_names.remove('ESOL Solubility (mg/ml)')
print(feature_names)

In [None]:
#Generate new DF (Whitout correlated data)
df_feat = Data[feature_names]

In [None]:
#Visualize new correlation matrix
Correlation = df_feat.corr()
sns.heatmap(df_feat.corr())
plt.savefig("correlacion_final.png")

In [None]:
#Convert target into a new DF
df_target = pd.DataFrame(Data['Drug Like'],columns=['Drug Like'])

# Machine Learning Model

## SVM

In [None]:
#Train Test Split
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_feat, np.ravel(df_target), test_size=0.30, random_state=101)

In [None]:
#Train the Support Vector Classifier
from sklearn.svm import SVC

In [None]:
#Assign Model
model = SVC()

In [None]:
#Train model
model.fit(X_train,y_train)

## Predictions

In [None]:
Now let's predict If a Molecule has"Drug Like" properties
using the trained model.

In [None]:
#Write a function to select descriptors for a single compound
def test_compound(Library, Name):
    FDA = Data[Data["Library"]== Library]
    test = FDA[FDA["Name"]== Name]
    test = test[feature_names]
    #print(test.head())
    return test

In [None]:
#test = test_compound("FDA", "Acetaminophen")
#test = test_compound("FDA", "Ambroxol")
test = test_compound("Biofacquim", "Purgic_acid_A")

In [None]:
#Visualice test descriptors
test 

In [None]:
#Predic result for test compound (test variable)
model.predict(test)

## Evaluate the model

In [None]:
predictions = model.predict(X_test) 

In [None]:
#import metrics
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
#Compute confusion matrix
print(confusion_matrix(y_test,predictions))

In [None]:
print(classification_report(y_test,predictions))