# SVM

Objective: Build a model to predict if a single compound is a "PPI modulator" or is not.

Data: ADME descriptors for 3 libraries.
    Libraries:
        AFRODB
        Biofacquim
        FDA
        PPI
        
        
    Endpoint: "PPI modulator" (Binary)
        1 -> PPI modulator
        0 -> Not PPI modulator
        
    Descriptors
        ADME descriptors:
            '#Aromatic heavy atoms'
            '#H-bond acceptors'
            '#H-bond donors'
            '#Heavy atoms'
             
Method: Support Vector Machine

## Import Libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#
import os
from sklearn.preprocessing import label_binarize

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score

from Functions_SVM import test_compound,test_compound_real_category

In [6]:
descriptors = ['HBA', 'HBD', 'RB', 'LogP', 'TPSA', 'MW', 'Heavy Atom', 'Ring Count', 'Fraction CSP3',]
#Falta generar los descriptores

In [7]:
#Difacquim computer root
root = "/home/barbara/Documents/DIFACQUIM/PPI_classifier/phase-1/Databases/"

In [17]:
class SVM:
    
    def __init__(self, input_file, target, descriptors):
        self.Data  = pd.read_csv(root+str(input_file))
        #Muestreo
        self.Data = pd.DataFrame.sample(self.Data, frac=0.05, replace=True,  random_state=1992, axis=None) 
        x = self.Data[self.Data["Library"] == "PPI"]
        print(x.Name)
        print(self.Data.PPI.unique())
        print("Libraries are: ", self.Data.Library.unique())
        #print("PPI modulator: ", self.Data[target].unique())
        print("Total compounds ", self.Data.shape[0])
        self.descriptors = descriptors
        self.target = target
        
    def train_model(self, fraction, kernel):
        """
        fraction: float, indicates test set composition
        kernel: str, customize kernel (see scikit learn documentation)
        """
        #Binarize the output
        y = np.array(self.Data[self.target])
        y = label_binarize(y, classes = ["No", "Yes"])
        y = np.reshape(y, len(self.Data[self.target]))
        print(y)
        X_train, X_test, y_train, y_test = train_test_split(self.Data[self.descriptors], y, test_size = fraction,random_state=1992)
        model = SVC(kernel = kernel)
        print(model)
        model.fit(X_train, y_train)
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        self.kernel = kernel
                
    def single_prediction(self, Library, Name, target):
        compound = test_compound(self.Data, Library, Name, self.descriptors)   
        result = test_compound_real_category(self.Data, Name, target)
        print("Evaluation of ", str(Name))
        print("Predicted activity value: ", str(self.model.predict(compound)))
        print("Real activity value", result)
    
    def metrics(self):
        y_test = self.y_test
        predictions = self.model.predict(self.X_test)
        print(accuracy_score(y_test,predictions))
        print(confusion_matrix(y_test,predictions))
        print(roc_auc_score(y_test, predictions))
        print(classification_report(y_test,predictions))
        print(roc_curve(y_test, predictions))
        #Hacer que imprima in data frame con los resultados

    def report(self, ref_output):
        Report = pd.DataFrame.from_dict(
            {"Method": "SVM",
            "Kernel": self.kernel,
            "descriptors": [self.descriptors]}
            )
        Report.to_csv("SVM_"+str(ref_output)+".csv", sep = ",")

In [18]:
a = SVM("Dataset.csv", "PPI", descriptors)

20365    1602
19511     679
19447     458
20482    1617
21055    1601
20777     370
19479     557
19828    1595
20519    1623
20273    1651
20644    1644
20700      39
19789    1491
20248    1289
19536     729
21078    1754
20327    1700
19319      40
19494     623
20846     633
19746    1391
19744    1383
20816     555
20850     646
21030    1437
20212     543
20656    1646
19420     383
20027    1107
20641    1644
         ... 
19911     439
19461     497
20626    1640
20942    1086
21072    1747
19778    1462
19472     530
19598     920
20857     684
19473     532
20178    1749
20440    1610
19944     654
19764    1433
20315    1689
19294    1732
19623     971
19385     285
20772     357
19812    1568
19738    1350
19621     964
20428    1608
20299    1673
20366    1602
19624     974
20836     603
19446     456
20167    1742
20920     965
Name: Name, Length: 88, dtype: object
['No' 'Yes']
Libraries are:  ['Epidatabase' 'FDA' 'PPI' 'AFRODB' 'BIOFACQUIM']
Total compounds  1054


In [28]:
a.train_model(0.3, 'rbf')

[0 0 0 ... 0 0 0]
10505     No
4093      No
2216      No
11533     No
16613     No
14290     No
17704     No
10376     No
20365    Yes
8712      No
8420      No
6198      No
12418     No
19511    Yes
1628      No
5983      No
12342     No
19447    Yes
16134     No
11716     No
11280     No
10761     No
4219      No
1918      No
10295     No
12150     No
11210     No
12796     No
20482    Yes
14598     No
        ... 
9470      No
8972      No
16775     No
15698     No
2122      No
17581     No
2108      No
9804      No
20836    Yes
10392     No
15587     No
19446    Yes
10105     No
13109     No
3183      No
3726      No
18566     No
5957      No
3165      No
13210     No
583       No
1135      No
11958     No
5967      No
20167    Yes
15808     No
20920    Yes
17130     No
3457      No
9029      No
Name: PPI, Length: 1054, dtype: object
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,



In [31]:
a.single_prediction("PPI", "1602", "PPI")

Evaluation of  1602
Predicted activity value:  [1 1]
Real activity value ['Yes', 'Yes']


In [32]:
a.metrics()

0.9274447949526814
[[292   1]
 [ 22   2]]
0.5399601820250284
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       293
           1       0.67      0.08      0.15        24

    accuracy                           0.93       317
   macro avg       0.80      0.54      0.56       317
weighted avg       0.91      0.93      0.90       317

(array([0.        , 0.00341297, 1.        ]), array([0.        , 0.08333333, 1.        ]), array([2, 1, 0]))


In [33]:
a.report("MyfirstSVM")

In [34]:
#Identify Numerical Data (Descriptors)
def numerical_descriptors(DataFrame):
    print(DataFrame.select_dtypes(np.number).columns)