# SVM

Objective: Build a model to predict if a single compound is a "PPI modulator" or is not.

Data: ADME descriptors for 3 libraries.
    Libraries:
        AFRODB
        Biofacquim
        FDA
        PPI
        
        
    Endpoint: "PPI modulator" (Binary)
        1 -> PPI modulator
        0 -> Not PPI modulator
        
    Descriptors
        ADME descriptors:
            '#Aromatic heavy atoms'
            '#H-bond acceptors'
            '#H-bond donors'
            '#Heavy atoms'
             
Method: Support Vector Machine

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#
import os
from sklearn.preprocessing import label_binarize

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score

from Functions_SVM import test_compound,test_compound_real_category

In [2]:
descriptors = ['HBA', 'HBD', 'RB', 'LogP', 'TPSA', 'MW', 'Heavy Atom', 'Ring Count', 'Fraction CSP3',]
#Falta generar los descriptores

In [3]:
#Difacquim computer root
root = "/home/barbara/Documents/DIFACQUIM/PPI_classifier/phase-1/Databases/"

In [27]:
class SVM:
    
    def __init__(self, input_file, target, descriptors):
        self.Data  = pd.read_csv(root+str(input_file))
        #Muestreo
        self.Data = pd.DataFrame.sample(self.Data, frac=0.03, replace=True,  random_state=1992, axis=None) 
        x = self.Data[self.Data["Library"]== "PPI"]
        print(x.Name)
        print(self.Data.PPI.unique())
        print("Libraries are: ", self.Data.Library.unique())
        #print("PPI modulator: ", self.Data[target].unique())
        print("Total compounds ", self.Data.shape[0])
        self.descriptors = descriptors
        self.target = target
        
    def train_model(self, fraction, kernel):
        """
        fraction: float, indicates test set composition
        kernel: str, customize kernel (see scikit learn documentation)
        """
        #Binarize the output
        y = np.revels(label_binarize(self.Data[self.target], classes=["No", "Yes"]))
        #y = pd.DataFrame(data = label_binarize(self.Data[self.target], classes=["No", "Yes"]), columns = ["Target"])
        print(y.Target.unique())
        X_train, X_test, y_train, y_test = train_test_split(self.Data[self.descriptors], y, test_size = fraction, random_state=1992)
        model = SVC(kernel = kernel)
        print(model)
        #print(len(X_train), len(y_train))
        model.fit(X_train, y_train)
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        self.kernel = kernel
        print(pd.DataFrame.from_dict({
            "Categoria Real": [self.Data.PPI],
            "Categoria Binaria": y]
            }
            ))
        
    def single_prediction(self, Library, Name, target):
        compound = test_compound(self.Data, Library, Name, self.descriptors)   
        result = test_compound_real_category(self.Data, Name, target)
        print("Evaluation of ", str(Name))
        print("Predicted activity value: ", str(self.model.predict(compound)))
        print("Real activity value", result)
    
    def metrics(self):
        y_test = self.y_test
        predictions = self.model.predict(self.X_test)
        print(accuracy_score(y_test,predictions))
        print(confusion_matrix(y_test,predictions))
        print(roc_auc_score(y_test, predictions))
        print(classification_report(y_test,predictions))
        print(roc_curve(y_test, predictions))
        #Hacer que imprima in data frame con los resultados

    def report(self, ref_output):
        Report = pd.DataFrame.from_dict(
            {"Method": "SVM",
            "Kernel": self.kernel,
            "descriptors": [self.descriptors]}
            )
        Report.to_csv("SVM_"+str(ref_output)+".csv", sep = ",")

SyntaxError: invalid syntax (<ipython-input-27-552b730f27ca>, line 36)

In [28]:
a = SVM("Dataset.csv", "PPI", descriptors)

20365    1602
19511     679
19447     458
20482    1617
21055    1601
20777     370
19479     557
19828    1595
20519    1623
20273    1651
20644    1644
20700      39
19789    1491
20248    1289
19536     729
21078    1754
20327    1700
19319      40
19494     623
20846     633
19746    1391
19744    1383
20816     555
20850     646
21030    1437
20212     543
20656    1646
19420     383
20027    1107
20641    1644
20063    1318
20910     906
19226     813
19408     356
20588    1634
19315      34
20549    1628
20446    1611
20263     408
20101    1503
20593    1635
20798     465
20058    1287
20214     549
19388     303
19773    1445
20213     548
19902     351
19615     957
21080    1756
20980    1259
20964    1173
Name: Name, dtype: object
['No' 'Yes']
Libraries are:  ['Epidatabase' 'FDA' 'PPI' 'AFRODB' 'BIOFACQUIM']
Total compounds  633


In [29]:
a.train_model(0.3, 'linear')

[0 1]
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


  y = column_or_1d(y, warn=True)


                                      Categoria Real  \
0  10505     No
4093      No
2216      No
11533  ...   

                                   Categoria Binaria  
0  0      0
1      0
2      0
3      0
4      0
5...  


In [30]:
a.single_prediction("PPI", "679", "PPI")

Evaluation of  679
Predicted activity value:  [0]
Real activity value ['Yes']


In [31]:
a.metrics()

0.8789473684210526
[[167   0]
 [ 23   0]]
0.5
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       167
           1       0.00      0.00      0.00        23

    accuracy                           0.88       190
   macro avg       0.44      0.50      0.47       190
weighted avg       0.77      0.88      0.82       190

(array([0., 1.]), array([0., 1.]), array([1, 0]))


  'precision', 'predicted', average, warn_for)


In [26]:
a.report("MyfirstSVM")

In [10]:
#Identify Numerical Data (Descriptors)
def numerical_descriptors(DataFrame):
    print(DataFrame.select_dtypes(np.number).columns)