In [1]:
from itertools import islice
import os
import numpy as np
from sklearn import cross_validation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
data = []
with open('../data/original.csv') as f:
    for line in islice(f,1,None):
        temp = line.rstrip(os.linesep).split(',')
        temp = temp[2:]
        temp = [float(x) for x in temp]
        data.append(temp)

In [3]:
data = np.array(data)
y = data[:,0]
x = data[:,1:4]

In [4]:
x1 = x[:,0:2]
y1 = [ 
    1 if i>=0.3
    else -1 
    for i in y
]

In [5]:
lda = LinearDiscriminantAnalysis(solver='svd', store_covariance=True,tol=0.01)
lda.fit(x1,y1)
print('Model 1:')
print(('-Equation:({})*TPSA+({})*logP+({})').format(lda.coef_[0][0],lda.coef_[0][1],lda.intercept_[0]))
y1_pred = lda.predict(x1)
c = classification_report(y1,y1_pred)
conf = confusion_matrix(y1,y1_pred)
loo = cross_validation.LeaveOneOut(len(x1))
score = cross_validation.cross_val_score(lda, x1, y1, cv=loo, n_jobs=-1)
print('-LOOCV accuracy:{}'.format(np.mean(score)))
print('-Training classification report:')
print(c)
print('-Confusion matrix:')
print(conf)
print('-Training accuracy:',end='')
print((conf[0,0]+conf[1,1])/np.sum(conf))
print('-Training sensitivity:',end='')
print(conf[1,1]/(conf[1,0]+conf[1,1]))
print('-Training specificity',end='')
print(conf[0,0]/(conf[0,0]+conf[0,1]))

Model 1:
-Equation:(-0.027690999766363304)*TPSA+(0.5159492707621883)*logP+(-0.6878067844391902)
-LOOCV accuracy:0.7947882736156352
-Training classification report:
             precision    recall  f1-score   support

         -1       0.83      0.86      0.85       197
          1       0.74      0.69      0.71       110

avg / total       0.80      0.80      0.80       307

-Confusion matrix:
[[170  27]
 [ 34  76]]
-Training accuracy:0.801302931596
-Training sensitivity:0.690909090909
-Training specificity0.862944162437


In [6]:
x2 = x
y2 = [
    1 if i>=-1
    else -1 
    for i in y
]

In [7]:
lda = LinearDiscriminantAnalysis(solver='svd', store_covariance=True,tol=0.01)
lda.fit(x2,y2)
print('Model 2')
print(('-Equation:({})*TPSA+({})*logP+({})*(a_acid+a_base)+({})')
      .format(lda.coef_[0][0],lda.coef_[0][1],lda.coef_[0][2],lda.intercept_[0]))
y2_pred = lda.predict(x2)
c = classification_report(y2,y2_pred)
conf = confusion_matrix(y2,y2_pred)
loo = cross_validation.LeaveOneOut(len(x2))
score = cross_validation.cross_val_score(lda, x2, y2, cv=loo, n_jobs=-1)
print('-LOOCV accuracy:{}'.format(np.mean(score)))
print('-Classification report:')
print(c)
print('-Confusion matrix:')
print(conf)
print('-Training accuracy:',end='')
print((conf[0,0]+conf[1,1])/np.sum(conf))
print('-Training sensitivity:',end='')
print(conf[1,1]/(conf[1,0]+conf[1,1]))
print('-Training specificity',end='')
print(conf[0,0]/(conf[0,0]+conf[0,1]))

Model 2
-Equation:(-0.032634360530268657)*TPSA+(0.2288811941956283)*logP+(-0.5671020565813774)*(a_acid+a_base)+(4.409500092147513)
-LOOCV accuracy:0.8729641693811075
-Classification report:
             precision    recall  f1-score   support

         -1       0.43      0.16      0.24        37
          1       0.89      0.97      0.93       270

avg / total       0.84      0.87      0.85       307

-Confusion matrix:
[[  6  31]
 [  8 262]]
-Training accuracy:0.872964169381
-Training sensitivity:0.97037037037
-Training specificity0.162162162162
