In [1]:
from sklearn import cross_validation
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from itertools import islice
import os

In [2]:
def read_data(file_name):
    data = []
    isFirst = False
    with open(file_name) as f:
        for line in islice(f,1,None):
            temp = line.rstrip(os.linesep).split(',')
            if isFirst is False:
                isFirst = True
                for i in range(len(temp)):
                    data.append([])
                    data[i].append(temp[i])
            else:
                for i in range(len(temp)):
                    data[i].append(temp[i])
        f.close()
    for i in range(1,len(data)):
        data[i] = [ float(x) for x in data[i] ]
    return data

In [3]:
train_data = read_data('../data/train_mergeFeatures.csv')
test_data = read_data('../data/test_mergeFeatures.csv')

In [4]:
logBB = []
with open('../data/train_logBB.txt') as f:
    for line in f:
        logBB.append(float(line.rstrip(os.linesep)))
    f.close()

In [5]:
y1 = [ 
    1 if i>=0.3
    else -1 
    for i in logBB
]
y2 = [
    1 if i>=-1
    else -1
    for i in logBB
]

In [6]:
test_y = []
with open('../data/test_labels.txt') as f:
    for line in f:
        if int(line[0:-1])==0:
            test_y.append(-1)
        else:
            test_y.append(int(line.rstrip(os.linesep)))
    f.close()

In [7]:
filtered_test_data = []
filtered_test_y = []
for i in range(len(test_data)):
    filtered_test_data.append([])
for i in range(len(test_data[0])):
    if test_data[0][i] not in train_data[0]:
        for j in range(len(test_data)):
            filtered_test_data[j].append(test_data[j][i])
        filtered_test_y.append(test_y[i])

In [8]:
print('Model Performance:')
print()
print('Model 1 (logBB threshold:0.3):')
print()
loo = cross_validation.LeaveOneOut(len(train_data[0]))
for i in [2,3,4,9]:
    for j in [1, 8]:
        train_x = np.array([train_data[j], train_data[i]]).transpose()
        train_y = y1
        lda = LinearDiscriminantAnalysis(solver='svd', store_covariance=True,tol=0.01)
        score = cross_validation.cross_val_score(lda, train_x, train_y, cv=loo, n_jobs=-1)
        lda.fit(train_x, train_y)
        print(('-Equation:({})*TPSA+({})*logP+({})').format(lda.coef_[0][0],lda.coef_[0][1],lda.intercept_[0]))
        train_y_pred = lda.predict(train_x)
        c = classification_report(train_y,train_y_pred)
        conf = confusion_matrix(train_y,train_y_pred)
        print('Features:',end='')
        if j == 1:
            print('TPSA(CDK),',end='')
        else:
            print('TPSA(RDkit),',end='')
        if i == 2:
            print('ALOGP(CDK)')
        elif i == 3:
            print('XLOGP(CDK)')
        elif i == 4:
            print('MannholdLogP(CDK)')
        else:
            print('MolLogP(RDkit)')
        print('-LOOCV accuracy:{}'.format(np.mean(score)))
        print('-Training classification report:')
        print(c)
        print('-Confusion matrix:')
        print(conf)
        print('-Training accuracy:',end='')
        print((conf[0,0]+conf[1,1])/np.sum(conf))
        print('-Training sensitivity:',end='')
        print(conf[1,1]/(conf[1,0]+conf[1,1]))
        print('-Training specificity',end='')
        print(conf[0,0]/(conf[0,0]+conf[0,1]))
        print()

Model Performance:

Model 1 (logBB threshold:0.3):

-Equation:(-0.02392937184407768)*TPSA+(0.14482544331588607)*logP+(0.8310859989476257)
Features:TPSA(CDK),ALOGP(CDK)
-LOOCV accuracy:0.7361563517915309
-Training classification report:
             precision    recall  f1-score   support

         -1       0.74      0.90      0.82       197
          1       0.72      0.45      0.55       110

avg / total       0.74      0.74      0.72       307

-Confusion matrix:
[[178  19]
 [ 61  49]]
-Training accuracy:0.739413680782
-Training sensitivity:0.445454545455
-Training specificity0.903553299492

-Equation:(-0.031532377708820156)*TPSA+(0.06665515714173544)*logP+(1.1095412208049686)
Features:TPSA(RDkit),ALOGP(CDK)
-LOOCV accuracy:0.752442996742671
-Training classification report:
             precision    recall  f1-score   support

         -1       0.77      0.92      0.84       197
          1       0.77      0.50      0.61       110

avg / total       0.77      0.77      0.75       307

In [10]:
print('-Model 2:')
for i in [2,3,4,9]:
    for j in [1,8]:
        train_x = np.array([train_data[j], train_data[i], train_data[7]]).transpose()
        train_y = y2
        lda = LinearDiscriminantAnalysis(solver='svd', store_covariance=True,tol=0.01)
        score = cross_validation.cross_val_score(lda, train_x, train_y, cv=loo, n_jobs=-1)
        lda.fit(train_x, train_y)
        print(('-Equation:({})*TPSA+({})*logP+({})*(n_acid+n_base)+({})')
              .format(lda.coef_[0][0],lda.coef_[0][1],lda.coef_[0][2],lda.intercept_[0]))
        train_y_pred = lda.predict(train_x)
        c = classification_report(train_y,train_y_pred)
        conf = confusion_matrix(train_y,train_y_pred)
        print('Features:',end='')
        if j == 1:
            print('TPSA(CDK),',end='')
        else:
            print('TPSA(RDkit),',end='')
        print('(n_acid+n_base)(CDK),',end='')
        if i == 2:
            print('ALOGP(CDK)')
        elif i == 3:
            print('XLOGP(CDK)')
        elif i == 4:
            print('MannholdLogP(CDK)')
        else:
            print('MolLogP(RDkit)')
        print('-LOOCV accuracy:{}'.format(np.mean(score)))
        print('-Training classification report:')
        print(c)
        print('-Confusion matrix:')
        print(conf)
        print('-Training accuracy:',end='')
        print((conf[0,0]+conf[1,1])/np.sum(conf))
        print('-Training sensitivity:',end='')
        print(conf[1,1]/(conf[1,0]+conf[1,1]))
        print('-Training specificity',end='')
        print(conf[0,0]/(conf[0,0]+conf[0,1]))
        print()

-Model 2:
-Equation:(-0.03362126348720727)*TPSA+(-0.03780629513280001)*logP+(-0.10903190062473503)*(n_acid+n_base)+(4.809788268821979)
Features:TPSA(CDK),(n_acid+n_base)(CDK)ALOGP(CDK)
-LOOCV accuracy:0.8827361563517915
-Training classification report:
             precision    recall  f1-score   support

         -1       0.58      0.19      0.29        37
          1       0.90      0.98      0.94       270

avg / total       0.86      0.89      0.86       307

-Confusion matrix:
[[  7  30]
 [  5 265]]
-Training accuracy:0.885993485342
-Training sensitivity:0.981481481481
-Training specificity0.189189189189

-Equation:(-0.039115221672729426)*TPSA+(-0.13864636221897694)*logP+(-0.22797874418460437)*(n_acid+n_base)+(5.213534086751705)
Features:TPSA(RDkit),(n_acid+n_base)(CDK)ALOGP(CDK)
-LOOCV accuracy:0.8762214983713354
-Training classification report:
             precision    recall  f1-score   support

         -1       0.50      0.16      0.24        37
          1       0.89      0