In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import sklearn.metrics as metrics
from glob import glob

In [2]:
rootDirData = 'data/'
rootDirSaveReper = 'Res/SVM/drugReper/'
rootDirSaveLabel = 'Res/SVM/label/'
rootDirSaveMes = 'Res/SVM/mes/'
# rootDir = ''
files = {
    'intractionMatrix':'intraction_matrix.csv',
    'virusSim':'virus_sim.csv',
    'drugSim':'drug_sim.csv',
}

In [3]:
intractionMatrix = pd.read_csv(rootDirData + files['intractionMatrix'], delimiter = ',', header=None, encoding='cp1252').to_numpy()
drugNames = intractionMatrix[0, 1:]
virusNames = intractionMatrix[1:, 0]
drugNames.shape, virusNames.shape
intractionMatrix = intractionMatrix[1:, 1:]

In [4]:
virusSim = pd.read_csv(rootDirData + files['virusSim'], delimiter = ',', header=None, encoding='cp1252').to_numpy()
virusSim = virusSim[1:, 1:]
virusSim = virusSim.astype(float)
virusSim.shape

(100, 100)

In [5]:
drugSim = pd.read_csv(rootDirData + files['drugSim'], delimiter = ',', header=None, encoding='cp1252').to_numpy()
drugSim = drugSim[1:, 1:]
drugSim = drugSim.astype(float)
drugSim.shape

(198, 198)

In [6]:
samples = []
for i in range(len(drugSim)):
    samples.append(np.concatenate(
            (
                virusSim[-1], drugSim[i]
            )
        )
    )
samples = np.array(samples)
samples.shape

(198, 298)

In [7]:
drugNamesSet = {}
for dn in drugNames:
    drugNamesSet[dn] = []

Y = []
X = []
Z = []
for i in range(len(virusSim)-2):
    for j in range(len(drugSim)):
        Y.append(
            intractionMatrix[i, j]
        )
        
        Z.append(
            [
                virusNames[i], drugNames[j]
            ]
        )
        
        X.append(
            np.concatenate(
                (
                    virusSim[i], drugSim[j]
                )
            )
        )

X = np.array(X).astype(float)
Y = np.array(Y).astype(float)
Z = np.array(Z)
rndIndex = np.random.choice(len(X), len(X), replace = False)
X = X[rndIndex]
Y = Y[rndIndex]
Z = Z[rndIndex]
X.shape, Y.shape, Z.shape

((19404, 298), (19404,), (19404, 2))

In [8]:
def calcMes(y_true, y_pred, fixedThreshold):
    optimal_threshold = 0
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
    auc = metrics.auc(fpr, tpr)

    precision, recall, thresholds = metrics.precision_recall_curve(y_true, y_pred)
    aupr = metrics.auc(recall, precision)
    y_predTemp = np.zeros(
        (
            len(y_pred),
        )
    )
    
    


    if fixedThreshold:
        y_predTemp[np.where(y_pred >= 0.5)] = 1
    else:
        # gmeans = np.sqrt(tpr * (1-fpr))
        # ix = np.argmax(gmeans)
        # gmeansMax = gmeans[ix]
        # gmeansTR = thresholds[ix]
        # print("gmeansTR:", gmeansTR, gmeansMax, ix)
        f1_scores = (2 * precision * recall) / (precision + recall)
        optimal_threshold = thresholds[np.argmax(f1_scores)]
        y_predTemp[np.where(y_pred >= optimal_threshold)] = 1

    f1 = metrics.f1_score(y_true, y_predTemp)
    pre = metrics.precision_score(y_true, y_predTemp)
    rec = metrics.recall_score(y_true, y_predTemp)

    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_predTemp).ravel()
    specificity = tn / (tn+fp)
    mcc = metrics.matthews_corrcoef(y_true, y_predTemp)
    acc = metrics.accuracy_score(y_true, y_predTemp)

    mesList = [
            specificity, rec, pre, acc, f1, mcc, auc, aupr
    ]

    return mesList, optimal_threshold
    

In [9]:
mesListFixed = [
    ["Set", "Fold", "Specificity", "Recall", "Precision", "Accuracy", "F1", "MCC", "AUC", "AUPR"]
]
mesListFloating = [
    ["Set", "Fold", "Specificity", "Recall", "Precision", "Accuracy", "F1", "MCC", "AUC", "AUPR", "TR"]
]

for SeT in range(10):
    rndIndex = np.random.choice(len(X), len(X), replace = False)
    X = X[rndIndex]
    Y = Y[rndIndex]
    kf = KFold(n_splits=5)
    foldCounter = 1
    for train_index, test_index in kf.split(X):
        print("Set-Fold: ", SeT, foldCounter)
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

        clf = SVC(kernel = 'rbf', probability=True)
        clf.fit(x_train,y_train)
        y_pred = clf.predict_proba(x_test)
        y_pred = y_pred[:,1]
        # print(y_pred)

        np.savetxt(rootDirSaveLabel + str(SeT) + '_' + str(foldCounter) + '_gt.csv', y_test, delimiter=',')
        np.savetxt(rootDirSaveLabel + str(SeT) + '_' + str(foldCounter) + '_pred.csv', y_pred, delimiter=',')

        res = clf.predict_proba(samples)
        res = res[:,1]
        mesFixed, _ = calcMes(y_test, y_pred, True)
        mesFixed.insert(0, SeT)
        mesFixed.insert(1, foldCounter)
        mesListFixed.append(
            mesFixed
        )
        np.savetxt(rootDirSaveMes + 'fixed.csv', mesListFixed, delimiter=',', fmt='%s')
        
        mesFloating, optimalTR = calcMes(y_test, y_pred, False)
        mesFloating.insert(0, SeT)
        mesFloating.insert(1, foldCounter)
        mesFloating.append(optimalTR)
        mesListFloating.append(
            mesFloating
        )
        np.savetxt(rootDirSaveMes + 'floating.csv', mesListFloating, delimiter=',', fmt='%s')

        tempDrugList = []
        for i in range(len(res)):
            if res[i] > sum(res) / len(res):
                tempDrugList.append(
                    [
                        drugNames[i], res[i]
                    ]
                )

        # res = res[:,0]
        if len(tempDrugList) > 0:
            tempDrugList = np.array(tempDrugList)
            tempDrugList = tempDrugList[tempDrugList[:, 1].argsort()[::-1]]
            for dns in range(len(tempDrugList)):
                tempDN = tempDrugList[dns][0]
                tempDNS = [SeT, foldCounter, dns, tempDrugList[dns][1]]
    
                drugNamesSet[tempDN].append(tempDNS)
    
    
            for key in drugNamesSet.keys():
                if len(drugNamesSet[key]) > 0:
                    np.savetxt(rootDirSaveReper + key + '.csv', drugNamesSet[key], delimiter=',', fmt='%s')

        # np.savetxt(rootDirSaveReper + str(foldCounter) + '.csv', tempDrugList, delimiter=',', fmt='%s')

        foldCounter += 1
        

Set-Fold:  0 1
Set-Fold:  0 2
Set-Fold:  0 3
Set-Fold:  0 4


  _warn_prf(average, modifier, msg_start, len(result))


Set-Fold:  0 5
Set-Fold:  1 1
Set-Fold:  1 2
Set-Fold:  1 3
Set-Fold:  1 4


  f1_scores = (2 * precision * recall) / (precision + recall)


Set-Fold:  1 5
Set-Fold:  2 1
Set-Fold:  2 2
Set-Fold:  2 3
Set-Fold:  2 4
Set-Fold:  2 5
Set-Fold:  3 1
Set-Fold:  3 2
Set-Fold:  3 3
Set-Fold:  3 4
Set-Fold:  3 5


  _warn_prf(average, modifier, msg_start, len(result))
  f1_scores = (2 * precision * recall) / (precision + recall)


Set-Fold:  4 1


  f1_scores = (2 * precision * recall) / (precision + recall)


Set-Fold:  4 2
Set-Fold:  4 3
Set-Fold:  4 4
Set-Fold:  4 5
Set-Fold:  5 1
Set-Fold:  5 2
Set-Fold:  5 3
Set-Fold:  5 4
Set-Fold:  5 5
Set-Fold:  6 1
Set-Fold:  6 2
Set-Fold:  6 3


  f1_scores = (2 * precision * recall) / (precision + recall)


Set-Fold:  6 4
Set-Fold:  6 5
Set-Fold:  7 1
Set-Fold:  7 2
Set-Fold:  7 3


  _warn_prf(average, modifier, msg_start, len(result))
  f1_scores = (2 * precision * recall) / (precision + recall)


Set-Fold:  7 4
Set-Fold:  7 5
Set-Fold:  8 1
Set-Fold:  8 2
Set-Fold:  8 3
Set-Fold:  8 4
Set-Fold:  8 5
Set-Fold:  9 1
Set-Fold:  9 2


  _warn_prf(average, modifier, msg_start, len(result))
  f1_scores = (2 * precision * recall) / (precision + recall)


Set-Fold:  9 3
Set-Fold:  9 4
Set-Fold:  9 5


In [10]:
filenames = []
for filename in glob(rootDirSaveReper + '*.csv', recursive=True):
    filenames.append(filename) 
# filenames = glob2.glob("/*.csv")
preDrugs = []
for i in range(len(filenames)):
    preDrugs.append(filenames[i].split('\\')[1].split('.')[0])
meanScore = []
for i in range(len(filenames)):
    sumL = 0
    lenL = 0
    index = pd.read_csv(filenames[i], delimiter = ',', header = None).to_numpy()[:, :3]
    for counter in range(len(index)):
        # if index[counter][0] != 29:
        sumL += index[counter][2]
        lenL += 1
    avg = sumL / lenL
    meanScore.append(
        [
            preDrugs[i],
            avg
        ]
    )

np.savetxt(rootDirSaveMes + 'meanScoreDrug.csv', meanScore, delimiter=',', fmt='%s')