In [3]:
import numpy as np
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# collect data


In [5]:
# cross validation
def k_fold_cv(data, label, classifier, dataname):
    kf = KFold(n_splits=10, shuffle=True)
    # create lists to collect statistic
    tp = []
    fp = []
    tn = []
    fn = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data.iloc[train_index], data.iloc[test_index]
        label_train, test_true_label = label.iloc[train_index], label.iloc[test_index]
        # fit data to svm
        classifier.fit(data_train, label_train)
        # get predicted label
        label_pred = classifier.predict(data_test)
        # find round confusion matrix
        round_tn, round_fp, round_fn, round_tp = metrics.confusion_matrix(test_true_label, label_pred).ravel()
        # add data data to array
        tp.append(round_tp)
        fp.append(round_fp)
        fn.append(round_fn)
        tn.append(round_tn)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    print("Dataset: {name}\nTrue positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
          .format(name=dataname, tp=np.sum(tp), fp=np.sum(fp), fn=np.sum(fn), tn=np.sum(tn)))
    ppv, npv, specificity, sensitivity, accuracy = calculate_important_value(np.sum(tp), np.sum(tn),
                                                                             np.sum(fp), np.sum(fn), len(data))
    return ppv, npv, specificity, sensitivity, accuracy

In [6]:
# calculate ppv,npv,specificity,sensitivity, and accuracy
def calculate_important_value(tp, tn, fp, fn, sample_length):
    # 1. Positive predicted value (PPV) or precision aka hit rate = True positive/ )True positive + False positive)
    ppv = (tp / (tp + fp))
    # 2. Negative predicted value (NPV) = True negative / (True negative + False negative)
    npv = (tn / (tn + fn))
    # 3. Specificity = (1 - False positive)
    specificity = (tn / (tn + fn))
    # 4. Sensitivity = True positive
    sensitivity = (tp / (tp + fn))
    # 5. Accuracy = (True positive + True negative) / Total number of sample
    accuracy = (tp + tn) / sample_length
    print('PPV: ', ppv)
    print('NPV: ', npv)
    print('Specificity: ', specificity)
    print('Sensitivity: ', sensitivity)
    print('Accuracy: ', accuracy,)
    return ppv, npv, specificity, sensitivity, accuracy

In [None]:
# create SVM model
rbf_svc = svm.SVC(kernel='rbf')
print(rbf_svc)
# fit model and do 10-fold cv
k_fold_cv(data, label, rbf_svc, "Same name author")
