In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# collect data
# ../Data/DataForClassification/d2v/
fileDir = "../Data/DataForClassification/p2v/"
fileList = os.listdir(fileDir)
print(fileList)

# # auto method that go through all the file in directory
# for file in fileList:
#     if not file.startswith('.'):
#         if file.endswith(".txt"):
#             file = file[:-4]

# hard code to read the file one by one
author0 = []
author1 = []
with open(fileDir+"michael wagner0.txt", 'r', encoding = 'utf8') as f:
    for line in f:
        author0.append(line.strip().split(" "))

with open(fileDir+"michael wagner1.txt", 'r', encoding = 'utf8') as f:
    for line in f:
        author1.append(line.strip().split(" "))
print(author0[0])
print(author1[0])

['chung-may yang1.txt', 'wei lu0.txt', 'yong wang1.txt', 'david g lloyd0.txt', 'wei lu1.txt', 'feng liu1.txt', 'david g lloyd1.txt', 'jeong hwan kim1.txt', 'chung-may yang0.txt', 'michael wagner0.txt', 'feng liu0.txt', 'hao song1.txt', 'hao song0.txt', 'kevin m. ryan0.txt', 'michael wagner1.txt', 'lei wang0.txt', 'jeong hwan kim0.txt', 'yong wang0.txt', 'lei wang1.txt', 'kevin m. ryan1.txt']
['22889921', '-0.244403', '-0.0406115', '-0.241506', '-0.0276834', '0.035334', '0.172173', '-0.138247', '0.0907196', '-0.0780261', '0.0947579', '-0.0556994', '-0.092753', '-0.0394446', '0.164712', '0.0333666', '-0.155414', '0.0652004', '0.0845404', '-0.00309468', '0.103676', '0.138019', '0.0117297', '-0.0868801', '0.107426', '0.0734262', '0.0697482', '0.0374372', '0.135657', '0.14281', '0.0381899', '-0.125376', '0.00246952', '0.0992286', '-0.0779264', '0.0226886', '0.101398', '-0.0621663', '-0.18564', '-0.0982089', '0.0417952', '-0.123849', '-0.130739', '-0.00359905', '-0.0186436', '-0.098999', '-0

In [2]:
# size of each class
print(len(author0))
print(len(author1))

98
141


In [3]:
# number of features (dimension) with it's paper id
print(len(author0[0]))
print(len(author1[0]))

101
101


In [4]:
# calculate ppv,npv,specificity,sensitivity, and accuracy
def calculate_important_value(tp, tn, fp, fn, sample_length,f1):
    # 1. Positive predicted value (PPV) or precision aka hit rate = True positive/ )True positive + False positive)
    ppv = (tp / (tp + fp))
    # 2. Negative predicted value (NPV) = True negative / (True negative + False negative)
    npv = (tn / (tn + fn))
    # 3. Specificity = (1 - False positive)
    specificity = (tn / (tn + fp))
    # 4. Sensitivity = True positive
    sensitivity = (tp / (tp + fn))
    # 5. Accuracy = (True positive + True negative) / Total number of sample
    accuracy = (tp + tn) / sample_length
    print('PPV: ', ppv, 'NPV: ', npv, 'Specificity: ', specificity, 'Sensitivity: ', sensitivity)
    print('Accuracy: ', accuracy, 'F1: ', f1)
    return ppv, npv, specificity, sensitivity, accuracy

In [5]:
# reconstract data so that we can feed it to svm
import pandas as pd
classOne = pd.DataFrame(author0)
classOne["label"] = 0
#print(classOne[:2:])
classTwo = pd.DataFrame(author1)
classTwo["label"] = 1
#print(classTwo[:2:])
# combine data from different class get all data
combinedData = pd.concat([classOne, classTwo])
print(combinedData[:5])
combinedData = combinedData.sample(frac=1).reset_index(drop=True)
# take the paper id out
paperID = combinedData[0]
# split data and label
data = combinedData.drop([0,'label'], axis=1)
label = combinedData['label']

          0          1            2          3            4           5  \
0  22889921  -0.244403   -0.0406115  -0.241506   -0.0276834    0.035334   
1  23585882  -0.283335   -0.0669752  -0.196079  -0.00280898    -0.15149   
2  23604333  -0.241933  -0.00540449  -0.255078    -0.140854  -0.0892556   
3  23137390  -0.213667   -0.0366257  -0.206121   -0.0531313   -0.139354   
4  22913370  -0.212666   -0.0438302  -0.215563   0.00510708   -0.133076   

          6           7          8           9  ...           92          93  \
0  0.172173   -0.138247  0.0907196  -0.0780261  ...   -0.0910969  -0.0672932   
1  0.130044  -0.0502186  0.0763087  -0.0267538  ...   -0.0553114  -0.0596153   
2  0.102551     -0.0739  0.0868043   -0.110511  ...    -0.135502  -0.0438898   
3  0.133103  -0.0861887  0.0295243  -0.0718318  ...    0.0203134   -0.126046   
4  0.134719    -0.15929    0.04278   -0.121276  ...    -0.069607  -0.0631248   

            94          95           96         97           98     

In [6]:
# cross validation
def k_fold_cv(data, label, classifier, clfname):
    kf = KFold(n_splits=10, shuffle=False)
    # create lists to collect statistic
    tp = []
    fp = []
    tn = []
    fn = []
    roundf1 = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data.iloc[train_index], data.iloc[test_index]
        label_train, test_true_label = label.iloc[train_index], label.iloc[test_index]
        # fit data to svm
        classifier.fit(data_train, label_train)
        # get predicted label
        label_pred = classifier.predict(data_test)
        # find out which sample cause the issue
        print("Pred: ",label_pred)
        print("True: ", test_true_label.values.tolist())
        print("Mislabeled sample: ",end='')
        for i in range(len(test_true_label)):
            if(label_pred[i]!=test_true_label[test_index[i]]):
                print(paperID[test_index[i]]+",",end='')
        print()
        # find round confusion matrix
        try:
            round_tn, round_fp, round_fn, round_tp = metrics.confusion_matrix(test_true_label, label_pred).ravel()
        except ValueError:
            round_tn, round_fp, round_fn, round_tp = metrics.confusion_matrix(test_true_label, label_pred,labels=[0,1]).ravel()
        # add data data to array
        tp.append(round_tp)
        fp.append(round_fp)
        fn.append(round_fn)
        tn.append(round_tn)
        roundf1.append(f1_score(test_true_label, label_pred,average='micro'))
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    print("Classifier: {name}\nTrue positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
          .format(name=clfname, tp=np.sum(tp), fp=np.sum(fp), fn=np.sum(fn), tn=np.sum(tn)))
    f1 = np.average(roundf1)
    ppv, npv, specificity, sensitivity, accuracy = calculate_important_value(np.sum(tp), np.sum(tn),
                                                                             np.sum(fp), np.sum(fn), len(data),f1)
    # return ppv, npv, specificity, sensitivity, accuracy, f1

In [11]:
# # create linear SVM model
linear_svc = svm.SVC(kernel='linear', class_weight='balanced', probability=True)
print(linear_svc)

# fit model and do 10-fold cv
k_fold_cv(data, label, linear_svc, "SVM linear")

# check number of support vectors
print("Number of support vectors: ",len(linear_svc.support_vectors_))

# create rbf SVM model with C=10 where (C*Error) is added into minimize function
# C big means error matter more
rbf_svc = svm.SVC(kernel='rbf', C=10)
print(rbf_svc)

# fit model and do 10-fold cv
k_fold_cv(data, label, rbf_svc, "SVM rbf")
print("Number of support vectors: ",len(rbf_svc.support_vectors_))


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Pred:  [1 1 0 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 1 0 1]
True:  [1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1]
Mislabeled sample: 
Pred:  [1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1]
True:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]
Mislabeled sample: 
Pred:  [0 0 1 0 0 0 1 1 1 1 0 1 1 0 1 1 0 0 1 0 0 0 1 0]
True:  [0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0]
Mislabeled sample: 
Pred:  [1 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 1]
True:  [1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1]
Mislabeled sample: 
Pred:  [1 0 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 1 0 1 0 0 1 1]
True:  [1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1]
Mi