In [10]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# collect data
# ../Data/DataForClassification/d2v/
fileDir = "../Data/DataForClassification/d2v/"
fileList = os.listdir(fileDir)
print(fileList)

# # auto method that go through all the file in directory
# for file in fileList:
#     if not file.startswith('.'):
#         if file.endswith(".txt"):
#             file = file[:-4]

# hard code to read the file one by one
author0 = []
author1 = []
with open(fileDir+"chung-may yang0.txt", 'r', encoding = 'utf8') as f:
    for line in f:
        read_data = line.strip().split("\t")
        author0.append(read_data[1].split(" "))

with open(fileDir+"chung-may yang1.txt", 'r', encoding = 'utf8') as f:
    for line in f:
        read_data = line.strip().split("\t")
        author1.append(read_data[1].split(" "))
print(author0[0])
print(author1[0])

['chung-may yang1.txt', 'wei lu0.txt', 'yong wang1.txt', 'david g lloyd0.txt', 'wei lu1.txt', 'feng liu1.txt', 'david g lloyd1.txt', 'jeong hwan kim1.txt', 'chung-may yang0.txt', 'michael wagner0.txt', 'feng liu0.txt', 'hao song1.txt', 'hao song0.txt', 'kevin m. ryan0.txt', 'michael wagner1.txt', 'lei wang0.txt', 'jeong hwan kim0.txt', 'yong wang0.txt', 'lei wang1.txt', 'kevin m. ryan1.txt']
['-0.29111460', '-0.57349956', '0.35524827', '0.12974840', '0.30130574', '-0.25134709', '0.21317476', '-0.74120957', '-0.05038653', '-0.44930771', '-0.02827097', '0.35546294', '0.11417462', '0.11215554', '0.00902790', '-0.20662700', '-0.40714705', '0.18991761', '0.18334278', '0.49447197', '0.21461654', '-0.34866148', '-0.12432522', '0.58297175', '0.19949827', '0.37721580', '-0.28323400', '-0.53867114', '0.16258357', '-0.05756472', '0.09064528', '0.18177433', '0.23114556', '-0.22056936', '-0.03383492', '0.22713418', '0.62852019', '0.14818428', '-0.09277134', '0.22546473', '-0.00073958', '0.28379443'

In [11]:
# size of each class
print(len(author0))
print(len(author1))

42
71


In [12]:
# number of features (dimension)
print(len(author0[0]))
print(len(author1[1]))

100
100


In [13]:
# reconstract data so that we can feed it to svm
import pandas as pd
classOne = pd.DataFrame(author0)
classOne["label"] = 0
#print(classOne[:2:])
classTwo = pd.DataFrame(author1)
classTwo["label"] = 1
#print(classTwo[:2:])
# combine data from different class get all data
combinedData = pd.concat([classOne, classTwo])
print(combinedData[:5])
combinedData = combinedData.sample(frac=1).reset_index(drop=True)
# split data and label
data = combinedData.drop('label', axis=1)
label = combinedData['label']

             0            1            2            3            4  \
0  -0.29111460  -0.57349956   0.35524827   0.12974840   0.30130574   
1   0.92845869  -0.93609655   0.31517544  -0.40926400   0.12442853   
2   0.56281191  -0.17059737   0.21973124   0.56775701   0.17825009   
3   0.02262557   0.13886765   0.08385008   0.44914705   0.25022486   
4   0.22692077  -0.18510655  -0.13579145   0.54744142  -0.32509586   

             5            6            7            8            9  ...   \
0  -0.25134709   0.21317476  -0.74120957  -0.05038653  -0.44930771  ...    
1  -1.39242876  -1.08160102   0.23345214  -0.15230766   0.68835104  ...    
2  -0.34945986  -0.13540348  -0.08924282   0.51040816   0.10645398  ...    
3  -0.22274359  -0.12428337  -0.28856090   0.31007025   0.31714562  ...    
4  -0.57200897   0.13991199  -0.08472289   0.28843415   0.14538938  ...    

           91           92           93          94           95           96  \
0  0.54225159  -0.09773158   0.52069086  

In [14]:
# cross validation
def k_fold_cv(data, label, classifier, clfname):
    kf = KFold(n_splits=10, shuffle=True)
    # create lists to collect statistic
    tp = []
    fp = []
    tn = []
    fn = []
    roundf1 = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data.iloc[train_index], data.iloc[test_index]
        label_train, test_true_label = label.iloc[train_index], label.iloc[test_index]
        # fit data to svm
        classifier.fit(data_train, label_train)
        # get predicted label
        label_pred = classifier.predict(data_test)
        # find round confusion matrix
        round_tn, round_fp, round_fn, round_tp = metrics.confusion_matrix(test_true_label, label_pred).ravel()
        # add data data to array
        tp.append(round_tp)
        fp.append(round_fp)
        fn.append(round_fn)
        tn.append(round_tn)
        roundf1.append(f1_score(test_true_label, label_pred,average='micro'))
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    print("Classifier: {name}\nTrue positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
          .format(name=clfname, tp=np.sum(tp), fp=np.sum(fp), fn=np.sum(fn), tn=np.sum(tn)))
    f1 = np.average(roundf1)
    ppv, npv, specificity, sensitivity, accuracy = calculate_important_value(np.sum(tp), np.sum(tn),
                                                                             np.sum(fp), np.sum(fn), len(data),f1)
    return ppv, npv, specificity, sensitivity, accuracy, f1

In [15]:
# calculate ppv,npv,specificity,sensitivity, and accuracy
def calculate_important_value(tp, tn, fp, fn, sample_length,f1):
    # 1. Positive predicted value (PPV) or precision aka hit rate = True positive/ )True positive + False positive)
    ppv = (tp / (tp + fp))
    # 2. Negative predicted value (NPV) = True negative / (True negative + False negative)
    npv = (tn / (tn + fn))
    # 3. Specificity = (1 - False positive)
    specificity = (tn / (tn + fp))
    # 4. Sensitivity = True positive
    sensitivity = (tp / (tp + fn))
    # 5. Accuracy = (True positive + True negative) / Total number of sample
    accuracy = (tp + tn) / sample_length
    print('PPV: ', ppv)
    print('NPV: ', npv)
    print('Specificity: ', specificity)
    print('Sensitivity: ', sensitivity)
    print('Accuracy: ', accuracy)
    print('F1: ', f1)
    return ppv, npv, specificity, sensitivity, accuracy

In [16]:
# logistic regression
logistic = linear_model.LogisticRegression(C=1e5)
print(logistic)
# fit model and do 10-fold cv
k_fold_cv(data, label, logistic, "logistic")

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
Classifier: logistic
True positive: 44, False positive: 21, False negative: 27, True negative: 21
PPV:  0.676923076923077
NPV:  0.4375
Specificity:  0.5
Sensitivity:  0.6197183098591549
Accuracy:  0.5752212389380531
F1:  0.5742424242424242


(0.676923076923077,
 0.4375,
 0.5,
 0.6197183098591549,
 0.5752212389380531,
 0.5742424242424242)