In [None]:
from sklearn.externals import joblib
# Load tf idf model from disk
loadDir = "..Data/"+Dataset+"/models/tfidf/"
tfidf_vectorizer = joblib.load(loadDir+'TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, norm=None,min_df=3)'+'.pkl')
# Load data
allContent = [paper.content for paper in allpaperCollection]
print(allContent[:2])
tfidf_matrix = tfidf_vectorizer.transform(allContent)
for i in range(0, len(allpaperCollection)):
    allpaperCollection[i].add_vec_location(i)
print(tfidf_matrix[:2].toarray())
print(tfidf_matrix.shape)

In [None]:
print(allpaperCollection[-1].pid)
print(allpaperCollection[-1].vec_index)

In [None]:
import random
def extractNegativeSample(positiveSample, allSample):
    negativeSample = [x for x in allSample if x not in positiveSample]
    print("Total Negative sample size:", len(negativeSample))
    return negativeSample


In [None]:
# collect class vectors from tf-idf matrix
import pandas as pd
import numpy as np

def extractVectors(author_pids, NegativeSample_pid, allpaperCollection):
    # extract class one vectors
    author_features = []
    for pid in author_pids:
        vec_index = -1
        for paper in allpaperCollection:
            if(pid == paper.pid):
                vec_index = paper.vec_index
        if(vec_index==-1):
            print("Error, not get vector index")
        author_features.extend(np.insert(tfidf_matrix[vec_index].toarray(), 0, pid, axis=1))
    print("Positive sample size: ", len(author_features))
    classOne = pd.DataFrame(author_features)
    classOne["label"] = 0
    # extract class two vectors
    other_features = []
    for pid in NegativeSample_pid:
        vec_index = -1
        for paper in allpaperCollection:
            if(pid == paper.pid):
                vec_index = paper.vec_index
        if(vec_index==-1):
            print("Error, not get vector index")
        other_features.extend(np.insert(tfidf_matrix[vec_index].toarray(), 0, pid, axis=1))
    print("Negative sample size: ", len(other_features))
    classTwo = pd.DataFrame(other_features)
    classTwo["label"] = 1
    return classOne, classTwo


In [None]:
# combine data from different class get all data
def combineClassesData(classOne,classTwo):
    combinedData = pd.concat([classOne, classTwo])
    combinedData = combinedData.sample(frac=1).reset_index(drop=True)
    # take the paper id out
    paperID = [int(i) for i in combinedData[0]]
    # split data and label
    data = combinedData.drop([0,'label'], axis=1)
    label = combinedData['label']
    print("Total sample size and shape: ",data.shape)
    return data, label, paperID


In [None]:
# Principal Component Analysis (PCA) applied to this data identifies the combination of attributes
# (principal components, or directions in the feature space) that account for the most variance in the data.
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
def visualizeWithPCA(plotSavingPath,name, data, label):
    visualize_setting = "PCA"
    pca = PCA(n_components=2)
    pca_transformed = pd.DataFrame(pca.fit_transform(X=data, y=label))
    pca_transformed["label"] = label
    #print(pca_transformed)
    plt.scatter(pca_transformed[label==0][0], pca_transformed[label==0][1], label='Positive sample', c='red')
    plt.scatter(pca_transformed[label==1][0], pca_transformed[label==1][1], label='Other', c='blue')
    plt.legend()
    plt.show()
    plt.savefig((plotSavingPath+name+"_"+visualize_setting+".png").encode('utf-8'))


In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import Normalizer
def visualizeWithTSNE(plotSavingPath,name, data, label):
    visualize_setting = "TSNE"
    tsne_transformed = TSNE(n_components=2, init = "pca").fit_transform(data)
    tsne_transformed_normalized = Normalizer(norm='l2').fit_transform(tsne_transformed)
    tsne_transformed_normalized = pd.DataFrame(tsne_transformed)
    tsne_transformed_normalized["label"] = label
    #print(tsne_transformed_normalized)
    plt.scatter(tsne_transformed_normalized[label==0][0], tsne_transformed_normalized[label==0][1], label='Positive sample', c='red')
    plt.scatter(tsne_transformed_normalized[label==1][0], tsne_transformed_normalized[label==1][1], label='Other', c='blue')
    plt.legend()
    plt.show()
    plt.savefig((plotSavingPath+name+"_"+visualize_setting+".png").encode('utf-8'))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import (precision_score, recall_score,f1_score,accuracy_score)

def k_fold_cv_mnb_model(author_name,data,label,k=10):
    kf = KFold(n_splits=10, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for counter,(train_index, test_index) in enumerate(kf.split(data)):
        # split train and test
        data_train, data_test = data.iloc[train_index], data.iloc[test_index]
        label_train, test_true_label = label.iloc[train_index], label.iloc[test_index]
        # fit data to classifier
        classifier = MultinomialNB().fit(data_train, label_train)
        # get predicted label
        label_pred = classifier.predict(data_test)
        allTrueLabel.extend(test_true_label)
        allPredLabel.extend(label_pred)
        # find out which sample cause the issue
        print("Pred: ",label_pred)
        print("True: ", test_true_label.values.tolist())
        print("Mislabeled sample: ",end='')
        for i in range(len(test_true_label)):
            if(label_pred[i]!=test_true_label[test_index[i]]):
                print(str(paperID[test_index[i]])+",",end='')
        print()
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))
    
    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='binary')
    precision = precision_score(allTrueLabel, allPredLabel)
    recall = recall_score(allTrueLabel, allPredLabel)
    tn,fp,fn,tp = metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel()
    
    print("Author: ", author_name)
    print("Classifier: ", classifier)
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    print("Accuracy: ",accuracy)
    print("F1: ", f1)
    print("Precision: ", precision)
    print("Recall: ", recall)
    
    return accuracy, f1, precision, recall, tn, fp, fn, tp

In [None]:
# extract vector for classification
import os
import re
import numpy as np
import sys

fileDir = "../Data/"+Dataset+"/filteredSameNameAuthor/filter=10/"
fileList = os.listdir(fileDir)
print(fileList)

In [None]:
# loop through files in directory |
# add name to name list
# author as positive sample, other as all samples
# train classifier for each author and save the result to the file
name_list = []
# create name list for all authors have same name
for file in fileList:
    if not file.startswith('.'):
        if not re.match(r'\D*\d+.txt$', file):
            # fix the coding issue
            name_list.append(file.encode("utf-8", "surrogateescape").decode('utf8','surrogateescape')[:-4])
# print(name_list)

# loop through all the author and gather result
allauthor = []
authorSampleSize = []
allSampleSize = []
allaccuracy = []
allf1 = []
allprecision = []
allrecall = []
alltn = []
allfp = []
allfn = []
alltp = []

for name in name_list:
    other_pids = []
    # read other sample
    with open((fileDir+name+".txt").encode('utf-8'), 'r', encoding = 'utf8') as f:
        for line in f:
            other_pids.extend(line.strip().split(" "))
#     print(name)
    for file in fileList:
        file=file.encode("utf-8", "surrogateescape").decode('utf8','surrogateescape')
        if not file.startswith('.'):
            if re.match(r'\D*\d+.txt$', file):
                if name in file:
                    print(os.path.splitext(file)[0])
                    # add author to list for final output
                    allauthor.append(os.path.splitext(file)[0])
                    author_pids = []
                    # read author sample
                    with open((fileDir+os.path.splitext(file)[0]+".txt").encode('utf-8'), 'r', encoding = 'utf8') as f:
                        for line in f:
                            author_pids.extend(line.strip().split(" "))
                    # print properties
                    authorSampleSize.append(len(author_pids))
                    allSampleSize.append(len(other_pids))
                    print(len(author_pids))
                    print(len(other_pids))
                    # remove author(positive sample) from other(all sample) to create negative sample
                    NegativeSample_pid = extractNegativeSample(author_pids, other_pids)
                    print(len(NegativeSample_pid))
                    # collect all vector
                    classOne, classTwo = extractVectors(author_pids,NegativeSample_pid,allpaperCollection)
                    print(classOne.shape)
                    print(classTwo.shape)
                    # combine data from different class get all data
                    data, label, paperID= combineClassesData(classOne, classTwo)
                    # PCA visualize data
                    plotSavingPath = "../plot/tf_idf/"
                    visualizeWithPCA(plotSavingPath,os.path.splitext(file)[0],data,label)
                    # TSNE visualize data
                    visualizeWithTSNE(plotSavingPath,os.path.splitext(file)[0],data,label)
                    # train classifier
                    accuracy, f1, precision, recall, tn, fp, fn, tp= k_fold_cv_mnb_model(os.path.splitext(file)[0],data,label,k=10)
                    allaccuracy.append(accuracy)
                    allf1.append(f1)
                    allprecision.append(precision)
                    allrecall.append(recall)
                    alltn.append(tn)
                    alltp.append(tp)
                    allfn.append(fn)
                    allfp.append(fp)
# write evaluation result to excel
output = pd.DataFrame({'author':allauthor,"AuthorSampleSize":authorSampleSize,
                       "accuracy":allaccuracy,"f1":allf1, "precision":allprecision,
                      "recall":allrecall, "AllSameNameSampleCount":allSampleSize,
                      "True positive": alltp, "True negative":alltn,
                      "False positive": allfp, "False negative": allfn})
filename = "author_clf_mnb_tf_idf_filter=10.csv"
output.to_csv("../result/"+Dataset+"/"+filename, encoding='utf-8',index=False)
print("Done")

In [None]:
# hard code to read the file one by one
# author as positive sample, other as all samples
author_pids = []
other_pids = []
with open((fileDir+"luís alves0.txt").encode('utf-8'), 'r', encoding = 'utf8') as f:
    for line in f:
        author_pids.extend(line.strip().split(" "))

with open((fileDir+"luís alves.txt").encode('utf-8'), 'r', encoding = 'utf8') as f:
    for line in f:
        other_pids.extend(line.strip().split(" "))
print(author_pids[0])
print(other_pids[0])

In [None]:
# size of each class
print(len(author_pids))
print(len(other_pids))
print(len(allpaperCollection))

In [None]:
NegativeSample_pid = extractNegativeSample(author_pids, other_pids)
print(len(NegativeSample_pid))

In [None]:
classOne, classTwo = extractVectors(author_pids,NegativeSample_pid,allpaperCollection)
print(classOne.shape)
print(classTwo.shape)

In [None]:
data, label, paperID= combineClassesData(classOne, classTwo)

In [None]:
plotSavingPath = "../plot/tf_idf/"
name = "luís alves0"
visualizeWithPCA(plotSavingPath,name,data,label)

In [None]:
# TSNE
plotSavingPath = "../plot/tf_idf/"
visualizeWithTSNE(plotSavingPath,name,data,label)

In [None]:
k_fold_cv_mnb_model(name,data,label,k=10)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.externals import joblib
def train_save_mnb_model(model_name, saving_path):
    # create data for train and data for test
    X_train, X_test, y_train, y_test = train_test_split(data,label)
    # train model
    mnb = MultinomialNB()
    y_pred = mnb.fit(X_train, y_train).predict(X_test)
    print(y_pred)
    print(y_test)
    print(metrics.confusion_matrix(y_test, y_pred).ravel())
    print(metrics.f1_score(y_test, y_pred,average='micro'))
    # save model
    joblib.dump(mnb, saving_path+model_name+"_tf_idf_mnb.pkl")
    print("Done")

ModelSavingPath = "../Data/"+Dataset+"/models/MultinomialNB/"
name = "michael wagner0"
train_save_mnb_model(name,ModelSavingPath)