In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

# One classifier each name: OCEN with different train percentage
1. This method throw away the authors write less than 100 papers  
2. We will collect result of different train size

In [None]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import com_func

# parameters
#----- filter for selecting set of name group -----------#
filter_select_name_group = 100
#----- filter for selecting productive authors ----#
filter_lower = 100
filter_upper = 110

Dataset = "pubmed"

In [None]:
# text embedding only
pp_text_emb = ["tf", "tf_idf", "lsa", "pv_dm", "pv_dbow"]
pp_citation_emb = ["off"]

In [None]:
# citation embedding only
pp_text_emb = ["off"]
pp_citation_emb = ["n2v","node2vec"]

In [None]:
# combined embedding
pp_text_emb = ["lsa", "pv_dm", "pv_dbow"]
pp_citation_emb = ["n2v"]

In [None]:
print(pp_text_emb)
print(pp_citation_emb)

In [None]:
import copy
import random
from collections import Counter
from sklearn.model_selection import StratifiedKFold

# cross validation
def k_fold_cv_with_different_train_size(data, label, clf, train_size=1, k=10):
    '''
    Split train and test for each fold first, then reduce train size
    train_size: between 0-1 is percentage, larger than one is train size count
    '''
    temp_train_percent = train_size
    random.seed(1)
    if train_size<=0:
        sys.exit("Training size must be larger than 0")
    # sync input datatype
    if not isinstance(data, pd.DataFrame):
        data = pd.DataFrame(data)
    if not isinstance(label, pd.Series):
        label = pd.Series(label, index = data.index.values)
        
    # obtain data ratio
    c = collections.Counter(label)
    data_ratio = [(i, c[i] / len(label)) for i in c]
    print(temp_train_percent)
    #print(data_ratio)
    
    kf = StratifiedKFold(n_splits=k)
    allTrueLabel = []
    allPredLabel = []
    all_fold_statistic = []
    test_size = 0
    fold = 0
    
    for train_index, test_index in kf.split(data, label):
        fold +=1
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # ---------------1. split train and test -------------------- #
        data_train, data_test = data.iloc[train_index], data.iloc[test_index]
        all_label_train, label_test = label.iloc[train_index], label.iloc[test_index]
        # ---------------2. train classifier with different training size -------------------- #
        if train_size<=1:
            train_size = len(all_label_train)*train_size
        if train_size > len(all_label_train):
            sys.exit("Training size must be less or equal to total training samples")
        # if train_size float, take floor of a float train_size
        train_size = int(train_size)
        test_size = len(data_test)
        #print("train: ",train_size, " test: ",test_size)
        # 1. number of samples for each class when perserve it's data ratio
        train_per_class_size = [(label, round(ratio*train_size)) for label, ratio in data_ratio]
        #print(train_per_class_size)
        selected_train_sample_idx = []
        # 2. select samples from train using variable we generated 
        for unique_label, training_size in train_per_class_size:
            curr_label_idx = all_label_train.index[all_label_train == unique_label].tolist()
            curr_label_size = len(curr_label_idx)
            # ----------- sometime round may cause error ----------------- #
            if temp_train_percent ==1:
                selected_train_sample_idx+=curr_label_idx
                #print("class:",unique_label," all size: ", curr_label_size, " training size:", curr_label_size)
            else:
                selected_train_sample_idx += random.sample(curr_label_idx, training_size)
                #print("class:",unique_label," all size: ", curr_label_size, " training size:", training_size)
        # .loc use index, .iloc use position
        final_data_train = data_train.loc[selected_train_sample_idx]
        final_label_train = all_label_train.loc[selected_train_sample_idx]
        # 3. train classifier
        per_fold_clf = copy.deepcopy(clf)
        per_fold_clf.fit(final_data_train, final_label_train)
        # 4. make predcit on test
        per_fold_predict_test = per_fold_clf.predict(data_test)
                
        allTrueLabel.extend(label_test.values.tolist())
        allPredLabel.extend(per_fold_predict_test)
        # collect per fold statistic
        curr_fold_statistic = {'fold':fold, 'train_size': train_per_class_size, 'test_size': data_test.shape[0],
                               'macro f1': f1_score(label_test, per_fold_predict_test,average='macro')}
        all_fold_statistic.append(curr_fold_statistic)
        
    # macro weighs each class equally 
    # micro weights each sample equally.
    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    macro_f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    return accuracy, macro_f1, train_size, test_size

In [None]:
# load the file
import io
import collections
import numpy as np
import pandas as pd

from statistics import mean 

from sklearn.preprocessing import normalize
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

diff_embedding_result = collections.defaultdict(list)

# ----------------------- different text embedding ----------------------#
for text_emb in pp_text_emb:
    print("Load text embedding: ", text_emb)
    all_text_embedding = []
    all_text_emb_pid = []
    # read pretrained embeddings
    if text_emb in ["tf", "tf_idf"]:
        all_text_emb_pid, all_text_embedding = com_func.read_text_embedding(emb_type=text_emb, training_size="140k")
    elif text_emb != "off":
        all_text_embedding = com_func.read_text_embedding(emb_type=text_emb, training_size="140k")
        all_text_emb_pid = [emb[0] for emb in all_text_embedding]
        all_text_embedding = [emb[1:] for emb in all_text_embedding]

    for citation_emb in pp_citation_emb:
        print("Load citation embedding: ", citation_emb)
        all_citation_embedding = com_func.read_citation_embedding_sorted(emb_type = citation_emb)
        all_citation_emb_pid = []
        if citation_emb!= "off":
            all_citation_emb_pid = [emb[0] for emb in all_citation_embedding]
            all_citation_embedding = [emb[1:] for emb in all_citation_embedding]
        
        diff_threshold_result = collections.defaultdict(list)

        # -------------- different filter (step by 10) -----------------------#
        for step_filter in range(filter_lower, filter_upper, 10):
            # collect statistic to output
            statistic_detail = collections.defaultdict(list)
            
            # ------- select useful name group in all name group --------------------#
            for file in listfiles:
                # group name
                temp = file.split("_")
                name = temp[1]+"_"+temp[-1]
                print("For name: ",name)
                # read needed content in labeled file
                labeled_data = com_func.read_pid_aid(fileDir+file)
                #----------- select name group contain productive author------------------------------------#
                #----------- (contain pair of author write more than 100 papers) ---------------------------#
                # count number of paper each author write based on author ID
                authorCounter = collections.Counter(labeled_data["authorID"])
                # remove name group that do not contain pair of author write more than 100 papers
                for k in list(authorCounter):
                    if authorCounter[k] < filter_select_name_group:
                        del authorCounter[k]
                # if only have one class or no class pass the filter, not applicable
                if(len(authorCounter)==0) or (len(authorCounter)==1):
                    print(name, " pass")
                else:
                    temp_orginal_sample_size = len(labeled_data)
                    #--------select authors in name group are very productive (more than filter)---------#
                    print("Total sample size before apply filter: ",len(labeled_data))
                    # count number of paper each author write based on author ID
                    paperCounter = collections.Counter(labeled_data["authorID"])
                    print(paperCounter)
                    print("Total author before apply threshoid: ", len(paperCounter))
                    # collect per class statistic
                    for k in list(paperCounter):
                        if paperCounter[k] < step_filter:
                            del paperCounter[k]
                    temp =list(paperCounter.keys())
                    print(temp)
                    print("Total author after apply threshoid: ", len(temp))
                    # remove samples that are smaller than filter
                    labeled_data = labeled_data[labeled_data.authorID.isin(temp)]
                    print("Total sample size after apply filter: ",len(labeled_data))
                    #------------ extract paper representation -------------------------------------------#
                    # shuffle the data
                    labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                    # extract true label and pid
                    label = labeled_data["authorID"]
                    pid = labeled_data["paperID"]
                    # list of different data field
                    part_collection = []
                    # select feature wanted to fit to clustering/classification algorithm
                    # data part, text information
                    data_part_text = com_func.extract_embedding(all_text_embedding, all_text_emb_pid, pid)
                    print("Text embedding shape: ", data_part_text.shape)
                    part_collection.append(data_part_text)
                    # data part, citation information
                    data_part_citation = com_func.extract_embedding(all_citation_embedding, all_citation_emb_pid, pid)
                    data_part_citation.fillna(0, inplace=True)
                    print("Citation embedding shape: ", data_part_citation.shape)
                    part_collection.append(data_part_citation)
                    # merge different part of data data together by concatenate it all together
                    # remove empty emb (when emb set off)
                    part_collection = [part for part in part_collection if len(part)!=0]
                    if len(part_collection)>1:
                        combinedata = np.concatenate(part_collection,axis=1)
                    elif len(part_collection)==1:
                        if isinstance(part_collection[0], pd.DataFrame):
                            combinedata = part_collection[0].values
                        else:
                            combinedata = part_collection[0]
                    else:
                        print("No data available")
                        break
                    print("Final feature (combined embedding) shape: ", combinedata.shape)
                    # ------------- 10% to 100% training size changes -----------------------#
                    for train_percent in np.arange(0.1, 1.1, 0.1):
                        statistic_detail["Name group"].append(name)
                        statistic_detail["Class number"].append(len(paperCounter))
                        statistic_detail["Per class size"].append(paperCounter)
                        statistic_detail["Orginal sample size"].append(temp_orginal_sample_size)
                        statistic_detail["Total selected sample size"].append(len(labeled_data))
                        statistic_detail["used_train_percent"].append(train_percent)
                        # -------------- using converted feature vector to train classifier-------------------#
                        if text_emb == "tf":
                            # using multinomial naive bayes
                            clf = MultinomialNB()
                            mnbaccuracy, mnbmarcof1, train_size, test_size = k_fold_cv_with_different_train_size(combinedata, label, clf, train_size=train_percent, k=10)
                            print("MNB F1: ", mnbmarcof1)
                            statistic_detail['MNB Accuracy'].append(mnbaccuracy)
                            statistic_detail['MNB macro F1'].append(mnbmarcof1)
                        # using logistic regression
                        clf = LogisticRegression(solver= "liblinear")
                        LRaccuracy, LRmarcof1, train_size, test_size = k_fold_cv_with_different_train_size(combinedata, label, clf, train_size=train_percent, k=10)
                        print("LR F1: ", LRmarcof1)
                        statistic_detail["LR accuracy"].append(LRaccuracy)
                        statistic_detail["LR macro f1"].append(LRmarcof1)
                        # using SVM with linear kernal
                        clf = SVC(gamma="auto", kernel='linear')
                        svcaccuracy, svcmarcof1, train_size, test_size = k_fold_cv_with_different_train_size(combinedata, label, clf, train_size=train_percent, k=10)
                        print("SVM F1: ", svcmarcof1)
                        statistic_detail["SVM(linear) accuracy"].append(svcaccuracy)
                        statistic_detail["SVM(linear) macro f1"].append(svcmarcof1)

            # write evaluation result to excel
            output = pd.DataFrame(statistic_detail)
            print(output)

            savePath = "../../result/"+Dataset+"/2_OCEN_Different_train_percentage_sample=140k/"
            filename = "citation="+citation_emb+"_textual="+text_emb+"_threshold="+str(step_filter)+".csv"
            com_func.write_csv_df(savePath, filename, output)
            print("Done")
            
            diff_threshold_result[step_filter].append(statistic_detail)
        
        diff_embedding_result["text="+text_emb+"_citation="+citation_emb].append(diff_threshold_result)
    

In [None]:
diff_embedding_result

In [None]:
%who