# Co-training

For visualization of co-training process, we apply PCA to feature before training. This will make co-training process clear, but the result will be not accuracy because apply PCA will loss lots of information.

1. We assume only part of label exist

2. We only select binary case (Only when one name indicate two and only two author)

3. When we apply 10 fold with co-training, each fold of first iteration will be baseline compare to co-training

# Improved part
1. adding stopping criterion where when confident score is 95% or number of iteration equal k


In [1]:
import os
import sys
import warnings

#warnings.filterwarnings('error')
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
np.set_printoptions(precision=4, suppress=True)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import com_func

#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_lower = 100
threshold_upper = 110

apply_threshold_to_name_group_samples = True

pp_text = ["pv_dbow"]
pp_citation = "n2v"

Dataset = "pubmed"

In [2]:
import numpy as np
import itertools
import warnings
import matplotlib.pyplot as plt
from adjustText import adjust_text

from collections import defaultdict

# create co training classifier
class Co_training_clf(object):
    
    import copy
    
    def __init__(self, clf1, clf2=None, p=1, n=1, k=30, u = 75):
        
        self.clf1 = clf1
        # assume co_training on one classifier
        if clf2 == None:
            self.clf2 = self.copy.deepcopy(clf1)
        else:
            self.clf2 = clf2
        # take p example from most confidently positive labels to example
        self.p = p
        # take n example from most confidently negative label to example
        self.n = n
        # number of iteration
        self.k = k
        # size of pool of unlabeled samples
        self.u = u

    def init_L_U_U_prime(self, labels):
        # index of the samples that are initially labeled
        L = labels.index[labels != -1].tolist()
        # index of unlabeled samples
        U = labels.index[labels == -1].tolist()
        print("Initial L size: ", len(L))
        print("Initial U size: ", len(U))
        # random drawing sample from U
        random.shuffle(U)
        U_prime = U[-min(len(U), self.u):]
        # remove the samples in U_prime from U
        U = U[:-len(U_prime)]
        print("Initial U prime size: ", len(U_prime))
        return L, U, U_prime
    
    def check_iter_label_mapping(self, iter_clf1, iter_clf2):
        '''
        In theory, it shouldn't occur that label not mapping since it trained on same dataset but different view
        But add a check to make sure it won't occur and save the class mapping for late label unlabeled sample
        '''
        dv1_class_label = iter_clf1.classes_
        dv2_class_label = iter_clf2.classes_
        if all(dv1_class_label == dv2_class_label):
            self.class_ = dv1_class_label
        else:
            sys.exit("Two view classifier label not mapping")

    def label_p_n_samples(self, proba, rank):
        U_prime_size = len(proba)
        self_trained_labels = []
        self_trained_confident = []
        for label, conf_measure in enumerate(rank):
            # 0 positive sample
            if label==0:
                p = []
                p_confident = []
                index = 0
                while(len(p) < self.p):
                    max_conf_sample_index = conf_measure[index]
                    # ---- if positive predict proba is more than 50% ------- #
                    if (proba[max_conf_sample_index][label] > 0.5):
                        print('P: ', max_conf_sample_index, " : ", proba[max_conf_sample_index])
                        p.append(max_conf_sample_index)
                        p_confident.append(proba[max_conf_sample_index][label])
                    index +=1
                    if (index>=U_prime_size):
                        break
                self_trained_labels.append(p)
                self_trained_confident.append(p_confident)
            # 1 negative sample
            elif label == 1:
                n = []
                n_confident = []
                index = 0
                while(len(n) < self.n):
                    max_conf_sample_index = conf_measure[index]
                    # ---- if negative predict proba is more than 50% ------- #
                    if (proba[max_conf_sample_index][label] > 0.5):
                        print('N: ', max_conf_sample_index, " : ", proba[max_conf_sample_index])
                        n.append(max_conf_sample_index)
                        n_confident.append(proba[max_conf_sample_index][label])
                    index +=1
                    if (index>=U_prime_size):
                        break
                self_trained_labels.append(n)
                self_trained_confident.append(n_confident)
            else:
                print("Class label error")
        return self_trained_labels, self_trained_confident

    def get_self_labeled_sample(self):
        '''
        return:
            self-labeled new positive, self-labeled new negative (Index)
        '''
        
        return self.new_labeled_idx

    def plot_co_training_process(self, iterCount, data, iter_train_label, unlabeled_idx, h1_new = [], h2_new = [],
                                 h1_new_prob = [], h2_new_prob = [], plotSavingPath=None, name=None):
        if not os.path.exists(plotSavingPath):
            os.makedirs(plotSavingPath)
        pca_one = data.values[:,0]
        pca_two = data.values[:,1]
        # Layer 1. plot unlabel samples in u_prime
        fig, ax = plt.subplots(figsize=(9,7))
        ax.scatter(pca_one[unlabeled_idx], pca_two[unlabeled_idx], color='grey', label = "unlabeled", s = 50, alpha = 0.5)
        # Layer 2. plot the labeled samples
        for author in np.unique(iter_train_label):
            ix = iter_train_label.index[iter_train_label == author].tolist()
            # print(ix)
            ax.scatter(pca_one[ix], pca_two[ix], cmap='viridis', label = author, s = 50, alpha = 0.5)
        if iterCount != 0:
            # layer 3. mark self labeled samples
            all_h1_new = list(itertools.chain(*h1_new))
            all_h2_new = list(itertools.chain(*h2_new))
            temp_h1 = ax.scatter(pca_one[all_h1_new], pca_two[all_h1_new], edgecolor='black', linewidth='1', s=50)
            temp_h1.set_facecolor("none")
            temp_h1.set_label("h1 self-labeled")
            temp_h2 = ax.scatter(pca_one[all_h2_new], pca_two[all_h2_new], edgecolor='red', linewidth='1', s=50)
            temp_h2.set_facecolor("none")
            temp_h2.set_label("h2 self-labeled")
            # layer 4. mark new samples confidence and which view produce it
            last_iter_h1_new = h1_new[-1]
            last_iter_h2_new = h2_new[-1]
            text = []
            for i, idx in enumerate(last_iter_h1_new):
                text.append(plt.text(pca_one[idx], pca_two[idx], "{:.2f}".format(h1_new_prob[i]), color='black'))
            for i, idx in enumerate(last_iter_h2_new):
                text.append(plt.text(pca_one[idx], pca_two[idx], "{:.2f}".format(h2_new_prob[i]), color='red'))
            adjust_text(text, x=pca_one, y=pca_two, force_points=0.3, force_text=0.3, expand_points=(2, 2), 
                        expand_text=(2, 2), arrowprops=dict(arrowstyle='Simple', color='red'))
        legend = ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25), ncol=3,prop={'size': 13})
        plt.title('Co-training iteration: '+ str(iterCount), fontsize=14)
        plt.xlabel("First principal component",fontsize=14)
        plt.ylabel("Second principal component",fontsize=14)
        plt.savefig((plotSavingPath+name+"_PCA_i-"+str(iterCount)+".png").encode('utf-8'), dpi=100, bbox_extra_artists=(legend,), bbox_inches='tight')
        plt.close("all")
        # plt.show()
        

    def fit(self, dataView1, dataView2, labels, dv1_test, dv2_test, label_test, plot_save_name=None, plot_save_path=None):
        # using all unlabeled sample instead of pool of unlabeled sample
        self.u = len(labels)
        # index of self labeled samples
        self.new_labeled_idx = defaultdict(list)
        self.h1_new_idx = defaultdict(list)
        self.h2_new_idx = defaultdict(list)
        
        # sync input datatype
        if not all(isinstance(i, pd.DataFrame) for i in [dataView1, dataView2, labels]):
            if not isinstance(dataView1, pd.DataFrame):
                dataView1 = pd.DataFrame(dataView1)
            if not isinstance(dataView2, pd.DataFrame):
                dataView2 = pd.DataFrame(dataView2)
            if not isinstance(labels, pd.DataFrame):
                labels = pd.DataFrame(labels, index = dataView1.index.values)
        labels = pd.Series(labels[0].values, index=dataView1.index.values) 
        # when fit co-train, we collect f1 on test samples wrt each iteration
        self.f1_on_test_dv1 = []
        self.f1_on_test_dv2 = []
        
        print("P value: ", self.p, " N value: ", self.n)
        print(dataView1.index.values)
        
        L, U, U_prime = self.init_L_U_U_prime(labels)
        print("L: ", L)
        print("U: ", U)
        print("U_prime: ", U_prime)
        
        iterCount = 0
        # --------- plot initial stage -------------- #
        init_train_label = labels[L]
        plot_save_dv1_name = plot_save_name+"_dv1"
        if plot_save_name != None:
            # ----- save pca reduced plot for dv1 ------ #
            self.plot_co_training_process(iterCount, dataView1, init_train_label, U_prime,
                                          plotSavingPath = plot_save_path, name = plot_save_dv1_name)
            # ----- dv2 -------- #
            plot_save_dv2_name = plot_save_name+"_dv2"
            self.plot_co_training_process(iterCount, dataView2, init_train_label, U_prime,
                                          plotSavingPath = plot_save_path, name = plot_save_dv2_name)
        
        #loop until we have assigned labels to every sample in U and U_prime or we hit our iteration break condition
        while iterCount < self.k and U_prime:
            # print("step",iterCount, " L: ",L)
            # print("step",iterCount, " U_prime: ",U_prime)
            # ------------- get labeled samples for train ----------- # 
            iter_train_d1 = dataView1.iloc[L]
            iter_train_d2 = dataView2.iloc[L]
            iter_train_label = labels[L]
            # print(iter_train_label)
            # ----------- get U_prime unlabeled samples  ------------ #
            iter_unlabeled_d1 = dataView1.iloc[U_prime]
            iter_unlabeled_d2 = dataView2.iloc[U_prime]
            # ------------ train different view classifier ----------- #
            iter_clf1 = self.copy.deepcopy(self.clf1) 
            iter_clf2 = self.copy.deepcopy(self.clf2)
            iter_clf1.fit(iter_train_d1, iter_train_label.ravel())
            iter_clf2.fit(iter_train_d2, iter_train_label.ravel())
            self.check_iter_label_mapping(iter_clf1, iter_clf2)
            # --------- test error on test data --------------------- #
            # make prediction on test data
            y1 = iter_clf1.predict(dv1_test)
            y2 = iter_clf2.predict(dv2_test)
            # f1 score on each iteration
            f1_dv1 = f1_score(label_test, y1, average='macro')
            f1_dv2 = f1_score(label_test, y2, average='macro')
            # collect f1 for current iteration
            self.f1_on_test_dv1.append(f1_dv1)
            self.f1_on_test_dv2.append(f1_dv2)
            ''' 
            Notice here dv1_proba and dv2_proba's index is index for u' (Unlabeled data only)
            We use index of u' to find index (position) of data in U where U and L is all data index
            '''
            # rank class probabilities for unlabeled sample for it's confidence measure
            dv1_proba = iter_clf1.predict_proba(iter_unlabeled_d1)
            dv2_proba = iter_clf2.predict_proba(iter_unlabeled_d2)
            dv1_proba_rank = []
            dv2_proba_rank = []
            # proba1_rank[i] is label i's confidence measure
            for class_proba in dv1_proba.T:
                dv1_proba_rank.append((-class_proba).argsort())
            for class_proba in dv2_proba.T:
                dv2_proba_rank.append((-class_proba).argsort())
            # print(dv1_proba)
            # print(dv1_proba_rank)
            # print(dv2_proba)
            # print(dv2_proba_rank)
            # h1 classifier
            h1_new_sample, h1_new_sample_probs = self.label_p_n_samples(dv1_proba, dv1_proba_rank)
            # h2 classifier
            h2_new_sample, h2_new_sample_probs = self.label_p_n_samples(dv2_proba, dv2_proba_rank)
            # collect statistic for plot only
            h1_new_flatten = list(itertools.chain(*h1_new_sample))
            h2_new_flatten = list(itertools.chain(*h2_new_sample))
            iter_h1_prob = list(itertools.chain(*h1_new_sample_probs))
            iter_h2_prob = list(itertools.chain(*h2_new_sample_probs))
            iter_h1_for_plot = [U_prime[x] for x in h1_new_flatten]
            iter_h2_for_plot = [U_prime[x] for x in h2_new_flatten]
            self.h1_new_idx["index"].append(iter_h1_for_plot)
            self.h1_new_idx["confident"].append(iter_h1_prob)
            self.h2_new_idx["index"].append(iter_h2_for_plot)
            self.h2_new_idx["confident"].append(iter_h2_prob)
            # add most confidence samples as new training samples
            roundNew = list(zip(h1_new_sample, h2_new_sample))
            print(roundNew)
            # auto label the samples and remove it from U_prime
            round_auto_labeled = []
            for label, round_new in enumerate(roundNew):
                round_new = set([item for sublist in round_new for item in sublist])
                auto_labeled = [U_prime[x] for x in round_new]
                round_auto_labeled.extend(auto_labeled)
                self.new_labeled_idx[self.class_[label]].append(auto_labeled)
                # add label to those new samples
                labels[auto_labeled] = self.class_[label]
                print(self.class_[label]," (u' idx): ",round_new)
                print(self.class_[label]," (U idx): ",auto_labeled)
            print(roundNew)
            print(round_auto_labeled)
            # extend the labeled sample
            L.extend(round_auto_labeled)
            # remove the labeled sample from U_prime
            U_prime = [x for x in U_prime if x not in round_auto_labeled]
            #print(U_prime)
            # randomly choice 2p+2n examples from u to replenish u_prime
            replenishItem = U[-(2*self.p+2*self.n):]
            U_prime.extend(replenishItem)
            U = U[:-len(replenishItem)]
            iterCount +=1
            # ----------- plot the co-training process -------------- #
            if plot_save_name != None:
                new_train_label = labels[L]
                h1_new = self.h1_new_idx["index"]
                h2_new = self.h2_new_idx["index"]
                # self_labeled_idx = [val for sublist in self_labeled_idx_temp for subsublist in sublist for val in subsublist]
                print("Current iter h1 new: ", iter_h1_for_plot, " probs: ", iter_h1_prob)
                print("Current iter h2 new: ", iter_h2_for_plot, " probs: ", iter_h2_prob)
                # ----- save pca reduced plot for dv1 ------ #
                plot_save_dv1_name = plot_save_name+"_dv1"
                self.plot_co_training_process(iterCount, dataView1, new_train_label, U_prime,
                                              h1_new, h2_new, iter_h1_prob, iter_h2_prob,
                                              plot_save_path, plot_save_dv1_name)
                # ----- dv2 -------- #
                plot_save_dv2_name = plot_save_name+"_dv2"
                self.plot_co_training_process(iterCount, dataView2, new_train_label, U_prime,
                                              h1_new, h2_new, iter_h1_prob, iter_h2_prob,
                                              plot_save_path, plot_save_dv2_name)
        print("Total Labeled number: ", len(L), " Still unlabeled number: ", len(U_prime))
        print(self.f1_on_test_dv1)
        print(self.f1_on_test_dv2)
        # final train
        newtrain_d1 = dataView1.iloc[L]
        newtrain_d2 = dataView2.iloc[L]
        self.clf1.fit(newtrain_d1, labels.iloc[L])
        self.clf2.fit(newtrain_d2, labels.iloc[L])
        # ------ save f1 vs number of iteration plot ------- #
        if plot_save_name != None:
            default_text_based = [self.f1_on_test_dv1[0]] * iterCount
            default_citation_based = [self.f1_on_test_dv2[0]] * iterCount
            default_step = np.arange(0,iterCount)
            co_train_text_based = self.f1_on_test_dv1[1:]
            co_train_citation_based = self.f1_on_test_dv2[1:]
            co_training_step = np.arange(1,iterCount)

            fig = plt.figure()
            ax = plt.axes()
            plt.plot(default_step, default_text_based, linestyle='dashed', label="Text based default")
            plt.plot(default_step, default_citation_based, linestyle='dashdot', label="Citation based default")
            plt.plot(co_training_step, co_train_text_based, linestyle='solid', marker = "*", label="Text based")
            plt.plot(co_training_step, co_train_citation_based, linestyle='dotted', marker = "+", label="Citation based")
            ax.autoscale_view()
            legend = ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=2)
            plt.xlabel('Co-Training Iterations')
            plt.ylabel('F1 score')
            plt.savefig((plot_save_path+plot_save_name+"_diff_iter_f1.png"), dpi=300, bbox_extra_artists=(legend,), bbox_inches='tight')
            # plt.show()
            plt.close("all")
            
    def co_train_process_f1(self):
        return self.f1_on_test_dv1, self.f1_on_test_dv2

    def get_iter_count(self):
        return self.k

    def supports_proba(self, clf, x):
        try:
            clf.predict_proba([x])
            return True
        except:
            return False

    def predict(self, dataView1, dataView2):
        y1 = self.clf1.predict(dataView1)
        y2 = self.clf2.predict(dataView2)
        proba_supported = self.supports_proba(self.clf1, dataView1.iloc[0]) and self.supports_proba(self.clf2, dataView2.iloc[0])
        #fill pred with -1 so we can identify the samples in which sample classifiers failed to agree
        y_pred = ["-1"] * dataView1.shape[0]
        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            # if both agree on label
            if y1_i == y2_i:
                y_pred[i] = y1_i
            # if disagree on label, times probability together, choice the class have higher probabilities
            elif proba_supported:
                y1_probas = self.clf1.predict_proba([dataView1.iloc[i]])[0]
                y2_probas = self.clf2.predict_proba([dataView2.iloc[i]])[0]
                print("y1 disagree on",i, " Proba: ",y1_probas)
                print("y2 not aggreed on ",i, "Proba: ", y2_probas)
                prod_y_probas = [proba_y1 * proba_y2 for (proba_y1, proba_y2) in zip(y1_probas, y2_probas)]
                print("product probas:",prod_y_probas)
                max_prob_idx = prod_y_probas.index(max(prod_y_probas))
                y_pred[i] = self.class_[max_prob_idx]
                print("result idx: ", max_prob_idx, " result: ",y_pred[i])
            else:
                #the classifiers disagree and don't support probability, exit
                sys.exit("classifiers disagree with label, result may not accurate")
        # convert final result to np array
        y_pred_np_array = np.asarray(y_pred)
        return y_pred_np_array

    def predict_proba(self, dataView1, dataView2):
        # the predicted probabilities is simply a product (*) of probabilities given from each classifier trained
        y1_probas = self.clf1.predict_proba(dataView1)
        y2_probas = self.clf2.predict_proba(dataView2)
        
        proba = (y1_probas*y2_probas)
        return proba


In [None]:
import copy
import random

import seaborn as sns

from collections import Counter
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score

# cross validation
def k_fold_cv_co_train_binary(dataview1, dataview2, label, init_labeled_size, clf, k=10, plot_save_name=None, plot_save_path=None):
    kf = StratifiedKFold(n_splits=k)
    allTrueLabel = []
    allPredLabel_co_train,allPredLabel_LR,allPredLabel_SVM = ([] for i in range(3))
    all_fold_coTrain_diff_iter_on_test_dv1 = []
    all_fold_coTrain_diff_iter_on_test_dv2 = []
    
    all_fold_statistic = []
    fold = 0
    co_train_iteration = 0
    # convert different input type to dataframe for consistency
    dataview1 = pd.DataFrame(dataview1)
    dataview2 = pd.DataFrame(dataview2)
    
    for train_index, test_index in kf.split(dataview1, label):
        detailed_plot_path = plot_save_path+plot_save_name+"/fold"+str(fold)+"/"
        fold +=1
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # ---------------split train and test -------------------- #
        dv1_train, dv1_test = dataview1.iloc[train_index], dataview1.iloc[test_index]
        dv2_train, dv2_test = dataview2.iloc[train_index], dataview2.iloc[test_index]
        all_label_train, label_test = label.iloc[train_index], label.iloc[test_index]
        
        # plot true labeled result for different view
        if not os.path.exists(detailed_plot_path):
            os.makedirs(detailed_plot_path)
        # view one
        dv1_pca_one = dv1_train.iloc[:,0]
        dv1_pca_two = dv1_train.iloc[:,1]
        fig, ax = plt.subplots(figsize=(9,7))
        for author in np.unique(all_label_train):
            ix = all_label_train.index[all_label_train == author].tolist()
            ax.scatter(dv1_pca_one[ix], dv1_pca_two[ix], cmap='viridis', label = author, s = 50, alpha = 0.5)
        legend = ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=2,prop={'size': 13})
        plt.title('True label', fontsize=14)
        plt.xlabel("First principal component",fontsize=14)
        plt.ylabel("Second principal component",fontsize=14)
        plt.savefig((detailed_plot_path+name+"_PCA_true_label_dv1.png").encode('utf-8'), dpi=100, bbox_extra_artists=(legend,), bbox_inches='tight')
        # view two
        dv2_pca_one = dv2_train.iloc[:,0]
        dv2_pca_two = dv2_train.iloc[:,1]
        fig, ax = plt.subplots(figsize=(9,7))
        for author in np.unique(all_label_train):
            ix = all_label_train.index[all_label_train == author].tolist()
            ax.scatter(dv2_pca_one[ix], dv2_pca_two[ix], cmap='viridis', label = author, s = 50, alpha = 0.5)
        legend = ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=2,prop={'size': 13})
        plt.title('True label', fontsize=14)
        plt.xlabel("First principal component",fontsize=14)
        plt.ylabel("Second principal component",fontsize=14)
        plt.savefig((detailed_plot_path+name+"_PCA_true_label_dv2.png").encode('utf-8'), dpi=100, bbox_extra_artists=(legend,), bbox_inches='tight')
        
        # ----------- set some labeled data as unlabeled ------------ #
        # 1. obtain data ratio
        c = Counter(all_label_train)
        data_ratio = [(i, c[i] / len(all_label_train)) for i in c]
        print(data_ratio)
        # 2. use co_train_per_class_size to draw "init_labeled_size" of samples as labeled, other as unlabeled
        co_train_per_class_size = [(label, round(ratio*init_labeled_size)) for label, ratio in data_ratio]
        final_train_label = all_label_train.tolist()
        train_sample_idx = []
        # 3. mark other as unlabeled
        for unique_label, training_size in co_train_per_class_size:
            curr_label_idx = [i for i, x in enumerate(final_train_label) if x == unique_label]
            curr_label_size = len(curr_label_idx)
            unlabeled_size = curr_label_size - training_size
            unlabel_item_idx = random.sample(curr_label_idx, unlabeled_size)
            train_sample_idx += [x for x in curr_label_idx if x not in unlabel_item_idx]
            for unlabel_idx in unlabel_item_idx:
                final_train_label[unlabel_idx]=-1
        print(final_train_label)
        unlabeled_sample_size = len(final_train_label)-len(train_sample_idx)
        final_dv1_train = dv1_train.reset_index(drop=True)
        final_dv2_train = dv2_train.reset_index(drop=True)
        ''' -------------- train binary co-training ------------------- '''
        per_fold_clf = copy.deepcopy(clf)
        per_fold_clf.fit(final_dv1_train, final_dv2_train, final_train_label, dv1_test, dv2_test, label_test,
                         plot_save_name, detailed_plot_path)
        # get self-labeled sample index #
        self_labeled_index = per_fold_clf.get_self_labeled_sample()
        print("Self labeled sample index: ", self_labeled_index)
        self_labeled_idx_temp = [idx for idx in self_labeled_index.values()]
        all_self_labeled_index = [val for sublist in self_labeled_idx_temp for subsublist in sublist for val in subsublist]
        # -------- use concatenated features for comparsion -------- #
        concatenated_train = pd.concat([final_dv1_train.iloc[train_sample_idx],final_dv2_train.iloc[train_sample_idx]], axis=1, ignore_index=True)
        train_label = [final_train_label[i] for i in train_sample_idx]
        ''' --- train LR on concatenated features with "init_labeled_size" labeled samples  --- '''
        LR_clf = LogisticRegression(solver= "liblinear")
        LR_clf.fit(concatenated_train, train_label)
        ''' --- train SVM on concatenated features with "init_labeled_size" labeled samples  --- '''
        SVM_clf = SVC(gamma="auto", kernel='linear')
        SVM_clf.fit(concatenated_train, train_label)
        # ------------ generate concatenated test dataset ------------ #
        concatenated_test = pd.concat([dv1_test,dv2_test], axis=1, ignore_index=True)
        # ------------- get predicted label for test set ------------- #
        co_lr_label_predict = per_fold_clf.predict(dv1_test, dv2_test)
        LR_predict = LR_clf.predict(concatenated_test)
        SVM_predict = SVM_clf.predict(concatenated_test)
        print("co-train f1: ", metrics.classification_report(label_test, co_lr_label_predict))
        print("LR f1: ", metrics.classification_report(label_test, LR_predict) )
        print("SVM f1: ", metrics.classification_report(label_test, SVM_predict))
        # ------------- get co-training iterations f1 score ---------- #
        co_train_iteration = per_fold_clf.get_iter_count()
        coTrain_diff_iter_on_test_dv1, coTrain_diff_iter_on_test_dv2 = per_fold_clf.co_train_process_f1()
        all_fold_coTrain_diff_iter_on_test_dv1.append(coTrain_diff_iter_on_test_dv1)
        all_fold_coTrain_diff_iter_on_test_dv2.append(coTrain_diff_iter_on_test_dv2)
        
        allTrueLabel.extend(label_test.values.tolist())
        allPredLabel_co_train.extend(co_lr_label_predict)
        allPredLabel_LR.extend(LR_predict)
        allPredLabel_SVM.extend(SVM_predict)
        # collect per fold statistic
        curr_fold_statistic = {'author': plot_save_name, 'fold':fold, 'train_size': co_train_per_class_size, 'test_size': dv1_test.shape[0],
                               'total_self_labeled_train': len(all_self_labeled_index), "unlabeled size": unlabeled_sample_size,
                               'co-train f1': f1_score(label_test.values.tolist(), co_lr_label_predict,average='macro'),
                               'LR f1': f1_score(label_test.values.tolist(), LR_predict,average='macro'),
                               'SVM f1': f1_score(label_test.values.tolist(), SVM_predict,average='macro')}
        all_fold_statistic.append(curr_fold_statistic)
    if plot_save_path !=None:
        # --------------- plot per fold result f1 variance --------------- #
        all_per_fold_f1_score_variance_plot = pd.DataFrame(all_fold_statistic)
        plot_temp_data = all_per_fold_f1_score_variance_plot[['co-train f1', 'LR f1', 'SVM f1']].copy()
        plot_temp_data = pd.melt(plot_temp_data, var_name='methods', value_name='f1')
        ax = sns.boxplot(x="methods", y="f1", data=plot_temp_data)
        ax = sns.swarmplot(x="methods", y="f1", data=plot_temp_data, color=".25")
        ax.set_title(plot_save_name+" result variance with 10 fold")
        plt.savefig(plot_save_path+plot_save_name+"/"+plot_save_name+"_result_variance.png", dpi=300)
        plt.show()
    # plot averaged f1 score wrt different iterations in co-training process
    averaged_coTrain_diff_iter_on_test_dv1 = np.mean(all_fold_coTrain_diff_iter_on_test_dv1, axis=0)
    averaged_coTrain_diff_iter_on_test_dv2 = np.mean(all_fold_coTrain_diff_iter_on_test_dv2, axis=0)
    
    default_text_based = [averaged_coTrain_diff_iter_on_test_dv1[0]] * co_train_iteration
    default_citation_based = [averaged_coTrain_diff_iter_on_test_dv2[0]] * co_train_iteration
    default_step = np.arange(0, co_train_iteration)
    co_train_text_based = averaged_coTrain_diff_iter_on_test_dv1[1:]
    co_train_citation_based = averaged_coTrain_diff_iter_on_test_dv2[1:]
    co_training_step = np.arange(1, co_train_iteration)

    fig = plt.figure()
    ax = plt.axes()
    plt.plot(default_step, default_text_based, linestyle='dashed', label="Text based default")
    plt.plot(default_step, default_citation_based, linestyle='dashdot', label="Citation based default")
    plt.plot(co_training_step, co_train_text_based, linestyle='solid', marker = "*", label="Text based")
    plt.plot(co_training_step, co_train_citation_based, linestyle='dotted', marker = "+", label="Citation based")
    ax.autoscale_view()
    legend = ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=2)
    plt.xlabel('Co-Training Iterations')
    plt.ylabel('F1 score')
    plt.savefig((plot_save_path+plot_save_name+"_diff_iter_f1_avg.png"), dpi=300, bbox_extra_artists=(legend,), bbox_inches='tight')
    plt.show()
    plt.close("all")
    
    co_train_accuracy = accuracy_score(allTrueLabel, allPredLabel_co_train)
    co_train_f1 = f1_score(allTrueLabel, allPredLabel_co_train,average='macro')
    print(metrics.classification_report(allTrueLabel, allPredLabel_co_train))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel_co_train).ravel())
    
    LR_accuracy = accuracy_score(allTrueLabel, allPredLabel_LR)
    LR_f1 = f1_score(allTrueLabel, allPredLabel_LR,average='macro')
    print(metrics.classification_report(allTrueLabel, allPredLabel_LR))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel_LR).ravel())
    
    SVM_accuracy = accuracy_score(allTrueLabel, allPredLabel_SVM)
    SVM_f1 = f1_score(allTrueLabel, allPredLabel_SVM,average='macro')
    print(metrics.classification_report(allTrueLabel, allPredLabel_SVM))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel_SVM).ravel())

    
    return LR_f1, SVM_f1, co_train_f1, all_fold_statistic

In [None]:
import collections
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# fix random seed for reproducibility
np.random.seed(1)

# loop through all files in directory add name to name list
fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

init_labeled_size = 10

co_lr_diff_embedding_result = []

#---------------- load different embeddings for view one ---------------#
for select_emb in pp_text:
    print("Load text embedding: ", select_emb)
    # read viewone embeddings
    viewone_text_emb, viewone_emb_pid = com_func.read_text_embedding(emb_type=select_emb, training_size = "140k")
    viewone_text_emb = np.column_stack((viewone_emb_pid,viewone_text_emb))
    # read viewtwo embedding, notice here we only use labeled data
    print("Load citation embedding: ", pp_citation)
    viewtwo_citation_embedding = com_func.read_citation_embedding_sorted(emb_type = pp_citation, labeled_only = True)
    # print(viewone_text_emb[0])
    # print(viewtwo_citation_embedding[0])
    
    threshold_change_all_co_lr_f1s = []
    threshold_change = []
    
    # -------------- different threshold (step by 10) -----------------------#
    for step_threshold in range(threshold_lower, threshold_upper, 10):
        plot_save_path = "../../plot/co_train_detail_plots/threshold="+str(step_threshold)+"/V1="+select_emb+"_V2="+pp_citation+"/"
        threshold_change.append(step_threshold)
        # collect statistic to output
        name_group, total_sample_size, train_sample_size, test_sample_size= ([] for i in range(4))
        unlabeled_count, co_train_self_labeled = ([] for i in range(2))

        all_LR_f1,all_SVM_f1, all_co_LR_f1 = ([] for i in range(3))
        all_per_fold_f1_score_variance = []

        total_selected_group = 0

        # ------- different name group in all name group --------------------#
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read labeled pid and aid from file
            data = com_func.read_pid_aid(fileDir+file)
            labeled_mask = data["authorID"] != "-1"
            labeled_data = data[labeled_mask]
            print(labeled_data.shape)
            # ---------------- collect all labeled sample -------------------- #
            # ---------------- if use all samples as negative --------------- #
            all_labeled_samples = labeled_data["paperID"].tolist()
            authorCounter = com_func.select_productive_groups(labeled_data, threshold_select_name_group)
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name," pass")
            else:
                total_selected_group+= 1
                # --------------for each name group---------------- #
                if apply_threshold_to_name_group_samples == True:
                    # ---------- only use sample pass threshold ------- #
                    #-------- only select authors in name group are very productive (more than threshold)---------#
                    labeled_data, author_list, _= com_func.only_select_productive_authors(labeled_data, step_threshold)
                    # ----------------- if use filtered samples as negative  --------- #
                    filtered_all_labeled_samples = labeled_data["paperID"].tolist()
                else:
                    # ----------- use all sample in name group --------- #
                    author_list = com_func.productive_authors_list(labeled_data, step_threshold)
                    print(name, " name group sample size: ",labeled_data.shape)
                # -------------- extract all samples for name group -------------- #
                # for each name group
                # read in labeled data
                labeled_viewone_text = com_func.extract_sorted_embedding(viewone_text_emb, labeled_data["paperID"])
                print(labeled_viewone_text.shape)
                labeled_viewtwo_citation = com_func.extract_sorted_embedding(viewtwo_citation_embedding, labeled_data["paperID"])
                print(labeled_viewtwo_citation.shape)
                print("Labeled: ",len(labeled_viewone_text), " : ", len(labeled_viewtwo_citation))
                # ---------------- shuffle the data ----------------- #
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # ------------------ alignment ---------------------- #
                labeled_viewone_text = pd.merge(labeled_data, labeled_viewone_text, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation = pd.merge(labeled_data, labeled_viewtwo_citation, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation.fillna(0, inplace=True)
                unique_labels = labeled_viewone_text.authorID.unique()
                map_dict = {}
                for idx, unique_label in enumerate(unique_labels):
                    map_dict[unique_label] = name+"_"+str(idx)
                true_label = labeled_viewone_text["authorID"].replace(map_dict)
                
                print(labeled_viewone_text.shape)
                print(labeled_viewtwo_citation.shape)
                '''
                only work on binary case, ignored multi-class case
                We need to check whether the name group only contain binary case or not
                '''
                if len(author_list) == 2:
                    name_group.append(name)
                    print(name + " is binary case")
                    viewone_text_final = labeled_viewone_text.drop(["paperID", "authorID", 0], axis=1)
                    viewtwo_citation_final = labeled_viewtwo_citation.drop(["paperID", "authorID", 0], axis=1)
                    '''Only for visualization of co-training process, use PCA to reduce views to 2d'''
                    # 1. apply PCA to different views
                    pca = PCA(n_components=2)
                    pca_dv1 = pca.fit_transform(X=viewone_text_final)
                    pca_dv2 = pca.fit_transform(X=viewtwo_citation_final)
                    
                    # 2. apply co-training
                    co_logistic_clf = Co_training_clf(clf1=LogisticRegression(solver= "liblinear"),p=1,n=1, k=30)
                    LR_f1, SVM_f1, co_lr_f1, name_per_fold_status= k_fold_cv_co_train_binary(pca_dv1, pca_dv2, true_label, 
                                                                                init_labeled_size, co_logistic_clf, 
                                                                                10, name, plot_save_path)
                    total_sample_size.append(len(true_label))
                    train_sample_size.append(name_per_fold_status[0]["train_size"])
                    test_sample_size.append(name_per_fold_status[0]["test_size"])
                    unlabeled_count.append(name_per_fold_status[0]["unlabeled size"])
                    co_train_self_labeled.append(name_per_fold_status[0]["total_self_labeled_train"])
                    all_LR_f1.append(LR_f1)
                    all_SVM_f1.append(SVM_f1)
                    all_co_LR_f1.append(co_lr_f1)
                else:
                    print(name+ " is multi-class case, ignored")
                    
        # write evaluation result to excel
        output = pd.DataFrame({'Name':name_group, "Total sample size":total_sample_size, "train size":train_sample_size,
                               "test size":test_sample_size, "unlabeled sample size": unlabeled_count, 
                               "total self labeled sample":co_train_self_labeled,
                               "LR F1": all_LR_f1, "SVM F1": all_SVM_f1, "co_logisticRegression F1": all_co_LR_f1})
        savePath = "../../result/"+Dataset+"/co_train_sample=140k/"
        filename = "(init_labeled_size="+str(init_labeled_size)+") V1="+select_emb+"_V2="+pp_citation+"_threshold="+str(step_threshold)+".csv"
        com_func.write_csv_df(savePath, filename, output)
        print("Done")
        
        threshold_change_all_co_lr_f1s.append(all_co_LR_f1)
        
    co_lr_diff_embedding_result.append(threshold_change_all_co_lr_f1s)

Load text embedding:  pv_dbow
Total text vector records: 135796
Vector dimension:  100
Load citation embedding:  n2v
Total citation vector records: 124922
Vector dimension:  101
For name:  j_read
(136, 2)
j_read  pass
For name:  f_esteves
(34, 2)
f_esteves  pass
For name:  c_miller
(252, 2)
c_miller  pass
For name:  r_jha
(11, 2)
r_jha  pass
For name:  a_lowe
(102, 2)
a_lowe  pass
For name:  a_vega
(20, 2)
a_vega  pass
For name:  k_smith
(338, 2)
k_smith  pass
For name:  j_gordon
(19, 2)
j_gordon  pass
For name:  s_liao
(104, 2)
s_liao  pass
For name:  j_qian
(17, 2)
j_qian  pass
For name:  s_bernardi
(91, 2)
s_bernardi  pass
For name:  t_hill
(15, 2)
t_hill  pass
For name:  s_schindler
(51, 2)
s_schindler  pass
For name:  j_williams
(625, 2)
j_williams  pass
For name:  s_jacobson
(28, 2)
s_jacobson  pass
For name:  e_andrade
(17, 2)
e_andrade  pass
For name:  t_santos
(45, 2)
t_santos  pass
For name:  k_kim
(1111, 2)
Total sample size before apply threshold:  1111
Counter({'0000-0002-

Total missing sample:  0
(252, 101)
Total missing sample:  6
(252, 101)
Labeled:  252  :  252
(252, 103)
(252, 103)
p_robinson is binary case
[('p_robinson_0', 0.47345132743362833), ('p_robinson_1', 0.5265486725663717)]
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_0', -1, -1, -1, -1, -1, -1, 'p_robinson_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_0', -1, -1, 'p_robinson_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_0', -1, -1, -1, -1, -1, -1, -1, 'p_robinson_0', -1, -1, -1, -1, -1,

P:  79  :  [0.96 0.04]
N:  91  :  [0.0191 0.9809]
P:  153  :  [0.9739 0.0261]
N:  56  :  [0.0123 0.9877]
[([79], [153]), ([91], [56])]
p_robinson_0  (u' idx):  {153, 79}
p_robinson_0  (U idx):  [217, 147]
p_robinson_1  (u' idx):  {56, 91}
p_robinson_1  (U idx):  [93, 159]
[([79], [153]), ([91], [56])]
[217, 147, 93, 159]
Current iter h1 new:  [147, 159]  probs:  [0.9600069224404275, 0.980883421179994]
Current iter h2 new:  [217, 93]  probs:  [0.9738676994628555, 0.9876578492382213]
P:  134  :  [0.9703 0.0297]
N:  109  :  [0.0186 0.9814]
P:  156  :  [0.976 0.024]
N:  56  :  [0.0124 0.9876]
[([134], [156]), ([109], [56])]
p_robinson_0  (u' idx):  {156, 134}
p_robinson_0  (U idx):  [168, 202]
p_robinson_1  (u' idx):  {56, 109}
p_robinson_1  (U idx):  [154, 77]
[([134], [156]), ([109], [56])]
[168, 202, 154, 77]
Current iter h1 new:  [202, 77]  probs:  [0.9703441114966747, 0.9814159956618238]
Current iter h2 new:  [168, 154]  probs:  [0.976017885735231, 0.9875963669845647]
P:  34  :  [0.96

P:  33  :  [0.9754 0.0246]
N:  69  :  [0.0212 0.9788]
P:  74  :  [0.9646 0.0354]
N:  30  :  [0.0212 0.9788]
[([33], [74]), ([69], [30])]
p_robinson_0  (u' idx):  {33, 74}
p_robinson_0  (U idx):  [194, 100]
p_robinson_1  (u' idx):  {69, 30}
p_robinson_1  (U idx):  [86, 33]
[([33], [74]), ([69], [30])]
[194, 100, 86, 33]
Current iter h1 new:  [194, 86]  probs:  [0.9753614680190892, 0.9787531605767716]
Current iter h2 new:  [100, 33]  probs:  [0.9646368155181921, 0.9788256770468118]
P:  34  :  [0.9782 0.0218]
N:  50  :  [0.0235 0.9765]
P:  88  :  [0.9638 0.0362]
N:  85  :  [0.0206 0.9794]
[([34], [88]), ([50], [85])]
p_robinson_0  (u' idx):  {88, 34}
p_robinson_0  (U idx):  [182, 47]
p_robinson_1  (u' idx):  {50, 85}
p_robinson_1  (U idx):  [152, 162]
[([34], [88]), ([50], [85])]
[182, 47, 152, 162]
Current iter h1 new:  [47, 152]  probs:  [0.9781995261992055, 0.9765388361001414]
Current iter h2 new:  [182, 162]  probs:  [0.9637959145395408, 0.9793588899657775]
Total Labeled number:  127 

P:  125  :  [0.9774 0.0226]
N:  161  :  [0.0378 0.9622]
P:  72  :  [0.9805 0.0195]
N:  53  :  [0.024 0.976]
[([125], [72]), ([161], [53])]
p_robinson_0  (u' idx):  {72, 125}
p_robinson_0  (U idx):  [99, 212]
p_robinson_1  (u' idx):  {161, 53}
p_robinson_1  (U idx):  [159, 103]
[([125], [72]), ([161], [53])]
[99, 212, 159, 103]
Current iter h1 new:  [212, 159]  probs:  [0.9773547817449242, 0.9621544322894119]
Current iter h2 new:  [99, 103]  probs:  [0.9805233865840419, 0.9759785930712684]
P:  102  :  [0.9828 0.0172]
N:  63  :  [0.0295 0.9705]
P:  23  :  [0.9823 0.0177]
N:  113  :  [0.023 0.977]
[([102], [23]), ([63], [113])]
p_robinson_0  (u' idx):  {102, 23}
p_robinson_0  (U idx):  [105, 46]
p_robinson_1  (u' idx):  {113, 63}
p_robinson_1  (U idx):  [14, 101]
[([102], [23]), ([63], [113])]
[105, 46, 14, 101]
Current iter h1 new:  [105, 101]  probs:  [0.9827807352815912, 0.9704949569195778]
Current iter h2 new:  [46, 14]  probs:  [0.9823368021107229, 0.9769602812684696]
P:  75  :  [0.9

P:  30  :  [0.9875 0.0125]
N:  68  :  [0.0209 0.9791]
P:  78  :  [0.9873 0.0127]
N:  18  :  [0.0134 0.9866]
[([30], [78]), ([68], [18])]
p_robinson_0  (u' idx):  {78, 30}
p_robinson_0  (U idx):  [96, 147]
p_robinson_1  (u' idx):  {18, 68}
p_robinson_1  (U idx):  [134, 11]
[([30], [78]), ([68], [18])]
[96, 147, 134, 11]
Current iter h1 new:  [147, 11]  probs:  [0.9875104896899415, 0.97910245356872]
Current iter h2 new:  [96, 134]  probs:  [0.9873305772409648, 0.9865957532311087]
P:  51  :  [0.985 0.015]
N:  114  :  [0.0204 0.9796]
P:  104  :  [0.9874 0.0126]
N:  50  :  [0.0139 0.9861]
[([51], [104]), ([114], [50])]
p_robinson_0  (u' idx):  {104, 51}
p_robinson_0  (U idx):  [1, 155]
p_robinson_1  (u' idx):  {114, 50}
p_robinson_1  (U idx):  [108, 80]
[([51], [104]), ([114], [50])]
[1, 155, 108, 80]
Current iter h1 new:  [155, 108]  probs:  [0.9850022629789462, 0.9796376004739115]
Current iter h2 new:  [1, 80]  probs:  [0.9873903219279206, 0.9860682499258941]
P:  80  :  [0.9863 0.0137]
N:

In [None]:
from statistics import mean 

print(threshold_change_all_co_lr_f1s)
print(co_lr_diff_embedding_result)

In [None]:
#         # --------------- plot overall result f1 variance --------------- #
#         all_per_fold_f1_score_variance_plot = pd.DataFrame(all_per_fold_f1_score_variance)
#         ax = sns.boxplot(x="author", y="f1", data=all_per_fold_f1_score_variance_plot)
#         ax = sns.swarmplot(x="author", y="f1", data=all_per_fold_f1_score_variance_plot, color=".25")
#         plt.savefig(plot_save_path+"all_result_variance.png", dpi=300)
#         # plt.show()

In [None]:
# %whos
del viewtwo_citation_embedding
del viewone_text_emb