In [9]:
import numpy as np
import warnings
# create co training classifier
class Co_training_clf(object):
    
    def __init__(self, clf1, clf2=None, p=-1, n=-1, k=30, u = 75):
        self.clf1 = clf1
        # assume co_training on one classifier
        if clf2 == None:
            self.clf2 = copy.copy(clf1)
        else:
            self.clf2 = clf2
        # take p example from most confidently positive labels to example
        self.p = p
        # take n example from most confidently negative label to example
        self.n = n
        # number of iteration
        self.k = k
        # size of pool of unlabeled samples
        self.u = u
        
    def fit(self, dataView1, dataView2, labels):
        self.self_labeled_pos = []
        self.self_labeled_neg = []
        labels = np.asarray(labels)
        # get data ratio
        pos_count = sum(1 for y_i in y if y_i == 1)
        neg_count = sum(1 for y_i in y if y_i == 0)
        p_n_ratio = num_pos / float(num_neg)
        # if not set number of positive and negative label take from prediction
        if self.p == -1 and self.n == -1:
            if p_n_ratio > 1:
                self.n = 1
                self.p = round(self.n*p_n_ratio)
            else:
                self.p = 1
                self.n = round(self.p/p_n_ratio)
        # if only set number of positive label take from prediction
        if self.p != -1 and self.n == -1:
            self.n = round(self.p/p_n_ratio)
        # if only set number of negative label take from prediction
        if self.p == -1 and self.n != -1:
            self.p = round(self.n*p_n_ratio)
        assert(self.p > 0 and self.n > 0 and self.k > 0 and self.u > 0)
        
        # the samples that are initially labeled
        L = [i for i, label_i in enumerate(labels) if label_i != -1]
        # index of unlabeled samples
        U = [i for i, label_i in enumerate(labels) if label_i == -1]
        # random drawing sample from U
        random.shuffle(U)
        U_prime = U[-min(len(U), self.u):]
        # remove the samples in U_prime from U
        U = U[:-len(U_prime)]
        
        iterCount = 0
        #loop until we have assigned labels to everything in U or we hit our iteration break condition
        while iterCount != self.k and U:
            iterCount +=1
            self.clf1.fit(dataView1[L], labels[L])
            self.clf2.fit(dataView2[L], labels[L])
            
            y1 = self.clf1_.predict(dataView1[U_prime])
            y2 = self.clf2_.predict(dataView2[U_prime])
            # add to train if prediction match
            p,n = [], []
            for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
                if len(p) == self.p and len(n) == self.n:
                    break
                if y1_i == y2_i == 1 and len(p) < self.p:
                    p.append(i)
                if y2_i == y1_i == 0 and len(n) < self.n:
                    n.append(i)
            # label the samples and remove thes newly added samples from U_prime
            labels[[U_prime[x] for x in p]] = 1
            labels[[U_prime[x] for x in n]] = 0
            
            L.extend([U_prime[x] for x in p])
            L.extend([U_prime[x] for x in n])
            # add sample to a final list for check
            self.self_labeled_pos.extend([U_prime[x] for x in p])
            self.self_labeled_neg.extend([U_prime[x] for x in n])
            # randomly choice 2p+2n examples from u to replenish u_prime
            replenishItem = U[-(2*self.p+2*self.n):]
            U_prime.extend(replenishItem)
            U = U[:-len(replenishItem)]
        # fit the co-trained model
        self.clf1_.fit(dataView1[L], labels[L])
        self.clf2_.fit(dataView2[L], labels[L])
    
    def supports_proba(self, clf, x):
        try:
            clf.predict_proba([x])
            return True
        except:
            return False
        
    def predict(self, dataView1, dataView2):
        y1 = self.clf1.predict(dataView1)
        y2 = self.clf2.predict(dataView2)
        # Checks if a given classifier supports the 'predict_proba' method
        # this allow me to build combined classifiers by multiplying the probabilities output of classifier together
        proba_supported = self.supports_proba(self.clf1, dataView1[0]) and self.supports_proba(self.clf2, dataView2[0])
        #fill pred with -1 so we can identify the samples in which sample classifiers failed to agree
        pred = np.asarray([-1] * dataView1.shape[0])
        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            # if both agree on label
            if y1_i == y2_i:
                y_pred[i] = y1_i
            # if disagree on label, choice the class have higher probabilities
            elif proba_supported:
                y1_probas = self.clf1.predict_proba([dataView1[i]])[0]
                y2_probas = self.clf2.predict_proba([dataView2[i]])[0]
                sum_y_probas = [proba_y1 + proba_y2 for (proba_y1, proba_y2) in zip(y1_probas, y2_probas)]
                y_pred[i] = sum_y_probas.index(max(sum_y_probas))
            else:
                #the classifiers disagree and don't support probability, so we guess
                warnings.warn("classifiers disagree with label and it don't support probability, result is not accurate")
                y_pred[i] = random.randint(0, 1)
        #check if predict works
        assert not (-1 in y_pred)
        return y_pred
    
    def predict_proba(self, dataView1, dataView2):
        # the predicted probabilities is simply a average of probabilities given from each classifier trained
        proba = np.full((dataView1.shape[0], 2), -1)
        y1_probas = self.clf1.predict_proba(dataView1)
        y2_probas = self.clf2.predict_proba(dataView2)
        
        for i, (y1_i, y2_i) in enumerate(zip(y1_probas, y2_probas)):
            proba[i][0] = (y1_i[0] + y2_i[0]) / 2
            proba[i][1] = (y1_i[1] + y2_i[1]) / 2
        
        return y_proba

In [1]:
# extract different view of data
# view one, doc2vec

# load the vector files
import sys
import io
setting = "d2v"

viewOneFilesDir = "../Data/vectors/"+setting+"/"+setting+".txt"
viewOneVectors = []

with open(viewOneFilesDir, 'r', encoding = 'utf8') as f:
    for line in f:
        read_data = line.split(" ")
        paper_Vectors = read_data
        viewOneVectors.append(paper_Vectors)
f.close()
        
print("Total vector records:",len(viewOneVectors))
print(viewOneVectors[0])


Total vector records: 3149075
['3', '-0.07245799', '-0.15048164', '-0.04320673', '0.01244448', '0.05051953', '-0.05573996', '0.03158288', '-0.04663554', '-0.00442508', '-0.02417533', '-0.03292065', '0.03798062', '0.08195730', '-0.09100581', '-0.04666801', '-0.06315092', '-0.05957321', '0.09766518', '0.01981102', '0.09956500', '-0.02059892', '-0.02321497', '0.10300557', '0.09654117', '0.02085607', '0.15179265', '0.03320639', '0.04716884', '0.04259005', '-0.01022485', '0.07371941', '0.02970656', '0.18967280', '0.07049462', '-0.07849123', '0.10272161', '0.05396378', '0.04138396', '0.08093689', '-0.04713648', '-0.08277001', '0.06004119', '0.15147503', '-0.10719796', '-0.06268646', '0.15823838', '0.10273122', '0.04453533', '-0.00394740', '-0.01239040', '-0.06826647', '-0.02995823', '0.14925463', '0.12254845', '-0.05894163', '0.11628735', '0.03898517', '0.01221054', '-0.00804257', '-0.06178775', '-0.04752085', '-0.04040224', '0.09192738', '0.01171173', '0.02951661', '-0.02156392', '-0.024588

In [2]:
# extract different view of data
# view two, node2vec
setting = "n2v"

viewTwoFilesDir = "../Data/vectors/"+setting+"/data=Meta-alg=N2V-l2=1.0-n2v_p=0.85-iteration=100-no_self_predict=1-idx=0.emb"
viewTwoVectors = []

with open(viewTwoFilesDir, 'r', encoding = 'utf8') as f:
    for line in f:
        read_data = line.split(" ")
        paper_Vectors = read_data
        viewTwoVectors.append(paper_Vectors)
f.close()
recordcount, dim = viewTwoVectors[0]
viewTwoVectors = viewTwoVectors[1:]
print("Total vector records:",len(viewTwoVectors))
print(viewTwoVectors[0])

Total vector records: 12140452
['22516865', '0.0109272', '0.126011', '0.186979', '0.0496719', '0.0373553', '0.0458918', '-0.119893', '0.217118', '0.0524591', '0.237477', '0.191269', '-0.0277055', '0.0290957', '-0.0366833', '0.118964', '0.0654807', '-0.0335345', '-0.0900123', '0.128621', '0.0561669', '-0.087823', '-0.0882296', '0.0740289', '0.082104', '0.0269581', '-0.0346502', '0.0153376', '0.104666', '0.0908716', '-0.085694', '-0.111344', '0.0787209', '-0.17003', '-0.103366', '-0.0832094', '-0.210496', '0.153037', '-0.0342884', '0.0698413', '-0.0719641', '-0.0535707', '0.172399', '0.106226', '-0.0593672', '-0.0348048', '-0.0863189', '-0.0801566', '-0.0665761', '0.0673258', '0.0306541', '-0.0896316', '-0.00800971', '-0.174798', '-0.0252528', '0.0098563', '0.0230368', '0.0282268', '-0.0366493', '-0.131323', '0.0318188', '-0.00778704', '-0.0608064', '-0.0860078', '0.215632', '0.0209927', '-0.0953191', '-0.191736', '-0.0741615', '0.151972', '-0.0522046', '-0.11081', '0.134878', '0.090797'

In [4]:
import re
import os

# collect data
fileDir = "../Data/filteredSameNameAuthor/filter=30/"
fileList = os.listdir(fileDir)
fileList.sort()
print(fileList)

['chung-may yang.txt', 'chung-may yang0.txt', 'chung-may yang1.txt', 'david g lloyd.txt', 'david g lloyd0.txt', 'david g lloyd1.txt', 'jeong hwan kim.txt', 'jeong hwan kim0.txt', 'jeong hwan kim1.txt', 'kevin m. ryan.txt', 'kevin m. ryan0.txt', 'kevin m. ryan1.txt', 'lei wang.txt', 'lei wang0.txt', 'lei wang1.txt', 'michael wagner.txt', 'michael wagner0.txt', 'michael wagner1.txt']


In [5]:
# remove author(positive sample) from other(negative sample)
import random
def extractNegativeSample(positiveSample, allSample):
    negativeSample = [x for x in allSample if x not in positiveSample]
    print("Total negative sample size:", len(negativeSample))
    return negativeSample

In [8]:
# collect class vectors
import pandas as pd
import numpy as np

def extractVectors(author_pids, NegativeSample_pid, allPaperVectors):
    # extract class one vectors
    author_features = []
    for pid in author_pids:
         for paper_Vectors in allPaperVectors:
            if(paper_Vectors[0] == pid):
                author_features.append(paper_Vectors)
    print("Positive sample size: ", len(author_features))
    classOne = pd.DataFrame(author_features)
    classOne["label"] = 0
    # extract class two vectors
    other_features = []
    for pid in NegativeSample_pid:
        for paper_Vectors in allPaperVectors:
            if(paper_Vectors[0] == pid):
                other_features.append(paper_Vectors)
    print("Negative sample size: ", len(other_features))
    classTwo = pd.DataFrame(other_features)
    classTwo["label"] = 1
    return classOne, classTwo


In [None]:
# combine data from different class get all data
def combineClassesData(classOne,classTwo):
    combinedData = pd.concat([classOne, classTwo])
    combinedData = combinedData.sample(frac=1).reset_index(drop=True)
    # take the paper id out
    paperID = combinedData[0]
    # split data and label
    data = combinedData.drop([0,'label'], axis=1)
    label = combinedData['label']
    print("Total sample size and shape: ",data.shape)
    return data, label, paperID

In [6]:
# hard code to read the file one by one
# store the features for classification
author_pids = []
other_pids = []
# author as positive sample, other as all samples
with open(fileDir+"michael wagner0.txt", 'r', encoding = 'utf8') as f:
    for line in f:
        author_pids.extend(line.strip().split(" "))

with open(fileDir+"michael wagner.txt", 'r', encoding = 'utf8') as f:
    for line in f:
        other_pids.extend(line.strip().split(" "))
        
print(author_pids)
print(other_pids[0])

['26921674', '27054571', '24612629', '27094404', '27104802', '26976043', '27092400', '26612384', '26601984', '26372590', '25435336', '25808791', '27239504', '26261317', '25471190', '25158072', '26402080', '26332608', '25822830', '26658776', '26619916', '26170016', '26233432', '26402078', '26402085', '25716354', '26484902', '26184561', '26308457', '26200043', '23928294', '24798886', '23521526', '23375567', '23883793', '25685139', '24325976', '24529980', '25062598', '24600411', '23867795', '22913370', '25042114', '25019225', '24571191', '24338591', '24990933', '23759289', '24729411', '24743864', '24089318', '24441882', '22475622', '23137390', '23992793', '22704223', '23990902', '24129642', '23474092', '26151896', '23652383', '22889924', '22889921', '23288874', '21914645', '22339903', '23399382', '24138519', '24204291', '23538093', '23736996', '23219936', '23788523', '23604333', '23680103', '23297009', '23585882', '23564357', '21901269', '22704853', '22437321', '22375927', '22698761', '22

In [7]:
NegativeSample_pid = extractNegativeSample(author_pids, other_pids)

Total negative sample size: 141
Choicen negative sample  141
['26222031', '25148481', '25576608', '25180967', '25303712', '25425419', '25093819', '25170152', '24401855', '24975266', '23263968', '23487774', '23335755', '22582069', '21833037', '23105050', '22701656', '22952791', '22079351', '22572638', '23057602', '23135396', '22141924', '21890669', '22066885', '21514465', '21169452', '21525411', '21709249', '21500343', '21858215', '21930919', '21546306', '21441524', '20535221', '20624973', '20033067', '21966903', '20598889', '19966029', '20675479', '21136591', '20545842', '20600954', '20023027', '20040079', '19120466', '19571892', '18826437', '19514853', '18250313', '18312573', '18177367', '18641160', '18459973', '18606736', '18461076', '18552182', '18647333', '17504498', '17227418', '17367515', '17408790', '17333172', '17635536', '17554047', '17345135', '18043620', '17099228', '16517650', '16572761', '16452171', '16598256', '16898133', '16377170', '16872410', '16517657', '16478447', '1

In [None]:
View1class1, View1class2 = extractVectors(author_pids, NegativeSample_pid, viewOneVectors)
View2class1, View2class2 = extractVectors(author_pids, NegativeSample_pid, viewTwoVectors)

In [None]:
dataViewone = combineClassesData(View1class1, View1class2)
dataViewtwo = combineClassesData(View2class1, View2class2)

In [None]:
# select 50% of sample as test data
