In [1]:
import numpy as np
import warnings
# create co training classifier
class Co_training_clf(object):
    import copy
    def __init__(self, clf1, clf2=None, p=-1, n=-1, k=30, u = 75):
        
        self.clf1 = clf1
        # assume co_training on one classifier
        if clf2 == None:
            self.clf2 = self.copy.copy(clf1)
        else:
            self.clf2 = clf2
        # take p example from most confidently positive labels to example
        self.p = p
        # take n example from most confidently negative label to example
        self.n = n
        # number of iteration
        self.k = k
        # size of pool of unlabeled samples
        self.u = u
        
    def fit(self, dataView1, dataView2, labels):
        
        labels = np.asarray(labels)
        # get data ratio
        pos_count = sum(1 for y_i in labels if y_i == 1)
        neg_count = sum(1 for y_i in labels if y_i == 0)
        p_n_ratio = pos_count / float(neg_count)
        # if not set number of positive and negative label take from prediction
        if self.p == -1 and self.n == -1:
            if p_n_ratio > 1:
                self.n = 1
                self.p = round(self.n*p_n_ratio)
            else:
                self.p = 1
                self.n = round(self.p/p_n_ratio)
        # if only set number of positive label take from prediction
        if self.p != -1 and self.n == -1:
            self.n = round(self.p/p_n_ratio)
        # if only set number of negative label take from prediction
        if self.p == -1 and self.n != -1:
            self.p = round(self.n*p_n_ratio)
        assert(self.p > 0 and self.n > 0 and self.k > 0 and self.u > 0)
        
        # the samples that are initially labeled
        L = [i for i, label_i in enumerate(labels) if label_i != -1]
        # index of unlabeled samples
        U = [i for i, label_i in enumerate(labels) if label_i == -1]
        print("Initial L: ",L)
        print("Initial U: ",U)
        # random drawing sample from U
        random.shuffle(U)
        U_prime = U[-min(len(U), self.u):]
        # remove the samples in U_prime from U
        U = U[:-len(U_prime)]
        iterCount = 0
        #loop until we have assigned labels to everything in U or we hit our iteration break condition
        while iterCount < self.k and U_prime:
            iterCount +=1
            print("step",iterCount, " L: ",L)
            print("step",iterCount, " U_prime: ",U_prime)
            iter_train_d1 = dataView1.iloc[L]
            iter_train_d2= dataView2.iloc[L]
            iter_train_label = labels[L]
            self.clf1.fit(iter_train_d1, iter_train_label)
            self.clf2.fit(iter_train_d2, iter_train_label)
            
            iter_labeling_d1 = dataView1.iloc[U_prime]
            iter_labeling_d2 = dataView2.iloc[U_prime]
            # show predict probabilities
            proba1 = self.clf1.decision_function(iter_labeling_d1)
            proba2 = self.clf1.decision_function(iter_labeling_d2)
            print(proba1)
            print(proba2)
            y1 = self.clf1.predict(iter_labeling_d1)
            y2 = self.clf2.predict(iter_labeling_d2)
            # add to train if prediction match
            p,n = [], []
            for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
                if len(p) == self.p and len(n) == self.n:
                    break
                if y1_i == y2_i == 1 and len(p) < self.p:
                    p.append(i)
                if y2_i == y1_i == 0 and len(n) < self.n:
                    n.append(i)
            # auto label the samples and remove it from U_prime
            auto_labeled_pos = [U_prime[x] for x in p]
            auto_labeled_neg = [U_prime[x] for x in n]
            auto_labeled_samples = auto_labeled_pos+auto_labeled_neg
            print(auto_labeled_samples)
            print(U_prime)
            labels[auto_labeled_pos] = 1
            labels[auto_labeled_neg] = 0
            # extend the labeled sample
            L.extend(auto_labeled_pos)
            L.extend(auto_labeled_neg)
            # remove the labeled sample from U_prime
            U_prime = [x for x in U_prime if x not in auto_labeled_samples]
            # randomly choice 2p+2n examples from u to replenish u_prime
            replenishItem = U[-(2*self.p+2*self.n):]
            U_prime.extend(replenishItem)
            U = U[:-len(replenishItem)]
        print("Labeled: ", len(L))
        print("Still unlabeled: ", len(U_prime))
        # final train
        newDataSet_d1 = dataView1.iloc[L]
        newDataSet_d2 = dataView2.iloc[L]
        self.clf1.fit(newDataSet_d1, labels[L])
        self.clf2.fit(newDataSet_d2, labels[L])
        print(labels)
    
    def supports_proba(self, clf, x):
        try:
            clf.predict_proba([x])
            return True
        except:
            return False
        
    def predict(self, dataView1, dataView2):
        y1 = self.clf1.predict(dataView1)
        y2 = self.clf2.predict(dataView2)
        # Checks if a given classifier supports the 'predict_proba' method
        # this allow me to build combined classifiers by multiplying the probabilities output of classifier together
        proba_supported = self.supports_proba(self.clf1, dataView1[0]) and self.supports_proba(self.clf2, dataView2[0])
        #fill pred with -1 so we can identify the samples in which sample classifiers failed to agree
        pred = np.asarray([-1] * dataView1.shape[0])
        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            # if both agree on label
            if y1_i == y2_i:
                y_pred[i] = y1_i
            # if disagree on label, choice the class have higher probabilities
            elif proba_supported:
                y1_probas = self.clf1.predict_proba([dataView1[i]])[0]
                y2_probas = self.clf2.predict_proba([dataView2[i]])[0]
                sum_y_probas = [proba_y1 + proba_y2 for (proba_y1, proba_y2) in zip(y1_probas, y2_probas)]
                y_pred[i] = sum_y_probas.index(max(sum_y_probas))
            else:
                #the classifiers disagree and don't support probability, so we guess
                warnings.warn("classifiers disagree with label and it don't support probability, result is not accurate")
                y_pred[i] = random.randint(0, 1)
        #check if predict works
        assert not (-1 in y_pred)
        return y_pred
    
    def predict_proba(self, dataView1, dataView2):
        # the predicted probabilities is simply a average of probabilities given from each classifier trained
        proba = np.full((dataView1.shape[0], 2), -1)
        y1_probas = self.clf1.predict_proba(dataView1)
        y2_probas = self.clf2.predict_proba(dataView2)
        
        for i, (y1_i, y2_i) in enumerate(zip(y1_probas, y2_probas)):
            proba[i][0] = (y1_i[0] + y2_i[0]) / 2
            proba[i][1] = (y1_i[1] + y2_i[1]) / 2
        
        return y_proba

In [2]:
# extract different view of data
# view one, doc2vec

# load the vector files
import sys
import io
setting = "d2v"

viewOneFilesDir = "../Data/vectors/"+setting+"/"+setting+".txt"
viewOneVectors = []

with open(viewOneFilesDir, 'r', encoding = 'utf8') as f:
    for line in f:
        read_data = line.split(" ")
        paper_Vectors = read_data
        viewOneVectors.append(paper_Vectors)
f.close()
        
print("Total vector records:",len(viewOneVectors))
print(viewOneVectors[0])


Total vector records: 3149075
['3', '-0.07245799', '-0.15048164', '-0.04320673', '0.01244448', '0.05051953', '-0.05573996', '0.03158288', '-0.04663554', '-0.00442508', '-0.02417533', '-0.03292065', '0.03798062', '0.08195730', '-0.09100581', '-0.04666801', '-0.06315092', '-0.05957321', '0.09766518', '0.01981102', '0.09956500', '-0.02059892', '-0.02321497', '0.10300557', '0.09654117', '0.02085607', '0.15179265', '0.03320639', '0.04716884', '0.04259005', '-0.01022485', '0.07371941', '0.02970656', '0.18967280', '0.07049462', '-0.07849123', '0.10272161', '0.05396378', '0.04138396', '0.08093689', '-0.04713648', '-0.08277001', '0.06004119', '0.15147503', '-0.10719796', '-0.06268646', '0.15823838', '0.10273122', '0.04453533', '-0.00394740', '-0.01239040', '-0.06826647', '-0.02995823', '0.14925463', '0.12254845', '-0.05894163', '0.11628735', '0.03898517', '0.01221054', '-0.00804257', '-0.06178775', '-0.04752085', '-0.04040224', '0.09192738', '0.01171173', '0.02951661', '-0.02156392', '-0.024588

In [3]:
# extract different view of data
# view two, node2vec
setting = "n2v"

viewTwoFilesDir = "../Data/vectors/"+setting+"/data=Meta-alg=N2V-l2=1.0-n2v_p=0.85-iteration=100-no_self_predict=1-idx=0.emb"
viewTwoVectors = []

with open(viewTwoFilesDir, 'r', encoding = 'utf8') as f:
    for line in f:
        read_data = line.split(" ")
        if(len(read_data[0])<=8):
            paper_Vectors = read_data
            viewTwoVectors.append(paper_Vectors)
f.close()
viewTwoVectors = viewTwoVectors[1:]
print("Total vector records:",len(viewTwoVectors))
print(viewTwoVectors[0])

Total vector records: 8602916
['22516865', '0.0109272', '0.126011', '0.186979', '0.0496719', '0.0373553', '0.0458918', '-0.119893', '0.217118', '0.0524591', '0.237477', '0.191269', '-0.0277055', '0.0290957', '-0.0366833', '0.118964', '0.0654807', '-0.0335345', '-0.0900123', '0.128621', '0.0561669', '-0.087823', '-0.0882296', '0.0740289', '0.082104', '0.0269581', '-0.0346502', '0.0153376', '0.104666', '0.0908716', '-0.085694', '-0.111344', '0.0787209', '-0.17003', '-0.103366', '-0.0832094', '-0.210496', '0.153037', '-0.0342884', '0.0698413', '-0.0719641', '-0.0535707', '0.172399', '0.106226', '-0.0593672', '-0.0348048', '-0.0863189', '-0.0801566', '-0.0665761', '0.0673258', '0.0306541', '-0.0896316', '-0.00800971', '-0.174798', '-0.0252528', '0.0098563', '0.0230368', '0.0282268', '-0.0366493', '-0.131323', '0.0318188', '-0.00778704', '-0.0608064', '-0.0860078', '0.215632', '0.0209927', '-0.0953191', '-0.191736', '-0.0741615', '0.151972', '-0.0522046', '-0.11081', '0.134878', '0.090797',

In [4]:
import re
import os

# collect data
fileDir = "../Data/filteredSameNameAuthor/filter=30/"
fileList = os.listdir(fileDir)
fileList.sort()
print(fileList)

['chung-may yang.txt', 'chung-may yang0.txt', 'chung-may yang1.txt', 'david g lloyd.txt', 'david g lloyd0.txt', 'david g lloyd1.txt', 'jeong hwan kim.txt', 'jeong hwan kim0.txt', 'jeong hwan kim1.txt', 'kevin m. ryan.txt', 'kevin m. ryan0.txt', 'kevin m. ryan1.txt', 'lei wang.txt', 'lei wang0.txt', 'lei wang1.txt', 'michael wagner.txt', 'michael wagner0.txt', 'michael wagner1.txt']


In [5]:
# remove author(positive sample) from other(negative sample)
import random
def extractNegativeSample(positiveSample, allSample):
    negativeSample = [x for x in allSample if x not in positiveSample]
    print("Total negative sample size:", len(negativeSample))
    return negativeSample

In [6]:
# collect class vectors
import pandas as pd
import numpy as np

def extractVectors(author_pids, NegativeSample_pid, allPaperVectors):
    # extract class one vectors
    author_features = []
    for pid in author_pids:
         for paper_Vectors in allPaperVectors:
            if(paper_Vectors[0] == pid):
                author_features.append(paper_Vectors)
    print("Positive sample size: ", len(author_features))
    classOne = pd.DataFrame(author_features)
    classOne["label"] = 0
    # extract class two vectors
    other_features = []
    for pid in NegativeSample_pid:
        for paper_Vectors in allPaperVectors:
            if(paper_Vectors[0] == pid):
                other_features.append(paper_Vectors)
    print("Negative sample size: ", len(other_features))
    classTwo = pd.DataFrame(other_features)
    classTwo["label"] = 1
    return classOne, classTwo


In [7]:
# combine data from different class get all data
def combineClassesData(classOne,classTwo):
    combinedData = pd.concat([classOne, classTwo])
    combinedData = combinedData.sample(frac=1).reset_index(drop=True)
    combinedData[0] = pd.to_numeric(combinedData[0], errors='coerce')
    return combinedData

In [8]:
# hard code to read the file one by one
# store the features for classification
author_pids = []
other_pids = []
# author as positive sample, other as all samples
with open(fileDir+"lei wang0.txt", 'r', encoding = 'utf8') as f:
    for line in f:
        author_pids.extend(line.strip().split(" "))

with open(fileDir+"lei wang.txt", 'r', encoding = 'utf8') as f:
    for line in f:
        other_pids.extend(line.strip().split(" "))
        
print(author_pids)
print(other_pids[0])

['27321135', '27078635', '26996321', '26301538', '25464845', '24615769', '25010185', '24449339', '25044429', '23913257', '24019075', '23322720', '24290358', '24139041', '22554080', '21956564', '22662873', '21805546', '21732687', '21681861', '21751313', '21926996', '21545173', '20872394', '20385807', '19318213', '19764747', '19668857', '19165727', '18426210', '18556466', '17603477', '17406421', '16689635', '16956756', '15840835', '15599909', '15556995', '15378068', '12580587', '12779328', '12518054', '14576413', '12537491', '12515477', '11866580', '12154230', '12148987', '12244330', '12120285', '12203503', '11564556', '11313494']
27321135


In [None]:
NegativeSample_pid = extractNegativeSample(author_pids, other_pids)

Total negative sample size: 64


In [None]:
View1class1, View1class2 = extractVectors(author_pids, NegativeSample_pid, viewOneVectors)
View2class1, View2class2 = extractVectors(author_pids, NegativeSample_pid, viewTwoVectors)

Positive sample size:  53
Negative sample size:  64


In [35]:
dataViewone = combineClassesData(View1class1, View1class2)
dataViewtwo = combineClassesData(View2class1, View2class2)

In [36]:
# process 1, synchronize different view based on pid
# some of the record doesn't have citation links, therefore we will have to remove those papers from train and test set
noCitationPids = set(dataViewone[0])-set(dataViewtwo[0])
print(noCitationPids)
for pid in noCitationPids:
    dataViewone = dataViewone[dataViewone[0] != pid]
print(dataViewone.shape)
print(dataViewtwo.shape)

# method: sort every view by pid
dataViewone = dataViewone.sort_values(dataViewone.columns[0],ascending = False).reset_index(drop=True)
dataViewtwo = dataViewtwo.sort_values(dataViewtwo.columns[0],ascending = False).reset_index(drop=True)
print(dataViewone[:3])
print(dataViewtwo[:3])

{27078635, 22751827, 21956564, 18841017, 14576413}
(112, 102)
(112, 102)
          0            1            2           3           4           5  \
0  27321135  -0.29048449  -0.59911001  0.04786110  0.27027032  0.16051295   
1  26996321   0.19381550  -0.48931718  0.06862706  0.14925475  0.17359938   
2  26301538   0.01054261  -0.88216472  0.57695472  0.57391030  0.23281731   

             6           7            8           9  ...           92  \
0  -0.07270139  0.07110120  -0.50903106  0.63912416  ...   0.41171157   
1  -0.00809215  0.06913777  -0.35344198  0.46796930  ...   0.03948900   
2   0.43674836  0.36799490  -0.56424075  0.50638115  ...   0.32038480   

            93          94          95           96           97           98  \
0  -0.11941113  0.30015391  0.33427653  -0.14802496   0.12012596  -0.00595635   
1   0.07864559  0.05917305  0.30804577  -0.03120008   0.01678459  -0.01389165   
2   0.38237169  0.24075703  0.11038157  -0.34420219  -0.00368872  -0.19147828   



In [37]:
# select 50% of sample as test data
print(dataViewone)
testPID = dataViewone[0].sample(frac=0.5)
testIndex = testPID.index
testTrueLable = dataViewone["label"].copy().tolist()
# take the paper id out
paperID = dataViewone[0]
# take data view and label out
label = dataViewone["label"]
dataViewone = dataViewone.drop([0,'label'], axis=1)
dataViewtwo = dataViewtwo.drop([0,'label'], axis=1)
for index in testIndex:
    label[index]=-1
print(label)
print(testIndex)
print(testTrueLable)

            0            1            2            3            4  \
0    27321135  -0.29048449  -0.59911001   0.04786110   0.27027032   
1    26996321   0.19381550  -0.48931718   0.06862706   0.14925475   
2    26301538   0.01054261  -0.88216472   0.57695472   0.57391030   
3    25464845   0.33646849  -0.79967630  -0.40360859   0.52798015   
4    25044429  -0.02384915  -0.08108799   0.27015415  -0.13188151   
5    25010185   0.45885149  -0.77878106   0.49409395   0.63217914   
6    24615769   0.53572321  -0.78155988   0.43898183   0.68515188   
7    24449339   0.10119539  -0.92035896   0.26243329   0.46221569   
8    24290358   0.16397019  -0.02875408   0.05833526   0.44064471   
9    24139041   0.25511259  -0.60395527   0.23804758   0.81938040   
10   24019075   0.00490047  -0.38244918   0.31173226   0.38422415   
11   23913257   0.08501743  -0.53054255   0.06510703   0.21164079   
12   23322720  -0.00948382  -0.04981139  -0.00158726   0.06656136   
13   23212343  -0.09914347  -0.823

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


0     -1
1     -1
2      0
3     -1
4     -1
5      0
6     -1
7      0
8     -1
9      0
10     0
11    -1
12    -1
13     1
14    -1
15    -1
16    -1
17     1
18     1
19     1
20     0
21    -1
22    -1
23    -1
24     1
25    -1
26     1
27     1
28     0
29    -1
      ..
82     1
83    -1
84     0
85     1
86    -1
87    -1
88     1
89     1
90    -1
91     1
92     1
93     0
94     0
95    -1
96     0
97     0
98    -1
99    -1
100    0
101   -1
102   -1
103   -1
104    0
105   -1
106    0
107   -1
108   -1
109    0
110    1
111    1
Name: label, Length: 112, dtype: int64
Int64Index([ 11,  50,  51, 101,  98,  14,  69, 108,  35,  30,  52,  47,  99,
             83,  75,  86,  57,  48,  16,   1,  36,   6,  54,  81,  37,  79,
             45, 103,  80,  95, 105,   4,  46,   3,  71,  15,  29,  87,  44,
             38,  61, 107,  22,  90,  56,  49,  55,  25,  12,  40,  32,   0,
              8,  21,  23, 102],
           dtype='int64')
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,

In [None]:
# add unlabeled samples to dataset


In [40]:
from sklearn import linear_model

clf = Co_training_clf(clf1=linear_model.LogisticRegression())
clf.fit(dataViewone,dataViewtwo,label)

Initial L:  [2, 5, 7, 9, 10, 13, 17, 18, 19, 20, 24, 26, 27, 28, 31, 33, 34, 39, 41, 42, 43, 53, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 70, 72, 73, 74, 76, 77, 78, 82, 84, 85, 88, 89, 91, 92, 93, 94, 96, 97, 100, 104, 106, 109, 110, 111]
Initial U:  [0, 1, 3, 4, 6, 8, 11, 12, 14, 15, 16, 21, 22, 23, 25, 29, 30, 32, 35, 36, 37, 38, 40, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 61, 69, 71, 75, 79, 80, 81, 83, 86, 87, 90, 95, 98, 99, 101, 102, 103, 105, 107, 108]
step 1  L:  [2, 5, 7, 9, 10, 13, 17, 18, 19, 20, 24, 26, 27, 28, 31, 33, 34, 39, 41, 42, 43, 53, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 70, 72, 73, 74, 76, 77, 78, 82, 84, 85, 88, 89, 91, 92, 93, 94, 96, 97, 100, 104, 106, 109, 110, 111]
step 1  U_prime:  [4, 11, 69, 12, 87, 37, 75, 38, 103, 108, 15, 22, 49, 47, 71, 8, 80, 90, 21, 50, 48, 101, 52, 44, 105, 51, 1, 55, 99, 81, 79, 36, 102, 98, 56, 30, 61, 14, 23, 16, 95, 29, 107, 45, 32, 46, 83, 0, 3, 35, 57, 40, 86, 25, 6, 54]
[-4.06120222 -1.7793096   1.64561382 -1.04

[-2.10619863 -4.04393615 -1.52624299 -0.91243457 -2.1137716   4.44633667
  3.70701959  2.37425514 -2.69473981 -1.15807167  1.82715525  0.55670386
  3.74832792  3.84346415 -2.26504118  4.20841785  4.83181059  5.83010732
  7.8628894   3.96443264 -2.55328832  6.84120271 -2.33268389  1.25900146
 -2.2012513   0.80389281  4.51768025  1.39109559 -2.57300007  4.99346808
 -3.58742676 -1.9007284  -0.74316781  1.72733766 -3.27276757 -1.95191327
 -2.58325379  3.13082539  4.20066897 -1.16687975  4.29357447  1.0284858
 -3.94671861 -2.39676295]
[ 0.74053357  0.69803539  0.36791099 -0.00568895  0.72375018  0.11478986
  0.12809225  0.05688906  0.15425631  0.54641133 -0.13013048 -0.09713516
 -0.05563815  0.13968263  0.19088452  0.32556935  0.06607857  0.18988935
  0.03254066 -0.13546229  0.95269983 -0.10262747  0.23123855  0.16964893
  0.08353244  0.06937725  0.27768416 -0.18716354  0.25031886 -0.51641857
  0.55513486  0.72852179  0.05393191  0.16541565  1.01425496 -0.02196211
  0.92439177  0.08624752  

[ 5.23688127  6.13771156  8.57814239  4.75874539 -2.86068344  7.42373761
 -2.68893564  1.56001361 -2.27244587  0.92481407  4.82582     1.54704495
 -2.72243496  5.58731511 -4.14164912 -2.1917324  -1.10195712  1.96817883
 -3.50461926 -2.18931675 -3.01492039  3.37401367  4.18344548 -1.4175734
  4.50472932  1.26977733 -4.29815039 -2.58933937]
[-0.07600794  0.00902487 -0.11459166 -0.31871625  1.32935445 -0.34092577
  0.49755892 -0.00518234  0.16301798 -0.07999339  0.09048607 -0.42616291
  0.37218127 -0.6663108   0.78608006  0.93886385  0.16495481 -0.00878884
  1.52743921 -0.05913173  1.12188425 -0.17354242 -0.1701934  -0.09337447
 -0.1049956  -0.05542838  0.48770801  1.25151466]
[99, 102]
[99, 81, 79, 36, 102, 98, 56, 30, 61, 14, 23, 16, 95, 29, 107, 45, 32, 46, 83, 0, 3, 35, 57, 40, 86, 25, 6, 54]
step 16  L:  [2, 5, 7, 9, 10, 13, 17, 18, 19, 20, 24, 26, 27, 28, 31, 33, 34, 39, 41, 42, 43, 53, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 70, 72, 73, 74, 76, 77, 78, 82, 84, 85, 88, 89, 91, 92, 9

[ 1.78271161  5.99712133  1.98769868 -2.28952001 -3.03141012  3.55173583
  4.40616262 -1.55843257  4.78337984  1.14157277 -4.53539995 -2.68601591]
[-0.44928392 -0.66897217  0.10728782  0.10239157  1.63943421 -0.06026603
 -0.05197437  0.03033122  0.08530965 -0.02253503  0.7178519   1.6487972 ]
[16, 0]
[16, 29, 46, 0, 3, 35, 57, 40, 86, 25, 6, 54]
step 24  L:  [2, 5, 7, 9, 10, 13, 17, 18, 19, 20, 24, 26, 27, 28, 31, 33, 34, 39, 41, 42, 43, 53, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 70, 72, 73, 74, 76, 77, 78, 82, 84, 85, 88, 89, 91, 92, 93, 94, 96, 97, 100, 104, 106, 109, 110, 111, 69, 4, 38, 11, 108, 12, 15, 87, 47, 37, 71, 75, 90, 103, 21, 22, 50, 49, 52, 8, 44, 80, 105, 48, 51, 101, 55, 1, 99, 102, 81, 56, 79, 61, 36, 95, 98, 107, 30, 45, 14, 32, 23, 83, 16, 0]
step 24  U_prime:  [29, 46, 3, 35, 57, 40, 86, 25, 6, 54]
[ 6.00928227  2.04770368 -3.07249045  3.50012102  4.35219004 -1.59156442
  4.93176552  1.20315156 -4.54442859 -2.72407878]
[-0.64009165  0.24949737  1.79833972  0.05808

In [21]:
%whos

Variable                Type               Data/Info
----------------------------------------------------
Co_training_clf         type               <class '__main__.Co_training_clf'>
NegativeSample_pid      list               n=64
View1class1             DataFrame                     0            1<...>\n[53 rows x 102 columns]
View1class2             DataFrame                     0            1<...>\n[64 rows x 102 columns]
View2class1             DataFrame                     0           1 <...>\n[50 rows x 102 columns]
View2class2             DataFrame                     0          1  <...>\n[62 rows x 102 columns]
author_pids             list               n=53
clf                     Co_training_clf    <__main__.Co_training_clf<...>object at 0x7f80586dbf28>
combineClassesData      function           <function combineClassesData at 0x7f85c8802950>
dataViewone             DataFrame                       1           <...>n[112 rows x 100 columns]
dataViewtwo             DataFrame  

In [16]:
del viewOneVectors
del viewTwoVectors