In [1]:
#!/usr/bin/python3.7

In [2]:
import time
import sklearn
import numpy as np
import pandas as pd

import sys
sys.path.append("..")
from baggingPU import BaggingClassifierPU

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random

In [3]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = " " * columnwidth
    print("    " + empty_cell, end=' ')
    for label in labels:
        print("%{0}s".format(columnwidth) % 'pred_' + label, end=" ")
    print()

    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % 'true_' + label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=" ")
        print()

# import data

In [4]:
df_raw = pd.read_csv('../data/w-independence.csv')

#df_raw = pd.read_csv('../data/1place-independence.csv')

#df_raw = pd.read_csv('../data/w-related.csv')


df_raw['label'] = df_raw['label'].astype("int")
print(df_raw.label.value_counts())
print('Has null values', df_raw.isnull().values.any())

0    10000
1    10000
Name: label, dtype: int64
Has null values False


In [5]:
df_raw.head(10)

Unnamed: 0,p1,p2,p3,label
0,0,0,1,1
1,1,0,1,1
2,0,1,1,1
3,2,0,1,1
4,1,1,1,1
5,0,1,0,1
6,0,2,1,1
7,3,0,1,1
8,2,1,1,1
9,1,1,0,1


In [6]:
print(df_raw.iloc[:10000,:])
df_max = np.amax(df_raw.iloc[:10000,:])
print(df_max)

      p1  p2  p3  label
0      0   0   1      1
1      1   0   1      1
2      0   1   1      1
3      2   0   1      1
4      1   1   1      1
...   ..  ..  ..    ...
9995  52  47   0      1
9996  52  48   1      1
9997  51  48   0      1
9998  51  49   1      1
9999  50  49   0      1

[10000 rows x 4 columns]
p1       100
p2        99
p3         1
label      1
dtype: int64


In [7]:
print(df_raw.iloc[10000:,:])
print(np.amax(df_raw.iloc[10000:,[2]]))

       p1  p2   p3  label
10000   8  41   52      0
10001  65  62  249      0
10002   4  21  509      0
10003  80  81  251      0
10004  75  12  919      0
...    ..  ..  ...    ...
19995  63  94  184      0
19996  43  65  209      0
19997  17  84  717      0
19998  69  83  269      0
19999  68  54  878      0

[10000 rows x 4 columns]
p3    999
dtype: int64


In [8]:
neg_process = df_raw.iloc[10000:,:].copy()
print(neg_process)

for i in range(10000):
    if neg_process.iloc[i,2] > df_max[0]:
        neg_process.iloc[i,2]  = int(neg_process.iloc[i,2] / df_max[0])

       p1  p2   p3  label
10000   8  41   52      0
10001  65  62  249      0
10002   4  21  509      0
10003  80  81  251      0
10004  75  12  919      0
...    ..  ..  ...    ...
19995  63  94  184      0
19996  43  65  209      0
19997  17  84  717      0
19998  69  83  269      0
19999  68  54  878      0

[10000 rows x 4 columns]


In [9]:
neg_process.drop_duplicates(inplace=True)
print(neg_process)
print(len(neg_process))
print(np.amax(neg_process))

       p1  p2  p3  label
10000   8  41  52      0
10001  65  62   2      0
10002   4  21   5      0
10003  80  81   2      0
10004  75  12   9      0
...    ..  ..  ..    ...
19995  63  94   1      0
19996  43  65   2      0
19997  17  84   7      0
19998  69  83   2      0
19999  68  54   8      0

[9580 rows x 4 columns]
9580
p1       100
p2        99
p3       100
label      0
dtype: int64


In [10]:
df_process = pd.concat([df_raw.iloc[:10000,:], neg_process])
print(df_process)

       p1  p2  p3  label
0       0   0   1      1
1       1   0   1      1
2       0   1   1      1
3       2   0   1      1
4       1   1   1      1
...    ..  ..  ..    ...
19995  63  94   1      0
19996  43  65   2      0
19997  17  84   7      0
19998  69  83   2      0
19999  68  54   8      0

[19580 rows x 4 columns]


In [11]:
x_data = df_process.iloc[:,:-1]
y_data = df_process.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [12]:
x_train_input = pd.concat([x_train.iloc[:,:], y_train],axis=1)
x_test_input = pd.concat([x_test.iloc[:,:],y_test],axis=1)

print(len(x_train_input))
print(len(x_test_input))

print(x_train_input.label.value_counts())
print('Has null values', x_train_input.isnull().values.any())

print(x_test_input.label.value_counts())
print('Has null values', x_test_input.isnull().values.any())

15664
3916
1    8019
0    7645
Name: label, dtype: int64
Has null values False
1    1981
0    1935
Name: label, dtype: int64
Has null values False


In [13]:
x_train_input.head(10)

Unnamed: 0,p1,p2,p3,label
12061,26,71,5,0
14534,45,29,5,0
6448,16,64,1,1
5951,27,49,0,1
6117,22,55,0,1
2811,25,27,0,1
17618,77,52,5,0
10402,80,99,1,0
5653,23,51,0,1
4536,10,57,1,1


In [14]:
x_test_input.head(10)

Unnamed: 0,p1,p2,p3,label
8333,19,71,0,1
9745,77,21,0,1
19962,81,82,60,0
10424,96,4,7,0
16390,31,3,3,0
19292,6,94,3,0
2735,10,41,0,1
14939,59,9,5,0
1448,17,21,1,1
9837,31,67,0,1


In [15]:
df = x_train_input.copy()

NON_LBL = [c for c in df.columns if c != 'label']
X = df[NON_LBL]
y = df['label']

y_orig = y.copy()

hidden_size = 5000
y.loc[
    np.random.choice(
        y[y == 1].index, 
        replace = False, 
        size = hidden_size
    )
] = 0

print(X)
print(y)

       p1  p2  p3
12061  26  71   5
14534  45  29   5
6448   16  64   1
5951   27  49   0
6117   22  55   0
...    ..  ..  ..
14000  16  17   1
919     5  24   0
5699    0  74   0
10745  24  75   5
17116  49  57   1

[15664 rows x 3 columns]
12061    0
14534    0
6448     0
5951     0
6117     0
        ..
14000    0
919      0
5699     1
10745    0
17116    0
Name: label, Length: 15664, dtype: int64


In [16]:
pd.Series(y).value_counts()

0    12645
1     3019
Name: label, dtype: int64

In [17]:
print('- %d samples and %d features' % (X.shape))
print('- %d positive out of %d total before hiding labels' % (sum(df_process.label), len(df_process.label)))
print('- %d positive out of %d total after hiding labels' % (sum(y), len(y)))

- 15664 samples and 3 features
- 10000 positive out of 19580 total before hiding labels
- 3019 positive out of 15664 total after hiding labels


# Trainning directly

In [18]:
print('Training XGboost model ...')

import xgboost as xgb

model = xgb.XGBClassifier()

#from sklearn.linear_model import LogisticRegression
#model = LogisticRegression()


#from sklearn.neural_network import MLPClassifier
#model = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(1), random_state=1,max_iter=100000)

model.fit(X, y)

print('Done')

Training XGboost model ...
Done


In [19]:
print('---- {} ----'.format('XGboost model'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig, model.predict(X)))
print('Recall: ', recall_score(y_orig, model.predict(X)))
print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
print('f1_score: ', f1_score(y_orig, model.predict(X)))

---- XGboost model ----
                        pred_negative        pred_positive 
           true_negative       7632.0         13.0 
           true_positive       6976.0       1043.0 
None

Precision:  0.9876893939393939
Recall:  0.130066093029056
Accuracy:  0.553817671092952
f1_score:  0.22986225895316809


# Training by bagging

In [20]:
#初始化多个分类器

from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()

from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier()

from sklearn.neural_network import MLPClassifier
model3 = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(30,30), random_state=1,max_iter=300)

from sklearn import svm
model4 = svm.LinearSVC()

from sklearn.naive_bayes import GaussianNB
model5 = GaussianNB()

from sklearn.ensemble import RandomForestClassifier
model6 = RandomForestClassifier(n_estimators = 50,n_jobs = -1)

import xgboost as xgb
model7 = xgb.XGBClassifier()


In [21]:
import logging
from FCM import FCM

model8 = FCM(n_clusters=2)
model8.set_logger(tostdout=True, level=logging.DEBUG)

In [22]:
model_list = [model1, model2, model3, model4, model5, model6, model7]

In [22]:
print('Training Logistics Regression model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model1,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Logistics Regression model...
Training bagging classifier...
Done!
Time: 4.496913472190499
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7261.0        384.0 
           true_positive        111.0       7908.0 
None

Precision:  0.9536903039073806
Recall:  0.9861578750467639
Accuracy:  0.9683988764044944
f1_score:  0.9696523818282141
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1839.0         96.0 
           true_positive         33.0       1948.0 
None

Precision:  0.9530332681017613
Recall:  0.98334174659263
Accuracy:  0.9670582226762002
f1_score:  0.9679503105590062
Training bagging classifier...
Done!
Time: 3.1173521149903536
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7266.0        379.0 
           true_positive        112.0       7907.0 
Non

In [23]:
print('Training Decision Tree model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model2,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Decision Tree model...
Training bagging classifier...
Done!
Time: 4.031087952665985
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7401.0        244.0 
           true_positive       1753.0       6266.0 
None

Precision:  0.9625192012288787
Recall:  0.7813941888015962
Accuracy:  0.8725102145045965
f1_score:  0.8625507605478697
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1853.0         82.0 
           true_positive        327.0       1654.0 
None

Precision:  0.9527649769585254
Recall:  0.8349318525996972
Accuracy:  0.8955566905005107
f1_score:  0.889965025558246
Training bagging classifier...
Done!
Time: 4.244832125492394
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7409.0        236.0 
           true_positive       1756.0       6263.0 
None

Prec

In [24]:
print('Training MLP model directly ...')

Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model3,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training MLP model directly ...
Training bagging classifier...
Done!
Time: 36.68415461666882
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7167.0        478.0 
           true_positive          0.0       8019.0 
None

Precision:  0.9437448511239261
Recall:  1.0
Accuracy:  0.9694841675178754
f1_score:  0.9710583676434973
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1820.0        115.0 
           true_positive          0.0       1981.0 
None

Precision:  0.9451335877862596
Recall:  1.0
Accuracy:  0.9706332992849847
f1_score:  0.9717929850380183
Training bagging classifier...
Done!
Time: 24.34667085390538
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7189.0        456.0 
           true_positive          0.0       8019.0 
None

Precision:  0.9461946902654867
Re

In [25]:
print('Training SVM model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model4,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training SVM model...
Training bagging classifier...
Done!
Time: 3.2875560550019145
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       5959.0       1686.0 
           true_positive          0.0       8019.0 
None

Precision:  0.8262751159196291
Recall:  1.0
Accuracy:  0.892364657814096
f1_score:  0.9048747461069736
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1486.0        449.0 
           true_positive          0.0       1981.0 
None

Precision:  0.8152263374485597
Recall:  1.0
Accuracy:  0.8853421859039836
f1_score:  0.8982090228973021
Training bagging classifier...
Done!
Time: 3.9846108136698604
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       6019.0       1626.0 
           true_positive          0.0       8019.0 
None

Precision:  0.8314152410575427
Recall:  1.

In [26]:
print('Training Naive Bayesianayes model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model5,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Naive Bayesianayes model...
Training bagging classifier...
Done!
Time: 1.3035555351525545
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       5923.0       1722.0 
           true_positive        560.0       7459.0 
None

Precision:  0.8124387321642522
Recall:  0.930165856091782
Accuracy:  0.8543156281920327
f1_score:  0.8673255813953489
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1479.0        456.0 
           true_positive        121.0       1860.0 
None

Precision:  0.8031088082901554
Recall:  0.93891973750631
Accuracy:  0.8526557711950971
f1_score:  0.8657202699557831
Training bagging classifier...
Done!
Time: 0.9856763398274779
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       5924.0       1721.0 
           true_positive        560.0       7459.0 
None


In [27]:
print('Training Random Forest model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model6,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Random Forest model...
Training bagging classifier...
Done!
Time: 19.082615179941058
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7333.0        312.0 
           true_positive        805.0       7214.0 
None

Precision:  0.9585437151209142
Recall:  0.8996134181319366
Accuracy:  0.9286899897854954
f1_score:  0.9281440977806369
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1841.0         94.0 
           true_positive        139.0       1842.0 
None

Precision:  0.9514462809917356
Recall:  0.9298334174659263
Accuracy:  0.9405005107252298
f1_score:  0.940515700791422
Training bagging classifier...
Done!
Time: 15.21702437568456
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7343.0        302.0 
           true_positive        857.0       7162.0 
None

Pre

In [28]:
print('Training XGboost model directly ...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model7,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training XGboost model directly ...
Training bagging classifier...
Done!
Time: 101.52125191222876
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7277.0        368.0 
           true_positive        288.0       7731.0 
None

Precision:  0.9545622916409433
Recall:  0.9640852974186308
Accuracy:  0.958120531154239
f1_score:  0.9593001613103364
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1839.0         96.0 
           true_positive         67.0       1914.0 
None

Precision:  0.9522388059701492
Recall:  0.9661786976274609
Accuracy:  0.9583758937691522
f1_score:  0.9591581057379104
Training bagging classifier...
Done!
Time: 103.22357183974236
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7276.0        369.0 
           true_positive        267.0       7752.0 
None

In [23]:
X = np.array(X.values.tolist())
y = np.array(y.values.tolist())
y_orig = np.array(y_orig.values.tolist())
x_test = np.array(x_test.values.tolist())
y_test = np.array(y_test.values.tolist())


print('Training FCM model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model8, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)

    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    
print("===========================")
print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)

Training FCM model...
Training bagging classifier...
Done!
Time: 61.7311185747385
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       5583.0       2062.0 
           true_positive       3083.0       4936.0 
None

Precision:  0.7053443841097456
Recall:  0.6155380970195785
Accuracy:  0.6715398365679265
f1_score:  0.65738829326763
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1416.0        519.0 
           true_positive        794.0       1187.0 
None

Precision:  0.6957796014067995
Recall:  0.5991923271075215
Accuracy:  0.664708886618999
f1_score:  0.6438839164632493
Training bagging classifier...
Done!
Time: 62.000362939201295
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       5570.0       2075.0 
           true_positive       3056.0       4963.0 
None

Precision:  0.70