In [1]:
#!/usr/bin/python3.7

In [2]:
import time
import sklearn
import numpy as np
import pandas as pd

import sys
sys.path.append("..")
from baggingPU import BaggingClassifierPU

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random

In [3]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = " " * columnwidth
    print("    " + empty_cell, end=' ')
    for label in labels:
        print("%{0}s".format(columnwidth) % 'pred_' + label, end=" ")
    print()

    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % 'true_' + label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=" ")
        print()

# import data

In [4]:
#df_raw = pd.read_csv('../data/w-dependence.csv')

df_raw = pd.read_csv('../data/1-place2.csv')

#df_raw = pd.read_csv('../data/w-related.csv')

df_raw['label'] = df_raw['label'].astype("int")
print(df_raw.label.value_counts())
print('Has null values', df_raw.isnull().values.any())

0    10000
1    10000
Name: label, dtype: int64
Has null values False


In [5]:
df_raw.head(10)

Unnamed: 0,p1,p2,label
0,1,0,1
1,1,2,1
2,1,4,1
3,1,6,1
4,1,8,1
5,1,10,1
6,1,12,1
7,1,14,1
8,1,16,1
9,1,18,1


In [6]:
print(df_raw.iloc[:10000,:])
df_max = np.amax(df_raw.iloc[:10000,:])
print(df_max)

      p1     p2  label
0      1      0      1
1      1      2      1
2      1      4      1
3      1      6      1
4      1      8      1
...   ..    ...    ...
9995   1  19990      1
9996   1  19992      1
9997   1  19994      1
9998   1  19996      1
9999   1  19998      1

[10000 rows x 3 columns]
p1           1
p2       19998
label        1
dtype: int64


In [7]:
print(df_raw.iloc[10000:,:])
print(np.amax(df_raw.iloc[10000:,[1]]))

       p1     p2  label
10000   1      1      0
10001   1      3      0
10002   1      5      0
10003   1      7      0
10004   1      9      0
...    ..    ...    ...
19995   1  19991      0
19996   1  19993      0
19997   1  19995      0
19998   1  19997      0
19999   1  19999      0

[10000 rows x 3 columns]
p2    19999
dtype: int64


In [8]:
neg_process = df_raw.iloc[10000:,:].copy()
print(neg_process)

for i in range(10000):
    if neg_process.iloc[i,1] > df_max[0]:
        neg_process.iloc[i,1]  = int(neg_process.iloc[i,1] / df_max[0])

       p1     p2  label
10000   1      1      0
10001   1      3      0
10002   1      5      0
10003   1      7      0
10004   1      9      0
...    ..    ...    ...
19995   1  19991      0
19996   1  19993      0
19997   1  19995      0
19998   1  19997      0
19999   1  19999      0

[10000 rows x 3 columns]


In [9]:
neg_process.drop_duplicates(inplace=True)
print(neg_process)
print(len(neg_process))
print(np.amax(neg_process))

       p1     p2  label
10000   1      1      0
10001   1      3      0
10002   1      5      0
10003   1      7      0
10004   1      9      0
...    ..    ...    ...
19995   1  19991      0
19996   1  19993      0
19997   1  19995      0
19998   1  19997      0
19999   1  19999      0

[10000 rows x 3 columns]
10000
p1           1
p2       19999
label        0
dtype: int64


In [10]:
df_process = pd.concat([df_raw.iloc[:10000,:], neg_process])
print(df_process)

       p1     p2  label
0       1      0      1
1       1      2      1
2       1      4      1
3       1      6      1
4       1      8      1
...    ..    ...    ...
19995   1  19991      0
19996   1  19993      0
19997   1  19995      0
19998   1  19997      0
19999   1  19999      0

[20000 rows x 3 columns]


In [11]:
x_data = df_process.iloc[:,:-1]
y_data = df_process.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [12]:
x_train_input = pd.concat([x_train.iloc[:,:], y_train],axis=1)
x_test_input = pd.concat([x_test.iloc[:,:],y_test],axis=1)

print(len(x_train_input))
print(len(x_test_input))

print(x_train_input.label.value_counts())
print('Has null values', x_train_input.isnull().values.any())

print(x_test_input.label.value_counts())
print('Has null values', x_test_input.isnull().values.any())

16000
4000
1    8077
0    7923
Name: label, dtype: int64
Has null values False
0    2077
1    1923
Name: label, dtype: int64
Has null values False


In [15]:
x_train_input.head(10)

Unnamed: 0,p1,p2,label
8118,1,16236,1
10171,1,343,0
9399,1,18798,1
12669,1,5339,0
18809,1,17619,0
13869,1,7739,0
3320,1,6640,1
14689,1,9379,0
13087,1,6175,0
15992,1,11985,0


In [16]:
x_test_input.head(10)

Unnamed: 0,p1,p2,label
14356,1,8713,0
3439,1,6878,1
12153,1,4307,0
15029,1,10059,0
18549,1,17099,0
15762,1,11525,0
12313,1,4627,0
16034,1,12069,0
11496,1,2993,0
5653,1,11306,1


In [17]:
df = x_train_input.copy()

NON_LBL = [c for c in df.columns if c != 'label']
X = df[NON_LBL]
y = df['label']

y_orig = y.copy()

hidden_size = 5000
y.loc[
    np.random.choice(
        y[y == 1].index, 
        replace = False, 
        size = hidden_size
    )
] = 0

print(X)
print(y)

       p1     p2
8118    1  16236
10171   1    343
9399    1  18798
12669   1   5339
18809   1  17619
...    ..    ...
13927   1   7855
919     1   1838
5699    1  11398
10742   1   1485
16921   1  13843

[16000 rows x 2 columns]
8118     1
10171    0
9399     1
12669    0
18809    0
        ..
13927    0
919      1
5699     0
10742    0
16921    0
Name: label, Length: 16000, dtype: int64


In [18]:
pd.Series(y).value_counts()

0    12923
1     3077
Name: label, dtype: int64

In [19]:
print('- %d samples and %d features' % (X.shape))
print('- %d positive out of %d total before hiding labels' % (sum(df_process.label), len(df_process.label)))
print('- %d positive out of %d total after hiding labels' % (sum(y), len(y)))

- 16000 samples and 2 features
- 10000 positive out of 20000 total before hiding labels
- 3077 positive out of 16000 total after hiding labels


# Trainning directly

In [20]:
print('Training XGboost model ...')

import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(X, y)

print('Done')

Training XGboost model ...
Done


In [21]:
print('---- {} ----'.format('XGboost model'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig, model.predict(X)))
print('Recall: ', recall_score(y_orig, model.predict(X)))
print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
print('f1_score: ', f1_score(y_orig, model.predict(X)))

---- XGboost model ----
                        pred_negative        pred_positive 
           true_negative       7922.0          1.0 
           true_positive       8065.0         12.0 
None

Precision:  0.9230769230769231
Recall:  0.0014857001361891792
Accuracy:  0.495875
f1_score:  0.002966625463535229


# Training by bagging

In [22]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()

from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier()

from sklearn.neural_network import MLPClassifier
model3 = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(1), random_state=1,max_iter=100000)

from sklearn import svm
model4 = svm.LinearSVC()

from sklearn.naive_bayes import MultinomialNB
model5 = MultinomialNB()

from sklearn.ensemble import RandomForestClassifier
model6 = RandomForestClassifier(n_estimators = 50,n_jobs = -1)

import xgboost as xgb
model7 = xgb.XGBClassifier()

In [23]:
model_list = [model1, model2, model3, model4, model5, model6, model7]

In [24]:
print('Training Logistics Regression model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model1,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)

    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    
print("===========================")
print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Logistics Regression model...
Training bagging classifier...
Done!
Time: 8.886250629999267
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative         27.0       7896.0 
           true_positive         27.0       8050.0 
None

Precision:  0.504828797190518
Recall:  0.9966571746935744
Accuracy:  0.5048125
f1_score:  0.6701910668942264
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative          5.0       2072.0 
           true_positive          5.0       1918.0 
None

Precision:  0.4807017543859649
Recall:  0.9973998959958398
Accuracy:  0.48075
f1_score:  0.6487400642651784
Training bagging classifier...
Done!
Time: 4.127701403995161
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative          0.0       7923.0 
           true_positive          0.0       8077.0 
None

Precision:  0.504

In [25]:
print('Training Decision Tree model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model2,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Decision Tree model...
Training bagging classifier...
Done!
Time: 4.268731844997092
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       3576.0       4347.0 
           true_positive       3657.0       4420.0 
None

Precision:  0.5041633397969659
Recall:  0.5472328834963477
Accuracy:  0.49975
f1_score:  0.5248159582047021
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative        819.0       1258.0 
           true_positive       1010.0        913.0 
None

Precision:  0.4205435283279595
Recall:  0.4747789911596464
Accuracy:  0.433
f1_score:  0.44601856375183196
Training bagging classifier...
Done!
Time: 6.12311253399821
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       3579.0       4344.0 
           true_positive       3708.0       4369.0 
None

Precision:  0.5014346378973

In [26]:
print('Training MLP model directly ...')

Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model3,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training MLP model directly ...
Training bagging classifier...
Done!
Time: 11.541991398997197
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       6013.0       1910.0 
           true_positive       6093.0       1984.0 
None

Precision:  0.5095017976373909
Recall:  0.2456357558499443
Accuracy:  0.4998125
f1_score:  0.3314677136412998
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1551.0        526.0 
           true_positive       1473.0        450.0 
None

Precision:  0.4610655737704918
Recall:  0.23400936037441497
Accuracy:  0.50025
f1_score:  0.3104518799586064
Training bagging classifier...
Done!
Time: 11.90349163999781
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       5182.0       2741.0 
           true_positive       5257.0       2820.0 
None

Precision:  0.5071030

In [27]:
print('Training SVM model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model4,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training SVM model...
Training bagging classifier...
Done!
Time: 4.4835277419988415
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative        144.0       7779.0 
           true_positive        154.0       7923.0 
None

Precision:  0.5045854031333588
Recall:  0.9809335149189056
Accuracy:  0.5041875
f1_score:  0.6663863072458892
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative         37.0       2040.0 
           true_positive         27.0       1896.0 
None

Precision:  0.4817073170731707
Recall:  0.9859594383775351
Accuracy:  0.48325
f1_score:  0.6472094214029697
Training bagging classifier...
Done!
Time: 6.110986684005184
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7303.0        620.0 
           true_positive       7438.0        639.0 
None

Precision:  0.5075456711675933
R

In [28]:
print('Training Naive Bayesianayes model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model5,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Naive Bayesianayes model...
Training bagging classifier...
Done!
Time: 1.0366294069972355
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       3877.0       4046.0 
           true_positive       4018.0       4059.0 
None

Precision:  0.5008019740900679
Recall:  0.5025380710659898
Accuracy:  0.496
f1_score:  0.5016685205784204
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       1062.0       1015.0 
           true_positive        922.0       1001.0 
None

Precision:  0.4965277777777778
Recall:  0.5205408216328653
Accuracy:  0.51575
f1_score:  0.5082508250825083
Training bagging classifier...
Done!
Time: 4.2291827790031675
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       3868.0       4055.0 
           true_positive       4009.0       4068.0 
None

Precision:  0.500800

In [29]:
print('Training Random Forest model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model6,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Random Forest model...
Training bagging classifier...
Done!
Time: 66.97128426400013
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       3318.0       4605.0 
           true_positive       3198.0       4879.0 
None

Precision:  0.5144453816954871
Recall:  0.6040609137055838
Accuracy:  0.5123125
f1_score:  0.5556631171345595
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative        711.0       1366.0 
           true_positive        909.0       1014.0 
None

Precision:  0.4260504201680672
Recall:  0.5273010920436817
Accuracy:  0.43125
f1_score:  0.47129909365558914
Training bagging classifier...


KeyboardInterrupt: 

In [None]:
print('Training XGboost model directly ...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model7,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pre = model.predict(X)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    
    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, pre), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, pre))
    print('Recall: ', recall_score(y_orig, pre))
    print('Accuracy: ', accuracy_score(y_orig, pre))
    print('f1_score: ', f1_score(y_orig, pre))
   
    #test data
    pre_test = model.predict(x_test)
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test,pre_test ), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, pre_test))
    print('Recall: ', recall_score(y_test, pre_test))
    print('Accuracy: ', accuracy_score(y_test, pre_test))
    print('f1_score: ', f1_score(y_test, pre_test))
    
    
    precision_train += precision_score(y_orig, pre)
    recall_train += recall_score(y_orig, pre)
    accuracy_train += accuracy_score(y_orig, pre)
    f1_train += f1_score(y_orig, pre)
    
    precision_test += precision_score(y_test,pre_test)
    recall_test += recall_score(y_test, pre_test)
    accuracy_test += accuracy_score(y_test, pre_test)
    f1_test += f1_score(y_test, pre_test)
    
print("===========================")
print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)   

Training XGboost model directly ...
Training bagging classifier...
