In [1]:
#!/usr/bin/python3.7

In [2]:
import time
import sklearn
import numpy as np
import pandas as pd

import sys
sys.path.append("..")
from baggingPU import BaggingClassifierPU

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random

In [3]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = " " * columnwidth
    print("    " + empty_cell, end=' ')
    for label in labels:
        print("%{0}s".format(columnwidth) % 'pred_' + label, end=" ")
    print()

    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % 'true_' + label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=" ")
        print()

# import data

In [4]:
df_raw = pd.read_csv('../data/cannery.csv')

df_raw['label'] = df_raw['label'].astype("int")
print(df_raw.label.value_counts())
print('Has null values', df_raw.isnull().values.any())

1    10000
0    10000
Name: label, dtype: int64
Has null values False


In [5]:
df_raw.head(10)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p29,p30,p31,p32,p33,p34,p35,p36,p37,label
0,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,1,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,1,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,1,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,2,1,0,1,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,2,1,0,1,0,1
5,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,2,1,0,1,0,1
6,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,2,1,0,1,0,1
7,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,2,1,0,1,0,1
8,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,2,1,0,1,0,1
9,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,2,1,0,1,0,1


In [6]:
df_process = df_raw
print(df_process)

       p1  p2  p3  p4  p5  p6  p7  p8  p9  p10  ...  p29  p30  p31  p32  p33  \
0       0   0   1   0   0   0   0   0   0    0  ...    0    1    0    0    1   
1       0   0   1   0   0   0   0   0   0    0  ...    0    1    0    0    1   
2       0   0   1   0   0   0   0   0   0    0  ...    0    1    0    0    1   
3       0   0   1   0   0   0   0   0   0    0  ...    0    1    0    0    2   
4       0   0   1   0   0   0   0   0   0    0  ...    0    1    0    0    2   
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...  ...  ...  ...   
19995  23   4  16   7  14   2   0  11   9    9  ...   18   20   18   23   18   
19996   8   3   6  25  19   6  14  18  14   17  ...    8   10   10   18   10   
19997  20  15   8  20  11   0   7  18  16   15  ...   16   12    6   17   21   
19998   3  12  10   4  15  25   6  19  17   12  ...   20   12    7   17   19   
19999   7  15   3  18   4  14  17   8  10   25  ...   10   20   18    9   19   

       p34  p35  p36  p37  label  
0   

In [7]:
x_data = df_process.iloc[:,:-1]
y_data = df_process.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [8]:
x_train_input = pd.concat([x_train.iloc[:,:], y_train],axis=1)
x_test_input = pd.concat([x_test.iloc[:,:],y_test],axis=1)

print(len(x_train_input))
print(len(x_test_input))

print(x_train_input.label.value_counts())
print('Has null values', x_train_input.isnull().values.any())

print(x_test_input.label.value_counts())
print('Has null values', x_test_input.isnull().values.any())

16000
4000
1    8077
0    7923
Name: label, dtype: int64
Has null values False
0    2077
1    1923
Name: label, dtype: int64
Has null values False


In [9]:
x_train_input.head(10)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p29,p30,p31,p32,p33,p34,p35,p36,p37,label
8118,1,0,0,0,0,0,0,0,1,0,...,0,1,0,6,3,1,0,1,0,1
10171,14,1,14,16,4,25,22,13,1,2,...,16,6,11,8,21,12,21,17,0,0
9399,0,0,0,0,1,0,0,0,0,0,...,0,1,0,7,4,1,0,1,0,1
12669,23,0,0,9,19,19,18,18,10,18,...,25,7,24,5,8,4,19,16,2,0
18809,2,10,8,13,17,21,6,7,8,2,...,8,13,1,16,13,18,11,8,22,0
13869,4,3,7,24,11,23,5,0,21,17,...,11,11,0,2,23,13,11,13,5,0
3320,0,0,1,0,0,0,0,0,0,0,...,0,1,0,3,2,1,0,1,0,1
14689,10,20,18,1,24,19,6,15,7,25,...,13,2,22,2,22,14,22,21,10,0
13087,25,20,4,24,1,4,23,16,2,16,...,17,25,17,15,22,9,11,21,2,0
15992,25,12,10,19,2,6,9,18,17,13,...,16,7,11,10,18,22,12,16,6,0


In [10]:
x_test_input.head(10)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p29,p30,p31,p32,p33,p34,p35,p36,p37,label
14356,15,23,5,7,5,22,19,1,4,1,...,25,13,2,8,19,13,8,11,3,0
3439,0,0,0,1,0,0,0,0,0,0,...,0,0,1,3,2,1,0,0,1,1
12153,0,16,17,7,20,7,2,11,20,7,...,9,17,12,12,17,22,10,18,12,0
15029,23,10,20,16,0,3,1,14,22,23,...,12,9,22,8,18,0,11,11,18,0
18549,21,13,5,4,19,24,10,12,0,18,...,8,10,21,5,4,12,5,24,9,0
15762,14,17,20,6,25,20,3,14,3,0,...,14,13,7,3,20,19,9,7,19,0
12313,7,15,14,14,20,9,10,14,12,20,...,21,17,25,18,8,6,5,23,16,0
16034,3,0,6,3,25,10,20,8,22,5,...,24,2,7,10,18,17,7,7,14,0
11496,25,17,8,9,5,16,15,20,9,14,...,9,20,20,2,17,22,6,13,13,0
5653,0,0,0,0,1,0,0,0,0,1,...,0,1,0,4,4,1,0,1,0,1


In [11]:
df = x_train_input.copy()

NON_LBL = [c for c in df.columns if c != 'label']
X = df[NON_LBL]
y = df['label']

y_orig = y.copy()

hidden_size = 5000
y.loc[
    np.random.choice(
        y[y == 1].index, 
        replace = False, 
        size = hidden_size
    )
] = 0

print(X)
print(y)

       p1  p2  p3  p4  p5  p6  p7  p8  p9  p10  ...  p28  p29  p30  p31  p32  \
8118    1   0   0   0   0   0   0   0   1    0  ...    1    0    1    0    6   
10171  14   1  14  16   4  25  22  13   1    2  ...   19   16    6   11    8   
9399    0   0   0   0   1   0   0   0   0    0  ...    1    0    1    0    7   
12669  23   0   0   9  19  19  18  18  10   18  ...   21   25    7   24    5   
18809   2  10   8  13  17  21   6   7   8    2  ...   14    8   13    1   16   
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...  ...  ...  ...   
13927  14  13   8  14  16   0   4  16  20   15  ...    8    6   25   23    8   
919     0   0   0   1   0   0   0   0   1    0  ...    1    0    1    0    1   
5699    0   0   0   0   0   0   1   0   0    0  ...    0    1    1    0    4   
10742  13   5  18  14   8  17   1  19  12    3  ...   18    9   15   22   21   
16921  22  11  18  21   0  16   1  13  18   25  ...   10   24    3   23    6   

       p33  p34  p35  p36  p37  
8118  

In [12]:
pd.Series(y).value_counts()

0    12923
1     3077
Name: label, dtype: int64

In [13]:
print('- %d samples and %d features' % (X.shape))
print('- %d positive out of %d total before hiding labels' % (sum(df_process.label), len(df_process.label)))
print('- %d positive out of %d total after hiding labels' % (sum(y), len(y)))

- 16000 samples and 37 features
- 10000 positive out of 20000 total before hiding labels
- 3077 positive out of 16000 total after hiding labels


# Trainning directly

In [14]:
print('Training XGboost model ...')

import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(X, y)

print('Done')

Training XGboost model ...
Done


In [15]:
print('---- {} ----'.format('XGboost model'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig, model.predict(X)))
print('Recall: ', recall_score(y_orig, model.predict(X)))
print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
print('f1_score: ', f1_score(y_orig, model.predict(X)))

---- XGboost model ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive       6305.0       1772.0 
None

Precision:  1.0
Recall:  0.2193883867772688
Accuracy:  0.6059375
f1_score:  0.3598334856330592


# Training by bagging

In [16]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()

from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier()

from sklearn.neural_network import MLPClassifier
model3 = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(1), random_state=1,max_iter=100000)

from sklearn import svm
model4 = svm.LinearSVC()

from sklearn.naive_bayes import MultinomialNB
model5 = MultinomialNB()

from sklearn.ensemble import RandomForestClassifier
model6 = RandomForestClassifier(n_estimators = 50,n_jobs = -1)

import xgboost as xgb
model7 = xgb.XGBClassifier()


In [17]:
model_list = [model1, model2, model3, model4, model5, model6, model7]

In [18]:
print('Training Logistics Regression model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model1,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    

    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, model.predict(X)))
    print('Recall: ', recall_score(y_orig, model.predict(X)))
    print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
    print('f1_score: ', f1_score(y_orig, model.predict(X)))
    precision_train += precision_score(y_orig, model.predict(X))
    recall_train += recall_score(y_orig, model.predict(X))
    accuracy_train += accuracy_score(y_orig, model.predict(X))
    f1_train += f1_score(y_orig, model.predict(X))
    
    
    #test data
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, model.predict(x_test)))
    print('Recall: ', recall_score(y_test, model.predict(x_test)))
    print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
    print('f1_score: ', f1_score(y_test, model.predict(x_test)))
    
    precision_test += precision_score(y_test, model.predict(x_test))
    recall_test += recall_score(y_test, model.predict(x_test))
    accuracy_test += accuracy_score(y_test, model.predict(x_test))
    f1_test += f1_score(y_test, model.predict(x_test))
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Logistics Regression model...
Training bagging classifier...
Done!
Time: 3.041385415941477
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive          0.0       8077.0 
None

Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       2077.0          0.0 
           true_positive          0.0       1923.0 
None

Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0
Training bagging classifier...
Done!
Time: 1.6475921357050538
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive          0.0       8077.0 
None

Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0
---- PU Bagging Test data ----
                        p

In [19]:
print('Training Decision Tree model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model2,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    

    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, model.predict(X)))
    print('Recall: ', recall_score(y_orig, model.predict(X)))
    print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
    print('f1_score: ', f1_score(y_orig, model.predict(X)))
    precision_train += precision_score(y_orig, model.predict(X))
    recall_train += recall_score(y_orig, model.predict(X))
    accuracy_train += accuracy_score(y_orig, model.predict(X))
    f1_train += f1_score(y_orig, model.predict(X))
    
    
    #test data
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, model.predict(x_test)))
    print('Recall: ', recall_score(y_test, model.predict(x_test)))
    print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
    print('f1_score: ', f1_score(y_test, model.predict(x_test)))
    
    precision_test += precision_score(y_test, model.predict(x_test))
    recall_test += recall_score(y_test, model.predict(x_test))
    accuracy_test += accuracy_score(y_test, model.predict(x_test))
    f1_test += f1_score(y_test, model.predict(x_test))
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Decision Tree model...
Training bagging classifier...
Done!
Time: 0.9509770134463906
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive       1367.0       6710.0 
None

Precision:  1.0
Recall:  0.830753992819116
Accuracy:  0.9145625
f1_score:  0.9075539325082843
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       2077.0          0.0 
           true_positive        171.0       1752.0 
None

Precision:  1.0
Recall:  0.9110764430577223
Accuracy:  0.95725
f1_score:  0.953469387755102
Training bagging classifier...
Done!
Time: 0.8818538440391421
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive       1375.0       6702.0 
None

Precision:  1.0
Recall:  0.8297635260616566
Accuracy:

In [20]:
print('Training MLP model directly ...')

Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model3,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    

    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, model.predict(X)))
    print('Recall: ', recall_score(y_orig, model.predict(X)))
    print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
    print('f1_score: ', f1_score(y_orig, model.predict(X)))
    precision_train += precision_score(y_orig, model.predict(X))
    recall_train += recall_score(y_orig, model.predict(X))
    accuracy_train += accuracy_score(y_orig, model.predict(X))
    f1_train += f1_score(y_orig, model.predict(X))
    
    
    #test data
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, model.predict(x_test)))
    print('Recall: ', recall_score(y_test, model.predict(x_test)))
    print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
    print('f1_score: ', f1_score(y_test, model.predict(x_test)))
    
    precision_test += precision_score(y_test, model.predict(x_test))
    recall_test += recall_score(y_test, model.predict(x_test))
    accuracy_test += accuracy_score(y_test, model.predict(x_test))
    f1_test += f1_score(y_test, model.predict(x_test))
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training MLP model directly ...
Training bagging classifier...
Done!
Time: 12.559453504160047
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive          0.0       8077.0 
None

Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       2077.0          0.0 
           true_positive          0.0       1923.0 
None

Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0
Training bagging classifier...
Done!
Time: 12.49970243871212
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive          0.0       8077.0 
None

Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0
---- PU Bagging Test data ----
                        pred_neg

In [21]:
print('Training SVM model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model4,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    

    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, model.predict(X)))
    print('Recall: ', recall_score(y_orig, model.predict(X)))
    print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
    print('f1_score: ', f1_score(y_orig, model.predict(X)))
    precision_train += precision_score(y_orig, model.predict(X))
    recall_train += recall_score(y_orig, model.predict(X))
    accuracy_train += accuracy_score(y_orig, model.predict(X))
    f1_train += f1_score(y_orig, model.predict(X))
    
    
    #test data
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, model.predict(x_test)))
    print('Recall: ', recall_score(y_test, model.predict(x_test)))
    print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
    print('f1_score: ', f1_score(y_test, model.predict(x_test)))
    
    precision_test += precision_score(y_test, model.predict(x_test))
    recall_test += recall_score(y_test, model.predict(x_test))
    accuracy_test += accuracy_score(y_test, model.predict(x_test))
    f1_test += f1_score(y_test, model.predict(x_test))
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training SVM model...
Training bagging classifier...
Done!
Time: 2.1177416974678636
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7922.0          1.0 
           true_positive          0.0       8077.0 
None

Precision:  0.9998762069819263
Recall:  1.0
Accuracy:  0.9999375
f1_score:  0.9999380996595482
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       2077.0          0.0 
           true_positive          0.0       1923.0 
None

Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0
Training bagging classifier...
Done!
Time: 1.9035667832940817
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7922.0          1.0 
           true_positive          0.0       8077.0 
None

Precision:  0.9998762069819263
Recall:  1.0
Accuracy:  0.9999375
f1_score:  0.9999380996595482


In [22]:
print('Training Naive Bayesianayes model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model5,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    

    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, model.predict(X)))
    print('Recall: ', recall_score(y_orig, model.predict(X)))
    print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
    print('f1_score: ', f1_score(y_orig, model.predict(X)))
    precision_train += precision_score(y_orig, model.predict(X))
    recall_train += recall_score(y_orig, model.predict(X))
    accuracy_train += accuracy_score(y_orig, model.predict(X))
    f1_train += f1_score(y_orig, model.predict(X))
    
    
    #test data
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, model.predict(x_test)))
    print('Recall: ', recall_score(y_test, model.predict(x_test)))
    print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
    print('f1_score: ', f1_score(y_test, model.predict(x_test)))
    
    precision_test += precision_score(y_test, model.predict(x_test))
    recall_test += recall_score(y_test, model.predict(x_test))
    accuracy_test += accuracy_score(y_test, model.predict(x_test))
    f1_test += f1_score(y_test, model.predict(x_test))
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Naive Bayesianayes model...
Training bagging classifier...
Done!
Time: 0.9016990819945931
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive         37.0       8040.0 
None

Precision:  1.0
Recall:  0.99541909124675
Accuracy:  0.9976875
f1_score:  0.9977042873983992
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       2077.0          0.0 
           true_positive         12.0       1911.0 
None

Precision:  1.0
Recall:  0.9937597503900156
Accuracy:  0.997
f1_score:  0.9968701095461658
Training bagging classifier...
Done!
Time: 0.8460099184885621
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive         37.0       8040.0 
None

Precision:  1.0
Recall:  0.99541909124675
Accuracy

In [23]:
print('Training Random Forest model...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model6,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    

    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, model.predict(X)))
    print('Recall: ', recall_score(y_orig, model.predict(X)))
    print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
    print('f1_score: ', f1_score(y_orig, model.predict(X)))
    precision_train += precision_score(y_orig, model.predict(X))
    recall_train += recall_score(y_orig, model.predict(X))
    accuracy_train += accuracy_score(y_orig, model.predict(X))
    f1_train += f1_score(y_orig, model.predict(X))
    
    
    #test data
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, model.predict(x_test)))
    print('Recall: ', recall_score(y_test, model.predict(x_test)))
    print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
    print('f1_score: ', f1_score(y_test, model.predict(x_test)))
    
    precision_test += precision_score(y_test, model.predict(x_test))
    recall_test += recall_score(y_test, model.predict(x_test))
    accuracy_test += accuracy_score(y_test, model.predict(x_test))
    f1_test += f1_score(y_test, model.predict(x_test))
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training Random Forest model...
Training bagging classifier...
Done!
Time: 9.570783131755888
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive        451.0       7626.0 
None

Precision:  1.0
Recall:  0.9441624365482234
Accuracy:  0.9718125
f1_score:  0.9712793733681463
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       2077.0          0.0 
           true_positive         51.0       1872.0 
None

Precision:  1.0
Recall:  0.9734789391575663
Accuracy:  0.98725
f1_score:  0.9865612648221344
Training bagging classifier...
Done!
Time: 9.604657818563282
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive        473.0       7604.0 
None

Precision:  1.0
Recall:  0.9414386529652099
Accuracy:

In [24]:
print('Training XGboost model directly ...')
Time = 0.0
precision_train = 0.0
recall_train = 0.0
accuracy_train = 0.0
f1_train = 0.0

precision_test = 0.0
recall_test = 0.0
accuracy_test = 0.0
f1_test = 0.0

for i in range(10):

    print('Training bagging classifier...')
    pu_start = time.perf_counter()
    model = BaggingClassifierPU(model7,
                             n_estimators = 50, 
                             n_jobs = -1, 
                             max_samples = sum(y) 
                        )
    model.fit(X, y)
    pu_end = time.perf_counter()
    print('Done!')
    print('Time:', pu_end - pu_start)
    Time += (pu_end - pu_start)
    

    #train data
    print('---- {} ----'.format('PU Bagging Train data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_orig, model.predict(X)))
    print('Recall: ', recall_score(y_orig, model.predict(X)))
    print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
    print('f1_score: ', f1_score(y_orig, model.predict(X)))
    precision_train += precision_score(y_orig, model.predict(X))
    recall_train += recall_score(y_orig, model.predict(X))
    accuracy_train += accuracy_score(y_orig, model.predict(X))
    f1_train += f1_score(y_orig, model.predict(X))
    
    
    #test data
    print('---- {} ----'.format('PU Bagging Test data'))
    print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
    print('')
    print('Precision: ', precision_score(y_test, model.predict(x_test)))
    print('Recall: ', recall_score(y_test, model.predict(x_test)))
    print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
    print('f1_score: ', f1_score(y_test, model.predict(x_test)))
    
    precision_test += precision_score(y_test, model.predict(x_test))
    recall_test += recall_score(y_test, model.predict(x_test))
    accuracy_test += accuracy_score(y_test, model.predict(x_test))
    f1_test += f1_score(y_test, model.predict(x_test))
    

print("The final results:")
print("Time:" , Time / 10)
print("precision_train:" , precision_train / 10)
print("recall_train:" , recall_train / 10)
print("accuracy_train:" , accuracy_train / 10)
print("f1_train:" , f1_train / 10)
print("precision_test:" , precision_test / 10)
print("recall_test:" , recall_test / 10)
print("accuracy_test:" , accuracy_test / 10)
print("f1_test:" , f1_test / 10)
    

Training XGboost model directly ...
Training bagging classifier...
Done!
Time: 8.21422905754298
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive        215.0       7862.0 
None

Precision:  1.0
Recall:  0.9733812058932773
Accuracy:  0.9865625
f1_score:  0.9865110734675951
---- PU Bagging Test data ----
                        pred_negative        pred_positive 
           true_negative       2077.0          0.0 
           true_positive         52.0       1871.0 
None

Precision:  1.0
Recall:  0.9729589183567343
Accuracy:  0.987
f1_score:  0.9862941486557723
Training bagging classifier...
Done!
Time: 8.360351916402578
---- PU Bagging Train data ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive        200.0       7877.0 
None

Precision:  1.0
Recall:  0.9752383310635137
Accuracy