In [1]:
import time
import sklearn
import numpy as np
import pandas as pd

import sys
sys.path.append("..")
from baggingPU import BaggingClassifierPU

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random

In [2]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = " " * columnwidth
    print("    " + empty_cell, end=' ')
    for label in labels:
        print("%{0}s".format(columnwidth) % 'pred_' + label, end=" ")
    print()

    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % 'true_' + label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=" ")
        print()

# import data

In [3]:
df_raw = pd.read_csv('../data/w-dependence.csv')

#df_raw = pd.read_csv('../data/1place-dependence.csv')

#df_raw = pd.read_csv('../data/w-related.csv')


df_raw['label'] = df_raw['label'].astype("int")
print(df_raw.label.value_counts())
print('Has null values', df_raw.isnull().values.any())

1    10000
0    10000
Name: label, dtype: int64
Has null values False


In [4]:
df_raw.head(10)

Unnamed: 0,p1,p2,p3,label
0,0,0,1,1
1,1,0,1,1
2,0,1,1,1
3,2,0,1,1
4,1,1,1,1
5,0,1,0,1
6,0,2,1,1
7,3,0,1,1
8,2,1,1,1
9,1,1,0,1


In [5]:
print(df_raw.iloc[:10000,:])
df_max = np.amax(df_raw.iloc[:10000,[0,1]])
print(df_max)

      p1  p2  p3  label
0      0   0   1      1
1      1   0   1      1
2      0   1   1      1
3      2   0   1      1
4      1   1   1      1
...   ..  ..  ..    ...
9995  52  47   0      1
9996  52  48   1      1
9997  51  48   0      1
9998  51  49   1      1
9999  50  49   0      1

[10000 rows x 4 columns]
p1    100
p2     99
dtype: int64


In [14]:
print(df_raw.iloc[10000:,:])
print(np.amax(df_raw.iloc[10000:,[0,1]]))

        p1   p2   p3  label
10000   86  724  444      0
10001  556  145   65      0
10002  395  673   66      0
10003  151  410  204      0
10004  961  853  760      0
...    ...  ...  ...    ...
19995  300  764  197      0
19996  123  206  636      0
19997  756  279  632      0
19998  675  960  101      0
19999  480  640  168      0

[10000 rows x 4 columns]
p1    999
p2    999
dtype: int64


In [11]:
neg_process = df_raw.iloc[10000:,:].copy()
print(neg_process)

for i in range(10000):
    if neg_process.iloc[i,0] > df_max[0]:
        neg_process.iloc[i,0]  = int(neg_process.iloc[i,0] / df_max[0])
    if neg_process.iloc[i,1] > df_max[1]:
        neg_process.iloc[i,1]  = int(neg_process.iloc[i,1] / df_max[1])

        p1   p2   p3  label
10000   86  724  444      0
10001  556  145   65      0
10002  395  673   66      0
10003  151  410  204      0
10004  961  853  760      0
...    ...  ...  ...    ...
19995  300  764  197      0
19996  123  206  636      0
19997  756  279  632      0
19998  675  960  101      0
19999  480  640  168      0

[10000 rows x 4 columns]


In [12]:
neg_process.drop_duplicates(inplace=True)
print(neg_process)
print(np.amax(neg_process))

       p1  p2   p3  label
10000  86   7  444      0
10001   5   1   65      0
10002   3   6   66      0
10003   1   4  204      0
10004   9   8  760      0
...    ..  ..  ...    ...
19995   3   7  197      0
19996   1   2  636      0
19997   7   2  632      0
19998   6   9  101      0
19999   4   6  168      0

[9605 rows x 4 columns]
p1       100
p2        99
p3       999
label      0
dtype: int64


In [13]:
df_process = pd.concat([df_raw.iloc[:10000,:], neg_process])
print(df_process)

       p1  p2   p3  label
0       0   0    1      1
1       1   0    1      1
2       0   1    1      1
3       2   0    1      1
4       1   1    1      1
...    ..  ..  ...    ...
19995   3   7  197      0
19996   1   2  636      0
19997   7   2  632      0
19998   6   9  101      0
19999   4   6  168      0

[19605 rows x 4 columns]


In [10]:
x_data = df_process.iloc[:,:-1]
y_data = df_process.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [11]:
x_train_input = pd.concat([x_train.iloc[:,:], y_train],axis=1)
x_test_input = pd.concat([x_test.iloc[:,:],y_test],axis=1)

print(len(x_train_input))
print(len(x_test_input))

print(x_train_input.label.value_counts())
print('Has null values', x_train_input.isnull().values.any())

print(x_test_input.label.value_counts())
print('Has null values', x_test_input.isnull().values.any())

16000
4000
1    8077
0    7923
Name: label, dtype: int64
Has null values False
0    2077
1    1923
Name: label, dtype: int64
Has null values False


In [12]:
x_train_input.head(10)

Unnamed: 0,p1,p2,p3,label
8118,36,54,1,1
10171,955,144,14,0
9399,53,43,0,1
12669,671,887,315,0
18809,118,877,702,0
13869,403,912,555,0
3320,51,7,1,1
14689,884,235,9,0
13087,17,244,215,0
15992,709,14,565,0


In [13]:
x_test_input.head(10)

Unnamed: 0,p1,p2,p3,label
14356,56,136,763,0
3439,50,8,0,1
12153,23,213,534,0
15029,461,505,575,0
18549,657,746,995,0
15762,282,709,653,0
12313,456,178,499,0
16034,409,843,59,0
11496,911,433,846,0
5653,23,51,0,1


In [14]:
iter = 13000
alllist = np.zeros([13000,4],dtype=np.int)

a = np.random.choice(range(9,999),size=iter,replace=True)
b = np.random.choice(range(0,999),size=iter,replace=True)
c = np.random.choice(range(9,999),size=iter,replace=True)

for idx in range(iter):  
    alllist[idx][0] = a[idx]
    alllist[idx][1] = b[idx]
    alllist[idx][2] = c[idx]
    alllist[idx][3] = 0
unlabel = np.unique(np.array(alllist), axis = 0)

print(len(unlabel))

13000


In [15]:
pos_list = np.random.randint(low=100, high=9999, size=3000).tolist()

data_pos = df_process.iloc[pos_list,:]
data_unl = pd.DataFrame(unlabel, columns=['p1','p2','p3','label'],)

data =  pd.concat([data_pos, data_unl], axis = 0, ignore_index=True)
print(data)

        p1   p2   p3  label
0       44   24    1      1
1       27   16    1      1
2       34    5    0      1
3       13   36    0      1
4       47   25    0      1
...    ...  ...  ...    ...
15995  998  462  897      0
15996  998  898  936      0
15997  998  915  594      0
15998  998  961  682      0
15999  998  991  230      0

[16000 rows x 4 columns]


In [16]:
df = data.copy()

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

y_orig = y.copy()

print(X)
print(y)

        p1   p2   p3
0       44   24    1
1       27   16    1
2       34    5    0
3       13   36    0
4       47   25    0
...    ...  ...  ...
15995  998  462  897
15996  998  898  936
15997  998  915  594
15998  998  961  682
15999  998  991  230

[16000 rows x 3 columns]
0        1
1        1
2        1
3        1
4        1
        ..
15995    0
15996    0
15997    0
15998    0
15999    0
Name: label, Length: 16000, dtype: int64


In [17]:
pd.Series(y).value_counts()

0    13000
1     3000
Name: label, dtype: int64

# Trainning directly

In [18]:
print('Training XGboost model ...')

#import xgboost as xgb

#model = xgb.XGBClassifier()

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()


#from sklearn.neural_network import MLPClassifier

#model = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(1), random_state=1,max_iter=100000)

model.fit(X, y)

print('Done')

Training XGboost model ...
Done


In [19]:
print('---- {} ----'.format('XGboost model'))
print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test_input.iloc[:,:-1])), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_test, model.predict(x_test_input.iloc[:,:-1])))
print('Recall: ', recall_score(y_test, model.predict(x_test_input.iloc[:,:-1])))
print('Accuracy: ', accuracy_score(y_test, model.predict(x_test_input.iloc[:,:-1])))
print('f1_score: ', f1_score(y_test, model.predict(x_test_input.iloc[:,:-1])))

---- XGboost model ----
                        pred_negative        pred_positive 
           true_negative       2076.0          1.0 
           true_positive          0.0       1923.0 
None

Precision:  0.9994802494802495
Recall:  1.0
Accuracy:  0.99975
f1_score:  0.9997400571874188


# Training by bagging

In [20]:
#初始化多个分类器

from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()

from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier()

from sklearn.neural_network import MLPClassifier
model3 = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(1), random_state=1,max_iter=100000)

from sklearn import svm
model4 = svm.LinearSVC()

from sklearn.naive_bayes import MultinomialNB
model5 = MultinomialNB()

from sklearn.ensemble import RandomForestClassifier
model6 = RandomForestClassifier(n_estimators = 50,n_jobs = -1)

import xgboost as xgb
model7 = xgb.XGBClassifier()


In [31]:
predict_sum = np.zeros([len(y)],dtype=np.float32)
print(predict_sum)

[0. 0. 0. ... 0. 0. 0.]


In [32]:
flag = 0
f1 = np.zeros([7],dtype=np.float32)

model_list = [model1, model2, model3, model4, model5, model6, model7]

for i,j in zip(model_list,range(7)):
    model = BaggingClassifierPU(i,
                         n_estimators = 50, 
                         n_jobs = -1, 
                         max_samples = sum(y)  
                        )
    model.fit(X,y)
    print(f1_score(y_data, model.predict(x_data)))
    predict_sum += model.predict(x_data)
    f1[j] = f1_score(y_data, model.predict(x_data))

print(f1)
print(predict_sum)

0.9994503023337165


ValueError: operands could not be broadcast together with shapes (16000,) (20000,) (16000,) 

In [26]:
print(model.predict(x_data))
print(type(model.predict(x_data)))


[1 1 1 ... 0 0 0]
<class 'numpy.ndarray'>


In [29]:
threshold = len(model_list) / 2
print(threshold)

3.5


In [24]:
print(np.argmax(f1))

best_model = BaggingClassifierPU(model_list[np.argmax(f1)],
                         n_estimators = 50, 
                         n_jobs = -1, 
                         max_samples = sum(y)  
                        )
print(best_model)

0
BaggingClassifierPU(base_estimator=LogisticRegression(), max_samples=3000,
                    n_estimators=50, n_jobs=-1)


In [158]:
print('Training bagging classifier...')

pu_start = time.perf_counter()
model = BaggingClassifierPU(LogisticRegression(),
                         n_estimators = 50, 
                         n_jobs = -1, 
                         max_samples = sum(y)  
                        )
model.fit(X, y)
pu_end = time.perf_counter()
print('Done!')
print('Time:', pu_end - pu_start)


Training bagging classifier...
Done!
Time: 0.9347040150023531


In [161]:
#train data
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig, model.predict(X)))
print('Recall: ', recall_score(y_orig, model.predict(X)))
print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
print('f1_score: ', f1_score(y_orig, model.predict(X)))

---- PU Bagging ----
                        pred_negative        pred_positive 
           true_negative      12998.0          2.0 
           true_positive          0.0       3000.0 
None

Precision:  0.9993337774816788
Recall:  1.0
Accuracy:  0.999875


TypeError: 'numpy.ndarray' object is not callable

In [136]:
#print wrong predictions
y_pre = model.predict(X)
y_orig_index = y_orig.index.tolist()

FN_index = []
FT_index = []

for i in range(len(y_orig)):
    if y_orig.iloc[i] == 1 and y_pre[i] == 0 :
        FN_index.append(y_orig_index[i])
    if y_orig.iloc[i] == 0 and y_pre[i] == 1 :
        FT_index.append(y_orig_index[i])
        
print("False Negtive:")
print(df_process.loc[FN_index])
print("False Positive:")
print(df_process.loc[FT_index])

False Negtive:
Empty DataFrame
Columns: [p1, p2, p3, label]
Index: []
False Positive:
      p1  p2  p3  label
3385  18  39   0      1


In [160]:
#test data
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test_input.iloc[:,:-1])), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_test, model.predict(x_test_input.iloc[:,:-1])))
print('Recall: ', recall_score(y_test, model.predict(x_test_input.iloc[:,:-1])))
print('Accuracy: ', accuracy_score(y_test, model.predict(x_test_input.iloc[:,:-1])))
print('f1_score: ', f1_score(y_test, model.predict(x_test_input.iloc[:,:-1])))

---- PU Bagging ----
                        pred_negative        pred_positive 
           true_negative       2075.0          2.0 
           true_positive          0.0       1923.0 
None

Precision:  0.9989610389610389
Recall:  1.0
Accuracy:  0.9995


TypeError: 'numpy.ndarray' object is not callable

In [138]:
#print wrong predictions
y_test_pre = model.predict(x_test_input.iloc[:,:-1])
y_test_index = y_test.index.tolist()

FN_test_index = []
FT_test_index = []

for i in range(len(y_test)):
    if y_test.iloc[i] == 1 and y_test_pre[i] == 0 :
        FN_test_index.append(y_test_index[i])
    if y_test.iloc[i] == 0 and y_test_pre[i] == 1 :
        FT_test_index.append(y_test_index[i])

print("False Negtive:")
print(df_process.loc[FN_test_index])
print("False Positive:")
print(df_process.loc[FT_test_index])

False Negtive:
Empty DataFrame
Columns: [p1, p2, p3, label]
Index: []
False Positive:
       p1  p2  p3  label
14102  17  33  28      0
17305  46   5  11      0


In [139]:
orig_data = df_process.iloc[:,:-1].copy()
orig_label = model.predict(orig_data)

print(orig_data)
print(orig_label)

        p1   p2   p3
0        0    0    1
1        1    0    1
2        0    1    1
3        2    0    1
4        1    1    1
...    ...  ...  ...
19995  300  764  197
19996  123  206  636
19997  756  279  632
19998  675  960  101
19999  480  640  168

[20000 rows x 3 columns]
[1 1 1 ... 0 0 0]


In [140]:
import xgboost as xgb
model = BaggingClassifierPU(xgb.XGBClassifier(),
                         n_estimators = 50, 
                         n_jobs = -1, 
                         max_samples = sum(y)  
                        )
print(model)
model.fit(orig_data, orig_label)

BaggingClassifierPU(base_estimator=XGBClassifier(base_score=None, booster=None,
                                                 colsample_bylevel=None,
                                                 colsample_bynode=None,
                                                 colsample_bytree=None,
                                                 gamma=None, gpu_id=None,
                                                 importance_type='gain',
                                                 interaction_constraints=None,
                                                 learning_rate=None,
                                                 max_delta_step=None,
                                                 max_depth=None,
                                                 min_child_weight=None,
                                                 missing=nan,
                                                 monotone_constraints=None,
                                                 n_estimators=1

BaggingClassifierPU(base_estimator=XGBClassifier(base_score=None, booster=None,
                                                 colsample_bylevel=None,
                                                 colsample_bynode=None,
                                                 colsample_bytree=None,
                                                 gamma=None, gpu_id=None,
                                                 importance_type='gain',
                                                 interaction_constraints=None,
                                                 learning_rate=None,
                                                 max_delta_step=None,
                                                 max_depth=None,
                                                 min_child_weight=None,
                                                 missing=nan,
                                                 monotone_constraints=None,
                                                 n_estimators=1

In [132]:
#verification
import random

place_max = max(np.amax(df_process.iloc[:10000, [0, 1]]))  #w_dependence
name1 = ['min', 'max', 'Precision', 'Recall', 'Accuracy', 'f1_score']
test = pd.DataFrame(columns=name1)

for i in range(1, 100):
    
    '''
    flag = int((0.1 * i) * place_max)
    print(flag)
    temp_data = df_process.iloc[(10000 - int(0.1 * place_max)) : (10000 + int(0.1 * place_max)), : -1].copy()
    temp_label = df_process.iloc[(10000 - int(0.1 * place_max)) : (10000 + int(0.1 * place_max)), -1].copy()
    temp_data.iloc[:, 0] = temp_data.iloc[:, 0] + flag + 1
    temp_data.iloc[:, 1] = temp_data.iloc[:, 1] + flag + 1
    '''
    
    pos_list = np.random.randint(low=100, high=9999, size=2000).tolist()
    neg_list = np.random.randint(low=10000, high=19000, size=2000).tolist()
    all_list = pos_list + neg_list
    
    flag1 = int((0.1 * (i - 1)) * place_max)
    flag2 = int((0.1 * i) * place_max)
    print(flag2)
    
    temp_data = df_process.iloc[all_list, : -1].copy()
    temp_label = df_process.iloc[all_list, -1].copy()
    
    temp_data.iloc[:, 0] = temp_data.iloc[:, 0] + random.randint((df_max[0] + flag1 + 1), ((df_max[0] + flag2 + 1)))
    temp_data.iloc[:, 1] = temp_data.iloc[:, 1] + random.randint((df_max[1] + flag1 + 1), ((df_max[1] + flag2 + 1)))

    
    print('======')
    print(temp_data.iloc[:, :])

    temp_max = max(np.amax(temp_data.iloc[:, [0, 1]]))
    temp_min = min(np.amin(temp_data.iloc[:, [0, 1]]))

    print('---- {} ----'.format(i))
    print('Precision: ', precision_score(temp_label, model.predict(temp_data)))
    print('Recall: ', recall_score(temp_label, model.predict(temp_data)))
    print('Accuracy: ', accuracy_score(temp_label, model.predict(temp_data)))
    print('f1_score: ', f1_score(temp_label, model.predict(temp_data)))

    test.loc[i] = [  temp_min, temp_max,
                        precision_score(temp_label, model.predict(temp_data)),
                        recall_score(temp_label, model.predict(temp_data)),
                        accuracy_score(temp_label, model.predict(temp_data)),
                        f1_score(temp_label, model.predict(temp_data))
                        ]
    
    orig_data = pd.concat([orig_data, temp_data], ignore_index=True)
    orig_label = pd.Series(orig_label.tolist() + model.predict(temp_data).tolist())

    model.fit(orig_data,orig_label)

test.to_csv('../result/w_dependence_unknown_processdata_result.csv')

10
         p1    p2   p3
4712   1064  1014    1
9561   1075  1031    0
7382   1055  1040    1
4880   1050  1029    1
4935   1022  1056    0
...     ...   ...  ...
10830  1686  1258  309
11607  1884  1839  157
15843  1269  1893  529
16876  1417  1372  960
16579  1858  1831  485

[4000 rows x 3 columns]
---- 1 ----
Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0
20
         p1    p2   p3
8422   1078  1037    1
3845   1041  1043    0
2129   1027  1041    0
8683   1040  1075    0
1755   1036  1028    0
...     ...   ...  ...
17381  2005  1945  725
18423  1140  1571  218
18651  1021  1966  646
16191  1035  1088  863
13421  1020  1767  854

[4000 rows x 3 columns]
---- 2 ----
Precision:  1.0
Recall:  1.0
Accuracy:  1.0
f1_score:  1.0


KeyboardInterrupt: 