In [1]:
import time
import sklearn
import numpy as np
import pandas as pd

import sys
sys.path.append("..")
from baggingPU import BaggingClassifierPU

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = " " * columnwidth
    print("    " + empty_cell, end=' ')
    for label in labels:
        print("%{0}s".format(columnwidth) % 'pred_' + label, end=" ")
    print()

    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % 'true_' + label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=" ")
        print()

# import data

In [3]:
#df_raw = pd.read_csv('../data/w-dependence.csv')

#df_raw = pd.read_csv('../data/1place-dependence.csv')

df_raw = pd.read_csv('../data/w-related.csv')


df_raw['label'] = df_raw['label'].astype("int")
print(df_raw.label.value_counts())
print('Has null values', df_raw.isnull().values.any())

1    10000
0    10000
Name: label, dtype: int64
Has null values False


In [4]:
df_raw.head(10)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,label
0,0,1,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,1
2,0,1,1,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,1
4,1,0,1,0,0,0,0,0,1
5,0,0,0,0,1,1,0,0,1
6,0,0,0,0,0,0,1,0,1
7,0,1,2,0,0,0,0,0,1
8,0,0,1,1,0,0,0,0,1
9,0,0,0,0,0,0,0,1,1


In [5]:
def random_undersampling(tmp_df, TARGET_LABEL):
    df_majority = tmp_df[tmp_df[TARGET_LABEL] == 0]
    df_minority = tmp_df[tmp_df[TARGET_LABEL] == 1]

    df_majority_downsampled = resample(df_majority, 
                                       replace=False,
                                       n_samples=len(df_minority),
                                       random_state=None)

    df_downsampled = pd.concat([df_majority_downsampled, df_minority])

    print("Undersampling complete!")
    print(df_downsampled[TARGET_LABEL].value_counts())
    return df_downsampled

In [6]:
df_downsampled = random_undersampling(df_raw, 'label')
df_downsampled = df_downsampled.sample(frac=1)
df_downsampled = df_downsampled.reset_index()
df_downsampled = df_downsampled.drop(columns=['index'])

Undersampling complete!
1    10000
0    10000
Name: label, dtype: int64


In [7]:
df_downsampled.head(10)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,label
0,1,0,47,0,0,16,0,0,1
1,0,0,33,1,1,14,0,1,0
2,1,0,11,0,1,42,0,1,0
3,0,1,12,1,0,83,1,0,0
4,2,1,40,4,7,81,7,3,0
5,1,0,53,0,0,41,1,1,0
6,1,0,21,0,0,5,0,0,1
7,6,0,62,0,2,35,9,8,0
8,0,0,15,0,0,24,0,1,1
9,1,0,75,0,0,62,1,1,0


In [24]:
place_max = np.amax(df_raw.iloc[:10000, [2, 5]])
place_min = np.amin(df_raw.iloc[:10000, [2, 5]])

print(max(place_max))
print(place_min)

81
p3    0
p6    0
dtype: int64


In [8]:
x_data = df_raw.iloc[:,:-1]
y_data = df_raw.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

x_train_input = pd.concat([x_train,y_train],axis=1)
x_test_input = pd.concat([x_test,y_test],axis=1)

print(len(x_train_input))
print(len(x_test_input))

print(x_train_input.label.value_counts())
print('Has null values', x_train_input.isnull().values.any())

print(x_test_input.label.value_counts())
print('Has null values', x_test_input.isnull().values.any())

16000
4000
1    8077
0    7923
Name: label, dtype: int64
Has null values False
0    2077
1    1923
Name: label, dtype: int64
Has null values False


In [9]:
x_train_input.head(10)

Unnamed: 0,p1,p2,p3,p4,label
8118,0,0,2705,1,1
10171,815,965,429,777,0
9399,0,0,3132,1,1
12669,241,761,380,957,0
18809,0,0,10244,0,0
13869,205,120,661,714,0
3320,1,0,1107,0,1
14689,841,618,913,610,0
13087,615,969,311,268,0
15992,0,1,1274,1,0


In [10]:
df = x_train_input.copy()

NON_LBL = [c for c in df.columns if c != 'label']
X = df[NON_LBL]
y = df['label']

y_orig = y.copy()

hidden_size = 5000
y.loc[
    np.random.choice(
        y[y == 1].index, 
        replace = False, 
        size = hidden_size
    )
] = 0

In [11]:
pd.Series(y).value_counts()

0    12923
1     3077
Name: label, dtype: int64

In [12]:
print('- %d samples and %d features' % (X.shape))
print('- %d positive out of %d total before hiding labels' % (sum(df_downsampled.label), len(df_downsampled.label)))
print('- %d positive out of %d total after hiding labels' % (sum(y), len(y)))

- 16000 samples and 4 features
- 10000 positive out of 20000 total before hiding labels
- 3077 positive out of 16000 total after hiding labels


# Trainning directly

In [13]:
print('Training XGboost model ...')

import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(X, y)

print('Done')

Training XGboost model ...
Done


In [15]:
model.predict(X)

array([1, 0, 0, ..., 0, 0, 0])

In [14]:
print('---- {} ----'.format('XGboost model'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig, model.predict(X)))
print('Recall: ', recall_score(y_orig, model.predict(X)))
print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
print('f1_score: ', f1_score(y_orig, model.predict(X)))

---- XGboost model ----
                        pred_negative        pred_positive 
           true_negative       7923.0          0.0 
           true_positive       7245.0        832.0 
None

Precision:  1.0
Recall:  0.10300854277578309
Accuracy:  0.5471875
f1_score:  0.18677741609608262


# Training by bagging

In [15]:
print('Training bagging classifier...')

pu_start = time.perf_counter()
model = BaggingClassifierPU(xgb.XGBClassifier(),
                         n_estimators = 50, 
                         n_jobs = -1, 
                         max_samples = sum(y)  
                        )
model.fit(X, y)
pu_end = time.perf_counter()
print('Done!')
print('Time:', pu_end - pu_start)

Training bagging classifier...
Done!
Time: 10.682515931024682


In [16]:
#train data
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig, model.predict(X)))
print('Recall: ', recall_score(y_orig, model.predict(X)))
print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
print('f1_score: ', f1_score(y_orig, model.predict(X)))

---- PU Bagging ----
                        pred_negative        pred_positive 
           true_negative       7914.0          9.0 
           true_positive         31.0       8046.0 
None

Precision:  0.9988826815642458
Recall:  0.9961619413148446
Accuracy:  0.9975
f1_score:  0.9975204562360526


In [17]:
#print wrong predictions
y_pre = model.predict(X)
y_orig_index = y_orig.index.tolist()

FN_index = []
FT_index = []

for i in range(len(y_orig)):
    if y_orig.iloc[i] == 1 and y_pre[i] == 0 :
        FN_index.append(y_orig_index[i])
    if y_orig.iloc[i] == 0 and y_pre[i] == 1 :
        FT_index.append(y_orig_index[i])
        
print("False Negtive:")
print(X.loc[FN_index])
print("False Positive:")
print(X.loc[FT_index])

False Negtive:
      p1  p2    p3  p4
5662   0   1  1888   0
5660   1   0  1887   0
9888   0   0  3295   1
5638   0   1  1880   0
101    1   0    34   0
8472   0   0  2823   1
5666   1   0  1889   0
110    1   0    37   0
92     1   0    31   0
38     1   0    13   0
45     0   0    14   1
42     0   0    13   1
8469   0   0  2822   1
5665   0   1  1889   0
5650   0   1  1884   0
3      0   0     0   1
40     0   1    14   0
98     1   0    33   0
5647   0   1  1883   0
5644   0   1  1882   0
5657   1   0  1886   0
48     0   0    15   1
9894   0   0  3297   1
104    1   0    35   0
5656   0   1  1886   0
44     1   0    15   0
41     1   0    14   0
6      0   0     1   1
33     0   0    10   1
8475   0   0  2824   1
268    0   1    90   0
False Positive:
       p1  p2    p3  p4
18982   0   0   441   0
17816   0   0  3129   0
16748   0   0   172   0
19558   0   0    84   0
16493   0   0   801   0
10001   0   1     0   0
19570   0   0  3085   0
18415   0   0  1311   0
16340   0   0   3

In [18]:
#test data
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_test, model.predict(x_test)))
print('Recall: ', recall_score(y_test, model.predict(x_test)))
print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
print('f1_score: ', f1_score(y_test, model.predict(x_test)))

---- PU Bagging ----
                        pred_negative        pred_positive 
           true_negative       2074.0          3.0 
           true_positive         10.0       1913.0 
None

Precision:  0.9984342379958246
Recall:  0.9947997919916797
Accuracy:  0.99675
f1_score:  0.9966137014847617


In [19]:
#print wrong predictions
y_test_pre = model.predict(x_test)
y_test_index = y_test.index.tolist()

FN_test_index = []
FT_test_index = []

for i in range(len(y_test)):
    if y_test.iloc[i] == 1 and y_test_pre[i] == 0 :
        FN_test_index.append(y_test_index[i])
    if y_test.iloc[i] == 0 and y_test_pre[i] == 1 :
        FT_test_index.append(y_test_index[i])

print("False Negtive:")
print(x_test.loc[FN_test_index])
print("False Positive:")
print(x_test.loc[FT_test_index])

False Negtive:
      p1  p2    p3  p4
5653   0   1  1885   0
9897   0   0  3298   1
5663   1   0  1888   0
5654   1   0  1885   0
36     0   0    11   1
5659   0   1  1887   0
107    1   0    36   0
43     0   1    15   0
37     0   1    13   0
95     1   0    32   0
False Positive:
       p1  p2    p3  p4
18980   0   0   220   0
18002   0   0    46   0
15587   0   0  2179   0


In [20]:
a = pd.read_csv('../data/onplacepos5000.csv')


a['label'] = a['label'].astype("int")
print(a.label.value_counts())
print('Has null values', a.isnull().values.any())

1    5000
Name: label, dtype: int64
Has null values False


In [23]:
x_data = a.iloc[:,:-1]
y_data = a.iloc[:,-1]
print(x_data)
print(y_data)

      p1  p2    p3  p4
0      0   0  8150   1
1      0   0  3388   1
2      0   1  3814   0
3      0   0  5003   1
4      1   0  6786   0
...   ..  ..   ...  ..
4995   0   0  4527   1
4996   0   1  4026   0
4997   1   0  9411   0
4998   0   0  6737   1
4999   0   1  9611   0

[5000 rows x 4 columns]
0       1
1       1
2       1
3       1
4       1
       ..
4995    1
4996    1
4997    1
4998    1
4999    1
Name: label, Length: 5000, dtype: int64


In [24]:
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_data, model.predict(x_data)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_data, model.predict(x_data)))
print('Recall: ', recall_score(y_data, model.predict(x_data)))
print('Accuracy: ', accuracy_score(y_data, model.predict(x_data)))
print('f1_score: ', f1_score(y_data, model.predict(x_data)))

---- PU Bagging ----
                        pred_negative        pred_positive 
           true_negative          0.0          0.0 
           true_positive       4230.0        770.0 
None

Precision:  1.0
Recall:  0.154
Accuracy:  0.154
f1_score:  0.2668977469670711


In [25]:
#print wrong predictions
y_data_pre = model.predict(x_data)
y_data_index = y_data.index.tolist()

FN_data_index = []
FT_data_index = []

for i in range(len(y_data)):
    if y_data.iloc[i] == 1 and y_data_pre[i] == 0 :
        FN_data_index.append(y_data_index[i])
    if y_data.iloc[i] == 0 and y_data_pre[i] == 1 :
        FT_data_index.append(y_data_index[i])

print("False Negtive:")
print(x_data.loc[FN_data_index])
print("False Positive:")
print(x_data.loc[FT_data_index])

False Negtive:
      p1  p2    p3  p4
0      0   0  8150   1
1      0   0  3388   1
2      0   1  3814   0
3      0   0  5003   1
4      1   0  6786   0
...   ..  ..   ...  ..
4995   0   0  4527   1
4996   0   1  4026   0
4997   1   0  9411   0
4998   0   0  6737   1
4999   0   1  9611   0

[4230 rows x 4 columns]
False Positive:
Empty DataFrame
Columns: [p1, p2, p3, p4]
Index: []
