In [1]:
import time
import sklearn
import numpy as np
import pandas as pd

import sys
sys.path.append("..")
from baggingPU import BaggingClassifierPU

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):   
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = " " * columnwidth
    print("    " + empty_cell, end=' ')
    for label in labels:
        print("%{0}s".format(columnwidth) % 'pred_' + label, end=" ")
    print()

    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % 'true_' + label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=" ")
        print()

# import data

In [3]:
IO="../data/w-dependence.xlsx"

data_pos = pd.read_excel(io = IO)
data_pos.to_csv('../data/w-dependence.csv',index=False)

df_raw = pd.read_csv('../data/w-dependence.csv')
df_raw['label'] = df_raw['label'].astype("int")
print(df_raw.label.value_counts())
print('Has null values', df_raw.isnull().values.any())

1    10000
0    10000
Name: label, dtype: int64
Has null values False


In [4]:
df_raw.head(10)

Unnamed: 0,p1,p2,p3,label
0,0,0,1,1
1,1,0,1,1
2,0,1,1,1
3,2,0,1,1
4,1,1,1,1
5,0,1,0,1
6,0,2,1,1
7,3,0,1,1
8,2,1,1,1
9,1,1,0,1


In [5]:
def random_undersampling(tmp_df, TARGET_LABEL):
    df_majority = tmp_df[tmp_df[TARGET_LABEL] == 0]
    df_minority = tmp_df[tmp_df[TARGET_LABEL] == 1]

    df_majority_downsampled = resample(df_majority, 
                                       replace=False,              
                                       n_samples=len(df_minority), 
                                       random_state=None)        

    df_downsampled = pd.concat([df_majority_downsampled, df_minority])

    print("Undersampling complete!")
    print(df_downsampled[TARGET_LABEL].value_counts())
    return df_downsampled

In [6]:
df_downsampled = random_undersampling(df_raw, 'label')
df_downsampled = df_downsampled.sample(frac=1) 
df_downsampled = df_downsampled.reset_index() 
df_downsampled = df_downsampled.drop(columns=['index']) 

Undersampling complete!
1    10000
0    10000
Name: label, dtype: int64


In [7]:
df_downsampled.head(10)

Unnamed: 0,p1,p2,p3,label
0,922,344,231,0
1,182,206,754,0
2,295,69,150,0
3,58,16,1,1
4,34,21,0,1
5,0,43,0,1
6,438,516,59,0
7,238,267,699,0
8,754,629,28,0
9,938,593,3,0


In [8]:
x_data = df_raw.iloc[:,:-1]
y_data = df_raw.iloc[:,-1]


x_train_input = df_raw.iloc[:12000,:]
x_test_input = df_raw.iloc[12001:,:]

x_train = df_raw.iloc[:12000,:-1]
y_train = df_raw.iloc[:12000,-1]

x_test = df_raw.iloc[12001:,:-1]
y_test = df_raw.iloc[12001:,-1]
print(y_test)


print(len(x_train_input))
print(len(x_test_input))

print(x_train_input.label.value_counts())
print('Has null values', x_train_input.isnull().values.any())

print(x_test_input.label.value_counts())
print('Has null values', x_test_input.isnull().values.any())

12001    0
12002    0
12003    0
12004    0
12005    0
        ..
19995    0
19996    0
19997    0
19998    0
19999    0
Name: label, Length: 7999, dtype: int64
12000
7999
1    10000
0     2000
Name: label, dtype: int64
Has null values False
0    7999
Name: label, dtype: int64
Has null values False


In [9]:
x_train_input.head(10)

Unnamed: 0,p1,p2,p3,label
0,0,0,1,1
1,1,0,1,1
2,0,1,1,1
3,2,0,1,1
4,1,1,1,1
5,0,1,0,1
6,0,2,1,1
7,3,0,1,1
8,2,1,1,1
9,1,1,0,1


In [10]:
df = x_train_input.copy()

NON_LBL = [c for c in df.columns if c != 'label']
X = df[NON_LBL]
y = df['label']

y_orig = y.copy()

hidden_size = 5000
y.loc[
    np.random.choice(
        y[y == 1].index, 
        replace = False, 
        size = hidden_size
    )
] = 0

In [11]:
pd.Series(y).value_counts()

0    7000
1    5000
Name: label, dtype: int64

In [12]:
print('- %d samples and %d features' % (X.shape))
print('- %d positive out of %d total before hiding labels' % (sum(df_downsampled.label), len(df_downsampled.label)))
print('- %d positive out of %d total after hiding labels' % (sum(y), len(y)))

- 12000 samples and 3 features
- 10000 positive out of 20000 total before hiding labels
- 5000 positive out of 12000 total after hiding labels


# Trainning directly

In [13]:
print('Training MLP model ...')

from sklearn.neural_network import MLPClassifier

model = MLPClassifier(solver='adam',alpha=1e-5,hidden_layer_sizes=(10,5))

model.fit(X, y)

print('Done')

Training MLP model ...
Done


In [14]:
print('---- {} ----'.format('MLP model'))


print(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)))

print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig, model.predict(X)))
print('Recall: ', recall_score(y_orig, model.predict(X)))
print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
print('f1_score: ', f1_score(y_orig, model.predict(X)))

---- MLP model ----
[[1996    4]
 [3292 6708]]
                        pred_negative        pred_positive 
           true_negative 

NameError: name 'new_cm' is not defined

# Training by bagging

In [None]:
print('Training bagging classifier...')

pu_start = time.perf_counter()
model = BaggingClassifierPU(MLPClassifier(solver='adam',alpha=1e-5,hidden_layer_sizes=(10,5)),
                         n_estimators = 50, 
                         n_jobs = -1, 
                         max_samples = sum(y)  
                        )
model.fit(X, y)
pu_end = time.perf_counter()
print('Done!')
print('Time:', pu_end - pu_start)

In [None]:
#train data
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig, model.predict(X)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig, model.predict(X)))
print('Recall: ', recall_score(y_orig, model.predict(X)))
print('Accuracy: ', accuracy_score(y_orig, model.predict(X)))
print('f1_score: ', f1_score(y_orig, model.predict(X)))

In [None]:
#test data
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_test, model.predict(x_test)))
print('Recall: ', recall_score(y_test, model.predict(x_test)))
print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
print('f1_score: ', f1_score(y_test, model.predict(x_test)))