In [1]:
import time
import sklearn
import numpy as np
import pandas as pd

from sklearn.utils import resample

import sys
sys.path.append("..")
from baggingPU import BaggingClassifierPU

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = " " * columnwidth
    print("    " + empty_cell, end=' ')
    for label in labels:
        print("%{0}s".format(columnwidth) % 'pred_' + label, end=" ")
    print()

    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % 'true_' + label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=" ")
        print()

# import data

In [3]:
#df_raw = pd.read_csv('../data/w-dependence.csv')

df_raw = pd.read_csv('../data/1place-independence.csv')

#df_raw = pd.read_csv('../data/w-related.csv')


df_raw['label'] = df_raw['label'].astype("int")
print(df_raw.label.value_counts())
print('Has null values', df_raw.isnull().values.any())

0    10000
1    10000
Name: label, dtype: int64
Has null values False


In [4]:
df_raw.head(10)

Unnamed: 0,p1,p2,p3,p4,label
0,1,0,0,0,1
1,0,1,1,0,1
2,1,0,1,0,1
3,0,0,0,1,1
4,0,1,2,0,1
5,1,0,2,0,1
6,0,0,1,1,1
7,0,1,3,0,1
8,1,0,3,0,1
9,0,0,2,1,1


In [5]:
def random_undersampling(tmp_df, TARGET_LABEL):
    df_majority = tmp_df[tmp_df[TARGET_LABEL] == 0]
    df_minority = tmp_df[tmp_df[TARGET_LABEL] == 1]

    df_majority_downsampled = resample(df_majority, 
                                       replace=False,              
                                       n_samples=len(df_minority), 
                                       random_state=None)        

    df_downsampled = pd.concat([df_majority_downsampled, df_minority])

    print("Undersampling complete!")
    print(df_downsampled[TARGET_LABEL].value_counts())
    return df_downsampled

In [6]:
df_downsampled = random_undersampling(df_raw, 'label')
df_downsampled = df_downsampled.sample(frac=1) 
df_downsampled = df_downsampled.reset_index() 
df_downsampled = df_downsampled.drop(columns=['index']) 

Undersampling complete!
0    10000
1    10000
Name: label, dtype: int64


In [7]:
df_downsampled.head(10)

Unnamed: 0,p1,p2,p3,p4,label
0,0,0,957,1,1
1,2156,836,950,1865,0
2,1,0,1154,0,1
3,0,1,1130,0,1
4,1,0,1162,1,0
5,1,1,37,1,0
6,0,0,1946,1,1
7,1,1,2323,1,0
8,0,0,2958,1,1
9,2442,10,964,450,0


In [8]:
x_data = df_raw.iloc[:,:-1]
y_data = df_raw.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

x_train_input = pd.concat([x_train,y_train],axis=1)
x_test_input = pd.concat([x_test,y_test],axis=1)

print(len(x_train_input))
print(len(x_test_input))

print(x_train_input.label.value_counts())
print('Has null values', x_train_input.isnull().values.any())

print(x_test_input.label.value_counts())
print('Has null values', x_test_input.isnull().values.any())

16000
4000
1    8077
0    7923
Name: label, dtype: int64
Has null values False
0    2077
1    1923
Name: label, dtype: int64
Has null values False


In [9]:
x_train_input.head(10)

Unnamed: 0,p1,p2,p3,p4,label
8118,0,0,2705,1,1
10171,0,0,2141,0,0
9399,0,0,3132,1,1
12669,1,1,2336,0,0
18809,1457,3115,942,1111,0
13869,0,0,1609,0,0
3320,1,0,1107,0,1
14689,1,1,1319,0,0
13087,1,1,2692,1,0
15992,515,833,2507,1884,0


In [10]:
df = x_train_input.copy()

NON_LBL = [c for c in df.columns if c != 'label']
X = df[NON_LBL]
y = df['label']

y_orig = y.copy()

hidden_size = 5000
y.loc[
    np.random.choice(
        y[y == 1].index, 
        replace = False, 
        size = hidden_size
    )
] = 0

In [11]:
pd.Series(y).value_counts()

0    12923
1     3077
Name: label, dtype: int64

In [12]:
print('- %d samples and %d features' % (X.shape))
print('- %d positive out of %d total before hiding labels' % (sum(df_downsampled.label), len(df_downsampled.label)))
print('- %d positive out of %d total after hiding labels' % (sum(y), len(y)))

- 16000 samples and 4 features
- 10000 positive out of 20000 total before hiding labels
- 3077 positive out of 16000 total after hiding labels


# Trainning directly

In [13]:
X_FCM = np.array(X.values.tolist())
y_FCM = np.array(y.values.tolist())
y_orig_FCM = np.array(y_orig.values.tolist())
print(X_FCM)
print(X_FCM.shape)
print(y_FCM)
print(y_orig_FCM)

[[   0    0 2705    1]
 [   0    0 2141    0]
 [   0    0 3132    1]
 ...
 [   1    0 1900    0]
 [   0    1 2930    1]
 [2113  604 1171 1381]]
(16000, 4)
[1 0 1 ... 1 0 0]
[1 0 1 ... 1 0 0]


In [14]:
import logging
from FCM import FCM

model = FCM(n_clusters=2)
model.set_logger(tostdout=True, level=logging.DEBUG)
model.fit(X_FCM, y_FCM)

print('Done')
print('Training FCM model ...')

set_membership_from_hard_cluster > membership: 
[[0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
num of points: 16000
num of features: 4
set_membership_from_hard_cluster > cluster centers: 
[array([ 516.48409812,  521.9837499 , 1661.99620831,  516.34125203]), array([3.29541761e-01, 3.40916477e-01, 1.64703672e+03, 3.29541761e-01])]
updated membership is: 
[[0.37161621 0.62838379 0.         ... 0.         0.         0.        ]
 [0.19073295 0.80926705 0.         ... 0.         0.         0.        ]
 [0.42645335 0.57354665 0.         ... 0.         0.         0.        ]
 ...
 [0.06914696 0.93085304 0.         ... 0.         0.         0.        ]
 [0.40565888 0.59434112 0.         ... 0.         0.         0.        ]
 [0.66260633 0.33739367 0.         ... 0.         0.         0.        ]]
updated membership is: 
[[0.37161621 0.62838379 0.         ... 0.         0.         0.     

Done
Training FCM model ...


In [15]:
model.predict_proba(X_FCM)
print(model.predict_proba(X_FCM))

[[0.37161621 0.62838379]
 [0.19073295 0.80926705]
 [0.42645335 0.57354665]
 ...
 [0.06914696 0.93085304]
 [0.40565888 0.59434112]
 [0.66260633 0.33739367]]


In [16]:
model.predict(X_FCM)
print(model.predict(X_FCM))

[1 1 1 ... 1 1 0]


In [17]:
print('---- {} ----'.format('FCM'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig_FCM, model.predict(X_FCM)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig_FCM, model.predict(X_FCM)))
print('Recall: ', recall_score(y_orig_FCM, model.predict(X_FCM)))
print('Accuracy: ', accuracy_score(y_orig_FCM, model.predict(X_FCM)))
print('f1_score: ', f1_score(y_orig_FCM, model.predict(X_FCM)))

---- FCM ----
                        pred_negative        pred_positive 
           true_negative       3985.0       3938.0 
           true_positive          0.0       8077.0 
None

Precision:  0.6722430295464004
Recall:  1.0
Accuracy:  0.753875
f1_score:  0.804001592673701


# Training by bagging

In [26]:
print('Training bagging classifier...')

pu_start = time.perf_counter()

fcm = FCM(n_clusters=2)
fcm.set_logger(tostdout=True, level=logging.DEBUG)

model = BaggingClassifierPU(fcm,
                         n_estimators = 50, 
                         n_jobs = -1, 
                         max_samples = sum(y_FCM)  
                        )

model.fit(X_FCM, y_FCM)
pu_end = time.perf_counter()
print('Done!')
print('Time:', pu_end - pu_start)

Training bagging classifier...
Done!
Time: 281.1064476361498


In [28]:
model.predict_proba(X_FCM)
print(model.predict_proba(X_FCM))

[[0.57980809 0.97727814]
 [0.29809688 1.25898935]
 [0.66496352 0.89212271]
 ...
 [0.10818309 1.44890314]
 [0.63269164 0.92439459]
 [1.03122586 0.52586037]]


In [29]:
model.predict(X_FCM)
print(model.predict(X_FCM))

[1 1 1 ... 1 1 0]


In [27]:
#train data
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_orig_FCM, model.predict(X_FCM)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_orig_FCM, model.predict(X_FCM)))
print('Recall: ', recall_score(y_orig_FCM, model.predict(X_FCM)))
print('Accuracy: ', accuracy_score(y_orig_FCM, model.predict(X_FCM)))
print('f1_score: ', f1_score(y_orig_FCM, model.predict(X_FCM)))

---- PU Bagging ----
                        pred_negative        pred_positive 
           true_negative       3985.0       3938.0 
           true_positive          0.0       8077.0 
None

Precision:  0.6722430295464004
Recall:  1.0
Accuracy:  0.753875
f1_score:  0.804001592673701


In [None]:
#print wrong predictions
y_pre = model.predict(X_FCM)
y_orig_index = y_orig.index.tolist()

FN_index = []
FT_index = []

for i in range(len(y_orig)):
    if y_orig.iloc[i] == 1 and y_pre[i] == 0 :
        FN_index.append(y_orig_index[i])
    if y_orig.iloc[i] == 0 and y_pre[i] == 1 :
        FT_index.append(y_orig_index[i])
        
print("False Negtive:")
print(X.loc[FN_index])
print("False Positive:")
print(X.loc[FT_index])

In [32]:
#test data
print('---- {} ----'.format('PU Bagging'))
print(print_cm(sklearn.metrics.confusion_matrix(y_test, model.predict(x_test)), labels=['negative', 'positive']))
print('')
print('Precision: ', precision_score(y_test, model.predict(x_test)))
print('Recall: ', recall_score(y_test, model.predict(x_test)))
print('Accuracy: ', accuracy_score(y_test, model.predict(x_test)))
print('f1_score: ', f1_score(y_test, model.predict(x_test)))

---- PU Bagging ----
                        pred_negative        pred_positive 
           true_negative       2070.0          7.0 
           true_positive        352.0       1571.0 
None

Precision:  0.9955640050697085
Recall:  0.8169526781071242
Accuracy:  0.91025
f1_score:  0.8974578691802342


In [None]:
#print wrong predictions
y_test_pre = model.predict(x_test)
y_test_index = y_test.index.tolist()

FN_test_index = []
FT_test_index = []

for i in range(len(y_test)):
    if y_test.iloc[i] == 1 and y_test_pre[i] == 0 :
        FN_test_index.append(y_test_index[i])
    if y_test.iloc[i] == 0 and y_test_pre[i] == 1 :
        FT_test_index.append(y_test_index[i])

print("False Negtive:")
print(x_test.loc[FN_test_index])
print("False Positive:")
print(x_test.loc[FT_test_index])