## MetaCost

Using LogisticRegression algorithm wrapping it with the MetaCost.

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from metacost import MetaCost

In [2]:
# load data
# only a few observations to speed the computaton

data = pd.read_csv('../kdd2004.csv').sample(10000)

# remap target class to 0 and 1
data['target'] = data['target'].map({-1:0, 1:1})

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
81830,32.46,43.24,-0.97,-37.0,25.5,1191.3,-0.61,-1.15,-13.5,-43.0,...,1071.7,-0.36,-0.1,-3.0,-36.0,164.9,0.49,0.37,0.48,0
62827,79.84,24.24,-1.05,-0.5,9.0,1146.8,-1.4,0.92,-11.5,-60.5,...,597.3,0.45,-0.65,4.0,-31.0,268.3,0.6,0.3,0.48,0
97905,53.64,21.59,0.64,39.0,12.0,1721.6,0.09,0.35,13.0,-62.5,...,1861.2,-0.11,-0.2,-3.0,-33.0,282.0,0.34,0.21,0.94,0
121824,96.8,21.49,0.35,22.5,31.5,1590.4,-0.25,-1.21,-16.5,-53.0,...,1387.6,0.04,-0.7,2.0,-60.0,147.6,1.21,0.23,0.5,0
34496,82.88,31.4,-0.47,-18.0,1.0,1056.3,-0.35,-0.59,-18.5,-54.0,...,1000.7,-0.01,0.79,-2.0,-58.0,389.9,1.03,0.17,-0.53,0


In [3]:
# imbalanced target

data.target.value_counts() / len(data)

0    0.9902
1    0.0098
Name: target, dtype: float64

In [4]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((7000, 74), (3000, 74))

In [5]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
39753,39.63,27.91,2.3,36.5,65.0,3654.4,0.69,-0.28,-30.0,-83.0,...,29.0,3976.2,1.17,0.69,-14.0,-113.0,725.0,1.04,0.33,0.16
96146,82.14,22.83,-1.76,10.0,-18.0,555.9,0.0,-1.4,-8.5,-60.5,...,-16.0,272.4,0.5,1.26,9.0,-36.0,222.9,-0.67,0.27,0.18
97508,28.7,26.09,0.52,-2.5,8.5,1818.6,0.33,-1.22,2.0,-56.0,...,-26.0,2097.2,-0.86,0.89,-8.0,-92.0,312.9,1.52,0.19,-0.08
67899,40.83,24.72,0.82,0.5,52.5,2747.4,-0.13,1.44,11.0,-76.5,...,8.0,2904.3,1.07,-0.16,-1.0,-77.0,88.1,1.86,0.35,0.38
122761,47.15,27.56,0.29,28.5,7.5,1248.9,1.95,0.04,0.0,-110.0,...,39.0,4594.4,-2.13,-0.16,-15.0,-114.0,238.3,1.68,0.44,0.24


## Set up Logistic regression

In [6]:
# set up the estimator we would like to ensemble

logit = LogisticRegression(
    penalty='l2',
    solver='newton-cg',
    random_state=0,
    max_iter=10,
    n_jobs=4,
)

## MetaCost

With no cost

In [8]:
cost_matrix = np.array([[0, 1], [1, 0]])
cost_matrix

array([[0, 1],
       [1, 0]])

In [9]:
metacost_ = MetaCost(estimator=logit,
                     cost_matrix=cost_matrix,
                     n_estimators=50,
                     n_samples=None,
                     p=True,
                     q=True)

In [13]:
metacost_.fit(X_train, y_train)

resampling data and training ensemble
Finished training ensemble
evaluating optimal class per observation
Finished re-assigning labels
Training model on new data
Finished training model on data with new labels


In [14]:
metacost_.predict_proba(X_train)

array([[1.00000000e+00, 2.84513073e-16],
       [9.74414360e-01, 2.55856402e-02],
       [9.99999956e-01, 4.37210081e-08],
       ...,
       [9.43762122e-01, 5.62378777e-02],
       [9.98731778e-01, 1.26822218e-03],
       [9.99989460e-01, 1.05403823e-05]])

In [15]:
print('Train set')
pred = metacost_.predict_proba(X_train)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

print('Test set')
pred = metacost_.predict_proba(X_test)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

Train set
MetaCost roc-auc: 0.9275945438799077
Test set
MetaCost roc-auc: 0.9089545289948787


## MetaCost

With costs

TN | FN
 
FP | TP

In [16]:
cost_matrix = np.array([[0, 100], [1, 0]])
cost_matrix

array([[  0, 100],
       [  1,   0]])

In [17]:
metacost2 = MetaCost(estimator=logit,
                     cost_matrix=cost_matrix,
                     n_estimators=50,
                     n_samples=None,
                     p=True,
                     q=True)

In [18]:
metacost2.fit(X_train, y_train)

print('Train set')
pred = metacost2.predict_proba(X_train)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

print('Test set')
pred = metacost2.predict_proba(X_test)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

resampling data and training ensemble
Finished training ensemble
evaluating optimal class per observation
Finished re-assigning labels
Training model on new data
Finished training model on data with new labels
Train set
MetaCost roc-auc: 0.9567315402874005
Test set
MetaCost roc-auc: 0.9210594382080595


In [19]:
y_train.reset_index(drop=True)

0       0
1       0
2       0
3       0
4       0
       ..
6995    0
6996    0
6997    0
6998    0
6999    0
Name: target, Length: 7000, dtype: int64

In [20]:
tmp = pd.concat([metacost2.y_, y_train.reset_index(drop=True)], axis=1)

tmp.head()

Unnamed: 0,0,target
0,0,0
1,1,0
2,0,0
3,0,0
4,0,0


In [21]:
tmp[tmp[0]!=tmp['target']][['target', 0]]

Unnamed: 0,target,0
1,0,1
6,0,1
18,0,1
21,0,1
39,0,1
...,...,...
6979,0,1
6990,0,1
6991,0,1
6997,0,1


In theory, we should only be re-labeling observations from class 0 to class 1, but in practice that does not happen.

In [22]:
np.sum( np.where(metacost2.y_ != y_train.reset_index(drop=True),1,0) )

1201

In [23]:
np.sum( np.where(metacost2.y_ == y_train.reset_index(drop=True),1,0) )

5799

We can wrap a model to make it cost-sensitive utilizing metacost.