## Neural Networks for Class Imbalance with Keras

Kaggle CreditCard Fraud Detection Data can be downloaded here:
https://github.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/blob/master/creditcard.csv?raw=true

In [None]:
%%time
import pandas as pd
data = pd.read_csv('creditcard.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['Class'].value_counts()

In [None]:
%%time
# Split data into train and test splits

from sklearn.model_selection import train_test_split

# retrieve numpy array
data = data.values
# split into input and output elements
X, y = data[:, 1:-1], data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
# Count how many unique values of each class

import numpy as np
unique, counts = np.unique(y, return_counts=True)
print (np.asarray((unique, counts)).T)

unique, counts = np.unique(y_test, return_counts=True)
print (np.asarray((unique, counts)).T)

### The class weighing can be defined multiple ways; for example:

* Domain expertise, determined by talking to subject matter experts
* Tuning, determined by a hyperparameter search such as a grid search
* Heuristic, specified using a general best practice
* A best practice for using the class weighting is to use the inverse of the class distribution present in the training dataset

In [None]:
# calculate heuristic class weighting
from sklearn.utils.class_weight import compute_class_weight

# calculate class weighting according to training data
weighting = compute_class_weight('balanced', [0,1], y_train)
print(weighting)

### For Neural Netoworks:

A large error weighting can be applied to those examples in the minority class as they are often more important in an imbalanced classification problem than examples from the majority class.

* *Large Weight:* Assigned to examples from the minority class.
* *Small Weight:* Assigned to examples from the majority class.

### This cell can take long to run

In [None]:
%%time
from sklearn.metrics import roc_auc_score
from keras.layers import Dense
from keras.models import Sequential
from sklearn.metrics import roc_auc_score

weights = {0:weighting[0], 1:weighting[1]} 
print(weights)
# define model
# try with class_weight=weights and class_weight=None

# the number of input features
n_input = X_train.shape[1]

# define the neural network model
# define model
model = Sequential()
# define first hidden layer and visible layer
model.add(Dense(10, input_dim=n_input, activation='relu', kernel_initializer='he_uniform'))
# define output layer
model.add(Dense(1, activation='sigmoid'))
# define loss and optimizer
model.compile(loss='binary_crossentropy', optimizer='sgd')

# fit model
model.fit(X_train, y_train, class_weight=weights, epochs=100, verbose=0)

# evaluate model
y_probs = model.predict(X_test)

auc = roc_auc_score(y_test, y_probs)

# summarize performance
print(' ROC AUC = %.3f' % auc)

### Use Synthetic Data

In [None]:
%%time
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from keras.layers import Dense
from keras.models import Sequential

# prepare train and test dataset
def prepare_data(n_samples=1000):
    # generate 2d classification dataset
    X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0,
    n_clusters_per_class=2, weights=[0.99], flip_y=0, random_state=4)
    # split into train and test
    n_train = n_samples//2
    trainX, testX = X[:n_train, :], X[n_train:, :]
    trainy, testy = y[:n_train], y[n_train:]
    return trainX, trainy, testX, testy

# define the neural network model
def define_model(n_input):
    # define model
    model = Sequential()
    # define first hidden layer and visible layer
    model.add(Dense(10, input_dim=n_input, activation='relu', kernel_initializer='he_uniform'))
    # define output layer
    model.add(Dense(1, activation='sigmoid'))
    # define loss and optimizer
    model.compile(loss='binary_crossentropy', optimizer='sgd')
    return model

# prepare dataset
X_train, y_train, X_test, y_test = prepare_data()
# get the model
n_input = X_train.shape[1]

model = define_model(n_input)
# fit model
# calculate class weighting according to training data
weighting = compute_class_weight('balanced', [0,1], y_train)

weights = {0:weighting[0], 1:weighting[1]} 
print(weights)

history = model.fit(X_train, y_train, class_weight=weights, epochs=100, verbose=0)
# evaluate model
yhat = model.predict(X_test)
score = roc_auc_score(y_test, yhat)
print('ROC AUC: %.3f' % score)

In [None]:
compute_class_weight('balanced', [0,1], y_train)