In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# Sampling imports
from numpy import genfromtxt
from sklearn.decomposition import PCA
from imblearn.over_sampling import ADASYN
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
LABELS = ["Normal","Fraud"]

In [None]:
#reading dataset
card_data = pd.read_csv("../input/creditcard.csv")
card_data.info()


In [None]:
#confirm there are no null values
card_data.isnull().values.any()

In [None]:
card_data.head()

In [None]:
card_data.columns

In [None]:
card_data['Class'].value_counts()

In [None]:
# unbalanced dataset
card_data['Class'].value_counts().plot.bar()
plt.xticks(range(2), LABELS)
plt.title("Frequency by observation number")
plt.xlabel("Class")
plt.ylabel("Number of Observations")

In [None]:
pca = PCA(n_components=2)
#card_data_2d = pd.DataFrame(pca.fit_transform(card_data[:,1:30]))
card_data_2d = pd.DataFrame(pca.fit_transform(card_data.drop(['Amount', 'Class','Time'], axis=1))) 
card_data_2d.head()

In [None]:
card_data_2d = pd.concat([card_data_2d, card_data['Class']], axis=1)
card_data_2d.columns

In [None]:
card_data_2d.columns = ['x', 'y', 'class']

In [None]:
card_data_2d.head()

In [None]:
sns.lmplot(x="x",y="y", data=card_data_2d, fit_reg = False, hue="class")

In [None]:
card_data.drop(['Amount', 'Class','Time'], axis=1).head()

In [None]:
ada = ADASYN()
X_resampled, y_resampled = ada.fit_sample(card_data.drop(['Amount', 'Class','Time'], axis=1),card_data['Class'])

In [None]:
labeled_y_resampled =  pd.DataFrame(y_resampled)
labeled_y_resampled.columns=['class']

In [None]:
card_data_oversampled = pd.concat([pd.DataFrame(X_resampled), labeled_y_resampled], axis=1)
card_data_oversampled.shape

In [None]:
card_data_oversampled.head()

In [None]:
## visual representation of the balanced dataset
card_data_oversampled['class'].value_counts().plot.bar()
plt.xticks(range(2), LABELS)
plt.title("Frequency by observation number")
plt.xlabel("Class")
plt.ylabel("Number of Observations")

In [None]:
## to split data for training 
X = card_data_oversampled.iloc[:,0:28 ].values

In [None]:
#use one-hot encoding for classs labels
y = card_data_oversampled['class']
Y = keras.utils.to_categorical(y,num_classes=None)
Y

Experiment
Hypothesis: A decrese in the number of neurons in the hidden layer will affect the accuracy of prediction adversly

In [None]:
#Cross Validation using K-Fold method (k =10)
seed = 7
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)


In [None]:
Conrtrol Arm with the number of input parameters as the number of neurons in the hidden layer

In [None]:
baseline_history =None
results_control_evaluation_accuracy =[]

In [None]:
#generate 30 samples
for _ in range(0,3):
    seed = _
    np.random.seed(seed)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    for train, test in kfold.split(X, y):
        model = keras.Sequential()
        model.add(keras.layers.Dense(28, input_shape=(28,), activation='relu' ))
        model.add(keras.layers.Dense(2,  activation='softmax' ))
        model.compile(keras.optimizers.Adam(lr=0.04), 'categorical_crossentropy', metrics=['accuracy'])
        model.summary()
        baseline_history = model.fit(X[train], keras.utils.to_categorical(y[train],num_classes=None) , epochs=40, verbose=1)
        #print("iteration-{} complete!!!".format(_))
        test_loss, test_acc = model.evaluate(X[test], keras.utils.to_categorical(y[test],num_classes=None))
        results_control_evaluation_accuracy.append(test_acc)

In [None]:
results_control_evaluation_accuracy

Experimental Arm with reduced number of neurons in the hidden layer

In [None]:
experiment_history = None
results_experiment_evaluation_accuracy= []

In [None]:
#generate 30 samples
for _ in range(0,3):
    seed = _
    np.random.seed(seed)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    for train, test in kfold.split(X, y):
        model = keras.Sequential()
        model.add(keras.layers.Dense(19, input_shape=(28,), activation='relu' ))
        model.add(keras.layers.Dense(2,  activation='softmax' ))
        model.compile(keras.optimizers.Adam(lr=0.04), 'categorical_crossentropy', metrics=['accuracy'])
        model.summary()
        experiment_history = model.fit(X[train], keras.utils.to_categorical(y[train],num_classes=None) , epochs=40, verbose=0)
        #print("iteration-{} complete!!!".format(_))
        test_loss, test_acc = model.evaluate(X[test], keras.utils.to_categorical(y[test],num_classes=None))
        results_experiment_evaluation_accuracy.append(test_acc)

In [None]:
results_experiment_evaluation_accuracy

In [None]:
#saving results
pd.DataFrame(results_control_evaluation_accuracy).to_csv('results_control_evaluation_accuracy.csv', index=False)
pd.DataFrame(results_experiment_evaluation_accuracy).to_csv('results_experiment_evaluation_accuracy.csv', index=False)

RESULT ANALYSIS

In [None]:
control = pd.DataFrame(results_control_evaluation_accuracy)
experiment = pd.DataFrame(results_experiment_evaluation_accuracy)

In [None]:
##Basic Stats
mean_control_accuracy = control.mean()
print("Mean Control Accuracy: {}".format(mean_control_accuracy))

mean_experimental_accuracy = experiment.mean()
print("Mean Experimental Accuracy: {}".format(mean_experimental_accuracy))

In [None]:
##Checking standard deviation of the results
std_control_accuracy = control.std()
print("Standard Deviation of Control Accuracy Results: {}".format(std_control_accuracy))

std_experimental_accuracy = experiment.std()
print("Standard Deviation of Experimental Accuracy Results: {}".format(std_experimental_accuracy))

In [None]:
results_accuracy= pd.concat([control, experiment], axis=1)
results_accuracy.columns = ['Control', 'Experimental']
results_accuracy.head()

In [None]:
#To calculate 5 parameter representation of results using boxplots
results_accuracy.boxplot()

In [None]:
#taking away outliers
results_accuracy.boxplot(showfliers=False)

In [None]:
from scipy import stats
s, p = stats.wilcoxon(control[0], experiment[0])

if p < 0.05:
  print('null hypothesis rejected, significant difference between the data-sets')
else:
  print('null hypothesis accepted, no significant difference between the data-sets')