In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

#Visualisation libraries
import seaborn as sns

#Oversampling libraries
from numpy import genfromtxt
from sklearn.decomposition import PCA #dimensionality reduction
from imblearn.over_sampling import ADASYN #Oversampling 


import matplotlib.pyplot as plt #plotting data

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
card_data = pd.read_csv("../input/creditcardfraud/creditcard.csv")
card_data.head()

**Data Statististics**

In [None]:
card_data.describe()

Fraud statistics

In [None]:
card_data[card_data.Amount==0]

In [None]:
card_data.Amount[card_data.Class == 1].describe()

Non fraud statistics

In [None]:
card_data.Amount[card_data.Class == 0].describe()

Very similar, but max is different and mean is smaller.  Yet the data is highly imbalanced - 492 fraund transactions and 284315 non fraud transactions.

In [None]:
#Fraud
card_data[card_data.Class == 1].describe()

In [None]:
#Genuine
card_data[card_data.Class == 0].describe()

In [None]:
pd.plotting.parallel_coordinates(card_data.sample(1000), "Class")

In [None]:
card_data_no_time_and_amount=card_data.drop("Time",axis=1)
card_data_no_time_and_amount=card_data_no_time_and_amount.drop("Amount",axis=1)

In [None]:
pd.plotting.parallel_coordinates(card_data_no_time_and_amount.sample(1000), "Class", color=['g','m'])

In [None]:
#Check for null values, which can cause problems later
card_data.isnull().sum()

**2. Plotting the data**

In [None]:
graph, (fraud_gr, non_fraud_gr) = plt.subplots(2, 1, sharex=True, figsize=(20,6))

cats = 60 #numbebrs of categories/bars

fraud_gr.hist(card_data.Time[card_data.Class == 1], cats)
fraud_gr.set_title('Fraud')

non_fraud_gr.hist(card_data.Time[card_data.Class == 0], cats)
non_fraud_gr.set_title('Non-Fraud')

plt.xlabel('Time (sec)')
plt.ylabel('Transaction count')
plt.show()

In [None]:
import seaborn as sns

In [None]:
matrix_correlation = card_data.corr()
fig = plt.figure(figsize = (12, 9))

sns.heatmap(matrix_correlation, vmax = .8, square = True)
plt.show()

In [None]:
sns.pairplot(card_data.sample(1000), hue="Class")

In [None]:
pd.plotting.andrews_curves(card_data_no_time_and_amount.sample(10000), "Class", color=['g','r'])

In [None]:
card_data.keys()

**Preparing for Oversampling**

In [None]:
from  sklearn.model_selection import train_test_split
import keras
def data_preparation(x): 
    x_data= x.iloc[:,x.columns != "Class"]
    x_labels=x.iloc[:,x.columns=="Class"]
    x_labels_norm = keras.utils.to_categorical(x_labels,num_classes=None)
    X_train, X_test, y_train, y_test = train_test_split(x_data,x_labels_norm,test_size=0.2,random_state=0)
    return(X_train, X_test, y_train, y_test)


In [None]:
data = card_data

I use the variable *data* to operate on from now on, because it boosts coding efficiency

In [None]:
data.drop(["Time"],axis=1,inplace=True)
data.head()

In [None]:
data.columns[:-1]

*Time* is dropped, because if it is considered the whole structure of the classifier has to be adjusted for time series, but the data is not time centered, so I decided to leave time out

In [None]:
data['Class'].value_counts()

In [None]:
X_train, X_test, y_train, y_test=data_preparation(data)

Test sample size is 20%. As suggested in "*Recurrent Neural Networks and Robust Time Series Prediction*" by 
J. T. Connor, R.  Martin,  and L. E. Atlas. 30% split was considered, but after achieving worse results the 80/20 split was chosen.

In [None]:
pca = PCA(n_components=2)
data_2d = pd.DataFrame(pca.fit_transform(data))

In [None]:
data_2d

In [None]:
data_2d= pd.concat([data_2d, data['Class']], axis=1)
data_2d.columns = ['x', 'y', 'Class']

In [None]:
sns.lmplot(x="x",y="y",data=data_2d, fit_reg=False, hue='Class')

**Oversampling**

In [None]:
ada = ADASYN()
X_resampled, y_resampled = ada.fit_sample(X_train, y_train)

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

In [None]:
data_oversampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)
data_oversampled.columns = data.columns

In [None]:
data_oversampled["Class"].value_counts()

In [None]:
data_2d_oversampled = pd.DataFrame(pca.transform(data_oversampled))
data_2d_oversampled= pd.concat([data_2d_oversampled, data_oversampled['Class']], axis=1)
data_2d_oversampled.columns = ['x', 'y', 'Class']

**Display the oversampled data frame**

In [None]:
data_2d_oversampled

In [None]:
sns.lmplot(x="x",y="y",data=data_2d_oversampled, fit_reg=False, hue='Class')

Looks similar enough, so the oversampling was successful!

**Design of MLP**

In [None]:
y= data_oversampled['Class'].astype('category').cat.codes

In [None]:
Y = keras.utils.to_categorical(y,num_classes=None)

In [None]:
X= data_oversampled.iloc[:,0:29].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,Y,test_size=0.3,random_state=0)

**Undersampling to verify results**

In [None]:
from numpy import argmax
test_set_class=argmax(y_test, axis=1)
test_set_data=pd.DataFrame(X_test.values)
test_set_data.columns=X_test.columns
unique, counts = np.unique(test_set_class, return_counts=True)
test_set_class_df = pd.DataFrame({'Class':test_set_class})
frames =[test_set_data,test_set_class_df]
test_set = pd.concat(frames, axis=1)
print(dict(zip(unique, counts))) 

In [None]:
test_set.head()

In [None]:
fraud_indices= np.array(test_set[test_set.Class==1].index)
non_fraud_indices = np.array(test_set[test_set.Class==0].index)
data_non_fraud = test_set[test_set["Class"]==0]
data_fraud = test_set[test_set["Class"]==1]

In [None]:
from sklearn.utils import resample
data_non_fraud_downsampled = resample(data_non_fraud,replace=False,n_samples=len(fraud_indices))
data_downsampled = pd.concat([data_non_fraud_downsampled, data_fraud])

In [None]:
data_downsampled["Class"].value_counts()

In [None]:
#Now from the undersampled set I can take test values 
test_set_downsampled_data= data_downsampled.iloc[:,data_downsampled.columns != "Class"]
test_set_downsampled_labels=data_downsampled.iloc[:,data_downsampled.columns=="Class"]
test_set_downsampled_labels_norm = keras.utils.to_categorical(test_set_downsampled_labels,num_classes=None)


In [None]:
X_test_downsampled= test_set_downsampled_data
y_test_downsampled= test_set_downsampled_labels_norm

**Custom metrics**

In [None]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        pp = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = tp / (pp + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        pp = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = tp / (pp + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

**Test model**

In [None]:
model = keras.Sequential()
model.add(keras.layers.Dense(29, input_shape=(29,), activation='relu'))
model.add(keras.layers.Dense(3, input_shape=(3,), activation='tanh'))
model.add(keras.layers.Dense(2, activation='softmax'))
model.compile(keras.optimizers.Adam(lr=0.001),'binary_crossentropy',metrics=['accuracy',f1])

In [None]:
history= model.fit(X_train, y_train, validation_data=(X_val,y_val), epochs=200, batch_size=5000)

In [None]:
model.evaluate(X_test,y_test)[2]

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
y_pred= model.predict(X_test)
fpr, tpr, thresholds = roc_curve(argmax(y_test, axis=1), argmax(y_pred, axis=1))
auc_score = auc(fpr, tpr)

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], '--')
plt.plot(fpr, tpr, label='Control (AUC = {:.3f})'.format(auc_score), color='g')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

I tested it with neurons 3-32 in the first hidden layer (as a side experimental arm) and it makes no signifant difference, so I left it with less parameters to conserve computing power
Now I see when the model converges

In [None]:
history.history.keys()

In [None]:
zoom=60
zoom-=1
plt.plot(history.history['f1'][0:zoom])
plt.plot(history.history['val_f1'][0:zoom])
plt.plot(history.history['loss'][0:zoom])
plt.plot(history.history['val_loss'][0:zoom])
plt.title('Model Performance')
plt.ylabel('performance')
plt.xlabel('epoch')
plt.legend(['train_f1', 'val_f1', 'train_loss','val_loss'], loc='center right')
plt.show()

We can clearly see that in 25 epochs the model reaches the early convergance, but for safety and a bit more gain 50 epochs should be sufficient 

In [None]:
#Making sure the validation dataset is balanced, so accuracy is a relevant metric. 
unique, counts = np.unique(y_val, return_counts=True)
dict(zip(unique, counts)) 

**Control Model**

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True)
results_control_auc = []
results_control_fpr = []
results_control_tpr = []
# results_control_f1 = []

for i in range(0,7):
    for index, (train_indices, val_indices) in enumerate(skf.split(X, y)):
        X_train_local, X_val_local = X[train_indices], X[val_indices]
        y_train_local, y_val_local = Y[train_indices], Y[val_indices]
        model = keras.Sequential()
        model.add(keras.layers.Dense(29, input_shape=(29,), activation='relu'))
        model.add(keras.layers.Dense(3, input_shape=(3,), activation='tanh'))
        model.add(keras.layers.Dense(2, activation='softmax'))
        model.compile(keras.optimizers.Adam(lr=0.04),'binary_crossentropy',metrics=['accuracy',f1])
        # I tried using f1 score, but because of micro averages it was almost equal to the accuracy, but I decided to stick with f1 score
        model.fit(X_train_local, y_train_local, validation_data=(X_val_local,y_val_local), epochs=50, batch_size=5000)
#         model_results = model.evaluate(X_test, y_test)
#         model_results_2=model.evaluate(X_test,test_set_downsampled_labels) 
        y_pred= model.predict(X_test)
        fpr, tpr, thresholds = roc_curve(argmax(y_test, axis=1), argmax(y_pred, axis=1))
        
        auc_score = auc(fpr, tpr)
#Testing if the cat.codes is giving me false results and the answer is no and everything works surprisingly well
#         f1_score = model_results[2]
        results_control_auc.append(auc_score)
        results_control_fpr.append(fpr)
        results_control_tpr.append(tpr)
#         results_control_f1.append(f1_score)
        print ("AUC of last iteration: " + str(auc_score))

**Experimental model**

In [None]:
results_experimental_auc = []
results_experimental_fpr = []
results_experimental_tpr = []
for i in range(0,7):
    for index, (train_indices, val_indices) in enumerate(skf.split(X, y)):
        print("Iteration: " +str((index+1)+(5*(i))))
        X_train_local, X_val_local = X[train_indices], X[val_indices]
        y_train_local, y_val_local = Y[train_indices], Y[val_indices]
        model = keras.Sequential()
        model.add(keras.layers.Dense(29, input_shape=(29,), activation='relu'))
        model.add(keras.layers.Dense(16, input_shape=(16,), activation='relu'))
        model.add(keras.layers.Dense(3, input_shape=(3,), activation='tanh'))
        model.add(keras.layers.Dense(2, activation='softmax'))
        #model.compile(keras.optimizers.Adam(lr=0.04),'binary_crossentropy',metrics=['accuracy'])
        #Since the test dataset is undersampled and accuracy is equal to the f1 score, I decided to keep just the f1 score,
        #but it will be equally right to speak of the accuracy comparision
        model.compile(keras.optimizers.Adam(lr=0.04),'binary_crossentropy',metrics=[f1]) 
        model.fit(X_train_local, y_train_local, validation_data=(X_val_local,y_val_local), epochs=50, batch_size=5000)
        y_pred= model.predict(X_test)
        fpr, tpr, thresholds = roc_curve(argmax(y_test, axis=1), argmax(y_pred, axis=1))
        auc_score = auc(fpr, tpr)
        results_experimental_auc.append(auc_score)
        results_experimental_fpr.append(fpr)
        results_experimental_tpr.append(tpr)

In [None]:
pd.DataFrame(results_control_auc).to_csv('results_control_auc.csv', index=False)
pd.DataFrame(results_control_fpr).to_csv('results_control_fpr.csv', index=False)
pd.DataFrame(results_control_tpr).to_csv('results_control_tpr.csv', index=False)

pd.DataFrame(results_experimental_auc).to_csv('results_experimental_auc.csv', index=False)
pd.DataFrame(results_experimental_fpr).to_csv('results_experimental_fpr.csv', index=False)
pd.DataFrame(results_experimental_tpr).to_csv('results_experimental_tpr.csv', index=False)

The K-fold is a cross-validation technique, where the classifier is exposed to the whole training set, while leaving different parts of it hidden for cross-validation. On every iteration the cross-validation set from last iteration is used for training and another part is chosed for cross-validation. n_splits of 5 were chosed, because as in [TODO] it proves to be promissing split.

**Results analysis**

In [None]:
results_control_auc=pd.read_csv("../input/dmat2018-i7461730-daniel-dimanov/results_control_auc.csv")
results_control_tpr=pd.read_csv("../input/dmat2018-i7461730-daniel-dimanov/results_control_tpr.csv")
results_control_fpr=pd.read_csv("../input/dmat2018-i7461730-daniel-dimanov/results_control_fpr.csv")

results_experimental_auc=pd.read_csv("../input/dmat2018-i7461730-daniel-dimanov/results_experimental_auc.csv")
results_experimental_tpr=pd.read_csv("../input/dmat2018-i7461730-daniel-dimanov/results_experimental_tpr.csv")
results_experimental_fpr=pd.read_csv("../input/dmat2018-i7461730-daniel-dimanov/results_experimental_fpr.csv")

In [None]:
control_fprs=results_control_fpr.values
control_tprs=results_control_tpr.values
control_aucs=results_control_auc.values
exp_tprs=results_experimental_tpr.values
exp_fprs=results_experimental_fpr.values
exp_aucs=results_experimental_auc.values
plt.figure(1)
plt.plot([0, 1], [0, 1], '--')
for i in range(0,len(control_fprs)-1):
    if i==0:
        plt.plot(control_fprs[i], control_tprs[i], label='Control (AUC ='+str(results_control_auc.mean()[0])+")", color='g')
        plt.plot(exp_fprs[i], exp_tprs[i], label='Experimental (AUC = '+ str(results_experimental_auc.mean()[0]) + ")", color='r')
    else:
        plt.plot(control_fprs[i], control_tprs[i], color='g')
        plt.plot(exp_fprs[i], exp_tprs[i], color='r')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [None]:
mean_control_auc = results_control_auc.mean()
print("Mean Control AUC: {}".format(mean_control_auc))

mean_experimental_auc = results_experimental_auc.mean()
print("Mean Experimental AUC: {}".format(mean_experimental_auc))

In [None]:
std_control_auc = results_control_auc.std()
print("Standard Deviation of Control F1-score Results: {}".format(std_control_auc))

std_experimental_auc = results_experimental_auc.std()
print("Standard Deviation of Experimental F1-score Results: {}".format(std_experimental_auc))

In [None]:
results_auc= pd.concat([results_control_auc, results_experimental_auc], axis=1)
results_auc.columns = ['Control', 'Experimental']

In [None]:
results_auc.boxplot(showfliers=False)

In [None]:
results_auc.hist(density=True)

In [None]:
from scipy import stats

alpha = 0.05;

s, p = stats.normaltest(results_control_auc)
print(p)
if p < alpha:
    print('Control data is not normal')
else:
    print('Control data is normal')

s, p = stats.normaltest(results_experimental_auc)
print(p)
if p < alpha:
    print('Experimental data is not normal')
else:
    print('Experimental data is normal')

In [None]:
pearson_coef, p_value = stats.pearsonr(results_control_auc, results_experimental_auc) #define the columns to perform calculations on

if p_value < 0.05 and pearson_coef!=0:
    if(pearson_coef>0):
        print('null hypothesis rejected, significant difference between the data-sets with tendency for the control arm to be better')
    else:
        print('null hypothesis rejected, significant difference between the data-sets with tendency for the experimental arm to be better')
else:
  print('null hypothesis accepted, no significant difference between the data-sets')

In [None]:
results_control_auc.describe()

In [None]:
results_experimental_auc.describe()