In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/creditcardfraud/creditcard.csv


In [11]:
#Reading in the data
dataframe = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
dataframe.head

<bound method NDFrame.head of             Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.

# **Data Preprocessing**

In [12]:
#splitting the dataset

from sklearn.model_selection import train_test_split

#seperate the dependent from the independent columnes
print('total dataset: ', len(dataframe))
x = dataframe.iloc[:, :-1]
y = dataframe.iloc[:, -1]

random_state = np.random.RandomState(0)

#Shuffling the dataset of features
n_rows, n_features = x.shape
x = np.c_[x, random_state.randn(n_rows, n_features)]

#split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=random_state)


total dataset:  284807


In [13]:
#Feature scaling. Given the disparity of the columns time and ammount from the rest

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# **Defining the model and Fitting the data**

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import pickle

#function for using the k-fold cross validation to find the optimal dept by
#...by fiting trees of various depths on the training data.
# We also use prunning optimize the classification tree

def run_decision_tree(x, y, tree_depths, cv=7, scoring='average_precision'):
    cross_val_scores_list = []
    cross_val_scores_mean = []
    cross_val_scores_std = []
    accuracy_list = []
    
    temp = 0
    
    for depth in tree_depths:
        random_state = np.random.RandomState(0)
        model = DecisionTreeClassifier(criterion="entropy", random_state=random_state, max_depth=depth)
        cross_val_scores = cross_val_score(model,x, y, cv=cv, scoring=scoring)
        cross_val_scores_list.append(cross_val_scores)
        score_mean = cross_val_scores.mean()
        cross_val_scores_mean.append(score_mean)
        cross_val_scores_std.append(cross_val_scores.std())
        accuracy_list.append(model.fit(x, y).score(x, y))
        
        print("Tree depth {:d}:\n\tCross Validation Scores mean: {:f}\n\tCross Validation STD: {:f}\n\tAccuracy: {:f}\n".format(depth, score_mean, cross_val_scores.std(), accuracy_list[-1]))
        
        if(score_mean > temp):
            temp = score_mean
            print("Saving model {}... \n".format(depth))
            model_file = 'best_model.sav'
            pickle.dump(model, open(model_file, 'wb'))
            best_tree_depth = depth
            best_tree_cv_score = cross_val_scores_mean[-1]
            best_tree_cv_std = cross_val_scores_std[-1]
            print("Model saved\n")
            
        else:
            print("Model not optimal. Skiping save...\n")
        
    cross_val_scores_mean = np.array(cross_val_scores_mean)
    cross_val_scores_std = np.array(cross_val_scores_std)
    accuracy_list = np.array(accuracy_list)
    
    return cross_val_scores_mean, cross_val_scores_std, accuracy_list, model_file

In [15]:
#Fitting Trees for depth 1 to 14
depths = range(1, 15)
scores_mean, scores_std, accuracy_list, model_file = run_decision_tree(x_train, y_train, tree_depths=depths)

print("Score Mean List: ",scores_mean)
print("Score STD: ", scores_std)
print("Accuracy List: ", accuracy_list)

Tree depth 1:
	Cross Validation Scores mean: 0.529772
	Cross Validation STD: 0.089715
	Accuracy: 0.999064

Saving model 1... 

Model saved

Tree depth 2:
	Cross Validation Scores mean: 0.644117
	Cross Validation STD: 0.068814
	Accuracy: 0.999260

Saving model 2... 

Model saved

Tree depth 3:
	Cross Validation Scores mean: 0.723840
	Cross Validation STD: 0.055506
	Accuracy: 0.999415

Saving model 3... 

Model saved

Tree depth 4:
	Cross Validation Scores mean: 0.740907
	Cross Validation STD: 0.059364
	Accuracy: 0.999551

Saving model 4... 

Model saved

Tree depth 5:
	Cross Validation Scores mean: 0.749390
	Cross Validation STD: 0.061486
	Accuracy: 0.999672

Saving model 5... 

Model saved

Tree depth 6:
	Cross Validation Scores mean: 0.743027
	Cross Validation STD: 0.060302
	Accuracy: 0.999738

Model not optimal. Skiping save...

Tree depth 7:
	Cross Validation Scores mean: 0.728781
	Cross Validation STD: 0.048102
	Accuracy: 0.999789

Model not optimal. Skiping save...

Tree depth 8:


In [16]:
loaded_model = pickle.load(open(model_file, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)

0.9993679952810315


In [17]:
#Making predictions and checking the accuracy
from sklearn import metrics

y_pred = loaded_model.predict(x_test)

accuracy = metrics.accuracy_score(y_pred, y_test)

print('The accuracy of the model is: ', accuracy)


The accuracy of the model is:  0.9993679952810315


Computing the Average Precision Score

In [18]:
from sklearn.metrics import average_precision_score
avg_precision = average_precision_score(y_test, y_pred)

print("Average precision-recall: {0:0.2f}%".format(avg_precision * 100))

Average precision-recall: 66.30%
