## A. Data Exploration

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Open file and inspect first five rows.
df_A = pd.read_csv('creditcard.csv')
df_A.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Check shape.
df_A.shape

(284807, 31)

In [4]:
# Check for nulls.
df_A.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
# Show class distribution.
print(df_A['Class'].value_counts(),'\n')

# Show proportion of fraudulent transactions.
print((df_A['Class'] == 1).mean())

0    284315
1       492
Name: Class, dtype: int64 

0.00172748563062


This is a highly imbalanced dataset, and if the model were to predict nothing but 0, it would still be almost 99.83% accurate. Two methods that are best suited for imbalanced datasets will be compared - logistic regression and random forest classification. Furthermore, up and down samplng, which are effective techniques for imbalanced datasets, will be conducted within each method. 

## B. Logistic Regression

### 1. Original Dataset

In [6]:
# Set the variables.
X_B1 = df_A.iloc[:,1:30]
Y_B1 = df_A['Class']

# Split into train and test data.
from sklearn.cross_validation import train_test_split
X_train_B1, X_test_B1, y_train_B1, y_test_B1 = train_test_split(X_B1, Y_B1, test_size = .3, random_state=25)

# Standardize training features.
from sklearn.preprocessing import StandardScaler
scaler_B1 = StandardScaler().fit(X_train_B1)
X_train_B1_trans = scaler_B1.transform(X_train_B1)

# Fit the model.
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression()
logr.fit(X_train_B1_trans, y_train_B1)

# Standardize test features based on training set.
X_test_B1_trans = scaler_B1.transform(X_test_B1)

# Make predictions.
logr_pred_B1 = logr.predict(X_test_B1_trans)

# Accuracy score.
from sklearn.metrics import accuracy_score
print('Score:', accuracy_score(y_test_B1, logr_pred_B1))



Score: 0.999157332959


In [7]:
# Confusion matrix.
from sklearn.metrics import confusion_matrix
confusion_matrix_B1 = confusion_matrix(y_test_B1, logr_pred_B1)
print(confusion_matrix_B1)

[[85275    10]
 [   62    96]]


In [8]:
# Classification matrix.
from sklearn.metrics import classification_report
print(classification_report(y_test_B1, logr_pred_B1))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     85285
          1       0.91      0.61      0.73       158

avg / total       1.00      1.00      1.00     85443



The model's sensitivity is relatively low at 0.61, which means it performs rather poorly in identifying fraudulent transactions, i.e., true positives; hence, the need for up or down sampling techniques.

### 2. Up-Sample Minority Class

In [9]:
# Separate majority and minority classes.
df_B2_maj = df_A[df_A['Class']==0]
df_B2_min = df_A[df_A['Class']==1]

# Upsample minority class.
from sklearn.utils import resample
df_B2_min_up = resample(df_B2_min, replace=True, n_samples=284315, random_state=123)

# Combine majority class with upsampled minority class.
df_B2_up = pd.concat([df_B2_maj, df_B2_min_up])

# Display new class counts.
df_B2_up['Class'].value_counts()

1    284315
0    284315
Name: Class, dtype: int64

In [10]:
# Set the variables.
X_B2 = df_B2_up.iloc[:,1:30]
Y_B2 = df_B2_up['Class']

# Split into train and test data.
X_train_B2, X_test_B2, y_train_B2, y_test_B2 = train_test_split(X_B2, Y_B2, test_size = .3, random_state=25)

# Standardize training features.
scaler_B2 = StandardScaler().fit(X_train_B2)
X_train_B2_trans = scaler_B2.transform(X_train_B2)

# Fit the model.
logr.fit(X_train_B2_trans, y_train_B2)

# Standardize test features based on training set.
X_test_B2_trans = scaler_B2.transform(X_test_B2)

# Make predictions.
logr_pred_B2 = logr.predict(X_test_B2_trans)

# Accuracy score.
print('Score:', accuracy_score(y_test_B2, logr_pred_B2))

Score: 0.948349541881


In [11]:
# Confusion matrix.
confusion_matrix_B2 = confusion_matrix(y_test_B2, logr_pred_B2)
print(confusion_matrix_B2)

[[83707  1934]
 [ 6877 78071]]


In [12]:
# Classification matrix.
print(classification_report(y_test_B2, logr_pred_B2))

             precision    recall  f1-score   support

          0       0.92      0.98      0.95     85641
          1       0.98      0.92      0.95     84948

avg / total       0.95      0.95      0.95    170589



### 3. Down-Sample Majority Class

In [13]:
# Downsample majority class.
df_B3_maj_down = resample(df_B2_maj, replace=False, n_samples=492, random_state=123)

# Combine minority class with downsampled majority class.
df_B3_down = pd.concat([df_B2_min, df_B3_maj_down])

# Display new class counts.
df_B3_down['Class'].value_counts()

1    492
0    492
Name: Class, dtype: int64

In [14]:
# Set the variables.
X_B3 = df_B3_down.iloc[:,1:30]
Y_B3 = df_B3_down['Class']

# Split into train and test data.
X_train_B3, X_test_B3, y_train_B3, y_test_B3 = train_test_split(X_B3, Y_B3, test_size = .3, random_state=25)

# Standardize training features.
scaler_B3 = StandardScaler().fit(X_train_B3)
X_train_B3_trans = scaler_B3.transform(X_train_B3)

# Fit the model.
logr.fit(X_train_B3_trans, y_train_B3)

# Standardize test features based on training set.
X_test_B3_trans = scaler_B3.transform(X_test_B3)

# Make predictions.
logr_pred_B3 = logr.predict(X_test_B3_trans)

# Accuracy score.
print('Score:', accuracy_score(y_test_B3, logr_pred_B3))

Score: 0.942567567568


In [15]:
# Confusion matrix.
confusion_matrix_B3 = confusion_matrix(y_test_B3, logr_pred_B3)
print(confusion_matrix_B3)

[[147   1]
 [ 16 132]]


In [16]:
# Classification matrix.
print(classification_report(y_test_B3, logr_pred_B3))

             precision    recall  f1-score   support

          0       0.90      0.99      0.95       148
          1       0.99      0.89      0.94       148

avg / total       0.95      0.94      0.94       296



## C. Random Forest

### 1. Original Dataset

In [17]:
# Fit the model on the original training set. 
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_B1_trans, y_train_B1)

# Predict on the original test set.
pred_rfc_C1 = rfc.predict(X_test_B1_trans)

# Accuracy score.
print('Score:', accuracy_score(y_test_B1, pred_rfc_C1))

Score: 0.999543555353


In [18]:
# Confusion matrix.
confusion_matrix_C1 = confusion_matrix(y_test_B1, pred_rfc_C1)
print(confusion_matrix_C1)

[[85273    12]
 [   27   131]]


In [19]:
# Classification matrix.
print(classification_report(y_test_B1, pred_rfc_C1))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     85285
          1       0.92      0.83      0.87       158

avg / total       1.00      1.00      1.00     85443



### 2. Up-Sample Minority Class

In [20]:
# Fit the model on the up-sample training set.
rfc.fit(X_train_B2_trans, y_train_B2)

# Predict on the up-sample test set.
pred_rfc_C2 = rfc.predict(X_test_B2_trans)

# Accuracy score.
print('Score:', accuracy_score(y_test_B2, pred_rfc_C2))

Score: 0.999970689787


In [21]:
# Confusion matrix.
confusion_matrix_C2 = confusion_matrix(y_test_B2, pred_rfc_C2)
print(confusion_matrix_C2)

[[85636     5]
 [    0 84948]]


In [22]:
# Classification matrix.
print(classification_report(y_test_B2, pred_rfc_C2))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     85641
          1       1.00      1.00      1.00     84948

avg / total       1.00      1.00      1.00    170589



### 3. Down-Sample Majority Class¶

In [23]:
# Fit the model on the down-sample training set.
rfc.fit(X_train_B3_trans, y_train_B3)

# Predict on the down-sample test set.
pred_rfc_C3 = rfc.predict(X_test_B3_trans)

# Accuracy score.
print('Score:', accuracy_score(y_test_B3, pred_rfc_C3))

Score: 0.935810810811


In [24]:
# Confusion matrix.
confusion_matrix_C3 = confusion_matrix(y_test_B3, pred_rfc_C3)
print(confusion_matrix_C3)

[[147   1]
 [ 18 130]]


In [25]:
# Classification matrix.
print(classification_report(y_test_B3, pred_rfc_C3))

             precision    recall  f1-score   support

          0       0.89      0.99      0.94       148
          1       0.99      0.88      0.93       148

avg / total       0.94      0.94      0.94       296



Up-sampling the dataset prior to random forest classification appears to provide the best results. However, the model could potentially be overfit, therefore, it should be evaluated on an unseen test set prior to making a final conclusion.