In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

%matplotlib inline

#### Reading Data

In [None]:
credit = pd.read_csv("data/creditcard_samp.csv")
credit.drop_duplicates(inplace=True)
credit.head()

In [None]:
credit.info()

In [None]:
print("Target Proportion:::")
np.round(credit.Class.value_counts(normalize=True),3)

In [None]:
features = credit.iloc[:, :-1].values
labels = credit.iloc[:, -1].values

In [None]:
print("Features Shape:", features.shape)
print("Targets Shape:", labels.shape)

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, 
                                                                            labels, 
                                                                            random_state=50)

## 1) Model Building using the actual Data Set

In [None]:
clf=RandomForestClassifier(random_state=0)
clf.fit(features_train, labels_train)
y_test_pred = clf.predict(features_test)

In [None]:
print("Accuracy Score:::", np.round(accuracy_score(labels_test, y_test_pred),4))

#### Accuracy Trap

![title](img/trap.png)

In [None]:
confusion_matrix(labels_test,y_test_pred)

In [None]:
print (classification_report(labels_test,y_test_pred))
print("AUC on the actual Dataset:::", roc_auc_score(labels_test,y_test_pred))

## 2) Model Building using Oversampling or Undersampling


![title](img/resample.png)

In [None]:
df_train = pd.DataFrame(np.concatenate((features_train, labels_train.reshape(-1,1)), axis=1))
df_train.columns = credit.columns
df_train.Class.value_counts()

In [None]:
# Class count
count_class_0, count_class_1 = df_train.Class.value_counts()

# Divide by class
df_class_0 = df_train[df_train['Class'] == 0]
df_class_1 = df_train[df_train['Class'] == 1]


In [None]:
### Random Over-sampling
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_over.Class.value_counts())

df_over.Class.value_counts().plot(kind='bar', title='Count (Class)');

In [None]:
features_train_over = df_over.iloc[:, :-1].values
labels_train_over =  df_over.iloc[:, -1].values

In [None]:
clf_over=RandomForestClassifier(random_state=0)
clf_over.fit(features_train_over, labels_train_over)
y_test_pred = clf_over.predict(features_test)

In [None]:
print (classification_report(labels_test,y_test_pred))
print("AUC using Oversampling method:::", roc_auc_score(labels_test,y_test_pred))

In [None]:
confusion_matrix(labels_test,y_test_pred)

In [None]:
print("Training Data shape::" , df_train.shape)
print("Oversampling Data shape::" , df_over.shape)
print("Oversampling Data shape after removing duplicates::" , df_over.drop_duplicates().shape)


## 3) Model Building using SMOTE

In [None]:
oversampler=SMOTE(random_state=0)
os_features,os_labels=oversampler.fit_sample(features_train,labels_train)

In [None]:
df_smote = pd.DataFrame(np.concatenate((os_features, os_labels.reshape(-1,1)), axis=1))
df_smote.columns = credit.columns
df_smote.shape

In [None]:
np.bincount(os_labels)

In [None]:
smote = RandomForestClassifier(random_state=0)
smote.fit(os_features, os_labels)
y_test_pred = smote.predict(features_test)

In [None]:
print (classification_report(labels_test,y_test_pred))
print("AUC using SMOTE:::", roc_auc_score(labels_test,y_test_pred))

In [None]:
confusion_matrix(labels_test,y_test_pred)

In [None]:
print("SMOTE Data shape::" , df_smote.shape)
print("SMOTE Data shape after removing duplicates::" , df_smote.drop_duplicates().shape)

### Results Summary 
![title](img/summary.png)