In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier

# from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


from imblearn.pipeline import Pipeline as imbpipeline
# from sklearn.pipeline import Pipeline


from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

import warnings
warnings.simplefilter(action='ignore')

from pylab import rcParams
rcParams['figure.figsize'] = 8,6
LABELS = ['Normal', 'Fraud']



# **Loading Data**

In [None]:
# import the dataset
data = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv", sep=',')
data.head()

In [None]:
data.info()

In [None]:
# step 3 : Outlier treatment
# since it's classification problem and also most of the variable value fall between 0 and 1
# this is also credit card (anomaly detection problem) so better to ignore outlier part

In [None]:
# check imbalance dataset
data['Class'].value_counts(normalize=True)*100
print("Transactions those are not Frauds :", round(data['Class'].value_counts()[0]/len(data) * 100,2), '% of the dataset')
print("Transactions those are Frauds :" , round(data['Class'].value_counts()[1]/len(data) * 100,2), '% of the dataset')

**Noteable Point** : Dataset has no null values and it is a highly imbalanced dataset.

**Note :** In this dataset, most of the transactions are  normal transactions. If we build our model using this dataset, our model might not detect Fraud transactions. To address the problem of imbalanced dataset we can use undersampling with StratifiedKFold cross validation and oversampling (SMOTE) with  StratifiedKFold cross validation techniques. 

# **EDA**

In [None]:
# EDA - Exploratory data analysis
count_classes = pd.value_counts(data['Class'], sort=True)
count_classes.plot(kind = 'bar', rot = 0)
plt.title("Transaction class distribution")
plt.xticks(range(2), LABELS)
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
# separating fraud and no-fraud transactions
fraud = data[data['Class']==1]
normal = data[data['Class']==0]

print(fraud.shape, normal.shape)

In [None]:
#  descriptive statistics for normal transactions
normal.Amount.describe()

In [None]:
#  descriptive statistics for fraud transactions
fraud.Amount.describe()

**Note :** Transactions amounts is almost equal in both cases.

In [None]:
f, (ax1, ax2) = plt.subplots(2,1, sharex = True)
f.suptitle("Amount per transaction by class")
bins = 50
ax1.hist(fraud.Amount, bins=bins)
ax1.set_title('Fraud')

ax2.hist(normal.Amount, bins=bins)
ax2.set_title('normal')

plt.xlabel("Amount ($)")
plt.ylabel("No. of Transaction")

plt.xlim(0,20000)
plt.yscale('log')
plt.show()

In [None]:
f, (ax1, ax2) = plt.subplots(2,1, sharex = True)
f.suptitle("Time of transaction vs Amount by Class")

ax1.scatter(fraud.Time, fraud.Amount)
ax1.set_title('Fraud')

ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('normal')

plt.xlabel("Time (in second)")
plt.ylabel("Amount")

plt.show()

**Note :** Fraud transactions is not happening any specific time period. 

In [None]:
sns.distplot(data['Time'])

In [None]:
# Correlation check

corrmat = data.corr()
top_corr_feature = corrmat.index
plt.figure(figsize=(20,20))
g = sns.heatmap(data[top_corr_feature].corr(), annot=True, cmap='coolwarm')

In [None]:
# making a copy of original data
data1 = data.copy()
data1.shape

In [None]:
# feature scaling 'Amount' and 'Time'
standard_Scaler=StandardScaler()
data1['s_amount'] = standard_Scaler.fit_transform(data1['Amount'].values.reshape(-1,1))
data1['s_time'] = standard_Scaler.fit_transform(data1['Time'].values.reshape(-1,1))

data1.drop(['Time','Amount'], axis=1, inplace=True)


In [None]:
data1.head()

# **Defining target and independent features**

In [None]:
y=data1["Class"]
x= data1.drop(["Class"],axis=1)

# **Spiliting data into train and test set**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, stratify=y, random_state=101)

**Note :** At first, we will build the RandomForest Classifier model with our imbalance dataset. Next, we build the RandomForest model using RandomUnderSampler (Under-sample the majority class(es) by randomly picking samples with or without replacement) and SMOTE  technique with stratifyKFold cross validation.

# **Model Building**

In [None]:
#  Model training and evaluation
recall_list =[]
def modelEval(xtr,ytr,xte,yte,model):
    
    model.fit(xtr,ytr)
    
    # Prediction for Test and Train Dataset
    test_pred=model.predict(xte)
    train_pred =model.predict(xtr)
    
    tpr_score = metrics.precision_score(ytr, train_pred)
    trc_score = metrics.recall_score(ytr, train_pred)
    tac_score =metrics.accuracy_score(ytr,train_pred)

    #  Confusion Matrix and calculating accuracy score
    print("For Training Dataset.")   
    print(f'Accuracy: {tac_score:.4f}, Precision: {tpr_score:.2f}, Recall: {trc_score:.2f}')
    print(classification_report(ytr, train_pred))
    print("===============================")
    
    pr_score = metrics.precision_score(yte, test_pred)
    rc_score = metrics.recall_score(yte, test_pred)
    ac_score = metrics.accuracy_score(yte, test_pred)
    recall_list.append(rc_score)
    print("===============================")
    print("For Testing Dataset")
    print("===============================")
    print("F1:",metrics.f1_score(yte, test_pred))
    print(f'Accuracy: {ac_score:.2f}, Precision: {pr_score:.2f}, Recall: {rc_score:.2f}')
    print("===============================")
    

    print(classification_report(yte,test_pred))
    metrics.plot_confusion_matrix(model,xte,yte,cmap='YlGnBu')    


# **Random Forest**

In [None]:
print("Model Name : RandomForest")

model_rf = RandomForestClassifier(n_estimators=200,criterion ='gini', max_depth=10, min_samples_leaf=10,
                                              min_samples_split=10, random_state=42)
rf_model_Acc = modelEval(X_train,y_train,X_test,y_test,model_rf)

# **RandomUnderSampler technique with StratifiedKFold**

# **Random Forest**

In [None]:
pipeline = imbpipeline(steps =[['underSample', RandomUnderSampler(random_state=110,sampling_strategy='majority')],
                           ['classifier', RandomForestClassifier(random_state=110)]])

RandomForestClassifier
param_grid = { "classifier__n_estimators":[200],
               "classifier__max_depth": [8,10],
               "classifier__min_samples_split":[10,12],
               "classifier__min_samples_leaf": [10,12],
               "classifier__criterion": ["gini", "entropy"]}

grid_search = GridSearchCV(estimator=pipeline,param_grid =param_grid,
                           cv=stratified_kfold,
                           n_jobs=3)


rf_model_Acc = modelEval(X_train,y_train,X_test,y_test,grid_search)


# **SMOTE technique with StratifiedKFold**

# **RandomForest**

In [None]:
pipeline = imbpipeline(steps =[['smote', SMOTE(random_state=110)],
                           ['classifier', RandomForestClassifier(random_state=110)]])

RandomForestClassifier
param_grid = { "classifier__n_estimators":[150],
               "classifier__max_depth": [10],
               "classifier__min_samples_split":[12],
               "classifier__min_samples_leaf":[15],
               "classifier__criterion": ["gini"]}

grid_search = GridSearchCV(estimator=pipeline,param_grid =param_grid,
                           cv=stratified_kfold,
                           n_jobs=3)


rf_model_Acc = modelEval(X_train,y_train,X_test,y_test,grid_search)


# XGBoost

In [None]:
from xgboost import XGBClassifier
xgb_classifier = XGBClassifier()
xgb_model_Acc = modelEval(X_train,y_train,X_test,y_test,xgb_classifier)


In [None]:
model_list = ["Random Forest","RandomForestUnderSampler","RandomForestSMOTE","XGBoost"]

In [None]:
col_pal = sns.color_palette("cool",n_colors=7)
plt.rcParams['figure.figsize']=15,6 
ax = sns.barplot(x=model_list, y=recall_list, palette = col_pal, saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("% Recall", fontsize = 20)
plt.title("Recall of RandomForest and XGBoost Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()