# Importing the necessary libraries

In [None]:
import pandas as pd
from dateutil.parser import parse
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Reading the CreditCard dataset

In [None]:
df = pd.read_csv("creditcard.csv")

In [None]:
df.head()

# Checking for null values, we have 0 null values

In [None]:
df.isna().sum()

# Exploratory Data Analysis

In [None]:
df.info()

In [None]:
display(df.describe())

The below countplot shows that the dataset is highly imbalanced and is leaning towards Class Value 0

In [None]:
sns.countplot(x=df['Class'],data=df)
plt.show()

Using pairplot to see the relationship between different variables 

In [None]:
sns.set(style="ticks")
sns.pairplot(df[["V1","V3","V8","Class"]], hue="Class")

Correlation Matrix

In [None]:
correlation_matrix=df.corr()
sns.heatmap(correlation_matrix,
            xticklabels=correlation_matrix.columns.values,
            yticklabels=correlation_matrix.columns.values)

# Feature Engineering

Dropping time column from the dataset as it seems irrelevant with the target variable

In [None]:
df = df.drop(['Time'], axis=1)

Transforming the categorical data

In [None]:

def data_type(dataset):
    numerical=[]
    categorical=[]
    for datatype in dataset.columns:
        if df[datatype].dtype=="float64" or df[datatype].dtype=="int64":
            numerical.append(datatype)
        else:
            categorical.append(datatype)
    return numerical,categorical

            
numerical,categorical=data_type(df)
#removing the binary columns from numerical list for scaling
def binary_columns(dataset):
    binary_cols=[]
    for col in dataset.select_dtypes(include=['int','float']).columns:
        unique_values=df[col].unique()
        if np.in1d(unique_values,[0,1]).all():
            binary_cols.append(col)
    return binary_cols

binary_cols=binary_columns(df)

for i in binary_cols:
    numerical.remove(i)

# Scaling the entire dataset

In [None]:
from sklearn.preprocessing import StandardScaler

def feature_scaling(dataset,numerical):
    sc_x=StandardScaler()
    dataset[numerical]=sc_x.fit_transform(dataset[numerical])
    return dataset

df=feature_scaling(df,numerical)
    

In [None]:
df.head()

Splitting the data into input(X) and target(y) variables

In [None]:
X = df.drop(['Class'], axis=1)
y = df[['Class']]

# Splitting the data into train and test datasets

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score


rf_model = RandomForestClassifier(n_estimators=50)
rf_model.fit(X_train, y_train.values.ravel())

acc_score_train = rf_model.score(X_train, y_train)
acc_score_test = rf_model.score(X_test, y_test)
print(f'Accuracy of model on training dataset :- {acc_score_train}')
print(f'Accuracy of model on test dataset :- {acc_score_test}')


        



Prediction of results using test dataset

In [None]:

y_pred = rf_model.predict(X_test)

# Classification report for f1-score

print(f"Classification Report :- \n {classification_report(y_test, y_pred)}")
print(f"AROC score :- \n {roc_auc_score(y_test, y_pred)}")


# Confusion Matrix

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot = True,fmt ='.5g')

Visualizing by The Precision Recall Curve

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
fig, ax = plt.subplots()
ax.plot(recall, precision, color='blue')
#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
#display plot
plt.show() 

# Need to apply balancing to this highly imbalanced dataset
1.If you see the training  score on the original dataset , it is 99.9%

    This means that the model has overfitted and has memorized the training data.
    This has happened purely because Class attribute in the dataset has more than 99% values as 0

2.To tackle this problem, we will use SMOTE over sampling method
    
    Please keep in mind, we are not going with random undersampling or random oversampling 
    
    Because with random oversampling ,we add random set of copies of minority class examples to the data.
    This may increase the likelihood of overfitting.
    
    Using random undersampling method,we delete data from the majority class.
    This can be highly problematic, as the loss of such data can make the decision boundary 
    between minority and majority instances harder to learn, resulting in a loss in classification performance.

3.Hence we are going with SMOTE

    It is an oversampling technique where the synthetic samples are generated for the minority class.
    This algorithm helps to overcome the overfitting problem posed by random oversampling. 
    



Applying SMOTE

In [None]:
from imblearn.over_sampling import SMOTE 

sm = SMOTE(sampling_strategy = 0.9, k_neighbors = 3, random_state = 100) 
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train.values.ravel()) 
  
# Print the oversampling results
print(f"\n\t After applying SMOTE ,the shape of  X_train: {X_train_SMOTE.shape}") 
print(f"\n\t After applying SMOTE ,the shape of y_train: {y_train_SMOTE.shape}") 
  
print("After applying SMOTE, count '1': {}".format(sum(y_train_SMOTE == 1))) 
print("After applying SMOTE, count '0': {}".format(sum(y_train_SMOTE == 0)))

Training balanced data

We have done hyperparameter tuning to control overfitting on the training dataset

In [None]:
rf_model_SMOTE = RandomForestClassifier(max_depth=2, n_estimators=30,
                min_samples_split=3, max_leaf_nodes=5,
                random_state=22)
 


rf_model_SMOTE.fit(X_train_SMOTE, y_train_SMOTE.ravel())

acc_score_train_SMOTE = rf_model_SMOTE.score(X_train_SMOTE, y_train_SMOTE)
acc_score_test_SMOTE = rf_model_SMOTE.score(X_test, y_test)

print(f'Accuracy of model on training dataset after SMOTE :- {acc_score_train_SMOTE}')
print(f'Accuracy of model on test dataset after SMOTE:- {acc_score_test_SMOTE}')



In [None]:
y_pred_SMOTE = rf_model_SMOTE.predict(X_test)

# classification report for f1-score



print(f"Classification Report :- \n {classification_report(y_test, y_pred_SMOTE)}")
print(f"AROC score :- \n {roc_auc_score(y_test, y_pred_SMOTE)}")

The ROC AUC Score score has improved in this model, which shows the model is predicting better now.
We would like this score to be as close to 1 as possible.

# Confusion Matrix on Balanced data

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_SMOTE), annot = True,fmt ='.5g')

In [None]:
from sklearn.metrics import precision_recall_curve, auc
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_SMOTE)

fig, ax = plt.subplots()
ax.plot(recall, precision, color='blue')
#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
#display plot
plt.show() 



In the above curve at (1, 1), the threshold is 0.0.
This means that our precision and recall are high, and the model makes distinctions perfectly