In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1.Importing the libraries

In [None]:
import matplotlib.pyplot as plt                                     # Importing pyplot interface using matplotlib
import seaborn as sns                                               # Importin seabor library for interactive visualization
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression                 # To scaled data with mean 0 and variance 1
from sklearn.model_selection import train_test_split                # To split the data in training and testing part
from sklearn.tree import DecisionTreeClassifier                     # To implement decision tree classifier
from sklearn.metrics import classification_report                   # To generate classification report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
#-------------------------------------------------------------------------------------------------------------------------------
import warnings                                                     # Importing warning to disable runtime warnings
warnings.filterwarnings("ignore")

## 2. Data Acquisition & Description

In [None]:
#Lets look at our dataset
data = pd.read_csv('/kaggle/input/credit-card-fraud-detection-dataset-2023/creditcard_2023.csv')

In [None]:
print('Shape of our Dataset -',data.shape)
data.head()

In [None]:
#Lets see Data Information
data.info()

In [None]:
#Lets see data description
data.describe()

**Observations**-
- We have 568630 Rows of observations having 31 columns.
- **'Class' is our Output feature indicating whether the transaction is fraudulent (1) or not (0).**
- No missing values observed in our Dataset.
- dtype of all the features looks perfect.

In [None]:
#Data Preprocessing

In [None]:
#Lets check for missing values
data.isna().sum()

In [None]:
# Lets check for duplicates if any
data.duplicated().any()

In [None]:
data.info()

**Observations**
- No missing values.
- No duplicates.
- dtype also looks fine.

## Exploratory Data Analysis

In [None]:
#Lets look at the Heatmap First

In [None]:
paper = plt.figure(figsize=[20,10])
sns.heatmap(data.corr(),cmap='crest',annot=True)
plt.show()

**Observations**
- Few features have high co-relation among different features.
- V17 and V18 are higly co-related.
- V16 and V17 are higly co-related.
- V14 has a negative corelation with V4.
- V12 is also negatively co-related with V10 and V11.
- V11 is ngetively co-related with V10 and positvely with V4.
- V3 is positevely co-related with V10 and V12.
- V9 and V10 are also positively co-related.

In [None]:
#Lets look the distribution using a pairplot

In [None]:
#sns.pairplot(data=data,hue='Class')

**Observations**
- Amount is almost normaly distributed.

In [None]:
#Lets skew the skewness of our features

In [None]:
data.skew() 

**Observations**
- Features like V1,V10,V23 are highly negatively skewed. 
- We will see the distribution of some of these features.

In [None]:
#Lets See the distribution of 'amount feature'

In [None]:
data['Amount'].plot.box()

In [None]:
sns.kdeplot(data=data['Amount'], shade=True)
plt.show()

**Observations**:
- Amount is fairly Normally distributed.

In [None]:
#Lets look at at features V1,V10,V23 and V12

In [None]:
# Lets plot a histogram
paper, axes = plt.subplots(2, 2, figsize=(10, 6))
data['V1'].plot(kind='hist', ax=axes[0,0], title='Distribution of V1')
data['V10'].plot(kind='hist', ax=axes[0,1], title='Distribution of V10')
data['V12'].plot(kind='hist', ax=axes[1,0], title='Distribution of V12')
data['V23'].plot(kind='hist', ax=axes[1,1], title='Distribution of V23')
plt.suptitle('Distribution of V1,V10,V12 and V23',size=14)
plt.tight_layout()
plt.show()

In [None]:
#Lets look at our Output feature

In [None]:
data['Class'].value_counts().plot.pie(explode=[0.1,0],autopct='%3.1f%%'
                                     ,shadow=True, legend= True,startangle =45)
plt.title('Distribution of Class',size=14)
plt.show()

**Observations**
- Our output feature is equally balanced.

## Data Preparation

In [None]:
# Lets prepare our data for our Model

In [None]:
data.head()

In [None]:
#Lets devide our data into dependent and independent features

In [None]:
x = data.drop(['id','Class'],axis=1)
y = data.Class

In [None]:
x.head()

In [None]:
print('Shape of x',x.shape)
print('Shape of y',y.shape)

In [None]:
#Lets standardize all our features to bring them on a same scale.
#I have used standard scaler

In [None]:
sc = StandardScaler()

In [None]:
x_scaled = sc.fit_transform(x) 

In [None]:
x_scaled_df = pd.DataFrame(x_scaled,columns=x.columns)

In [None]:
x_scaled_df.head()

## Modelling

In [None]:
#Lets Split our dataset into train and test

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled_df,y,test_size=0.25,random_state=15,stratify= y)

In [None]:
#Lets see the shapes
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

### Logistic Regression

In [None]:
#Lets build a Logistic Regression Model

In [None]:
lr=LogisticRegression()
lr.fit(x_train,y_train)

In [None]:
#Lets define a function for Checking Model Accuracy,Classification Report and Confusion Matrix

In [None]:
def model_eval(actual, predicted):
  acc_score = accuracy_score(actual, predicted)
  conf_matrix = confusion_matrix(actual, predicted)
  clas_rep = classification_report(actual, predicted)
  print('Model Accuracy is: ', round(acc_score, 2))
  print(conf_matrix)
  print(clas_rep)

In [None]:
preds_lr_train = lr.predict(x_train)
preds_lr_test = lr.predict(x_test)

In [None]:
#Lets see the Evaluation matrix of train and test dataset

In [None]:
print('-------Training Accuracy---------')
model_eval(y_train,preds_lr_train)


In [None]:
print('-------Test Accuracy---------')
model_eval(y_test, preds_lr_test)

**Observations**
- Our Logistic Regression Model is giving 96%Accuracy.
- Lets see tree-based models.

###  Decision Tree

In [None]:
#Lets build Decision tree Model and fit 

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train,y_train)

In [None]:
preds_dtree_train = dtree.predict(x_train)
preds_dtree_test = dtree.predict(x_test)

In [None]:
print('-------Training Accuracy---------')
model_eval(y_train,preds_dtree_train)

In [None]:
print('-------Test Accuracy---------')
model_eval(y_test,preds_dtree_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

In [None]:
preds_rf_train = rf.predict(x_train)
preds_rf_test = rf.predict(x_test)

In [None]:
print('-------Training Accuracy---------')
model_eval(y_train, preds_rf_train)

In [None]:
print('-------Test Accuracy---------')
model_eval(y_test, preds_rf_test)

**Observations**:
- Random Forest with default parameters are giving 100% accuracy on both test and train dataset.

In [None]:
import xgboost as xgb

In [None]:
xgclf = xgb.XGBRFClassifier()
xgclf.fit(x_train,y_train)

In [None]:
preds_xgb_train = xgclf.predict(x_train)
preds_xgb_test = xgclf.predict(x_test)

In [None]:
print('-------Training Accuracy---------')
model_eval(y_train,preds_xgb_train)

In [None]:
print('-------Test Accuracy---------')
model_eval(y_test,preds_xgb_test)

## Hypertuning

In [None]:
#We can also try and do some hyperparameter tuning to select the best parameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#Heyerparamter tuning for XGBoost

In [None]:
param_dist_xgb = {
    'n_estimators': [50,100,150,200,300,400],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6]
}

In [None]:
xgb_clf = RandomizedSearchCV(xgclf,param_dist_xgb,verbose = 2)

In [None]:
xgb_clf.fit(x_train,y_train)

In [None]:
# If you want to save some time skip the above step. It took very long to run.
#You can check the best parameters and use directly with the above XG Boost model.

In [None]:
#Best Hyper paramters for XG Boost
print('Best Parameters for XG Boost :',xgb_clf.best_params_)

In [None]:
preds_xgb_clf_train = xgb_clf.predict(x_train)
preds_xgb_clf_test = xgb_clf.predict(x_test)

In [None]:
print('-------Training Accuracy---------')
model_eval(y_train,preds_xgb_clf_train)

In [None]:
print('-------Test Accuracy---------')
model_eval(y_test,preds_xgb_clf_test)

## Consclusion

- We have  done Exploratory Data analysis for different features.
- We prepared our Data and build different ML Models.
- We have seen how different models are performing w.r.t Accuracy,Precision,Recall and F1 Scores.
- Random Forest with default parameters is giving 100% accuracy on training and test dataset.
- We have tried using Boosting technique XGBoost and we have a model with 97% accuracy with improvement in **False Poitive and False Negative.**
- We have further tried doing hyper parameter tuning for XGBoost.We can with different parameters and see if further we can reduce the FP and FN.


> If you have any query you can reach out to me on anmol.1591@outlook.com.
> - **If you like this Notebook pls give me an upvote.**

                        Thankyou 