<a href="https://colab.research.google.com/github/DSShail/Machine-Learning/blob/main/credit_card_fraud_detection_2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from statsmodels.graphics.gofplots import qqplot

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,accuracy_score,classification_report
!pip install statsmodels

# 1. Data Acquisition

*Reading the CSV file into a dataframe*

In [None]:
credit_df=pd.read_csv('creditcard.csv')

*Dropping the Na rows and columns *

# 2. Data Preprocessing

**we will check if the data needs transformation**
*  label encoding
*  ordinal encoding
*  column transformation
*  function transformer
*  power transformer




In [None]:
credit_df.dropna(inplace=True)

# 3. Data Visualization

**Plotting heatmap to check the relationships in the dataset**

In [None]:
corelation_matrix=credit_df.corr()
plt.figure(figsize=(14,8))
sns.heatmap(corelation_matrix,annot=False,cmap='coolwarm',linewidths=1.5)
plt.title('Corelation heatmap')
plt.show()

*training the data into train and test data*

In [None]:
from sklearn.model_selection import train_test_split
X=credit_df.drop(columns=['Class'])
y=credit_df['Class']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)


*Looking at the data structure(column wise)*

*we will use QQ plot for looking at the data distribution*


In [None]:
for col in X_train.columns:
    fig,ax=plt.subplots(nrows=1,ncols=2,figsize=(14,6))
    sns.distplot(X_train[col],ax=ax[0])
    ax[0].set_title(f'KDE Plot for {col}')

    qqplot(X_train[col],line='s',ax=ax[1])
    ax[1].set_title(f'QQ Plot for {col}')
    plt.show()

# Softmax Regression-Logistic Regression

*We will apply softmax regression because it is a multi-class regression problem statement*

we need to scale the input features before fitting into the model

In [None]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()

X_train_scaled=scalar.fit_transform(X_train)
X_test_scaled=scalar.transform(X_test)

*The hyper-parameter used in the softmax regression is*
1.   max_iter




In [None]:
from sklearn.linear_model import LogisticRegression

#using multi-class as multinomial the LogisticRegression becomes SOFTMAX REGRESSION
LoR=LogisticRegression(multi_class='multinomial',max_iter=1000)
LoR.fit(X_train_scaled,y_train)
y_pred_lor=LoR.predict(X_test_scaled)
print(accuracy_score(y_pred_lor,y_test))
print(classification_report(y_pred_lor,y_test))

*   The recall is 77
*   The precision for fraud is 69 precent



# Decision Tree

In [None]:
parmas={
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 5],
    'min_samples_split': [2, 5, 10],
}

**Appying GridSearchCV on DecisionTreeClassifier**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
dt_clf=DecisionTreeClassifier()

grid_search=GridSearchCV(estimator=dt_clf, param_grid=parmas, cv=5)
grid_search.fit(X_train,y_train)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

*we will use the output of GridSearchCV and Hypertune the decision tree parameters*

*   Hyperparamters used in the decision tree are
  1.   criterion
  2.   max_depth
  3.   min_samples_leaf
  4.   min_samples_split

In [None]:
from sklearn.tree import DecisionTreeClassifier

#decison tree with hyperparameter tuining
decision_tree_clf=DecisionTreeClassifier(criterion='gini')
decision_tree_clf =decision_tree_clf.fit(X_train,y_train)
y_pred_dt=decision_tree_clf.predict(X_test)

*Calculating the accuracy and classification report*

In [None]:
from sklearn.metrics import classification_report,accuracy_score
print(classification_report(y_test,y_pred_dt))

print(accuracy_score(y_pred_dt,y_test))



*   F1 score is 1- that means model is ideal
*   The accuracy is also 100 percent



*The biggest concern is the data - the dataset is imbalanced so we need to work on imbalanced dataset*

# Random Forest - Bagging Ensemble

*Appying GridSearchCV on RandomForest*

In [None]:
rf_parmas={
    'criterion': ['gini', 'entropy'],
    'n_estimators':[5,10,15,20],
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 5],
    'min_samples_split': [2, 5, 10],
}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf1=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf1, param_grid=rf_parmas, cv=5)
grid_search.fit(X_train,y_train)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

*   Hyperparameters used for the random forest is
  1.  n_estimators

*   Hyperparamters used in the decision tree of the random forest are
  1.   criterion
  2.   max_depth
  3.   min_samples_leaf
  4.   min_samples_split





In [None]:

rf2=RandomForestClassifier(n_estimators=5,criterion='gini',max_depth=5,min_samples_leaf=1,min_samples_split=2)
rf2.fit(X_train,y_train)
y_pred=rf2.predict(X_test)

print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

# Feature Importance using random Forest

In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE
# Separate features and target variable
X = credit_df.drop('Class', axis=1)
y = credit_df['Class']

# Apply SMOTE for oversampling
X.dropna(inplace=True)
y.dropna(inplace=True)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("X_resampled: ", X_resampled)
print("y_resampled: ", y_resampled)


#For Feature importance
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Fit the model to determine feature importances
rf.fit(X_resampled, y_resampled)



# Get feature importances
feature_importances = rf.feature_importances_

# Display feature importances in a sorted manner
feature_importances_sorted = sorted(zip(feature_importances, X_resampled.columns), reverse=True)

#print(feature_importances_sorted)
for importance, feature in feature_importances_sorted:
    print(f"Feature: {feature}, Importance: {importance}")### Feature Importance using Random Forest

*We will select three most relevant features after appying feature importance. Those features are as given below*
1.   V14
2.   V10
3.   V12

We will create a dataframe(X_selected and y_selected) consisting of these above columns and perform the ML algo.



In [None]:
X_selected=X_resampled[['V14','V12','V10']]
y_selected=y_resampled

*splitting the data into training set and test set*

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_selected,y_selected,test_size=0.2,random_state=42)

# Logistic Regression after feature importance

In [None]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()

X_train_scaled=scalar.fit_transform(X_train)
X_test_scaled=scalar.transform(X_test)

from sklearn.linear_model import LogisticRegression

#using multi-class as multinomial the LogisticRegression becomes SOFTMAX REGRESSION
LoR=LogisticRegression(multi_class='multinomial',max_iter=1000)
LoR.fit(X_train_scaled,y_train)
y_pred_lor=LoR.predict(X_test_scaled)
print(accuracy_score(y_pred_lor,y_test))
print(classification_report(y_pred_lor,y_test))