# #CREDIT CARD FRAUD DETECTION

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## Load the dataset 

In [2]:
import pandas as pd
df = pd.read_csv('creditcard.csv')

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Perform basic data quality checks

In [4]:
df.shape

(284807, 31)

In [5]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

## Data Cleaning & Preprocessing

In [7]:
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [8]:
df.dtypes

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

In [9]:
df.duplicated().sum()

1081

In [10]:
# Remove duplicate rows
df.drop(columns=['Time'],axis=1,inplace=True)
df = df.drop_duplicates()

print("Shape of DataFrame after removing duplicates: ", df.shape)


Shape of DataFrame after removing duplicates:  (275663, 30)


In [11]:
df.nunique()

V1        275663
V2        275663
V3        275663
V4        275663
V5        275663
V6        275663
V7        275663
V8        275663
V9        275663
V10       275663
V11       275663
V12       275663
V13       275663
V14       275663
V15       275663
V16       275663
V17       275663
V18       275663
V19       275663
V20       275663
V21       275663
V22       275663
V23       275663
V24       275663
V25       275663
V26       275663
V27       275663
V28       275663
Amount     32767
Class          2
dtype: int64

## Import necessary libraries

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [13]:
# Define your features and target
X = df.drop('Class', axis=1)
y = df['Class']

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [15]:
# Apply SMOTE to the training data
### SMOTE (Synthetic Minority Over-sampling Technique): Creates synthetic samples for the minority class.
smote = SMOTE(random_state=101)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [16]:
# Define the classifiers
classifiers = {
    "Logistic" : LogisticRegression(random_state=101),
    "Random Forest": RandomForestClassifier(random_state=101),
    "Decision Tree": DecisionTreeClassifier(random_state=101)
}

In [17]:
# Train the classifiers and make predictions
for name, classifier in classifiers.items():
    classifier.fit(X_train_smote, y_train_smote)
    y_pred = classifier.predict(X_test)
# Print the classification report and confusion matrix
    print(f"Classifier: {name}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

Classifier: Logistic
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     55049
           1       0.07      0.88      0.14        84

    accuracy                           0.98     55133
   macro avg       0.54      0.93      0.56     55133
weighted avg       1.00      0.98      0.99     55133

[[54112   937]
 [   10    74]]
Classifier: Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55049
           1       0.84      0.81      0.82        84

    accuracy                           1.00     55133
   macro avg       0.92      0.90      0.91     55133
weighted avg       1.00      1.00      1.00     55133

[[55036    13]
 [   16    68]]
Classifier: Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     55049
           1       0.34      0.76      0.47        84

    accuracy                           1.00    