# CREDIT CARD FRAUD DETECTION

1. Build a machine learning model to identify fraudulent credit card
transactions.
2. Preprocess and normalize the transaction data, handle class
imbalance issues, and split the dataset into training and testing sets.
3. Train a classification algorithm, such as logistic regression or random
forests, to classify transactions as fraudulent or genuine.
4. Evaluate the model's performance using metrics like precision, recall, and F1-score, and consider techniques like oversampling or
undersampling for improving results.

In [3]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#loading and reading 10 rows of data
dt = pd.read_csv('C:/Studies/Codsoft/Task5-CreditCard_Fraud_Detection/creditcard.csv')
dt.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [5]:
# looking into data information
dt.info()
# all records are complete, no NAN values seen

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
# looking into statistical information of data by setting float limit to 3 decimal points and displaying all columns
pd.set_option('display.float_format',lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 100)
dt.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.86,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,88.35,0.002
std,47488.146,1.959,1.651,1.516,1.416,1.38,1.332,1.237,1.194,1.099,1.089,1.021,0.999,0.995,0.959,0.915,0.876,0.849,0.838,0.814,0.771,0.735,0.726,0.624,0.606,0.521,0.482,0.404,0.33,250.12,0.042
min,0.0,-56.408,-72.716,-48.326,-5.683,-113.743,-26.161,-43.557,-73.217,-13.434,-24.588,-4.797,-18.684,-5.792,-19.214,-4.499,-14.13,-25.163,-9.499,-7.214,-54.498,-34.83,-10.933,-44.808,-2.837,-10.295,-2.605,-22.566,-15.43,0.0,0.0
25%,54201.5,-0.92,-0.599,-0.89,-0.849,-0.692,-0.768,-0.554,-0.209,-0.643,-0.535,-0.762,-0.406,-0.649,-0.426,-0.583,-0.468,-0.484,-0.499,-0.456,-0.212,-0.228,-0.542,-0.162,-0.355,-0.317,-0.327,-0.071,-0.053,5.6,0.0
50%,84692.0,0.018,0.065,0.18,-0.02,-0.054,-0.274,0.04,0.022,-0.051,-0.093,-0.033,0.14,-0.014,0.051,0.048,0.066,-0.066,-0.004,0.004,-0.062,-0.029,0.007,-0.011,0.041,0.017,-0.052,0.001,0.011,22.0,0.0
75%,139320.5,1.316,0.804,1.027,0.743,0.612,0.399,0.57,0.327,0.597,0.454,0.74,0.618,0.663,0.493,0.649,0.523,0.4,0.501,0.459,0.133,0.186,0.529,0.148,0.44,0.351,0.241,0.091,0.078,77.165,0.0
max,172792.0,2.455,22.058,9.383,16.875,34.802,73.302,120.589,20.007,15.595,23.745,12.019,7.848,7.127,10.527,8.878,17.315,9.254,5.041,5.592,39.421,27.203,10.503,22.528,4.585,7.52,3.517,31.612,33.848,25691.16,1.0


In [7]:
# copying data such that original is not affected and selecting columns that are relevent
data = dt.copy(deep= True)

In [8]:
# dropping duplicate records
data.drop_duplicates(keep='first', inplace = True)

In [9]:
# since amount column has high standard deviation we need to preprocess it using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1,1))

In [10]:
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0
mean,94811.078,0.006,-0.004,0.002,-0.003,0.002,-0.001,0.002,-0.001,-0.002,-0.001,0.0,-0.001,0.001,0.0,0.001,0.001,0.0,0.002,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.002,0.001,-0.0,0.002
std,47481.048,1.948,1.647,1.509,1.414,1.377,1.332,1.228,1.179,1.095,1.076,1.019,0.995,0.995,0.952,0.915,0.874,0.843,0.837,0.813,0.77,0.724,0.725,0.624,0.606,0.521,0.482,0.396,0.328,1.0,0.041
min,0.0,-56.408,-72.716,-48.326,-5.683,-113.743,-26.161,-43.557,-73.217,-13.434,-24.588,-4.797,-18.684,-5.792,-19.214,-4.499,-14.13,-25.163,-9.499,-7.214,-54.498,-34.83,-10.933,-44.808,-2.837,-10.295,-2.605,-22.566,-15.43,-0.353,0.0
25%,54204.75,-0.916,-0.6,-0.89,-0.85,-0.69,-0.769,-0.553,-0.209,-0.644,-0.536,-0.762,-0.406,-0.648,-0.426,-0.581,-0.467,-0.484,-0.498,-0.456,-0.211,-0.228,-0.543,-0.162,-0.354,-0.317,-0.327,-0.071,-0.053,-0.331,0.0
50%,84692.5,0.02,0.064,0.18,-0.022,-0.053,-0.275,0.041,0.022,-0.053,-0.093,-0.032,0.139,-0.013,0.05,0.049,0.067,-0.066,-0.002,0.003,-0.062,-0.029,0.007,-0.011,0.041,0.016,-0.052,0.001,0.011,-0.265,0.0
75%,139298.0,1.316,0.8,1.027,0.74,0.612,0.397,0.57,0.326,0.596,0.454,0.74,0.617,0.663,0.492,0.65,0.524,0.399,0.502,0.459,0.133,0.186,0.528,0.148,0.44,0.351,0.24,0.091,0.078,-0.044,0.0
max,172792.0,2.455,22.058,9.383,16.875,34.802,73.302,120.589,20.007,15.595,23.745,12.019,7.848,7.127,10.527,8.878,17.315,9.254,5.041,5.592,39.421,27.203,10.503,22.528,4.585,7.52,3.517,31.612,33.848,102.248,1.0


In [11]:
# dropping time column as there is no relevance of transaction time with fraud detection
data.drop(columns= 'Time', inplace= True)

In [12]:
# data information after dropping duplicate records
data.count()
# 1081 records dropped

V1        283726
V2        283726
V3        283726
V4        283726
V5        283726
V6        283726
V7        283726
V8        283726
V9        283726
V10       283726
V11       283726
V12       283726
V13       283726
V14       283726
V15       283726
V16       283726
V17       283726
V18       283726
V19       283726
V20       283726
V21       283726
V22       283726
V23       283726
V24       283726
V25       283726
V26       283726
V27       283726
V28       283726
Amount    283726
Class     283726
dtype: int64

In [13]:
# gives frequency of unique values in a column/s to find if data is balanced or imbalanced
from collections import Counter
Counter(data['Class'])
# highly imbalanced in nature, thus we will approach using 2 ways: Undersampling and Oversamping

Counter({0: 283253, 1: 473})

In [14]:
# splitting data into dependent and independent
x = data.drop(columns='Class',axis = 1)
y = data['Class']

In [19]:
# importing train test split model, algorithms and classification report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Undersampling

In [16]:
from imblearn.under_sampling import (ClusterCentroids, TomekLinks, RandomUnderSampler)

In [17]:
# 1. Random Under Sampler:
sampler = RandomUnderSampler(random_state=42)
x_rs , y_rs = sampler.fit_resample(x,y)
print(Counter(y_rs))

Counter({0: 473, 1: 473})


In [18]:
# assigning variables in train test split where test size is 30%
x_train,x_test,y_train,y_test = train_test_split(x_rs,y_rs,test_size=0.3,random_state=0)

In [20]:
# Logistic Regression Algorithm
lr = LogisticRegression()
lr.fit(x_train,y_train)
lpred = lr.predict(x_test)
report = classification_report(y_test, lpred)
print(report)


              precision    recall  f1-score   support

           0       0.93      0.95      0.94       146
           1       0.95      0.93      0.94       138

    accuracy                           0.94       284
   macro avg       0.94      0.94      0.94       284
weighted avg       0.94      0.94      0.94       284



In [22]:
# Random Forest Algorithm
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rpred = rfc.predict(x_test)
report = classification_report(y_test, rpred)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       146
           1       0.94      0.91      0.93       138

    accuracy                           0.93       284
   macro avg       0.93      0.93      0.93       284
weighted avg       0.93      0.93      0.93       284



In [23]:
# 2. Cluster Centroid:
Sampler = ClusterCentroids(random_state=4)
x_rs , y_rs = Sampler.fit_resample(x,y)
print(Counter(y_rs))



Counter({0: 473, 1: 473})


In [24]:
x_train,x_test,y_train,y_test = train_test_split(x_rs,y_rs,test_size=0.3,random_state=0)

In [25]:
lr = LogisticRegression()
lr.fit(x_train,y_train)
lpred = lr.predict(x_test)
report = classification_report(y_test, lpred)
print(report)


              precision    recall  f1-score   support

           0       0.93      0.94      0.93       146
           1       0.93      0.92      0.93       138

    accuracy                           0.93       284
   macro avg       0.93      0.93      0.93       284
weighted avg       0.93      0.93      0.93       284



In [26]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rpred = rfc.predict(x_test)
report = classification_report(y_test, rpred)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       146
           1       0.96      0.92      0.94       138

    accuracy                           0.94       284
   macro avg       0.94      0.94      0.94       284
weighted avg       0.94      0.94      0.94       284



In [27]:
# Tomek Links:
sampler = TomekLinks()
x_rs , y_rs = sampler.fit_resample(x,y)
print(Counter(y_rs))

Counter({0: 283226, 1: 473})


In [28]:
x_train,x_test,y_train,y_test = train_test_split(x_rs,y_rs,test_size=0.3,random_state=0)

In [29]:
lr = LogisticRegression()
lr.fit(x_train,y_train)
lpred = lr.predict(x_test)
report = classification_report(y_test, lpred)
print(report)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84968
           1       0.89      0.62      0.73       142

    accuracy                           1.00     85110
   macro avg       0.94      0.81      0.86     85110
weighted avg       1.00      1.00      1.00     85110



In [30]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rpred = rfc.predict(x_test)
report = classification_report(y_test, rpred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84968
           1       0.92      0.76      0.83       142

    accuracy                           1.00     85110
   macro avg       0.96      0.88      0.92     85110
weighted avg       1.00      1.00      1.00     85110



# OverSampling

In [31]:
from imblearn.over_sampling import SMOTE
# not taking randomoversampler because it will cause overfitting due very huge gap between both categories. 
# Replicating 473 records so many times till it reaches 283253 observations is not worth.

In [33]:
# SMOTE:
sampler = SMOTE(random_state=42)
x_os , y_os = sampler.fit_resample(x,y)
print(Counter(y_os))

Counter({0: 283253, 1: 283253})


In [34]:
x_train,x_test,y_train,y_test = train_test_split(x_os,y_os,test_size=0.3,random_state=0)

In [35]:
lr = LogisticRegression()
lr.fit(x_train,y_train)
lpred = lr.predict(x_test)
report = classification_report(y_test, lpred)
print(report)


              precision    recall  f1-score   support

           0       0.92      0.97      0.95     84730
           1       0.97      0.91      0.94     85222

    accuracy                           0.94    169952
   macro avg       0.95      0.94      0.94    169952
weighted avg       0.95      0.94      0.94    169952



In [36]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rpred = rfc.predict(x_test)
report = classification_report(y_test, rpred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84730
           1       1.00      1.00      1.00     85222

    accuracy                           1.00    169952
   macro avg       1.00      1.00      1.00    169952
weighted avg       1.00      1.00      1.00    169952



# Final Conclusion:
1. In case of Undersampling - RandomUnderSampler gave best report but we lost many precious data, thus ClusterCentroids and TomekLinks are more approachable.
2. In case of Oversampling - we only took SMOTE method where randomforest algorithm gave top notch results.