# **Importing the Libraries and Functions**

In [347]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score

# **Loading the dataset**

In [348]:
credit_card_dataset = pd.read_csv('/content/creditcard_2023.csv')

# **Checking the first 5 rows of the dataset**

In [349]:
credit_card_dataset.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


# **Checking the last 5 rows of the dataset**

In [350]:
credit_card_dataset.tail()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
568625,568625,-0.833437,0.061886,-0.899794,0.904227,-1.002401,0.481454,-0.370393,0.189694,-0.938153,...,0.167503,0.419731,1.288249,-0.900861,0.560661,-0.006018,3.308968,0.081564,4394.16,1
568626,568626,-0.670459,-0.202896,-0.068129,-0.267328,-0.13366,0.237148,-0.016935,-0.147733,0.483894,...,0.031874,0.388161,-0.154257,-0.846452,-0.153443,1.961398,-1.528642,1.704306,4653.4,1
568627,568627,-0.311997,-0.004095,0.137526,-0.035893,-0.042291,0.121098,-0.070958,-0.019997,-0.122048,...,0.140788,0.536523,-0.2111,-0.448909,0.540073,-0.755836,-0.48754,-0.268741,23572.85,1
568628,568628,0.636871,-0.51697,-0.300889,-0.14448,0.131042,-0.294148,0.580568,-0.207723,0.893527,...,-0.060381,-0.195609,-0.175488,-0.554643,-0.099669,-1.434931,-0.159269,-0.076251,10160.83,1
568629,568629,-0.795144,0.433236,-0.64914,0.374732,-0.244976,-0.603493,-0.347613,-0.340814,0.253971,...,0.534853,-0.291514,0.157303,0.93103,-0.349423,-1.090974,-1.575113,0.722936,21493.92,1


# **Displaying dataset informations**

In [351]:
credit_card_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568630 entries, 0 to 568629
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      568630 non-null  int64  
 1   V1      568630 non-null  float64
 2   V2      568630 non-null  float64
 3   V3      568630 non-null  float64
 4   V4      568630 non-null  float64
 5   V5      568630 non-null  float64
 6   V6      568630 non-null  float64
 7   V7      568630 non-null  float64
 8   V8      568630 non-null  float64
 9   V9      568630 non-null  float64
 10  V10     568630 non-null  float64
 11  V11     568630 non-null  float64
 12  V12     568630 non-null  float64
 13  V13     568630 non-null  float64
 14  V14     568630 non-null  float64
 15  V15     568630 non-null  float64
 16  V16     568630 non-null  float64
 17  V17     568630 non-null  float64
 18  V18     568630 non-null  float64
 19  V19     568630 non-null  float64
 20  V20     568630 non-null  float64
 21  V21     56

# **Displaying the numbers of missing values in each column**

In [352]:
credit_card_dataset.isnull().sum()

Unnamed: 0,0
id,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


# **Allocation of legit and fraud transactions**

In [353]:
credit_card_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,284315


# **Data separation for analysis**

In [354]:
legit = credit_card_dataset[credit_card_dataset.Class == 0]
fraud = credit_card_dataset[credit_card_dataset.Class == 1]

# **Displaying the size and rows of legit and fraud transactions**

In [355]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(284315, 31)


# **Statistical Analysis of the legit data**

In [356]:
legit.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,12026.313506
std,6929.500715
min,50.12
25%,6034.54
50%,11996.9
75%,18040.265
max,24039.93


# **Statistical Analysis of the fraud data**

In [357]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,12057.601763
std,6909.750891
min,50.01
25%,6074.64
50%,12062.45
75%,18033.78
max,24039.93


# **Differentiating both transaction values**

In [358]:
credit_card_dataset.groupby('Class').mean()

Unnamed: 0_level_0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,142442.987714,0.505761,-0.491878,0.682095,-0.735981,0.338639,0.435088,0.491234,-0.144294,0.585522,...,-0.179851,-0.10964,-0.014098,-0.010255,0.130107,-0.061847,-0.071052,-0.214002,-0.102024,12026.313506
1,426186.012286,-0.505761,0.491878,-0.682095,0.735981,-0.338639,-0.435088,-0.491234,0.144294,-0.585522,...,0.179851,0.10964,0.014098,0.010255,-0.130107,0.061847,0.071052,0.214002,0.102024,12057.601763


# **Under-Sampling method**

# Creating a new dataset having equal no. distribution of legit and fraud transactions due huge no. of distributions

**Number of Legit transactions sample = 2000**

In [359]:
legit_sample = legit.sample(n=2000)

**Number of Fraud transactions sample = 2000**

In [360]:
fraud_sample = fraud.sample(n=2000)

# **Concatenating the two dataframes**

In [361]:
updated_dataset = pd.concat([legit_sample, fraud_sample], axis=0)

# **Checking the first 5 rows of the new dataset**

In [362]:
updated_dataset.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
199206,199206,1.547203,-0.909482,0.149839,-0.743194,0.042311,0.593066,0.263469,-0.179186,0.3221,...,-0.324321,-1.181172,0.266385,0.162349,-1.016866,-0.169255,-0.26089,-0.097886,17755.85,0
147359,147359,0.003927,-0.06751,3.222345,0.787108,0.088884,1.258864,0.393595,-0.081016,-0.317563,...,0.008044,0.817754,-0.243294,-0.102142,-0.060806,1.152958,-0.061234,0.182069,18668.11,0
33176,33176,-0.130633,-0.456719,1.620581,-2.244793,-0.209056,0.135067,0.170749,-0.025163,-0.489648,...,-0.131044,-0.007018,-0.315611,-0.29329,0.673943,-0.293583,-0.027182,0.029138,18119.42,0
85562,85562,0.094038,-0.183905,1.293693,-0.841397,0.223581,-0.387154,0.817883,-0.207159,0.155415,...,-0.243838,-1.009293,0.123785,1.55433,-0.686996,1.186852,-0.329127,-0.06777,6289.42,0
192452,192452,1.60022,-0.525069,-0.09002,-0.482214,0.331956,0.081081,0.458086,-0.173704,1.056439,...,-0.174965,-0.432058,0.052718,-0.98783,-0.22964,-0.182436,-0.263211,-0.160063,23626.13,0


# **Checking the last 5 rows of the new dataset**

In [363]:
updated_dataset.tail()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
361154,361154,-0.539778,-0.144625,-0.333687,0.403143,-1.041692,1.46714,0.336845,-0.803823,0.090817,...,-0.260082,-0.160505,-3.485512,1.099806,-1.414328,-0.71777,1.63618,-0.771793,19943.12,1
321931,321931,-2.459141,3.32697,-2.476251,1.785864,-2.828982,-0.276048,-3.424998,-2.20702,-2.787201,...,-4.328204,3.653956,0.930769,0.542946,-1.236161,-2.571824,-4.947034,-3.337478,863.98,1
567302,567302,-0.747763,-1.092791,-0.505155,0.185895,0.400078,-1.366924,-0.076675,-0.121167,0.292624,...,0.193352,-0.336046,0.527548,0.085293,-1.924721,-2.517718,-0.375225,-0.706869,6650.27,1
478021,478021,-0.589578,1.383939,-1.48326,1.989736,-0.526307,-1.501621,-1.232442,0.681209,-1.769342,...,0.590291,0.15597,0.082075,-1.909857,-0.493919,1.307268,2.648543,1.843971,22254.51,1
502871,502871,-0.174921,0.968477,-1.052283,1.455709,0.278264,-0.860285,-0.344615,0.294028,-1.378954,...,0.111065,-0.471494,0.012663,-1.846876,0.0146,1.136722,0.459639,0.237559,16616.78,1


# **Allocation of legit and fraud transactions of new dataset**

In [364]:
updated_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,2000
1,2000


# **Differentiating both transaction values of new dataset**

In [365]:
updated_dataset.groupby('Class').mean()

Unnamed: 0_level_0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,143115.4555,0.536302,-0.473504,0.670888,-0.734714,0.351292,0.443653,0.503776,-0.149576,0.57936,...,-0.184467,-0.116929,-0.020287,-0.021976,0.109538,-0.063766,-0.102747,-0.201786,-0.076781,12379.411715
1,423969.6275,-0.510421,0.487927,-0.690536,0.73229,-0.323577,-0.442863,-0.493311,0.122149,-0.589782,...,0.150587,0.145984,-0.021558,0.0304,-0.126981,0.081991,0.064698,0.198628,0.101624,12198.73996


# **Splitting the data into features and target**

In [366]:
P = updated_dataset.drop(columns = 'Class', axis = 1)
Q = updated_dataset['Class']

In [367]:
print(P)

            id        V1        V2        V3        V4        V5        V6  \
199206  199206  1.547203 -0.909482  0.149839 -0.743194  0.042311  0.593066   
147359  147359  0.003927 -0.067510  3.222345  0.787108  0.088884  1.258864   
33176    33176 -0.130633 -0.456719  1.620581 -2.244793 -0.209056  0.135067   
85562    85562  0.094038 -0.183905  1.293693 -0.841397  0.223581 -0.387154   
192452  192452  1.600220 -0.525069 -0.090020 -0.482214  0.331956  0.081081   
...        ...       ...       ...       ...       ...       ...       ...   
361154  361154 -0.539778 -0.144625 -0.333687  0.403143 -1.041692  1.467140   
321931  321931 -2.459141  3.326970 -2.476251  1.785864 -2.828982 -0.276048   
567302  567302 -0.747763 -1.092791 -0.505155  0.185895  0.400078 -1.366924   
478021  478021 -0.589578  1.383939 -1.483260  1.989736 -0.526307 -1.501621   
502871  502871 -0.174921  0.968477 -1.052283  1.455709  0.278264 -0.860285   

              V7        V8        V9  ...       V20       V21  

In [368]:
print(Q)

199206    0
147359    0
33176     0
85562     0
192452    0
         ..
361154    1
321931    1
567302    1
478021    1
502871    1
Name: Class, Length: 4000, dtype: int64


# **Splitting the data into training data and test data**

In [369]:
P_train, P_test, Q_train, Q_test = train_test_split(P, Q, test_size=0.3, stratify=Q, random_state=3)

# **Checking the distribution of training and test data**

In [370]:
print(P.shape, P_train.shape, P_test.shape)

(4000, 30) (2800, 30) (1200, 30)


# **Training Model**

## **Logistic Regression**

In [381]:
model = LogisticRegression(solver = 'saga', max_iter = 5000)

## **Training the Logistic Regressing Model using training data**

In [382]:
model.fit(P_train, Q_train)

# **Evaluation Model**

## **Accuracy evaluation on training data**

In [383]:
P_train_prediction = model.predict(P_train)
training_data_accuracy = accuracy_score(P_train_prediction, Q_train)

In [384]:
print(f"Accuracy Of Training Data: {training_data_accuracy: .2f}")

Accuracy Of Training Data:  0.81


## **Accuracy evaluation on test data**

In [385]:
P_test_prediction = model.predict(P_test)
training_data_accuracy = accuracy_score(P_test_prediction, Q_test)

In [386]:
print(f"Accuracy Of Training Data: {training_data_accuracy: .2f}")

Accuracy Of Training Data:  0.80


## **Linear Regression**

In [387]:
linear_model = LinearRegression()

## **Training the Linear Regressing Model using training data**

In [388]:
linear_model.fit(P_train, Q_train)
Q_train_pred = linear_model.predict(P_train)
Q_test_pred = linear_model.predict(P_test)

# **Evaluation Model**

## **Accuracy evaluation on training data**

In [389]:
train_mse = mean_squared_error(Q_train, Q_train_pred)
train_r2 = r2_score(Q_train, Q_train_pred)

In [390]:
print(f"Training Mean Squared Error (MSE): {train_mse:.2f}")
print(f"Training R-squared (R²): {train_r2:.2f}")

Training Mean Squared Error (MSE): 0.03
Training R-squared (R²): 0.87


## **Accuracy evaluation on test data**

In [391]:
test_mse = mean_squared_error(Q_test, Q_test_pred)
test_r2 = r2_score(Q_test, Q_test_pred)

In [392]:
print(f"Test Mean Squared Error (MSE): {test_mse:.2f}")
print(f"Test R-squared (R²): {test_r2:.2f}")

Test Mean Squared Error (MSE): 0.04
Test R-squared (R²): 0.85
