Importing Dependencies

In [None]:
import numpy as np
import pandas as pd # For handling datasets
from sklearn.model_selection import train_test_split # To split dataset into train & test sets
from sklearn.linear_model import LogisticRegression  # The ML model you're going to train
from sklearn.metrics import accuracy_score  # To evaluate model performance

# What’s inside sklearn (Scikit-learn)?
# It’s a complete toolkit for ML:

# Supervised Learning: LogisticRegression, RandomForestClassifier, SVC etc.

# Unsupervised Learning: KMeans, PCA

# Model selection tools: train_test_split, cross_val_score

# Preprocessing: StandardScaler, OneHotEncoder

# Metrics: accuracy_score, confusion_matrix, et

In [None]:
#  Load Data set to a Panda Dataframe

credit_card_data = pd.read_csv('/content/creditcard.csv')

In [None]:
credit_card_data.head()

# V1 to V28: Features transformed by PCA to protect confidentiality of the
# original features (like location, time, merchant ID, etc.).

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [None]:
credit_card_data.tail() # last five

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
31775,36510,1.497714,-0.874208,0.004261,-1.537368,-1.044513,-0.733905,-0.644635,-0.18651,-2.380495,...,-0.527761,-1.248016,0.145825,-0.061193,0.250252,-0.526898,0.003761,0.00795,24.9,0.0
31776,36512,-0.407854,0.940336,1.533733,-0.018123,-0.2194,-0.934602,0.597172,0.02461,-0.15185,...,-0.24757,-0.72583,-0.006221,0.322366,-0.249413,0.076587,0.25538,0.11734,1.79,0.0
31777,36513,-0.434226,-3.705556,0.194223,0.139216,-2.066818,1.220137,-0.278115,0.281452,-0.161302,...,0.620431,0.040315,-0.690176,-0.226844,-0.123369,-0.291412,-0.080514,0.155457,861.12,0.0
31778,36513,-0.274278,1.004565,1.33643,-0.164777,0.404935,-0.418731,0.853252,-0.148772,-0.546969,...,-0.277437,-0.579135,-0.097793,-0.037926,-0.157882,0.044814,0.078376,-0.137035,9.72,0.0
31779,36513,1.295137,0.096775,0.348134,0.629896,-0.1727,-0.15435,,,,...,,,,,,,,,,


Dataset Infromation

DataSet infromation


In [None]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31780 entries, 0 to 31779
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    31780 non-null  int64  
 1   V1      31780 non-null  float64
 2   V2      31780 non-null  float64
 3   V3      31780 non-null  float64
 4   V4      31780 non-null  float64
 5   V5      31780 non-null  float64
 6   V6      31780 non-null  float64
 7   V7      31779 non-null  float64
 8   V8      31779 non-null  float64
 9   V9      31779 non-null  float64
 10  V10     31779 non-null  float64
 11  V11     31779 non-null  float64
 12  V12     31779 non-null  float64
 13  V13     31779 non-null  float64
 14  V14     31779 non-null  float64
 15  V15     31779 non-null  float64
 16  V16     31779 non-null  float64
 17  V17     31779 non-null  float64
 18  V18     31779 non-null  float64
 19  V19     31779 non-null  float64
 20  V20     31779 non-null  float64
 21  V21     31779 non-null  float64
 22

In [None]:
# chechking the number of missing values for each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,1
V8,1
V9,1


In [None]:
#  Distribution of Legit transaction and fraudulent transactions
credit_card_data['Class'].value_counts()


# credit_card_data['Class']: grabs the column with 0s and 1s, where:

# 0 = Legit transaction

# 1 = Fraudulent transaction

#  31677 legit transactions

#  Only 31677 fraud transactions

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,31677
1.0,102


This dataset Is highly unbalanced

0 -> Normal Transaction
1 -> Fraudulent Transaction

In [None]:
#  Separting the Data for ananlysis
legit = credit_card_data[credit_card_data.Class == 0]  #	All transactions where Class == 0

fraud = credit_card_data[credit_card_data.Class == 1]  #fraud-	All transactions where Class == 1


# This code is separating the data into two categories — legit and fraud — so you can analyze and compare them separately

In [None]:
legit.shape


(31677, 31)

In [None]:
fraud.shape # 102 is fruad transaction and 31 are columns

(102, 31)

Statistical Measures of Data

In [None]:
legit.Amount.describe()

Unnamed: 0,Amount
count,31677.0
mean,81.082407
std,223.072655
min,0.0
25%,6.87
50%,20.0
75%,73.61
max,7879.42


In [None]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,102.0
mean,91.237451
std,248.270971
min,0.0
25%,1.0
50%,3.44
75%,99.99
max,1809.68


In [None]:
#  Comapre the values fro both transactions
credit_card_data.groupby('Class').mean()

# groupby('Class'): Splits the data into two groups:

# Class = 0: Legit transactions

# Class = 1: Fraudulent transactions

# .mean(): Calculates the mean of every other column in the dataset — separately for each group.

# "On average, how do legit and fraud transactions differ across all features?

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,22335.407141,-0.188381,0.085855,0.759238,0.192821,-0.192581,0.09593,-0.095538,0.019413,0.324683,...,0.041991,-0.03532,-0.118406,-0.039544,0.009212,0.133785,0.02214,0.01034,0.003013,81.082407
1.0,20334.872549,-7.761095,5.946853,-11.03523,5.926171,-5.529543,-2.295415,-7.767215,3.904551,-2.99729,...,0.679987,0.6419,-0.352131,-0.317568,-0.241053,0.314412,0.177056,0.811372,0.118708,91.237451


Under Sampling

Build a sample datset Containing similar Distribution of Normal transaction and Fraudulent Transactions


Numbers of Fraudulent Transaction -> 102

In [None]:
legit_sample = legit.sample(n=102)

# a random sample of 102 rows from the legit DataFrame.

# full legit dataset probably has thousands of rows,
#  but your fraud dataset has only 102 rows

# That means your data is imbalanced, like this:
# legit (Class = 0): 31,000+
# fraud (Class = 1): ~102

# then :

# Grab 102 legit samples

# Combine with the 102 fraud samples

Concatenating two DataFrames

In [None]:
new_dataset = pd.concat([legit_sample , fraud] , axis=0)

# then  concat :

# Grab 102 legit samples

# Combine with the 102 fraud samples

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
24313,33171,-2.336582,-0.70449,1.13871,-2.151931,-2.461569,-0.869399,-0.991356,0.638399,0.840925,...,-0.366464,-0.543044,-0.408212,0.751881,0.160377,-0.731866,-0.500511,-0.063035,84.59,0.0
16570,27931,-0.748671,1.180628,0.799786,0.973906,1.79445,0.354868,1.255642,0.03821,-1.540134,...,0.022749,-0.09709,-0.410969,-1.347304,0.580556,0.026262,0.014038,0.071887,23.05,0.0
13820,24505,0.853096,-1.124276,0.301781,0.061568,-0.346277,1.483586,-0.767457,0.380247,2.495119,...,-0.307537,-0.891192,-0.271761,-1.718368,0.126477,1.005531,-0.096189,0.009994,213.49,0.0
16657,28021,1.053168,0.091184,1.61149,2.847261,-0.856462,0.592868,-0.69283,0.327013,0.162049,...,0.088708,0.585613,-0.119529,0.583131,0.564431,0.18275,0.046033,0.019332,1.52,0.0
10030,15114,-2.544857,2.337904,1.666779,-0.166272,-0.319802,1.336787,-1.244748,-3.836336,2.179435,...,3.471329,-1.58113,0.179905,-0.70857,0.586912,0.416031,0.541973,0.306263,19.99,0.0


In [None]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
30398,35906,-3.51903,4.140867,-3.628202,5.505672,-4.057463,-0.905945,-6.652031,2.634524,-4.679402,...,1.582556,0.77871,-0.135707,-0.004278,0.032706,0.362014,0.900925,0.554897,9.13,1.0
30442,35926,-3.896583,4.518355,-4.454027,5.547453,-4.121459,-1.163407,-6.805053,2.928356,-4.91713,...,1.691042,0.920021,-0.151104,0.011007,0.080303,0.412191,0.635789,0.50105,4.56,1.0
30473,35942,-4.194074,4.382897,-5.118363,4.45523,-4.812621,-1.224645,-7.281328,3.33225,-3.679659,...,1.550473,0.614573,0.028521,0.013704,-0.149512,-0.131687,0.473934,0.473757,14.46,1.0
30496,35953,-4.844372,5.649439,-6.730396,5.252842,-4.409566,-1.740767,-6.311699,3.449167,-5.416284,...,1.194888,-0.845753,0.190674,-0.216443,-0.325033,-0.270328,0.210214,0.391855,111.7,1.0
31002,36170,-5.685013,5.776516,-7.064977,5.902715,-4.715564,-1.755633,-6.958679,3.877795,-5.541529,...,1.128641,-0.96296,-0.110045,-0.177733,-0.089175,-0.049447,0.303445,0.21938,111.7,1.0


In [None]:
new_dataset['Class'].value_counts()


# created a balanced dataset from an imbalanced one — this is crucial for training ML models properly.

#  model would be biased toward the majority class (legit).

# It might get 99% accuracy just by predicting "legit" all the time — but that’s 💩 for fraud detection.


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,102
1.0,102


Sliptting The Data Into Features and Target

In [33]:
X = new_dataset.drop(columns='Class', axis=1) # remove the class column
Y = new_dataset['Class'] # and move the Class To Y

In [34]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
24313  33171 -2.336582 -0.704490  1.138710 -2.151931 -2.461569 -0.869399   
16570  27931 -0.748671  1.180628  0.799786  0.973906  1.794450  0.354868   
13820  24505  0.853096 -1.124276  0.301781  0.061568 -0.346277  1.483586   
16657  28021  1.053168  0.091184  1.611490  2.847261 -0.856462  0.592868   
10030  15114 -2.544857  2.337904  1.666779 -0.166272 -0.319802  1.336787   
...      ...       ...       ...       ...       ...       ...       ...   
30398  35906 -3.519030  4.140867 -3.628202  5.505672 -4.057463 -0.905945   
30442  35926 -3.896583  4.518355 -4.454027  5.547453 -4.121459 -1.163407   
30473  35942 -4.194074  4.382897 -5.118363  4.455230 -4.812621 -1.224645   
30496  35953 -4.844372  5.649439 -6.730396  5.252842 -4.409566 -1.740767   
31002  36170 -5.685013  5.776516 -7.064977  5.902715 -4.715564 -1.755633   

             V7        V8        V9  ...       V20       V21       V22  \
24313 -0.9913

In [35]:
print(Y)

24313    0.0
16570    0.0
13820    0.0
16657    0.0
10030    0.0
        ... 
30398    1.0
30442    1.0
30473    1.0
30496    1.0
31002    1.0
Name: Class, Length: 204, dtype: float64


Split The Data Into Test and Training Data

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=0)
# Variable	Meaning
# X	Features (input data)
# Y	Labels (target/class)
# test_size	20% of the data goes to testing 80% training
# stratify=Y	Keeps class balance same in train/test sets (important in imbalanced data like fraud detection)
# random_state=0	Ensures the same random split every time you run the code (reproducibility)

In [40]:
print(X.shape , X_test.shape , X_train.shape)

(204, 30) (41, 30) (163, 30)


Model Training

In [42]:
model = LogisticRegression()

Logistic Regression  probability = 1 / (1 + e^-z)

 machine learning algorithm used for binary classification.

Is this transaction fraud or not?

Even though the name says "regression", it’s actually a classification model.

It works like this:

Takes your features (like Amount, V1, V2… V28)

Applies a linear function (like in linear regression)
z = w1*x1 + w2*x2 + w3*x3 + ... + wn*xn + b

Passes it through a sigmoid function (s-shaped curve) to squish the output between 0 and 1

Predicts:

If output > 0.5 → Class = 1 (Fraud)

If output <= 0.5 → Class = 0 (Legit)


Training the LR model with Trainig Data

In [44]:
model.fit(X_train , Y_train)
# X_train = your training features (like Amount, V1, V2, ..., V28)

# Y_train = your training labels (Class → 0 for legit, 1 for fraud)

# model = your blank logistic regression model

# So fit() means: Train this model on this data.

Evaluate The Modal

Accuracy Score


In [49]:
#  accuracy on taining Data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)

# 1.model.predict(X_train):
#  now that you're trained, try to predict the labels for the same training data you saw earlier."

#2. accuracy_score(X_train_prediction, Y_train):

# X_train_prediction → model's predictions

# Y_train → actual correct labels

In [50]:
print("Accuracy on Training Data is :",training_data_accuracy)

Accuracy on Training Data is : 0.9631901840490797


In [51]:
#  Accuracy On Test DATA

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [52]:
print("Accuracy on Test Data is :",test_data_accuracy)

Accuracy on Test Data is : 0.975609756097561
