##**Importing the Dependencies**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the dataset to a Pandas DataFrame

credit_card_data = pd.read_csv('/content/creditcard.csv')

In [3]:
# first 5 rows of DataFrame
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [37]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
77332,57002,0.958079,-0.370366,1.258194,0.730481,-0.436944,1.544908,-0.932519,0.63898,0.86537,...,0.055591,0.496503,0.190713,-0.595555,-0.101279,0.424053,0.088563,0.017342,15.97,0.0
77333,57003,-1.565835,0.537575,3.284121,3.229021,-0.917761,2.016339,-1.157749,1.086392,0.234172,...,0.125678,1.07283,-0.229837,0.094444,0.215087,0.544487,0.271867,-0.089124,25.69,0.0
77334,57005,-0.710264,-0.09532,2.899716,0.718612,-0.501955,0.968641,-0.007123,0.308006,1.383339,...,0.069589,0.711129,-0.03857,0.08899,-0.282553,-0.448201,0.05445,-0.051693,65.0,0.0
77335,57005,0.875729,-0.658494,-0.798643,-0.889801,-0.205406,-1.093946,0.743501,-0.381269,0.405087,...,0.192299,0.161282,-0.451218,0.060376,0.885705,-0.477421,-0.036297,0.035704,235.53,0.0
77336,57006,-0.679923,1.074176,1.045563,1.10062,-0.764069,-1.048969,0.601586,0.283135,-0.67482,...,0.256539,0.475028,0.124473,0.886947,-0.327076,-0.362904,0.017048,0.10904,73.52,0.0


In [7]:
credit_card_data.shape

(77338, 31)

In [8]:
# dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77338 entries, 0 to 77337
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    77338 non-null  int64  
 1   V1      77338 non-null  float64
 2   V2      77338 non-null  float64
 3   V3      77338 non-null  float64
 4   V4      77338 non-null  float64
 5   V5      77338 non-null  float64
 6   V6      77338 non-null  float64
 7   V7      77338 non-null  float64
 8   V8      77338 non-null  float64
 9   V9      77338 non-null  float64
 10  V10     77338 non-null  float64
 11  V11     77338 non-null  float64
 12  V12     77338 non-null  float64
 13  V13     77338 non-null  float64
 14  V14     77337 non-null  float64
 15  V15     77337 non-null  float64
 16  V16     77337 non-null  float64
 17  V17     77337 non-null  float64
 18  V18     77337 non-null  float64
 19  V19     77337 non-null  float64
 20  V20     77337 non-null  float64
 21  V21     77337 non-null  float64
 22

In [9]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [14]:
# droping the rows which have null values
credit_card_data.dropna(subset=['V14','V15','V16','V17','V18','V19','V20','V21','V22','V23','V24','V26','V27','V28'], inplace = True)

In [15]:
credit_card_data.shape

(77337, 31)

In [16]:
# distribution of legit transaction & fraudulent transaction
credit_card_data['Class'].value_counts()

0.0    77149
1.0      188
Name: Class, dtype: int64

This Dataset in highly unbalanced.

0 -> Normal transaction

1 -> Fraudulent transaction

In [17]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [18]:
print(legit.shape)
print(fraud.shape)

(77149, 31)
(188, 31)


In [19]:
# statistical measures for data
legit.Amount.describe()

count    77149.000000
mean        97.625867
std        270.623024
min          0.000000
25%          7.690000
50%         26.800000
75%         89.000000
max      19656.530000
Name: Amount, dtype: float64

In [20]:
fraud.Amount.describe()

count     188.000000
mean       94.292500
std       214.093799
min         0.000000
25%         1.000000
50%         7.550000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [21]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,36643.938353,-0.239252,-0.043518,0.701351,0.152168,-0.264608,0.10159,-0.097359,0.046639,0.005268,...,0.042235,-0.030677,-0.105246,-0.038271,0.007363,0.134344,0.025494,0.000766,0.002836,97.625867
1.0,32055.739362,-6.692335,4.711159,-8.903156,5.250821,-4.90527,-2.04085,-7.040342,3.194879,-3.137542,...,0.374793,0.797237,-0.171993,-0.229811,-0.084322,0.242355,0.097236,0.586565,0.050794,94.2925


##**Under Sampling**

Build a sample dataset containing similar distribution of Normal transaction and Fraudulent transaction

Number of Fraudulent transactions -> 188

In [22]:
legit_sample = legit.sample(n=188)

Concatenating two DataFrames

In [23]:
new_dataset = pd.concat([legit_sample,fraud],axis=0) # 0 -> rows, 1 -> columns

In [24]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
23426,32749,1.357079,-0.512537,0.338693,-0.756243,-0.834574,-0.728217,-0.430466,-0.232475,-0.896857,...,-0.089907,-0.429668,0.02256,-0.11188,0.313492,-0.458133,0.007705,0.027392,52.99,0.0
38781,39549,1.360547,-0.711437,0.977551,-0.618905,-1.558396,-0.737977,-1.046277,-0.040499,-0.330785,...,0.380575,1.01661,-0.102823,0.42953,0.409458,-0.048982,0.033251,0.026216,19.0,0.0
19840,30592,0.995098,-0.741301,1.007431,0.344877,-1.345762,-0.35587,-0.532001,0.048723,1.254986,...,-0.17088,-0.505512,-0.013026,0.466779,0.08126,0.931585,-0.057561,0.033005,120.34,0.0
43507,41533,-0.322404,1.102696,0.574003,-0.119663,0.260698,-0.479736,0.621775,0.005742,-0.43264,...,-0.141803,-0.372684,-0.016309,-0.43657,-0.349132,0.075882,-0.393322,-0.263203,14.79,0.0
5151,4866,-0.079548,0.776078,-1.108034,0.300347,3.015266,3.220955,0.391869,0.656805,0.451822,...,-0.010242,0.160853,0.11459,0.944734,-0.653714,-0.379587,0.453791,0.297918,50.0,0.0


In [25]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
75511,56098,-1.229669,1.956099,-0.851198,2.796987,-1.913977,-0.044934,-1.340739,-0.555548,-1.184468,...,1.208054,0.277612,0.019266,0.508529,-0.201183,-0.2496,0.562239,0.075309,170.92,1.0
76555,56624,-7.901421,2.720472,-7.885936,6.348334,-5.480119,-0.333059,-8.682376,1.164431,-4.542447,...,0.077739,1.092437,0.320133,-0.434643,-0.380687,0.21363,0.42362,-0.105169,153.46,1.0
76609,56650,-8.762083,2.79103,-7.682767,6.991214,-5.230695,-0.357388,-9.685621,1.749335,-4.495679,...,-0.090527,0.34859,0.051132,-0.41543,0.219665,0.33002,-0.028252,-0.15627,7.52,1.0
76929,56806,0.016828,2.400826,-4.22036,3.462217,-0.624142,-1.294303,-2.986028,0.751883,-1.606672,...,0.285832,-0.771508,-0.2652,-0.873077,0.939776,-0.219085,0.874494,0.470434,1.0,1.0
77099,56887,-0.075483,1.812355,-2.566981,4.127549,-1.628532,-0.805895,-3.390135,1.019353,-2.451251,...,0.794372,0.270471,-0.143624,0.013566,0.634203,0.213693,0.773625,0.387434,5.0,1.0


In [26]:
new_dataset['Class'].value_counts()

0.0    188
1.0    188
Name: Class, dtype: int64

In [27]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,36238.75,-0.217853,0.015401,0.647508,0.110839,-0.380721,0.119404,-0.084824,0.00651,0.010268,...,0.045457,-0.102258,-0.106419,-0.109731,-0.044181,0.087435,0.03072,0.027949,0.020661,107.595745
1.0,32055.739362,-6.692335,4.711159,-8.903156,5.250821,-4.90527,-2.04085,-7.040342,3.194879,-3.137542,...,0.374793,0.797237,-0.171993,-0.229811,-0.084322,0.242355,0.097236,0.586565,0.050794,94.2925


### Spliting the data into Features & Targets

In [28]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [29]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
23426  32749  1.357079 -0.512537  0.338693 -0.756243 -0.834574 -0.728217   
38781  39549  1.360547 -0.711437  0.977551 -0.618905 -1.558396 -0.737977   
19840  30592  0.995098 -0.741301  1.007431  0.344877 -1.345762 -0.355870   
43507  41533 -0.322404  1.102696  0.574003 -0.119663  0.260698 -0.479736   
5151    4866 -0.079548  0.776078 -1.108034  0.300347  3.015266  3.220955   
...      ...       ...       ...       ...       ...       ...       ...   
75511  56098 -1.229669  1.956099 -0.851198  2.796987 -1.913977 -0.044934   
76555  56624 -7.901421  2.720472 -7.885936  6.348334 -5.480119 -0.333059   
76609  56650 -8.762083  2.791030 -7.682767  6.991214 -5.230695 -0.357388   
76929  56806  0.016828  2.400826 -4.220360  3.462217 -0.624142 -1.294303   
77099  56887 -0.075483  1.812355 -2.566981  4.127549 -1.628532 -0.805895   

             V7        V8        V9  ...       V20       V21       V22  \
23426 -0.4304

In [30]:
print(Y)

23426    0.0
38781    0.0
19840    0.0
43507    0.0
5151     0.0
        ... 
75511    1.0
76555    1.0
76609    1.0
76929    1.0
77099    1.0
Name: Class, Length: 376, dtype: float64


## Split the data into Training data & Testing Data

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [32]:
print(X.shape,X_train.shape,X_test.shape)

(376, 30) (300, 30) (76, 30)


##**Model Training**

###Logistic Regression

In [33]:
model = LogisticRegression()

In [34]:
# training the Logistic Regression Model with Training Data
model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


##**Model Evaluation**

Accuracy Score

In [35]:
# accuracy on Training Data
training_data_accuracy = accuracy_score(model.predict(X_train), Y_train)
print("Accuracy on Training Data =",training_data_accuracy)

Accuracy on Training Data = 0.9666666666666667


In [36]:
# accuracy on Testing Data
testing_data_accuracy = accuracy_score(model.predict(X_test), Y_test)
print("Accuracy on Training Data =",testing_data_accuracy)

Accuracy on Training Data = 0.9736842105263158


##**Prediction**

In [38]:
input_data = (57003,	-1.56583533418111,	0.537575485261604,	3.28412078420187,	3.22902118240923,	-0.917760888069893,	2.01633887916683,	-1.15774948257937,	1.08639157980196,	0.234172198326152,	0.75101861486599,	0.313272386309906,	1.060841556241,	0.173516903803545,	-1.0929384704987,	-1.54995351839618,	-0.720056345480092,	0.814829173578427,	0.302596197026285,	1.18237502836407,1.8529336016497,0.055591,	0.496503,	0.190713,	-0.595555,	-0.101279,	0.424053,	0.088563,	0.017342,	15.97	)

#Changing input_data to numpy array for prediction
inp_np_arr = np.asarray(input_data)
print(inp_np_arr)

#reshaping the array shape as we are predicting for one instance

in_re = inp_np_arr.reshape(1,-1)
print(in_re)

[ 5.70030000e+04 -1.56583533e+00  5.37575485e-01  3.28412078e+00
  3.22902118e+00 -9.17760888e-01  2.01633888e+00 -1.15774948e+00
  1.08639158e+00  2.34172198e-01  7.51018615e-01  3.13272386e-01
  1.06084156e+00  1.73516904e-01 -1.09293847e+00 -1.54995352e+00
 -7.20056345e-01  8.14829174e-01  3.02596197e-01  1.18237503e+00
  1.85293360e+00  5.55910000e-02  4.96503000e-01  1.90713000e-01
 -5.95555000e-01 -1.01279000e-01  4.24053000e-01  8.85630000e-02
  1.73420000e-02  1.59700000e+01]
[[ 5.70030000e+04 -1.56583533e+00  5.37575485e-01  3.28412078e+00
   3.22902118e+00 -9.17760888e-01  2.01633888e+00 -1.15774948e+00
   1.08639158e+00  2.34172198e-01  7.51018615e-01  3.13272386e-01
   1.06084156e+00  1.73516904e-01 -1.09293847e+00 -1.54995352e+00
  -7.20056345e-01  8.14829174e-01  3.02596197e-01  1.18237503e+00
   1.85293360e+00  5.55910000e-02  4.96503000e-01  1.90713000e-01
  -5.95555000e-01 -1.01279000e-01  4.24053000e-01  8.85630000e-02
   1.73420000e-02  1.59700000e+01]]


In [39]:
#Prediction
stages = {0:"Legit Transaction",1:"Fraudulent Transaction"}
predict_op = model.predict(in_re)
print("Transaction Is",stages[predict_op[0]])

Transaction Is Legit Transaction


