Import the Dependencies

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Loading Dataset to a Pandas DataFrame

In [5]:
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [6]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [7]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
15931,27369,-1.160116,-0.244177,0.74425,-0.19235,1.156356,-1.931383,0.40967,-0.364716,-0.516156,...,-0.021714,0.012447,-0.360625,0.514926,0.064194,1.026317,-0.030845,-0.083609,52.9,0.0
15932,27369,-3.058318,3.099206,-4.932555,1.924138,-1.576032,-2.135383,-0.830098,2.228617,-0.312343,...,-0.111526,-0.485223,-0.003945,0.049422,-0.173962,-0.37926,0.171872,-0.236166,99.99,0.0
15933,27369,-0.661806,0.315385,2.011194,-0.438757,-0.55499,-0.668072,0.424651,0.079141,0.126057,...,0.148063,0.177511,0.134794,0.359931,-0.544428,0.181545,0.068546,0.153438,74.58,0.0
15934,27370,1.525348,-1.231442,0.420095,-1.551218,-1.376006,0.100758,-1.455755,0.134876,-1.319056,...,-0.108619,0.084883,-0.051758,-0.815038,0.331989,-0.015837,0.058942,0.011087,6.0,0.0
15935,27371,1.38568,-0.590076,-0.569197,-0.939441,-0.196015,-0.486685,-0.102496,-0.23793,-0.928028,...,-0.255944,-1.055682,,,,,,,,


In [8]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15936 entries, 0 to 15935
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    15936 non-null  int64  
 1   V1      15936 non-null  float64
 2   V2      15936 non-null  float64
 3   V3      15936 non-null  float64
 4   V4      15936 non-null  float64
 5   V5      15936 non-null  float64
 6   V6      15936 non-null  float64
 7   V7      15936 non-null  float64
 8   V8      15936 non-null  float64
 9   V9      15936 non-null  float64
 10  V10     15936 non-null  float64
 11  V11     15936 non-null  float64
 12  V12     15936 non-null  float64
 13  V13     15936 non-null  float64
 14  V14     15936 non-null  float64
 15  V15     15936 non-null  float64
 16  V16     15936 non-null  float64
 17  V17     15936 non-null  float64
 18  V18     15936 non-null  float64
 19  V19     15936 non-null  float64
 20  V20     15936 non-null  float64
 21  V21     15936 non-null  float64
 22

Check number of missing values in each column

In [9]:
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


Great! We don't have any missing values :)

Now, let's check distribution of legit transaction & fraudlent transaction



In [10]:
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,15862
1.0,73


We can see the dataset is unbalanced :(
  0---> Normal Transaction
  1---> Fraudlent Transaction

In [11]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [12]:
print(legit.shape)
print(fraud.shape)

(15862, 31)
(73, 31)


Statistical measure of the data

In [13]:
legit.Amount.describe()

Unnamed: 0,Amount
count,15862.0
mean,66.280151
std,188.898885
min,0.0
25%,5.5225
50%,15.95
75%,53.89
max,7712.43


In [14]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,73.0
mean,90.307123
std,271.63436
min,0.0
25%,1.0
50%,1.0
75%,99.99
max,1809.68


Comapre the values for both transactions

In [15]:
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,12104.432165,-0.219072,0.25,0.862854,0.272641,-0.105868,0.124522,-0.112681,-0.016178,0.87912,...,0.028173,-0.064056,-0.161585,-0.03504,0.013076,0.117023,0.037111,0.011206,0.005907,66.280151
1.0,15559.643836,-7.929807,6.19312,-11.997831,6.55505,-5.474984,-2.480356,-8.354317,3.668478,-3.086988,...,0.671905,0.4411,-0.295856,-0.358994,-0.328925,0.216076,0.209323,0.927187,0.06138,90.307123


Build a sample dataset containing similar distribution of normal transaction & fraudlent transactions

No. of fraudlent transaction = 492

In [16]:
legit_sample = legit.sample(n=492)

Let's concatenate two dataframes

In [17]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [18]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
7914,10963,1.25011,-0.039766,0.836381,0.317287,-0.851758,-0.814436,-0.474901,-0.167594,1.822692,...,-0.076497,0.013709,-0.015406,0.343646,0.19618,1.035992,-0.093109,0.00419,15.95,0.0
699,529,-2.000567,-2.495484,2.467149,1.140053,2.46201,0.594262,-2.110183,0.788347,0.958809,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.17106,1.5,0.0
2607,2134,-0.667351,-0.167788,1.241277,-1.524555,-1.048869,0.995735,0.274662,0.159642,-1.092677,...,-0.374554,-0.58895,0.117503,-0.853818,-0.88804,0.81837,-0.039758,0.067833,170.55,0.0
1155,900,-1.037084,0.628594,0.800019,-1.284624,0.385346,-0.822886,1.358449,-0.172655,-0.725526,...,-0.308919,-0.879053,0.187984,0.032678,-0.101778,0.59716,0.128289,0.116486,89.9,0.0
9518,14201,1.295975,-0.570168,0.316172,-1.390539,-1.081812,-1.099526,-0.443622,-0.299744,1.025382,...,-0.33382,-0.193002,-0.087797,0.469408,0.608169,-0.66573,0.038539,0.017268,39.4,0.0


In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
149708,91773.0,1.679246,-0.686567,-1.677479,0.883446,0.259368,0.152458,-0.082617,-0.220789,2.463822,...,0.06808,0.488328,-0.303519,0.087046,0.26772,0.910695,-0.094397,-0.010578,207.66,0
11447,19896.0,-0.334535,0.241851,1.727856,-0.862383,-0.580904,-0.691943,-0.040325,-0.181736,0.383812,...,-0.087448,0.008343,-0.108217,0.332635,0.058632,-0.332627,-0.089933,0.038952,5.0,0
1519,1188.0,0.974692,-0.618454,1.189187,0.260653,-1.241627,-0.140155,-0.599202,0.10132,0.964559,...,-0.006735,0.02953,0.042315,0.497241,-0.017796,0.988186,-0.030459,0.033423,100.0,0
149270,90919.0,-0.529437,1.277977,-0.119945,-0.330126,0.771294,-0.61196,0.717138,0.038679,1.597979,...,-0.149173,0.13287,-0.310309,-0.520696,-0.209589,0.553603,0.444485,0.298455,3.99,0
109446,71392.0,-1.350784,-0.553596,1.846922,-1.653582,-0.948094,-0.190595,-0.804077,0.705539,-1.058238,...,0.540701,1.069134,-0.171326,0.026606,0.21905,-0.208809,-0.039343,-0.053735,74.75,0


In [19]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,73


In [20]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,11766.477642,-0.101671,0.120906,0.920528,0.249813,-0.130391,0.151653,-0.088419,0.008068,0.885989,...,0.037309,-0.083006,-0.124831,-0.050077,0.008487,0.119555,0.08291,-0.000886,-0.00602,73.788638
1.0,15559.643836,-7.929807,6.19312,-11.997831,6.55505,-5.474984,-2.480356,-8.354317,3.668478,-3.086988,...,0.671905,0.4411,-0.295856,-0.358994,-0.328925,0.216076,0.209323,0.927187,0.06138,90.307123


Splitting data into Features & Targets

In [21]:
X = new_dataset.drop(columns='Class', axis=1)
Y= new_dataset['Class']

In [22]:
print(X)

        Time         V1         V2         V3        V4         V5        V6  \
7914   10963   1.250110  -0.039766   0.836381  0.317287  -0.851758 -0.814436   
699      529  -2.000567  -2.495484   2.467149  1.140053   2.462010  0.594262   
2607    2134  -0.667351  -0.167788   1.241277 -1.524555  -1.048869  0.995735   
1155     900  -1.037084   0.628594   0.800019 -1.284624   0.385346 -0.822886   
9518   14201   1.295975  -0.570168   0.316172 -1.390539  -1.081812 -1.099526   
...      ...        ...        ...        ...       ...        ...       ...   
15566  26961 -23.237920  13.487386 -25.188773  6.261733 -17.345188 -4.534989   
15736  27163 -23.914101  13.765942 -25.733734  6.290918 -17.784824 -4.572498   
15751  27187 -24.590245  14.044567 -26.278701  6.320089 -18.224513 -4.609968   
15781  27219 -25.266355  14.323254 -26.823673  6.349248 -18.664251 -4.647403   
15810  27252 -25.942434  14.601998 -27.368650  6.378395 -19.104033 -4.684806   

              V7         V8        V9  

In [23]:
print(Y)

7914     0.0
699      0.0
2607     0.0
1155     0.0
9518     0.0
        ... 
15566    1.0
15736    1.0
15751    1.0
15781    1.0
15810    1.0
Name: Class, Length: 565, dtype: float64


Split the data into Training & Testing data

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [25]:
print(X.shape, X_train.shape, X_test.shape)

(565, 30) (452, 30) (113, 30)


Training the Model

Here, I will be using Logistic Regresiion Model

In [26]:
model = LogisticRegression()

In [27]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation based on Accuracy Score

In [28]:
#Accuracy on Training Data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print("Accuracy on Training Data : ", training_data_accuracy)

Accuracy on Training Data :  0.9237611181702668


In [29]:
#Accuracy on Test Data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [30]:
print('Accuracy Score on Test Data : ', test_data_accuracy)

Accuracy Score on Test Data :  0.9646017699115044


Making a Predictive System

In [44]:
input_data = (0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62)

#Changing the input data to a numpy array
input_data_as_nunpy_array = np.asarray(input_data)

#Reshape the Numpy Array as we are predicting for one instance

input_data_reshaped = input_data_as_nunpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0] == '1'):
  print("Fraudlent Transaction")
else:
  print("Legit Transaction")

[0.]
Legit Transaction




Saving the trained model

In [45]:
import pickle

In [46]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))


In [47]:
#Loading the saved model
loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))

In [48]:
input_data = (0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62)

#Changing the input data to a numpy array
input_data_as_nunpy_array = np.asarray(input_data)

#Reshape the Numpy Array as we are predicting for one instance

input_data_reshaped = input_data_as_nunpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if(prediction[0] == '1'):
  print("Fraudlent Transaction")
else:
  print("Legit Transaction")

[0.]
Legit Transaction


