In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the dataset into pandas
credit_card_data = pd.read_csv('fraudTest.csv')

In [3]:
# obtaining the first 5 rows
credit_card_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [4]:
credit_card_data.tail()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
555714,555714,2020-12-31 23:59:07,30560609640617,fraud_Reilly and Sons,health_fitness,43.77,Michael,Olson,M,558 Michael Estates,...,40.4931,-91.8912,519,Town planner,1966-02-13,9b1f753c79894c9f4b71f04581835ada,1388534347,39.946837,-91.333331,0
555715,555715,2020-12-31 23:59:09,3556613125071656,fraud_Hoppe-Parisian,kids_pets,111.84,Jose,Vasquez,M,572 Davis Mountains,...,29.0393,-95.4401,28739,Futures trader,1999-12-27,2090647dac2c89a1d86c514c427f5b91,1388534349,29.661049,-96.186633,0
555716,555716,2020-12-31 23:59:15,6011724471098086,fraud_Rau-Robel,kids_pets,86.88,Ann,Lawson,F,144 Evans Islands Apt. 683,...,46.1966,-118.9017,3684,Musician,1981-11-29,6c5b7c8add471975aa0fec023b2e8408,1388534355,46.65834,-119.715054,0
555717,555717,2020-12-31 23:59:24,4079773899158,fraud_Breitenberg LLC,travel,7.99,Eric,Preston,M,7020 Doyle Stream Apt. 951,...,44.6255,-116.4493,129,Cartographer,1965-12-15,14392d723bb7737606b2700ac791b7aa,1388534364,44.470525,-117.080888,0
555718,555718,2020-12-31 23:59:34,4170689372027579,fraud_Dare-Marvin,entertainment,38.13,Samuel,Frey,M,830 Myers Plaza Apt. 384,...,35.6665,-97.4798,116001,Media buyer,1993-05-10,1765bb45b3aa3224b4cdcb6e7a96cee3,1388534374,36.210097,-97.036372,0


In [5]:
# gathering dataset info
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [6]:
#checking the null values in each column 
credit_card_data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [7]:
#legitimate and fraudulent transactions
credit_card_data['is_fraud'].value_counts()

is_fraud
0    553574
1      2145
Name: count, dtype: int64

In [8]:
#separating the data for further analysis
legit = credit_card_data[credit_card_data.is_fraud == 0]
fraud = credit_card_data[credit_card_data.is_fraud  == 1]

In [9]:
print(legit.shape)
print(fraud.shape) 

(553574, 23)
(2145, 23)


In [10]:
#statistical measurement for the given data
legit.amt.describe()

count    553574.000000
mean         67.614408
std         152.471931
min           1.000000
25%           9.600000
50%          47.150000
75%          82.620000
max       22768.110000
Name: amt, dtype: float64

In [11]:
fraud.amt.describe()

count    2145.000000
mean      528.356494
std       392.747594
min         1.780000
25%       214.510000
50%       371.940000
75%       907.770000
max      1320.920000
Name: amt, dtype: float64

Undersampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 2145

In [12]:
legit_sample = legit.sample(n=2145)

Concatenating the two dataframes

In [13]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [14]:
new_dataset.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
331441,331441,2020-10-26 11:05:26,4783226709001,fraud_Sawayn PLC,shopping_pos,6.99,Jessica,Garcia,F,13108 Jennifer Passage,...,38.5957,-99.554,320,Film/video editor,1961-04-22,3369ebfb4d17360d904f5e68651008a7,1382785526,38.083972,-100.065273,0
152866,152866,2020-08-13 18:18:05,5559857416065248,"fraud_Medhurst, Labadie and Gottlieb",travel,3.81,Jack,Hill,M,5916 Susan Bridge Apt. 939,...,41.6125,-122.5258,589,Systems analyst,1945-12-21,cad30c58c0d6808f707e83dfaa82e0ce,1376417885,41.11785,-122.942735,0
458298,458298,2020-12-10 09:53:57,213125815021702,"fraud_Schultz, Simonis and Little",grocery_pos,141.65,Adam,Kirk,M,40847 Stark Junctions,...,42.074,-74.453,397,Psychiatrist,1931-09-12,bcd14688d8f3e38b87640573c9944434,1386669237,42.814669,-75.446385,0
50660,50660,2020-07-08 09:55:07,2475085306462014,fraud_Kemmer-Buckridge,misc_pos,48.62,John,Miller,M,153 Mccullough Springs Apt. 857,...,44.2378,-95.2739,1507,Land/geomatics surveyor,1993-10-12,769db376a97f2a4c427671d8f0af729f,1373277307,44.014302,-94.589837,0
152553,152553,2020-08-13 15:31:45,4003989662068504,fraud_Ankunding LLC,shopping_net,4.39,Chris,White,M,98897 Bennett Lodge,...,33.3224,-86.9657,71463,Radio broadcast assistant,1989-02-08,616798f9d8215aeb28f10f3234391519,1376407905,32.871048,-86.014766,0


In [15]:
new_dataset.tail()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
517197,517197,2020-12-22 22:05:48,2242176657877538,"fraud_Willms, Kris and Bergnaum",shopping_pos,1041.51,Travis,Daniel,M,1327 Rose Causeway Apt. 610,...,34.6323,-89.8855,14462,Database administrator,1959-03-03,35b0297dd026d2e9a75d024a5dec7955,1387749948,34.573471,-89.911011,1
517274,517274,2020-12-22 22:18:07,2242176657877538,fraud_Kuhn LLC,shopping_pos,868.09,Travis,Daniel,M,1327 Rose Causeway Apt. 610,...,34.6323,-89.8855,14462,Database administrator,1959-03-03,da7f67d7375f10a054a3d919448c45dd,1387750687,34.091227,-90.390612,1
517341,517341,2020-12-22 22:31:48,2242176657877538,"fraud_Mosciski, Ziemann and Farrell",shopping_net,1039.42,Travis,Daniel,M,1327 Rose Causeway Apt. 610,...,34.6323,-89.8855,14462,Database administrator,1959-03-03,25b076c7bcd70f272c1c5326bb234f4b,1387751508,34.628434,-90.28478,1
517529,517529,2020-12-22 23:06:03,2242176657877538,fraud_Bauch-Raynor,grocery_pos,289.27,Travis,Daniel,M,1327 Rose Causeway Apt. 610,...,34.6323,-89.8855,14462,Database administrator,1959-03-03,2df7d894868fbc99ec1d8b055585fc9d,1387753563,34.746063,-90.401093,1
517571,517571,2020-12-22 23:13:39,2242176657877538,fraud_Jaskolski-Vandervort,misc_net,766.38,Travis,Daniel,M,1327 Rose Causeway Apt. 610,...,34.6323,-89.8855,14462,Database administrator,1959-03-03,44292cbc51e37dc018ee6a988a4bc426,1387754019,33.771462,-90.651342,1


In [16]:
new_dataset['is_fraud'].value_counts()

is_fraud
0    2145
1    2145
Name: count, dtype: int64

In [17]:
new_dataset.groupby('is_fraud').mean

<bound method GroupBy.mean of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x177bb7c90>>

In [18]:
df_new = new_dataset.reset_index()


In [19]:
print(df_new)


       index  Unnamed: 0 trans_date_trans_time            cc_num  \
0     331441      331441   2020-10-26 11:05:26     4783226709001   
1     152866      152866   2020-08-13 18:18:05  5559857416065248   
2     458298      458298   2020-12-10 09:53:57   213125815021702   
3      50660       50660   2020-07-08 09:55:07  2475085306462014   
4     152553      152553   2020-08-13 15:31:45  4003989662068504   
...      ...         ...                   ...               ...   
4285  517197      517197   2020-12-22 22:05:48  2242176657877538   
4286  517274      517274   2020-12-22 22:18:07  2242176657877538   
4287  517341      517341   2020-12-22 22:31:48  2242176657877538   
4288  517529      517529   2020-12-22 23:06:03  2242176657877538   
4289  517571      517571   2020-12-22 23:13:39  2242176657877538   

                                  merchant      category      amt    first  \
0                         fraud_Sawayn PLC  shopping_pos     6.99  Jessica   
1     fraud_Medhurst, Labad

Splitting the data into features and targets 

In [20]:
X = new_dataset.drop(columns='is_fraud',axis=1)
Y = new_dataset['is_fraud']

In [21]:
print(X)

        Unnamed: 0 trans_date_trans_time            cc_num  \
331441      331441   2020-10-26 11:05:26     4783226709001   
152866      152866   2020-08-13 18:18:05  5559857416065248   
458298      458298   2020-12-10 09:53:57   213125815021702   
50660        50660   2020-07-08 09:55:07  2475085306462014   
152553      152553   2020-08-13 15:31:45  4003989662068504   
...            ...                   ...               ...   
517197      517197   2020-12-22 22:05:48  2242176657877538   
517274      517274   2020-12-22 22:18:07  2242176657877538   
517341      517341   2020-12-22 22:31:48  2242176657877538   
517529      517529   2020-12-22 23:06:03  2242176657877538   
517571      517571   2020-12-22 23:13:39  2242176657877538   

                                    merchant      category      amt    first  \
331441                      fraud_Sawayn PLC  shopping_pos     6.99  Jessica   
152866  fraud_Medhurst, Labadie and Gottlieb        travel     3.81     Jack   
458298     frau

In [22]:
print(Y)

331441    0
152866    0
458298    0
50660     0
152553    0
         ..
517197    1
517274    1
517341    1
517529    1
517571    1
Name: is_fraud, Length: 4290, dtype: int64


Split the data into training data and testing data 

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [24]:
print(X.shape, X_train.shape, X_test.shape)

(4290, 22) (3432, 22) (858, 22)


Model Training 

Logistic Regression model 

In [25]:
model = LogisticRegression()

Model Evaluation    

Accuracy Score 