In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from collections import Counter
from datetime import datetime

In [2]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
#pd.reset_option('display.float_format')

In [3]:
df = pd.read_csv("c:/Users/91959/Desktop/aftech/fraudTrain.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.01129,-82.04832,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.15905,-118.18646,0


In [4]:
df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [5]:
len(df.columns)

23

In [6]:
df.drop(df.columns[0], axis=1, inplace=True)

In [8]:
df['gender'] = df['gender'].astype("category")
df['is_fraud'] = df['is_fraud'].astype("category")
df['category'] = df['category'].astype("category")
df['state'] = df['state'].astype("category")     

In [9]:
df['is_fraud'].value_counts()

is_fraud
0    1289169
1       7506
Name: count, dtype: int64

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.shape

(1296675, 22)

In [12]:
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["dob"] = pd.to_datetime(df["dob"])

In [14]:
df.drop(columns=['first','last','trans_num','cc_num','dob','unix_time'],inplace=True)

In [16]:
df.drop(columns=['trans_date_trans_time'],inplace=True)

In [18]:
df.nunique()

merchant          693
category           14
amt             52928
gender              2
street            983
city              894
state              51
zip               970
lat               968
long              969
city_pop          879
job               494
merch_lat     1247805
merch_long    1275745
is_fraud            2
dtype: int64

In [19]:
df.rename(columns={"amt":"amount(USD)"},inplace=True)

In [20]:
# preprocessing.LabelEncoder() - Encode target labels with value between 0 and n_classes-1.
# preprocessing.OneHotEncoder(*[, categories, ...]) - Encode categorical features as a one-hot numeric array.
# preprocessing.OrdinalEncoder(*[, ...]) - Encode categorical features as an integer array.
# preprocessing.TargetEncoder([categories, ...]) - Target Encoder for regression and classification targets.

In [21]:
encoder = LabelEncoder()
df["gender"] = encoder.fit_transform(df["gender"])
df["is_fraud"] = encoder.fit_transform(df["is_fraud"])
df["category"] = encoder.fit_transform(df["category"])
df["state"] = encoder.fit_transform(df["state"])

In [22]:
df.head(2)

Unnamed: 0,merchant,category,amount(USD),gender,street,city,state,zip,lat,long,city_pop,job,merch_lat,merch_long,is_fraud
0,"fraud_Rippin, Kub and Mann",8,4.97,0,561 Perry Cove,Moravian Falls,27,28654,36.0788,-81.1781,3495,"Psychologist, counselling",36.01129,-82.04832,0
1,"fraud_Heller, Gutmann and Zieme",4,107.23,0,43039 Riley Greens Suite 393,Orient,47,99160,48.8878,-118.2105,149,Special educational needs teacher,49.15905,-118.18646,0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 15 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   merchant     1296675 non-null  object 
 1   category     1296675 non-null  int32  
 2   amount(USD)  1296675 non-null  float64
 3   gender       1296675 non-null  int32  
 4   street       1296675 non-null  object 
 5   city         1296675 non-null  object 
 6   state        1296675 non-null  int32  
 7   zip          1296675 non-null  int64  
 8   lat          1296675 non-null  float64
 9   long         1296675 non-null  float64
 10  city_pop     1296675 non-null  int64  
 11  job          1296675 non-null  object 
 12  merch_lat    1296675 non-null  float64
 13  merch_long   1296675 non-null  float64
 14  is_fraud     1296675 non-null  int64  
dtypes: float64(5), int32(3), int64(3), object(4)
memory usage: 133.6+ MB


In [25]:
df.nunique()

merchant           693
category            14
amount(USD)      52928
gender               2
street             983
city               894
state               51
zip                970
lat                968
long               969
city_pop           879
job                494
merch_lat      1247805
merch_long     1275745
is_fraud             2
dtype: int64

In [28]:
# merchant, street, city, job
from category_encoders import TargetEncoder

target_encoder = TargetEncoder()

df["merchant"] = target_encoder.fit_transform(df["merchant"],df["is_fraud"])
df["street"] = target_encoder.fit_transform(df["street"],df["is_fraud"])
df["city"] = target_encoder.fit_transform(df["city"],df["is_fraud"])
df["job"] = target_encoder.fit_transform(df["job"],df["is_fraud"])

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 15 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   merchant     1296675 non-null  float64
 1   category     1296675 non-null  int32  
 2   amount(USD)  1296675 non-null  float64
 3   gender       1296675 non-null  int32  
 4   street       1296675 non-null  float64
 5   city         1296675 non-null  float64
 6   state        1296675 non-null  int32  
 7   zip          1296675 non-null  int64  
 8   lat          1296675 non-null  float64
 9   long         1296675 non-null  float64
 10  city_pop     1296675 non-null  int64  
 11  job          1296675 non-null  float64
 12  merch_lat    1296675 non-null  float64
 13  merch_long   1296675 non-null  float64
 14  is_fraud     1296675 non-null  int64  
dtypes: float64(9), int32(3), int64(3)
memory usage: 133.6 MB
