In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
train_data = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [4]:
data = train_data.drop(['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'trans_num'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 17 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   merchant               1296675 non-null  object 
 2   category               1296675 non-null  object 
 3   amt                    1296675 non-null  float64
 4   gender                 1296675 non-null  object 
 5   city                   1296675 non-null  object 
 6   state                  1296675 non-null  object 
 7   zip                    1296675 non-null  int64  
 8   lat                    1296675 non-null  float64
 9   long                   1296675 non-null  float64
 10  city_pop               1296675 non-null  int64  
 11  job                    1296675 non-null  object 
 12  dob                    1296675 non-null  object 
 13  unix_time              1296675 non-null  int64  
 14  merch_lat         

In [5]:
# Feature Engineering: Extract features from trans_date_trans_time
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
data['hour'] = data['trans_date_trans_time'].dt.hour
data['day'] = data['trans_date_trans_time'].dt.day
data['month'] = data['trans_date_trans_time'].dt.month
data['day_of_week'] = data['trans_date_trans_time'].dt.dayofweek

data['dob'] = pd.to_datetime(data['dob'])
data['day_of_birth'] = data['trans_date_trans_time'].dt.day
data['month_of_birth'] = data['trans_date_trans_time'].dt.month
data['year_of_birth'] = data['trans_date_trans_time'].dt.year


In [6]:
# Drop the original trans_date_trans_time column
data = data.drop(['trans_date_trans_time'], axis=1)
data = data.drop(['dob'],axis=1)

In [7]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ['merchant', 'category', 'gender', 'city', 'state', 'job']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
data

Unnamed: 0,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,...,merch_lat,merch_long,is_fraud,hour,day,month,day_of_week,day_of_birth,month_of_birth,year_of_birth
0,514,8,4.97,0,526,27,28654,36.0788,-81.1781,3495,...,36.011293,-82.048315,0,0,1,1,1,1,1,2019
1,241,4,107.23,0,612,47,99160,48.8878,-118.2105,149,...,49.159047,-118.186462,0,0,1,1,1,1,1,2019
2,390,0,220.11,1,468,13,83252,42.1808,-112.2620,4154,...,43.150704,-112.154481,0,0,1,1,1,1,1,2019
3,360,2,45.00,1,84,26,59632,46.2306,-112.1138,1939,...,47.034331,-112.561071,0,0,1,1,1,1,1,2019
4,297,9,41.96,1,216,45,24433,38.4207,-79.4629,99,...,38.674999,-78.632459,0,0,1,1,1,1,1,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,499,0,15.56,1,330,44,84735,37.7175,-112.4777,258,...,36.841266,-111.690765,0,12,21,6,6,21,6,2020
1296671,2,1,51.70,1,813,20,21790,39.2667,-77.5101,100,...,38.906881,-78.246528,0,12,21,6,6,21,6,2020
1296672,599,1,105.93,1,346,32,88325,32.9396,-105.8189,899,...,33.619513,-105.130529,0,12,21,6,6,21,6,2020
1296673,509,1,74.90,1,471,41,57756,43.3526,-102.5411,1126,...,42.788940,-103.241160,0,12,21,6,6,21,6,2020


In [8]:
# Standardize the numerical features
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])


NameError: name 'X' is not defined

In [None]:
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

In [None]:
import matplotlib.pyplot as plt
data.is_fraud.plot.hist()

In [None]:
X.corrwith(data['is_fraud']).plot.bar(figsize=(16,9), title = 'Correlation with the variables', rot =45, grid=True)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X,y)

In [None]:
test_data = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')
test_data.info()

In [None]:
test = test_data.drop(['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'trans_num'], axis=1)
test['trans_date_trans_time'] = pd.to_datetime(test['trans_date_trans_time'])
test['hour'] = test['trans_date_trans_time'].dt.hour
test['day'] = test['trans_date_trans_time'].dt.day
test['month'] = test['trans_date_trans_time'].dt.month
test['day_of_week'] = test['trans_date_trans_time'].dt.dayofweek

test['dob'] = pd.to_datetime(test['dob'])
test['day_of_birth'] = test['trans_date_trans_time'].dt.day
test['month_of_birth'] = test['trans_date_trans_time'].dt.month
test['year_of_birth'] = test['trans_date_trans_time'].dt.year

In [None]:
# Drop the original trans_date_trans_time column
test = test.drop(['trans_date_trans_time'], axis=1)
test = test.drop(['dob'],axis=1)

In [None]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ['merchant', 'category', 'gender', 'city', 'state', 'job']
for col in categorical_columns:
    le = LabelEncoder()
    test[col] = le.fit_transform(test[col])
    label_encoders[col] = le
test

In [None]:
# Standardize the numerical features
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

X_test = test.drop('is_fraud', axis=1)
y_test = test['is_fraud']


In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
acc