## ```Imports```
---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
fraud_train = pd.read_csv('/Users/crivera/Desktop/capstone-dsir82/capstone_data/fraudTrain.csv')
fraud_test = pd.read_csv('/Users/crivera/Desktop/capstone-dsir82/capstone_data/fraudTest.csv')

In [3]:
print(fraud_train.shape)
print(fraud_test.shape)

(1296675, 23)
(555719, 23)


In [4]:
fraud_train['is_fraud'].value_counts(normalize=True)

0    0.994211
1    0.005789
Name: is_fraud, dtype: float64

In [5]:
fraud_test['is_fraud'].value_counts(normalize=True)

0    0.99614
1    0.00386
Name: is_fraud, dtype: float64

## ```Creating Smote Balanced Classes Data Set for final Modeling```
---

In [6]:
# concat both datasets
fraud = pd.concat([fraud_train,fraud_test])

# drop uneccesary columns
fraud.drop(columns=['Unnamed: 0','first','last','street','unix_time','trans_num'], inplace=True)

# change trans_data_trans_time and dob to datetime 
fraud['trans_date_trans_time'] = pd.to_datetime(fraud['trans_date_trans_time'])
fraud['dob'] = pd.to_datetime(fraud['dob'])

# creating hour column to for eda
fraud['hour'] = fraud['trans_date_trans_time'].dt.hour
fraud.shape

(1852394, 18)

In [7]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1852394 entries, 0 to 555718
Data columns (total 18 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   merchant               object        
 3   category               object        
 4   amt                    float64       
 5   gender                 object        
 6   city                   object        
 7   state                  object        
 8   zip                    int64         
 9   lat                    float64       
 10  long                   float64       
 11  city_pop               int64         
 12  job                    object        
 13  dob                    datetime64[ns]
 14  merch_lat              float64       
 15  merch_long             float64       
 16  is_fraud               int64         
 17  hour                   int64         
dtypes: datetime64[ns](2), f

In [8]:
# dummy for object variables
df_fraud = fraud.copy()
df_fraud = pd.get_dummies(fraud, columns=['gender','city','state','category','merchant'])
df_fraud.shape

(1852394, 1679)

In [9]:
df_fraud.head()

Unnamed: 0,trans_date_trans_time,cc_num,amt,zip,lat,long,city_pop,job,dob,merch_lat,...,"merchant_fraud_Yost, Schamberger and Windler",merchant_fraud_Yost-Rogahn,merchant_fraud_Zboncak LLC,merchant_fraud_Zboncak Ltd,"merchant_fraud_Zboncak, Rowe and Murazik",merchant_fraud_Zemlak Group,"merchant_fraud_Zemlak, Tillman and Cremin",merchant_fraud_Ziemann-Waters,"merchant_fraud_Zieme, Bode and Dooley",merchant_fraud_Zulauf LLC
0,2019-01-01 00:00:18,2703186189652095,4.97,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,36.011293,...,0,0,0,0,0,0,0,0,0,0
1,2019-01-01 00:00:44,630423337322,107.23,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,49.159047,...,0,0,0,0,0,0,0,0,0,0
2,2019-01-01 00:00:51,38859492057661,220.11,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,43.150704,...,0,0,0,0,0,0,0,0,0,0
3,2019-01-01 00:01:16,3534093764340240,45.0,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,47.034331,...,0,0,0,0,0,0,0,0,0,0
4,2019-01-01 00:03:06,375534208663984,41.96,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,38.674999,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#set up X/y
X = df_fraud.drop(columns=['trans_date_trans_time','lat','long','job','merch_lat','merch_long','dob','is_fraud'])
y = df_fraud['is_fraud']
print(f"Target is_fraud value_counts: {y.value_counts()}")

# Instantiate Smote Class
smo = SMOTE(random_state=42)

print('====')
# Synthetically enhance minority class
X_smo, y_smo = smo.fit_resample(X,y)
print(f"Smote Target value_counts: {y_smo.value_counts()}")

Target is_fraud value_counts: 0    1842743
1       9651
Name: is_fraud, dtype: int64
====


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_smo,y_smo)

In [None]:
# train model
dt = DecisionTreeClassifier()
dt.fit(X_train_ss,y_train)

# score model
print(f"train Acc: {dt.score(X_train_ss,y_train)}")
print(f"test Acc: {dt.score(X_test_ss,y_test)}")

# Confusion Matrix
plot_confusion_matrix(dt, X_test_ss, y_test, display_labels=['noFraud','Fraud'])
plt.show()