## 1. Import the necessary Libraries 

In [106]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split

## 2. Import Dataset

In [2]:
fraud_data = pd.read_csv('Fraud_check.csv')

In [3]:
fraud_data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


## 3. Understanding the Data

In [6]:
fraud_data.shape  # 6 features and 600 Observations

(600, 6)

In [7]:
fraud_data.isna().sum()  # There are no null values to deal with

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [8]:
fraud_data.dtypes

Undergrad          object
Marital.Status     object
Taxable.Income      int64
City.Population     int64
Work.Experience     int64
Urban              object
dtype: object

In [10]:
fraud_data['Undergrad'].value_counts()

YES    312
NO     288
Name: Undergrad, dtype: int64

In [11]:
fraud_data['Marital.Status'].value_counts()

Single      217
Married     194
Divorced    189
Name: Marital.Status, dtype: int64

In [14]:
fraud_data['Urban'].value_counts()

YES    302
NO     298
Name: Urban, dtype: int64

## 4. Data preparation


##### We are having OBJECT Data types for 3 features here 


In [31]:
le = LabelEncoder()

In [32]:
le 

LabelEncoder()

In [34]:
fraud_data['E_Undergrad'] = le.fit_transform(fraud_data['Undergrad'])
fraud_data['E_Marital.Status'] = le.fit_transform(fraud_data['Marital.Status'])
fraud_data['E_Urbun'] = le.fit_transform(fraud_data['Urban'])

In [35]:
fraud_data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,E_Undergrad,E_Marital.Status,E_Urbun
0,NO,Single,68833,50047,10,YES,0,2,1
1,YES,Divorced,33700,134075,18,YES,1,0,1
2,NO,Married,36925,160205,30,YES,0,1,1
3,YES,Single,50190,193264,15,YES,1,2,1
4,NO,Married,81002,27533,28,NO,0,1,0


In [67]:
fraud_data2 = fraud_data.drop(columns=['Undergrad','Marital.Status','Urban'])
fraud_data2.head()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,E_Undergrad,E_Marital.Status,E_Urbun
0,68833,50047,10,0,2,1
1,33700,134075,18,1,0,1
2,36925,160205,30,0,1,1
3,50190,193264,15,1,2,1
4,81002,27533,28,0,1,0


In [68]:
E_Taxable_Income = []

In [69]:
def Encoded_income():
    for i in fraud_data['Taxable.Income']:
        if i <= 30000:
            E_Taxable_Income.append(1)
        else:
            E_Taxable_Income.append(0)

In [70]:
E_Taxable_Income

[]

In [71]:
Encoded_income()

In [72]:
E_Taxable_Income = pd.DataFrame(data = E_Taxable_Income, columns=['E_Taxable.Income'])

In [73]:
E_Taxable_Income.value_counts()

E_Taxable.Income
0                   476
1                   124
dtype: int64

In [74]:
fraud_data2.head()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,E_Undergrad,E_Marital.Status,E_Urbun
0,68833,50047,10,0,2,1
1,33700,134075,18,1,0,1
2,36925,160205,30,0,1,1
3,50190,193264,15,1,2,1
4,81002,27533,28,0,1,0


In [75]:
fraud_data2.dtypes

Taxable.Income      int64
City.Population     int64
Work.Experience     int64
E_Undergrad         int32
E_Marital.Status    int32
E_Urbun             int32
dtype: object

In [77]:
fraud_data2['E_Taxable.Income'] = E_Taxable_Income

In [79]:
fraud_data2.head(20)

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,E_Undergrad,E_Marital.Status,E_Urbun,E_Taxable.Income
0,68833,50047,10,0,2,1,0
1,33700,134075,18,1,0,1,0
2,36925,160205,30,0,1,1,0
3,50190,193264,15,1,2,1,0
4,81002,27533,28,0,1,0,0
5,33329,116382,0,0,0,0,0
6,83357,80890,8,0,0,1,0
7,62774,131253,3,1,2,1,0
8,83519,102481,12,0,2,1,0
9,98152,155482,4,1,0,1,0


In [80]:
fraud_data_new = fraud_data2.drop(columns='Taxable.Income')

In [81]:
fraud_data_new

Unnamed: 0,City.Population,Work.Experience,E_Undergrad,E_Marital.Status,E_Urbun,E_Taxable.Income
0,50047,10,0,2,1,0
1,134075,18,1,0,1,0
2,160205,30,0,1,1,0
3,193264,15,1,2,1,0
4,27533,28,0,1,0,0
...,...,...,...,...,...,...
595,39492,7,1,0,1,0
596,55369,2,1,0,1,0
597,154058,0,0,0,1,0
598,180083,17,1,1,0,0


In [82]:
X = fraud_data_new.drop(labels = E_Taxable_Income, axis=1)
X

Unnamed: 0,City.Population,Work.Experience,E_Undergrad,E_Marital.Status,E_Urbun
0,50047,10,0,2,1
1,134075,18,1,0,1
2,160205,30,0,1,1
3,193264,15,1,2,1
4,27533,28,0,1,0
...,...,...,...,...,...
595,39492,7,1,0,1
596,55369,2,1,0,1
597,154058,0,0,0,1
598,180083,17,1,1,0


In [86]:
y = fraud_data_new[['E_Taxable.Income']]
y

Unnamed: 0,E_Taxable.Income
0,0
1,0
2,0
3,0
4,0
...,...
595,0
596,0
597,0
598,0


In [90]:
std_scalar = StandardScaler()
X_scaled = std_scalar.fit_transform(X)

In [93]:
X_scaled = pd.DataFrame(data = X_scaled, columns= X.columns)
X_scaled.head()

Unnamed: 0,City.Population,Work.Experience,E_Undergrad,E_Marital.Status,E_Urbun
0,-1.178521,-0.629143,-1.040833,1.1608,0.993355
1,0.5085,0.27637,0.960769,-1.274445,0.993355
2,1.033109,1.634639,-1.040833,-0.056822,0.993355
3,1.696831,-0.063197,0.960769,1.1608,0.993355
4,-1.630532,1.408261,-1.040833,-0.056822,-1.006689


In [107]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=2, shuffle=True)

In [95]:
dt_model = DecisionTreeClassifier()

In [108]:
dt_model.fit(X_train,y_train)

DecisionTreeClassifier()

In [109]:
y_pred_train = dt_model.predict(X_train)

In [110]:
print('Accuracy Score for Training   : '  , accuracy_score(y_train, y_pred_train))
print('Confusion Matrix for Training : \n', confusion_matrix(y_train, y_pred_train))

Accuracy Score for Training   :  1.0
Confusion Matrix for Training : 
 [[359   0]
 [  0  91]]


In [111]:
y_pred_test = dt_model.predict(X_test)

In [112]:
print('Accuracy Score for Testing   : '  , accuracy_score(y_test, y_pred_test))
print('Confusion Matrix for Testing : \n', confusion_matrix(y_test, y_pred_test))

Accuracy Score for Testing   :  0.64
Confusion Matrix for Testing : 
 [[90 27]
 [27  6]]


In [103]:
confusion_matrix(y_pred, y)

array([[476,   0],
       [  0, 124]], dtype=int64)

In [104]:
accuracy_score(y, y_pred)

1.0

In [105]:
recall_score(y, y_pred)

1.0