# Decision Tree- Fraud Check

Use decision trees to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

Data Description :

Undergrad : person is under graduated or not

Marital.Status : marital status of a person

Taxable.Income : Taxable income is the amount of how much tax an individual owes to the government

Work Experience : Work experience of an individual person

Urban : Whether that person belongs to urban area or not

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report

In [None]:
fraud= pd.read_csv("Fraud_check.csv")

In [None]:
fraud.describe()

In [None]:
#Changing the categorical variables into dummies.
df = pd.get_dummies(fraud)

#Converting the Target variable i.e. Taxable Income into Categorical (As mentioned in the problem statement)
df['Category'] = pd.cut(df['Taxable.Income'], 
                        bins=[0,30000, np.inf], 
                        labels=['Risky','Good'],
                        include_lowest=True)

In [None]:
df

In [None]:
x = df.iloc[:,1:10]
y = df.iloc[:,10]

In [None]:
x

In [None]:
y

In [None]:
# Splitting data into training and testing data set
x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.3,random_state=40)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

### Building Decision Tree Classifier using Entropy Criteria
### Iteration-1: Max Depth = 2

In [None]:
model1 = DecisionTreeClassifier(criterion = 'entropy',max_depth=2)
model1.fit(x_train,y_train)
preds1 = model1.predict(x_test) # predicting on test data set 

print('Model leaves:',model1.get_n_leaves(),'\n','\n',
      pd.Series(preds1).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds1==y_test))

### Iteration-2: Max Depth = 3

In [None]:
model2 = DecisionTreeClassifier(criterion = 'entropy',max_depth=3)
model2.fit(x_train,y_train)
preds2 = model2.predict(x_test) # predicting on test data set 

print('Model leaves:',model2.get_n_leaves(),'\n','\n',
      pd.Series(preds2).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds2==y_test))

### Iteration-3: Max Depth = 4

In [None]:
model3 = DecisionTreeClassifier(criterion = 'entropy',max_depth=4)
model3.fit(x_train,y_train)
preds3 = model3.predict(x_test) # predicting on test data set 

print('Model leaves:',model3.get_n_leaves(),'\n','\n',
      pd.Series(preds3).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds3==y_test))

### Iteration-4: Max Depth = 5

In [None]:
model4 = DecisionTreeClassifier(criterion = 'entropy',max_depth=5)
model4.fit(x_train,y_train)
preds4 = model4.predict(x_test) # predicting on test data set 

print('Model leaves:',model4.get_n_leaves(),'\n','\n',
      pd.Series(preds4).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds4==y_test))

### Iteration-5: Max Depth = 6

In [None]:
model5 = DecisionTreeClassifier(criterion = 'entropy',max_depth=6)
model5.fit(x_train,y_train)
preds5 = model5.predict(x_test) # predicting on test data set 

print('Model leaves:',model5.get_n_leaves(),'\n','\n',
      pd.Series(preds5).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds5==y_test))

### Iteration-6: Max Depth = 7

In [None]:
model6 = DecisionTreeClassifier(criterion = 'entropy',max_depth=7)
model6.fit(x_train,y_train)
preds6 = model6.predict(x_test) # predicting on test data set 

print('Model leaves:',model6.get_n_leaves(),'\n','\n',
      pd.Series(preds6).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds6==y_test))

### Iteration-7: Max Depth = 8

In [None]:
model7 = DecisionTreeClassifier(criterion = 'entropy',max_depth=8)
model7.fit(x_train,y_train)
preds7 = model7.predict(x_test) # predicting on test data set 

print('Model leaves:',model7.get_n_leaves(),'\n','\n',
      pd.Series(preds7).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds7==y_test))

### Hence, the classifier model at the end of iteration 1 has the max accuracy i.e. 77.77%

In [None]:
print(classification_report(preds1,y_test))

### We'll perform EDA to find outlier and see if that can increase the model accuracy

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
newdf= pd.get_dummies(fraud)

In [None]:
newdf

In [None]:
# training the model
clf = IsolationForest(random_state=40,contamination=.01)
clf.fit(newdf)

In [None]:
# predictions
y_pred_outliers = clf.predict(newdf)

In [None]:
#-1 for outliers and 1 for inliers.
y_pred_outliers

In [None]:
newdf['scores']=clf.decision_function(newdf.iloc[:,0:10])
newdf['anomaly']=clf.predict(newdf.iloc[:,0:10])
newdf

In [None]:
#Print the outlier data points
newdf[newdf['anomaly']==-1]

In [None]:
df1 = newdf.drop([34,280,409,414,474,541],axis=0).reset_index(drop=True)
df1

In [None]:
#Converting the Target variable i.e. Taxable Income into Categorical (As mentioned in the problem statement)
df1['Category'] = pd.cut(df1['Taxable.Income'], 
                        bins=[0,30000, np.inf], 
                        labels=['Risky','Good'],
                        include_lowest=True)

df1

In [None]:
x1 = df1.iloc[:,1:10]
y1 = df1.iloc[:,12]

In [None]:
x1

In [None]:
y1

In [None]:
# Splitting data into training and testing data set
x_train1, x_test1,y_train1,y_test1 = train_test_split(x1,y1, test_size=0.25,random_state=40)

### Building Decision Tree Classifier using Entropy Criteria
### Since last time the iteration 1 and 2 had the highest accuracy, so we'll make new models with this new dataframe, i.e. with max depth = 2 & 3

In [None]:
model11 = DecisionTreeClassifier(criterion = 'entropy',max_depth=2)
model11.fit(x_train,y_train)
preds11 = model11.predict(x_test) # predicting on test data set 

print('Model leaves:',model11.get_n_leaves(),'\n','\n',
      pd.Series(preds11).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds11==y_test))

In [None]:
model12 = DecisionTreeClassifier(criterion = 'entropy',max_depth=3)
model12.fit(x_train,y_train)
preds12 = model12.predict(x_test) # predicting on test data set 

print('Model leaves:',model12.get_n_leaves(),'\n','\n',
      pd.Series(preds12).value_counts(),'\n','\n',
      'Model Accuracy is:',np.mean(preds12==y_test))

### Since the accuracy hasn't improved, we can finalise the model 1 as our final model
### Building Decision Tree Classifier (CART) using Gini Criteria

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_gini = DecisionTreeClassifier(criterion='gini', max_depth=2)

In [None]:
model_gini.fit(x_train, y_train)

In [None]:
#Prediction and computing the accuracy
predG=model_gini.predict(x_test)
print('Model Accuracy is:',np.mean(predG==y_test))

### Same Accuracy is achieved using CART as well
### Let's Visualize both the Decision Trees

In [None]:
fig = plt.figure(figsize=(25,20))
fig = tree.plot_tree(model1,
                     feature_names= ['Taxable.Income','City.Population','Work.Experience','Undergrad_NO',
                                     'Undergrad_YES','Marital.Status_Divorced','Marital.Status_Married',
                                     'Marital.Status_Single','Urban_NO','Urban_YES','Category'], filled=True)
plt.title('Decision Tree using Entropy',fontsize=22)
plt.savefig('Fraud Check DT_Entropy.png')

In [None]:
fig = plt.figure(figsize=(25,20))
fig = tree.plot_tree(model_gini,
                     feature_names= ['Taxable.Income','City.Population','Work.Experience','Undergrad_NO',
                                     'Undergrad_YES','Marital.Status_Divorced','Marital.Status_Married',
                                     'Marital.Status_Single','Urban_NO','Urban_YES','Category'], filled=True)
plt.title('Decision Tree using CART',fontsize=22)
plt.savefig('Fraud Check DT_CART.png')