#Use Decision Trees to prepare a model on fraud data treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

#Data Description :

#Undergrad : person is under graduated or not

#Marital.Status : marital status of a person

#Taxable.Income : Taxable income is the amount of how much tax an individual owes to the government

#Work Experience : Work experience of an individual person

# Urban : Whether that person belongs to urban area or not

In [None]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets  
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn import preprocessing



In [None]:
from google.colab import files
uploaded=files.upload()

In [None]:
df = pd.read_csv("Fraud_check.csv")

In [None]:
df.head()

In [None]:
#Creating dummy vairables for ['Undergrad','Marital.Status','Urban'] dropping first dummy variable
df=pd.get_dummies(df,columns=['Undergrad','Marital.Status','Urban'], drop_first=True)


In [None]:
df

In [None]:
#Creating new cols TaxInc and dividing 'Taxable.Income' cols on the basis of Risky and Good
df["TaxInc"] = pd.cut(df["Taxable.Income"], bins = [10002,30000,99620], labels = ["Risky", "Good"])


In [None]:
print(df)

In [None]:
#After creation of new col. TaxInc also made its dummies var concating right side of df
df = pd.get_dummies(df,columns = ["TaxInc"],drop_first=True)

In [None]:
df.head(10)

In [None]:
# let's plot pair plot to visualise the attributes all at once
import seaborn as sns
sns.pairplot(data=df, hue = 'TaxInc_Good')

In [None]:
# Normalization function 
def norm_func(i):
    x = (i-i.min())/(i.max()-i.min())
    return (x)

In [None]:
# Normalized data frame (considering the numerical part of data)
df_norm = norm_func(df.iloc[:,1:])
df_norm.tail(10)

In [None]:
# Declaring features & target
X = df_norm.drop(['TaxInc_Good'], axis=1)
y = df_norm['TaxInc_Good']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Splitting data into train & test
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
##Converting the Taxable income variable to bucketing. 
df_norm["income"]="<=30000"
df_norm.loc[df["Taxable.Income"]>=30000,"income"]="Good"
df_norm.loc[df["Taxable.Income"]<=30000,"income"]="Risky"

In [None]:
##Droping the Taxable income variable
df.drop(["Taxable.Income"],axis=1,inplace=True)

In [None]:
df.rename(columns={"Undergrad":"undergrad","Marital.Status":"marital","City.Population":"population","Work.Experience":"experience","Urban":"urban"},inplace=True)
## As we are getting error as "ValueError: could not convert string to float: 'YES'".
## Model.fit doesnt not consider String. So, we encode

In [None]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for column_name in df.columns:
    if df[column_name].dtype == object:
        df[column_name] = le.fit_transform(df[column_name])
    else:
        pass

In [None]:
##Splitting the data into featuers and labels
features = df.iloc[:,0:5]
labels = df.iloc[:,5]

In [None]:
## Collecting the column names
colnames = list(df.columns)
predictors = colnames[0:5]
target = colnames[5]
##Splitting the data into train and test

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size = 0.2,stratify = labels)


In [None]:
##Model building
from sklearn.ensemble import RandomForestClassifier as RF
model = RF(n_jobs = 3,n_estimators = 15, oob_score = True, criterion = "entropy")
model.fit(x_train,y_train)

In [None]:
RF(criterion='entropy', n_estimators=15, n_jobs=3,
                       oob_score=True)

In [None]:
model.estimators_
model.classes_
model.n_features_in_
model.n_classes_

In [None]:
model.n_outputs_

In [None]:
##Predictions on train data
prediction = model.predict(x_train)

In [None]:
##Accuracy
# For accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_train,prediction)

In [None]:
np.mean(prediction == y_train)

In [None]:
##Prediction on test data
pred_test = model.predict(x_test)

In [None]:
##Accuracy
acc_test =accuracy_score(y_test,pred_test)


In [None]:
acc_test

# **Building Decision Tree Classifier using Entropy Criteria**

In [None]:
model = DecisionTreeClassifier(criterion = 'entropy',max_depth=3)
model.fit(x_train,y_train)

In [None]:
from sklearn import tree

In [None]:
#PLot the decision tree
tree.plot_tree(model);

In [None]:
colnames = list(df.columns)
colnames

In [None]:
fn=['population','experience','Undergrad_YES','Marital.Status_Married','Marital.Status_Single','Urban_YES']
cn=['1', '0']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(model,
               feature_names = fn, 
               class_names=cn,
               filled = True);

In [None]:
#Predicting on test data
preds = model.predict(x_test) # predicting on test data set 
pd.Series(preds).value_counts() # getting the count of each category 


In [None]:
preds

In [None]:
pd.crosstab(y_test,preds) # getting the 2 way table to understand the correct and wrong predictions

In [None]:
# Accuracy 
np.mean(preds==y_test)

#**Building Decision Tree Classifier (CART) using Gini Criteria**


In [None]:
from sklearn.tree import DecisionTreeClassifier
model_gini = DecisionTreeClassifier(criterion='gini', max_depth=3)

In [None]:
model_gini.fit(x_train, y_train)

In [None]:
#Prediction and computing the accuracy
pred=model.predict(x_test)
np.mean(preds==y_test)

By using  both entropy criteria and Gini index , we are getting same accuracy of model is same.