# Importing the required dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Lets load the dataset of fraud, I have named Rfraud as we have same dataset for decision tree

In [2]:
Rfraud = pd.read_csv(r"C:\Users\Binita Mandal\Desktop\finity\Random Forests\Fraud_check.csv")

In [3]:
# Head of the dataset
Rfraud.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [4]:
# Tail of the dataset
Rfraud.tail()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO
599,NO,Divorced,96519,158137,16,NO


In [5]:
# Size of the dataset
Rfraud.size

3600

In [6]:
# Shape of the dataset
Rfraud.shape

(600, 6)

In [7]:
Rfraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [8]:
Rfraud.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [9]:
#Converting the Taxable income variable to bucketing. 
Rfraud["income"]="<=30000"
Rfraud.loc[Rfraud["Taxable.Income"]>=30000,"income"]="Good"
Rfraud.loc[Rfraud["Taxable.Income"]<=30000,"income"]="Risky"

In [10]:
#Droping the Taxable income variable
Rfraud.drop(["Taxable.Income"],axis=1,inplace=True)

### As we are getting error as "ValueError: could not convert string to float: 'YES'".
### Model.fit doesnt not consider String. So, we encode

In [11]:
Rfraud.rename(columns={"Undergrad":"undergrad","Marital.Status":"marital","City.Population":"population","Work.Experience":"experience","Urban":"urban"},inplace=True)

In [12]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for column_name in Rfraud.columns:
    if Rfraud[column_name].dtype == object:
        Rfraud[column_name] = le.fit_transform(Rfraud[column_name])
    else:
        pass

In [13]:
#Splitting the data into featuers and labels
features = Rfraud.iloc[:,0:5]
labels = Rfraud.iloc[:,5]

In [14]:
# Collecting the column names
colnames = list(Rfraud.columns)
predictors = colnames[0:5]
target = colnames[5]

In [15]:
#Splitting the data into train and test

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size = 0.2,stratify = labels)

In [16]:
#Model building
from sklearn.ensemble import RandomForestClassifier as RF
model = RF(n_jobs = 3,n_estimators = 15, oob_score = True, criterion = "entropy")
model.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=15, n_jobs=3,
                       oob_score=True)

In [17]:
model.estimators_

[DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1145462256),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1320174460),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1391939111),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=278805779),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1943438093),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1927249215),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=44380852),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1979729394),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
         

In [18]:
model.classes_

array([0, 1])

In [19]:
model.n_features_

5

In [20]:
model.n_classes_

2

In [21]:
model.n_outputs_

1

In [22]:
model.oob_score_

0.71875

In [23]:
#Predictions on train data
prediction = model.predict(x_train)

In [24]:
# For accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_train,prediction)
print(accuracy)

0.9854166666666667


In [25]:
np.mean(prediction == y_train)

0.9854166666666667

In [26]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_train,prediction)

In [27]:
#Prediction on test data
pred_test = model.predict(x_test)

In [28]:
#Accuracy
acc_test =accuracy_score(y_test,pred_test)
print(acc_test)

0.7416666666666667


In [29]:
# In random forest we can plot a Decision tree present in Random forest
from sklearn.tree import export_graphviz
import pydotplus
from six import StringIO

In [30]:
tree = model.estimators_[5]

In [31]:
dot_data = StringIO()
export_graphviz(tree,out_file = dot_data, filled = True,rounded = True, feature_names = predictors ,class_names = target,impurity =False)

In [32]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

In [33]:
import os

os.environ['PATH'] = os.environ['PATH']+';'+os.environ['CONDA_PREFIX']+r"\Library\bin\graphviz"

In [34]:
# Creating pdf and png file the selected decision tree
graph.write_pdf('Rfraudrf.pdf')
graph.write_png('Rfraudrf.png')

True