The Bank Note Authentication Dataset

The dataset contains following four input features computed from 400x400 images of bank notes:
1. variance of Wavelet Transformed image (continuous)
2. skewness of Wavelet Transformed image (continuous)
3. Curtosis of Wavelet Transformed image (continuous)
4. entropy of image (continuous)
Each instance is labelled as fake (label 0) or authentic (label 1).

In [1]:
import pandas as pd
import numpy as np
from numpy import log2 as log
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn import metrics

For the given dataset, implement Decision Tree-based CART algorithm:

# (i) Inbuilt Function

In [2]:
df = pd.read_csv('bank_notes.csv')
print(df)

      variance  skewness  curtosis  entropy  Target
0      3.62160   8.66610   -2.8073 -0.44699       0
1      4.54590   8.16740   -2.4586 -1.46210       0
2      3.86600  -2.63830    1.9242  0.10645       0
3      3.45660   9.52280   -4.0112 -3.59440       0
4      0.32924  -4.45520    4.5718 -0.98880       0
...        ...       ...       ...      ...     ...
1367   0.40614   1.34920   -1.4501 -0.55949       1
1368  -1.38870  -4.87730    6.4774  0.34179       1
1369  -3.75030 -13.45860   17.5932 -2.77710       1
1370  -3.56370  -8.38270   12.3930 -1.28230       1
1371  -2.54190  -0.65804    2.6842  1.19520       1

[1372 rows x 5 columns]


In [3]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state=42)

In [4]:
classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(x_train,y_train)

y_pred = classifier.predict(x_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,

Performance Evaluation

In [5]:
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[190   1]
 [  4 148]]


0.9854227405247813

# (ii) Step-By-Step Method

In [6]:
eps = np.finfo(float).eps

# function to calculate entropy of a complete set
def find_entropy(df) :
  Class = df.keys()[-1]
  entropy = 0
  values = df[Class].unique()
  for value in values :
    fraction = df[Class].value_counts()[value]/len(df[Class]) # number of instances of this value divided by the total number of instances
    entropy  = entropy + -fraction*np.log2(fraction)
  return entropy

# function to calculate the entropy of a feature given a set
def find_entropy_attribute(df,attribute) :
  Class = df.keys()[-1]
  target_variables = df[Class].unique()
  variables = df[attribute].unique()
  entropy2 =0 
  for variable in variables :
    entropy = 0
    for target_variable in target_variables :
      num = len(df[attribute][df[attribute]==variable][df[Class]==target_variable])
      den = len(df[attribute][df[attribute]==variable])
      fraction = num/(den+eps)
      entropy += -fraction*log(fraction+eps)
    fraction2 = den/len(df)
    entropy2 += -fraction2*entropy
  return abs(entropy2)

In [7]:
# function to find the feature with the highest information gain
def find_winner(df) :
  IG = []
  for key in df.keys()[:-1] :
    IG.append(find_entropy(df)-find_entropy_attribute(df,key))
  return df.keys()[:-1][np.argmax(IG)]
  
# function that returns the subtable that meet the given condition
def get_subtable(df,node,value) :
  return df[df[node]==value].reset_index(drop=True)

In [8]:
# constructing decision tree
def buildTree(df,tree=None) :
  Class = df.keys()[-1]
  node = find_winner(df) # node is the attribute with the maximum information gain
  attValue = np.unique(df[node]) # returns distinct values for that attribute
  # empty dictionary for the tree
  if tree is None :
    tree = {}
    tree[node] = {}
  # the tree is made recursively for subsets of the data anytime a subset is pure the loop stops
  for value in attValue :
    subTable = get_subtable(df,node,value)
    c1Value,counts = np.unique(subTable['Target'],return_counts=True)
    if len(counts)==1: # checking purity of the subset
      tree[node][value] = c1Value[0]
    else :
      tree[node][value] = buildTree(subTable)
  return tree
tree = buildTree(df)

Performance Evaluation

In [9]:
def predict(inst,tree) :
  for nodes in tree.keys() :
    value = inst[nodes]
    tree = tree[nodes][value]
    prediction = 0
    if type(tree) is dict :
      prediction = predict(inst,tree)
    else :
      prediction = tree
      break
  return prediction

df1 =df.iloc[1000:,]
Y_test = df.iloc[1000:,-1]
Y_label = []
for i in range(len(df1)) :
  inst = df1.iloc[i,:]
  prediction = predict(inst,tree)
  Y_label.append(prediction)

print("Confusion Matrix",metrics.confusion_matrix(Y_test,Y_label))
print(metrics.classification_report(Y_test,Y_label))

Confusion Matrix [[372]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       372

    accuracy                           1.00       372
   macro avg       1.00      1.00      1.00       372
weighted avg       1.00      1.00      1.00       372

