In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix


About the data:
Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.

Import the banknote_authentication dataset into a pandas dataframe. Since this dataset does not contain headers, we must specify the headers in read_csv using the 'names' option.

In [None]:
banknotes = pd.read_csv("/kaggle/input/banknote-detection-authentication/data_banknote_authentication.txt",
                        names = ["Variance", "Skewness", "Curtosis", "Entropy", "TargetClass"])

banknotes

Attribute Information:
1. variance of Wavelet Transformed image (continuous) 
2. skewness of Wavelet Transformed image (continuous) 
3. curtosis of Wavelet Transformed image (continuous) 
4. entropy of image (continuous) 
5. class (integer) 

The last attribute is what we're trying to predict; the authenticity (0 - inauthentic, 1 - authentic) of the banknote based on the four independent attributes. 

Split into two different datasets. One containing the independent attributes (variance, skewness, curtosis, entropy) -X, and another for the dependent attribute (TargetClass) -  Y.

X will be split into a  training set and a testing set. Same for Y. 

We'll be using a 80:20 (Training:Test) ratio, which is a common configuration in data science.

Note: the pop() function will permanently remove the 'TargetClass' feature from the dataframe. The dataset must be re-imported if need be.

In [None]:
y = banknotes.pop('TargetClass')
x = banknotes

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)

Verify dimensions of newly created partitions. Training sets must contain 80% of the dataset and 20% for the testing sets

In [None]:
print("Size of xTrain is: %d (%d%%)" % (len(xTrain), 100*(len(xTrain)/len(banknotes))))
print("Size of xTest is: %d (%d%%)" % (len(xTest), 100*(len(xTest)/len(banknotes))))
print("Size of yTrain is: %d (%d%%)" % (len(yTrain), 100*(len(yTrain)/len(banknotes))))
print("Size of yTest is: %d (%d%%)" % (len(yTest), 100*len(yTest)/len(banknotes)))

Initialize decision tree and run the model

In [None]:
dt = DecisionTreeClassifier()

dt.fit(xTrain, yTrain)

y_pred = dt.predict(xTest)


Define precision function

In [None]:
def my_precision(y, pred):
    TP = 0
    FP = 0
    
    for actual, predicted in zip(y, pred):
        if(actual == predicted and predicted == 1):
            TP += 1
            
    for actual, predicted in zip(y, pred):
        if(actual != predicted and predicted == 1):
            FP += 1

    return TP/(TP+FP)

Define recall function

In [None]:
def my_recall(y, pred):
    TP = 0
    FP = 0
    
    for actual, predicted in zip(y, pred):
        if(actual == predicted and predicted == 1):
            TP += 1
            
    for actual, predicted in zip(y, pred):
        if(actual != predicted and predicted == 0):
            FP += 1
            
    return TP/(TP+FP)

Define accuracy function

In [None]:
def my_accuracy(y, pred):
    correct = 0
    incorrect = 0
    
    for i in range(0, len(y_pred)):
        if (y[i] == pred[i]):
            correct += 1
        else:
            incorrect += 1
        
    return (correct/len(y_pred))