In [230]:
import pandas as pd
import random
import numpy as np
import pprint

In [231]:
df = pd.read_csv('800ImagesFeatures.csv')
#df = pd.read_csv('Features2.csv')

#print(df.head())

In [232]:
def train_test_split(df, test_size):

    if isinstance(test_size, float):
        test_size = round(test_size * len(df)) #eg: test_size = 0.2 -> 20% x df

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)

    return train_df, test_df

In [233]:
def check_purity(data):

    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    #unique_classes = 0 and 1 (kamm, burst) -> length = 2
    #if length only 1 -> only 1 class -> data pure -> True
    if len(unique_classes) == 1:
        return True
    else:
        return False

In [234]:
def classify_data(data):

    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]

    return classification

In [235]:
def get_potential_splits(data):

    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):        # excluding the last column which is the label
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2

                potential_splits[column_index].append(potential_split)

    return potential_splits

In [236]:
def split_data(data, split_column, split_value):

    split_column_values = data[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]

    return data_below, data_above

In [237]:
def calculate_entropy(data):

    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))

    return entropy

In [238]:
def calculate_overall_entropy(data_below, data_above):

    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below)
                      + p_data_above * calculate_entropy(data_above))

    return overall_entropy

In [239]:
def determine_best_split(data, potential_splits):

    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value

    return best_split_column, best_split_value


In [240]:
sub_tree = {"question": ["yes_answer",
                         "no_answer"]}

In [241]:
def decision_tree_algorithm(df, counter=0, max_depth = 10):

    # data preparations
    if counter == 0:          #at first, data still data frame and it needs to be converted to the numpy 2Darray (without header)
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df


    # base cases (stop condition so recursive not go infinitiv)
    if (check_purity(data)) or (counter == max_depth):
        classification = classify_data(data)
        return classification


    # recursive part
    else:
        counter += 1

        # helper functions
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)

        # instantiate sub-tree
        feature_name= COLUMN_HEADERS[split_column]
        question = "{} <= {}".format(feature_name, split_value)
        sub_tree = {question: []}

        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, max_depth)

        if yes_answer == no_answer:
          sub_tree = yes_answer
        else:
          sub_tree[question].append(yes_answer)
          sub_tree[question].append(no_answer)

        return sub_tree


In [242]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer

    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [244]:
#needs to be separated because after the function calculate_accuracy,
# the table is modified by adding 2 more columns,
# that is why when go back here, it creates error
example= train_df.iloc[0]
print(example)
classify_example(example,tree)


contour_points                      676.000000
amount_contours                       3.000000
rect_area                         20729.545245
hull_area                         14977.000000
approximation_area                10152.500000
contour_perimeters                  789.494509
corners                              53.000000
harris_corners                      901.000000
ratio_wide_length                     3.399329
contour_length_area_ratio             0.077310
contour_length_rect_area_ratio        0.038085
contour_length_hull_area_ratio        0.052714
contour_rect_length_ratio             1.149038
contour_hull_length_ratio             1.318301
extent                                0.492630
solidity                              0.681845
hull_rectangle_ratio                  0.722495
labels                                0.000000
Name: 0, dtype: float64


0.0

In [245]:
def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df.classification == df.labels

    accuracy = df["classification_correct"].mean()

    return accuracy

In [246]:
random.seed(0)
train_df, test_df = train_test_split(df, 0.2)
tree = decision_tree_algorithm(train_df)
#tree = decision_tree_algorithm(train_df, max_depth=10)
accuracy = calculate_accuracy(test_df, tree)

pprint.pprint(tree, width=50)
accuracy

test_df

{'hull_rectangle_ratio <= 0.8077957412719301': [{'contour_length_area_ratio <= 0.037084545491365695': [{'contour_hull_length_ratio <= 1.1707387253861703': [1.0,
                                                                                                                                                            {'contour_hull_length_ratio <= 1.2103389576690942': [0.0,
                                                                                                                                                                                                                 1.0]}]},
                                                                                                       {'contour_length_hull_area_ratio <= 0.07590293849917454': [{'hull_area <= 27251.75': [{'contour_hull_length_ratio <= 1.0966656424931571': [{'harris_corners <= 234.0': [{'contour_rect_length_ratio <= 0.9333884602156977': [0.0,
                                                                              

Unnamed: 0,contour_points,amount_contours,rect_area,hull_area,approximation_area,contour_perimeters,corners,harris_corners,ratio_wide_length,contour_length_area_ratio,contour_length_rect_area_ratio,contour_length_hull_area_ratio,contour_rect_length_ratio,contour_hull_length_ratio,extent,solidity,hull_rectangle_ratio,labels,classification,classification_correct
394,676,2,28754.648537,22359.5,17094.0,805.234624,56,159,3.182118,0.047035,0.028004,0.036013,1.012745,1.160856,0.595382,0.765670,0.777596,0,0.0,True
776,568,10,16061.428794,12950.0,9427.0,603.622364,89,452,2.295432,0.064045,0.037582,0.046612,1.094871,1.234824,0.586810,0.727799,0.806279,1,0.0,False
430,1011,2,65024.984436,64863.0,64804.5,1014.727922,44,281,1.000000,0.015657,0.015605,0.015644,0.994831,1.002589,0.996686,0.999175,0.997509,1,0.0,False
41,1752,7,65025.000000,65025.0,51351.5,1819.931020,67,698,1.000000,0.035487,0.027988,0.027988,1.784246,1.784246,0.788681,0.788681,1.000000,0,0.0,True
265,640,2,21178.651126,15945.5,13175.0,778.347322,46,417,4.743048,0.059019,0.036752,0.048813,1.014100,1.142181,0.622703,0.827067,0.752904,0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,592,2,12171.135912,11520.5,11039.0,733.661030,52,424,0.132169,0.066245,0.060279,0.063683,1.067712,1.103287,0.909940,0.961330,0.946543,1,1.0,True
620,740,2,30090.000000,29373.0,28696.5,745.798990,29,173,0.462745,0.025991,0.024786,0.025391,0.999731,1.007734,0.953606,0.976884,0.976171,1,1.0,True
707,984,5,60434.984985,60435.0,60435.0,984.000000,50,455,0.929412,0.016282,0.016282,0.016282,1.000000,1.000000,1.000000,1.000000,1.000000,1,1.0,True
122,895,21,30340.260358,27002.0,18386.0,959.203098,100,637,2.393403,0.052211,0.031615,0.035523,1.255288,1.347413,0.605516,0.680376,0.889973,0,1.0,False


In [247]:
from sklearn.metrics import classification_report

print(classification_report(test_df.labels, test_df.classification))

              precision    recall  f1-score   support

           0       0.72      0.75      0.74        77
           1       0.76      0.73      0.75        83

    accuracy                           0.74       160
   macro avg       0.74      0.74      0.74       160
weighted avg       0.74      0.74      0.74       160



In [248]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(test_df.labels, test_df.classification))


[[58 19]
 [22 61]]
