In [43]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

In [44]:
train_df = pd.read_csv("01_train.csv") 

In [45]:
train_df = train_df .rename(columns={"y": "label"})

In [46]:
test_df = pd.read_csv("01_test.csv")
test_df = test_df .rename(columns={"y": "label"})

In [47]:
data = train_df.values
data[:5]

array([[  65938, -109121,  101371,   68046,  499646,   51569, -170592,
         -84944,  -74420,       3],
       [  83426,  -28524, -100525,  244933, -219593,   54303, -206699,
         146491,  -31888,      10],
       [ -96851,   15578,   -5402,  289095,  -57329,   58332, -214913,
        -138813,   34742,      12],
       [  10072,  -26539,  186370,   72241, -112577,   64956, -228388,
        -154687,  221456,      12],
       [ -86930,  -66303,  527176,  224934,  -79165,   57980, -195603,
        -186762,   11842,       1]])

In [48]:
# возвращает номер наибольшего класса, по колву принадлежащих ему элементов
def classify_data(data): 
    
    label_column = data[:, -1]#столбец значений
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True) #уолво значений каждого класса

    index = counts_unique_classes.argmax() #индекс с самым большим колвом классов
    classification = unique_classes[index]
    
    return classification 

In [49]:
# составляем словарь,где каждому столбцу сопоставляем уникальные значения в нем находящиеся
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
    
    return potential_splits


In [50]:
# разделяем на две части меньше split_value и больше. определенную колонку
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]
    
    return data_below, data_above

In [51]:
#
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum() # вероятность, что принадлежит к определенному классу
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

In [52]:
# вероятность, что строка принадлежит к разделенному классу 
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy


In [53]:
def determine_best_split(data, potential_splits):
    
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value


In [75]:
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5):
    
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df           
    
    
    if (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        
        return classification

    else:    
        counter += 1

        # разделяем данные на две части
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # определяем вопрос
        feature_name = COLUMN_HEADERS[split_column]
        question = "{} <= {}".format(feature_name, split_value)

        sub_tree = {question: []}
        
        # находим рекурсивно ответ
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        
        sub_tree[question].append(yes_answer)
        sub_tree[question].append(no_answer)
        
        return sub_tree

In [76]:
tree = decision_tree_algorithm(train_df, max_depth=3)
pprint(tree)

{'x7 <= -214911': [{'x2 <= -155079': [{'x1 <= 40137': [16, 16]},
                                      {'x4 <= 163941': [12, 12]}]},
                   {'x3 <= 190305': [{'x1 <= 58572': [11, 10]},
                                     {'x5 <= -217110': [17, 1]}]}]}


In [81]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")


    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]
        
        
    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [82]:
example = test_df.iloc[10]
classify_example(example, tree)

10

In [83]:
prediction = []
for i in range(len(test_df)):
    prediction = prediction + [classify_example(test_df.iloc[i], tree)]
print(prediction)
    
    

[11, 12, 12, 12, 12, 12, 1, 11, 11, 12, 10, 1, 11, 1, 1, 11, 1, 17, 10, 17, 16, 11, 12, 11, 11, 1, 12, 1, 1, 12, 1, 12, 12, 12, 1, 12, 11, 12, 12, 10, 17, 1, 1, 1, 11, 12, 1, 17, 1, 11, 17, 1, 11, 12, 12, 16, 10, 1, 11, 12, 11, 10, 11, 1, 11, 11, 1, 1, 11, 11, 16, 1, 1, 1, 1, 11, 12, 12, 11, 12, 11, 12, 12, 1, 12, 11, 11, 11, 11, 1, 10, 16, 11, 12, 10, 11, 1, 11, 1, 11, 11, 11, 12, 10, 17, 10, 11, 11, 17, 11, 1, 11, 11, 12, 12, 12, 16, 12, 1, 12, 11, 12, 11, 1, 16, 11, 11, 12, 11, 12, 11, 1, 1, 11, 1, 11, 1, 1, 11, 12, 11, 1, 11, 11, 11, 1, 12, 12, 12, 1, 1, 11, 11, 11, 16, 12, 12, 1, 17, 11, 11, 10, 10, 12, 11, 11, 11, 11, 10, 11, 10, 12, 12, 1, 11, 12, 11, 16, 12, 16, 11, 17, 12, 12, 12, 12, 12, 12, 17, 11, 11, 12, 11, 12, 10, 1, 17, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 12, 12, 11, 11, 12, 12, 11, 11, 11, 16, 12, 1, 10, 12, 11, 16, 16, 12, 12, 11, 12, 11, 12, 12, 1, 11, 11, 10, 1, 10, 11, 11, 1, 1, 11, 11, 11, 11, 10, 12, 1, 12, 10, 11, 1, 11, 11, 12, 12, 11, 11, 11, 11, 11, 11, 1

In [84]:
def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, args=(tree,), axis=1)
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [85]:
accuracy = calculate_accuracy(test_df, tree)
accuracy

0.9994859933179131

In [86]:
label_column = data[:, -1]#столбец значений
unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True) 