In [4]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from scipy.spatial import distance
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,recall_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Задания 1, 2 и 3 реализованы внутри класса Decision tree

In [5]:
class DecisionTree():
    class Node():
        def __init__(self):
            self.rule = -1
            self.threshold = None
            self.left_node = None
            self.right_node = None
            self.node_class = None
            
    def __init__(self, debug=''):
        self.tree = None
        self.debug = debug
        self.impurity_f = {
            'entropy': self.get_entropy,
            'gini': self.get_gini
        }
    
    def get_entropy(self, cnt, n): #Задание 1
        entropy = 0
        for _, class_n in cnt.most_common():
            if class_n > 0:
                entropy -= (class_n/n * np.log2(class_n/n))
        return entropy
    
    def get_gini(self, cnt, n):
        gini = 0
        for _, class_n in cnt.most_common():
            gini += class_n/n * (1 - class_n/n)
        return gini
        
    def get_IG_threshold(self, v, y, n, impurity_f):
        sorted_vy = sorted(list(zip(v, y)))
        best_threshold = None
        best_IG = None
        y_left_cnt = Counter()
        y_right_cnt = Counter([t[1] for t in sorted_vy[0:]])
        n_left = 0
        n_right = len(sorted_vy)
        total_IG = n_right/n * impurity_f(y_right_cnt, n_right) # all values    
        for pos, (v, y) in enumerate(sorted_vy):
            if pos > 0 and (sorted_vy[pos][0] != sorted_vy[pos-1][0]):
                left_IG = n_left/n * impurity_f(y_left_cnt, n_left)
                right_IG = n_right/n * impurity_f(y_right_cnt, n_right)
                IG = total_IG - left_IG - right_IG
                if best_IG is None or IG > best_IG:
                    best_IG = IG
                    best_threshold = v
            y_left_cnt[y] = y_left_cnt[y] + 1
            n_left += 1
            y_right_cnt[y] -= 1
            n_right -= 1
        return best_IG, best_threshold
    #Задание 2
    def build_tree(self, node, X, y, n, level, max_level, impurity):
        cnt_y = Counter(y)
        if len(cnt_y) == 1:
            node.node_class = y[0]
            return
        if level == max_level:
            node.node_class = cnt_y.most_common()[0][0]
            return
        features_num = X.shape[1]
        features2IG = []
        for feature in range(0, features_num):
            IG, threshold = self.get_IG_threshold(X[:, feature], y, n, self.impurity_f[impurity])
            if IG is not None:
                features2IG.append((IG, threshold, feature))
        if len(features2IG) == 0:
            node.node_class = y[0]
            return
        IG, threshold, best_feature = sorted(features2IG, reverse=True)[0] # get always max
        node.IG = IG
        node.rule = best_feature
        node.threshold = threshold
        if 'v' in self.debug:
            print(f"[L{level}] n: {X.shape[0]}, feature: {node.rule}, threshold: {node.threshold}, IG: {IG}")
        
        # Left subtree
        node.left_node = self.Node()
        if 'vvv' in self.debug:
            print(f"[L{level}] Samples to left: {X[X[:, best_feature] < threshold].shape[0]}")
        self.build_tree(node.left_node,
                        X[X[:, best_feature] < threshold],
                        y[X[:, best_feature] < threshold],
                        n, level+1, max_level, impurity)
        # Right subtree
        node.right_node = self.Node()
        if 'vvv' in self.debug:
            print(f"[L{level}] Samples to right: {X[X[:, best_feature] >= threshold].shape[0]}")
        self.build_tree(node.right_node,
                        X[X[:, best_feature] >= threshold],
                        y[X[:, best_feature] >= threshold],
                        n, level+1, max_level, impurity) 
        return
        
    
    def fit(self, X, y, max_level=10, impurity='entropy'):
        self.tree = self.Node()
        self.build_tree(self.tree, X, y, X.shape[0], 0, max_level, impurity)
        
    def predict(self, X):
        predictions = []
        for sample in X:
            current_node = self.tree
            while current_node.node_class is None:
                if sample[current_node.rule] < current_node.threshold:
                    current_node = current_node.left_node
                else:
                    current_node = current_node.right_node
            predictions.append(current_node.node_class)
        return predictions

# Задание 4

### Spam dataset

In [6]:
spam_df = pd.read_csv('spam.csv')
X = spam_df[spam_df.columns.difference(['label'])]
y = spam_df.label.values
spam_X_train, spam_X_val, spam_y_train, spam_y_val = train_test_split(X, y, test_size=0.2, random_state=1)


#### Entropy

In [7]:
impurity = 'entropy'
print(f"Spam dataset, impurity: {impurity}")
print('-' * 10)
best_depth = None
best_accuracy = None
dtree = DecisionTree(debug='')
for max_level in range(1, 11):
    dtree.fit(spam_X_train.values, spam_y_train, max_level=max_level, impurity=impurity)
    predictions = dtree.predict(spam_X_val.values)
    accuracy = accuracy_score(spam_y_val, predictions)
    recall = recall_score(spam_y_val, predictions)
    precision= precision_score(spam_y_val, predictions)
    if best_accuracy is None or accuracy > best_accuracy:
        best_depth = max_level
        best_accuracy = accuracy
    print(f"Depth: {max_level}, accuracy: {accuracy}, recall: {recall}, precision: {precision}")
print('-' * 10)
print(f"Best tree depth: {best_depth}, accuracy: {best_accuracy}")

Spam dataset, impurity: entropy
----------
Depth: 1, accuracy: 0.7839305103148752, recall: 0.5266106442577031, precision: 0.8623853211009175
Depth: 2, accuracy: 0.8230184581976113, recall: 0.8627450980392157, precision: 0.7298578199052133
Depth: 3, accuracy: 0.8914223669923995, recall: 0.7871148459383753, precision: 0.921311475409836
Depth: 4, accuracy: 0.9153094462540716, recall: 0.8375350140056023, precision: 0.9373040752351097
Depth: 5, accuracy: 0.9229098805646037, recall: 0.8711484593837535, precision: 0.9255952380952381
Depth: 6, accuracy: 0.9337676438653637, recall: 0.8711484593837535, precision: 0.9539877300613497
Depth: 7, accuracy: 0.9294245385450597, recall: 0.8907563025210085, precision: 0.9244186046511628
Depth: 8, accuracy: 0.9196525515743756, recall: 0.8543417366946778, precision: 0.9327217125382263
Depth: 9, accuracy: 0.9294245385450597, recall: 0.8627450980392157, precision: 0.9506172839506173
Depth: 10, accuracy: 0.9109663409337676, recall: 0.865546218487395, precisio

#### Gini

In [8]:
impurity = 'gini'
print(f"Spam dataset, impurity: {impurity}")
print('-' * 10)
best_depth = None
best_accuracy = None
dtree = DecisionTree(debug='')
for max_level in range(1, 11):
    dtree.fit(spam_X_train.values, spam_y_train, max_level=max_level, impurity=impurity)
    predictions = dtree.predict(spam_X_val.values)
    accuracy = accuracy_score(spam_y_val, predictions)
    recall = recall_score(spam_y_val, predictions)
    precision= precision_score(spam_y_val, predictions)
    if best_accuracy is None or accuracy > best_accuracy:
        best_depth = max_level
        best_accuracy = accuracy
    print(f"Depth: {max_level}, accuracy: {accuracy}, recall: {recall}, precision: {precision}")
print('-' * 10)
print(f"Best tree depth: {best_depth}, accuracy: {best_accuracy}")

Spam dataset, impurity: gini
----------
Depth: 1, accuracy: 0.7839305103148752, recall: 0.5266106442577031, precision: 0.8623853211009175
Depth: 2, accuracy: 0.8783930510314875, recall: 0.7282913165266106, precision: 0.9454545454545454
Depth: 3, accuracy: 0.8870792616720955, recall: 0.834733893557423, precision: 0.8688046647230321
Depth: 4, accuracy: 0.9077090119435396, recall: 0.8123249299719888, precision: 0.9415584415584416
Depth: 5, accuracy: 0.9044516829533116, recall: 0.8543417366946778, precision: 0.8944281524926686
Depth: 6, accuracy: 0.9196525515743756, recall: 0.8515406162464986, precision: 0.9353846153846154
Depth: 7, accuracy: 0.9087947882736156, recall: 0.8543417366946778, precision: 0.9050445103857567
Depth: 8, accuracy: 0.9229098805646037, recall: 0.8571428571428571, precision: 0.9386503067484663
Depth: 9, accuracy: 0.9174809989142236, recall: 0.865546218487395, precision: 0.9169139465875371
Depth: 10, accuracy: 0.9218241042345277, recall: 0.8739495798319328, precision: 

#### kNN

In [12]:
scaler = StandardScaler()
spam_X_train = scaler.fit_transform(spam_X_train)
spam_X_val = scaler.fit_transform(spam_X_val)
best_number_of_neigbours = None
best_accuracy = None
for k in range(1, 11):
    classifier= KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree')
    classifier.fit(spam_X_train, spam_y_train)
    predictions = classifier.predict(spam_X_val)
    accuracy = accuracy_score(spam_y_val, predictions)
    recall = recall_score(spam_y_val, predictions)
    precision= precision_score(spam_y_val, predictions)
    print(f"Number of neighbours: {k}, accuracy: {accuracy}, recall: {recall}, precision: {precision}")
    if best_accuracy is None or accuracy > best_accuracy:
        best_number_of_neighbours = k
        best_accuracy = accuracy
print('-' * 10)
print(f"Best number of neighbours: {best_number_of_neighbours}, accuracy: {best_accuracy}")

Number of neighbours: 1, accuracy: 0.9077090119435396, recall: 0.8851540616246498, precision: 0.8777777777777778
Number of neighbours: 2, accuracy: 0.9022801302931596, recall: 0.7983193277310925, precision: 0.9405940594059405
Number of neighbours: 3, accuracy: 0.9109663409337676, recall: 0.8683473389355743, precision: 0.8985507246376812
Number of neighbours: 4, accuracy: 0.9066232356134636, recall: 0.8151260504201681, precision: 0.9356913183279743
Number of neighbours: 5, accuracy: 0.9131378935939196, recall: 0.8571428571428571, precision: 0.9134328358208955
Number of neighbours: 6, accuracy: 0.9066232356134636, recall: 0.8263305322128851, precision: 0.9247648902821317
Number of neighbours: 7, accuracy: 0.9163952225841476, recall: 0.8683473389355743, precision: 0.9117647058823529
Number of neighbours: 8, accuracy: 0.9055374592833876, recall: 0.834733893557423, precision: 0.9141104294478528
Number of neighbours: 9, accuracy: 0.9109663409337676, recall: 0.8683473389355743, precision: 0.8

### Cancer dataset

In [15]:
cancer_df = pd.read_csv('cancer.csv')
cancer_df['int_label'] = cancer_df.label.map({'M':1, 'B':0})
cancer_df['label'] = cancer_df.int_label
cancer_df.drop('int_label', axis=1, inplace=True)
X = cancer_df[cancer_df.columns.difference(['label'])]
y = cancer_df.label.values
cancer_X_train, cancer_X_val, cancer_y_train, cancer_y_val = train_test_split(
    X, y, test_size=0.2, random_state=1)

#### Entropy

In [16]:
impurity = 'entropy'
print(f"Cancer dataset, impurity: {impurity}")
print('-' * 10)
best_depth = None
best_accuracy = None
dtree = DecisionTree(debug='')
for max_level in range(1, 11):
    dtree.fit(cancer_X_train.values, cancer_y_train, max_level=max_level, impurity=impurity)
    predictions = dtree.predict(cancer_X_val.values)
    accuracy = accuracy_score(cancer_y_val, predictions)
    recall = recall_score(cancer_y_val, predictions)
    precision= precision_score(cancer_y_val, predictions)
    if best_accuracy is None or accuracy > best_accuracy:
        best_depth = max_level
        best_accuracy = accuracy
    print(f"Depth: {max_level}, accuracy: {accuracy}, recall: {recall}, precision: {precision}")
print('-' * 10)
print(f"Best tree depth: {best_depth}, accuracy: {best_accuracy}")

Cancer dataset, impurity: entropy
----------
Depth: 1, accuracy: 0.868421052631579, recall: 0.8095238095238095, precision: 0.8292682926829268
Depth: 2, accuracy: 0.868421052631579, recall: 0.8095238095238095, precision: 0.8292682926829268
Depth: 3, accuracy: 0.9122807017543859, recall: 0.8809523809523809, precision: 0.8809523809523809
Depth: 4, accuracy: 0.956140350877193, recall: 0.8809523809523809, precision: 1.0
Depth: 5, accuracy: 0.9122807017543859, recall: 0.7619047619047619, precision: 1.0
Depth: 6, accuracy: 0.9473684210526315, recall: 0.8571428571428571, precision: 1.0
Depth: 7, accuracy: 0.9473684210526315, recall: 0.8571428571428571, precision: 1.0
Depth: 8, accuracy: 0.9473684210526315, recall: 0.8571428571428571, precision: 1.0
Depth: 9, accuracy: 0.9473684210526315, recall: 0.8571428571428571, precision: 1.0
Depth: 10, accuracy: 0.9473684210526315, recall: 0.8571428571428571, precision: 1.0
----------
Best tree depth: 4, accuracy: 0.956140350877193


#### Gini

In [17]:
impurity = 'gini'
print(f"Cancer dataset, impurity: {impurity}")
print('-' * 10)
best_depth = None
best_accuracy = None
dtree = DecisionTree(debug='')
for max_level in range(1, 11):
    dtree.fit(cancer_X_train.values, cancer_y_train, max_level=max_level, impurity=impurity)
    predictions = dtree.predict(cancer_X_val.values)
    accuracy = accuracy_score(cancer_y_val, predictions)
    recall = recall_score(cancer_y_val, predictions)
    precision= precision_score(cancer_y_val, predictions)
    if best_accuracy is None or accuracy > best_accuracy:
        best_depth = max_level
        best_accuracy = accuracy
    print(f"Depth: {max_level}, accuracy: {accuracy}, recall: {recall}, precision: {precision}")
print('-' * 10)
print(f"Best tree depth: {best_depth}, accuracy: {best_accuracy}")

Cancer dataset, impurity: gini
----------
Depth: 1, accuracy: 0.8771929824561403, recall: 0.7857142857142857, precision: 0.868421052631579
Depth: 2, accuracy: 0.8859649122807017, recall: 0.7857142857142857, precision: 0.8918918918918919
Depth: 3, accuracy: 0.9210526315789473, recall: 0.8571428571428571, precision: 0.9230769230769231
Depth: 4, accuracy: 0.9298245614035088, recall: 0.8333333333333334, precision: 0.9722222222222222
Depth: 5, accuracy: 0.9298245614035088, recall: 0.8333333333333334, precision: 0.9722222222222222
Depth: 6, accuracy: 0.9298245614035088, recall: 0.8333333333333334, precision: 0.9722222222222222
Depth: 7, accuracy: 0.9298245614035088, recall: 0.8333333333333334, precision: 0.9722222222222222
Depth: 8, accuracy: 0.9298245614035088, recall: 0.8333333333333334, precision: 0.9722222222222222
Depth: 9, accuracy: 0.9298245614035088, recall: 0.8333333333333334, precision: 0.9722222222222222
Depth: 10, accuracy: 0.9298245614035088, recall: 0.8333333333333334, precisio

#### kNN

In [19]:
scaler = StandardScaler()
cancer_X_train = scaler.fit_transform(cancer_X_train)
cancer_X_val = scaler.fit_transform(cancer_X_val)
best_number_of_neigbours = None
best_accuracy = None
for k in range(1, 11):
    classifier= KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree')
    classifier.fit(cancer_X_train, cancer_y_train)
    predictions = classifier.predict(cancer_X_val)
    accuracy = accuracy_score(cancer_y_val, predictions)
    recall = recall_score(cancer_y_val, predictions)
    precision= precision_score(cancer_y_val, predictions)
    print(f"Number of neighbours: {k}, accuracy: {accuracy}, recall: {recall}, precision: {precision}")
    if best_accuracy is None or accuracy > best_accuracy:
        best_number_of_neighbours = k
        best_accuracy = accuracy
print('-' * 10)
print(f"Best number of neighbours: {best_number_of_neighbours}, accuracy: {best_accuracy}")

Number of neighbours: 1, accuracy: 0.9210526315789473, recall: 0.9047619047619048, precision: 0.8837209302325582
Number of neighbours: 2, accuracy: 0.9385964912280702, recall: 0.8571428571428571, precision: 0.972972972972973
Number of neighbours: 3, accuracy: 0.9473684210526315, recall: 0.9047619047619048, precision: 0.95
Number of neighbours: 4, accuracy: 0.956140350877193, recall: 0.8809523809523809, precision: 1.0
Number of neighbours: 5, accuracy: 0.9473684210526315, recall: 0.8809523809523809, precision: 0.9736842105263158
Number of neighbours: 6, accuracy: 0.956140350877193, recall: 0.8809523809523809, precision: 1.0
Number of neighbours: 7, accuracy: 0.9298245614035088, recall: 0.8809523809523809, precision: 0.925
Number of neighbours: 8, accuracy: 0.9473684210526315, recall: 0.8809523809523809, precision: 0.9736842105263158
Number of neighbours: 9, accuracy: 0.9385964912280702, recall: 0.9047619047619048, precision: 0.926829268292683
Number of neighbours: 10, accuracy: 0.956140

Таким образом, лучше всего для обоих случаев работает decision tree с impurity=entropy. (В случае с датасетом Cancer данная модель работает так же хорошо, как и kNN). Думаю, это происходит потому, что Decision tree поддерживает взаимодействие между объектами в ходе классификации. Энтропия считается сложнее и более "глубоко" работает с данными, думаю поэтому думаю она дает несколько лучшие результаты. Однако, хочется отметить, что результаты работы классификаторов очень близки и на практике скорее всего будет выбран decision tree с gini потому, что у этого классификатора самая маленькая вычислительная сложность. 