In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Класс, который в последствии добавляется в словарь для удобного выбора
class Node:
    def __init__(
        self,
        feature=None,
        threshold=None,
        childs=None,
        # left = None,
        # right = None,
        value=None,
        proba_value = None
    ):
        self.feature = feature
        self.threshold = threshold
        self.childs = childs
        self.value = value
        # self.right = right
        # self.left = left
        self.proba_value = proba_value

    def is_leaf_node(self):
        return self.value is not None
    


class DecisionTree:
    def __init__(self, classes, max_depth=10, min_samples=10):
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.tree = None
        self.classes = classes


    def fit(self, X, y):
        # self.classes = np.unique(y)
        self.tree = self.grow_tree(X, y)

    def calck_unic(self, a: list):
        keys = np.unique(a)
        # return {key: a[a == key].shape[0] for key in keys}
        return (a[a == key].shape[0] for key in keys), keys

    def predict(self, X):
        return np.array([self.travers_tree(x, self.tree) for x in X])

    def predict_proba(self, X):
        return np.array([self.travers_proba_tree(x, self.tree) for x in X])

    def entropy(self, y: np.ndarray):
        hist, _ = self.calck_unic(y)
        n = y.shape[0]
        # hist = {key: val / len(y) for key, val in hist.items()}
        info = -np.sum(np.fromiter((val/n * np.log2(val/n) for val in hist), dtype=np.float64))

        return info
    
    #======================================
    def gini(self, y:np.ndarray):
        uitems = self.calck_unic(y)
        n = y.shape[0]
        return 1 - np.sum(np.fromiter(((val/n)**2 for val in uitems), dtype=np.float64))
    #======================================

    def information_gain(self, X_column: list, y: list):
        if np.unique(y).shape[0] == 1:
            return 0

        n = y.shape[0]
        parent = self.entropy(y)
        # print("point1")
        uitems, keys = self.calck_unic(X_column)
        uitems = [i for i in uitems]
        # print("point2")
        #======================================
        # left_inds = np.argwhere(X_column <= threshold).flatten()
        # right_inds = np.argwhere(X_column > threshold).flatten()
        # # print(left_inds)
        # # print(X_column.shape)
        # # print(y.shape)
        # gini_left = self.gini(y[left_inds])
        # gini_right = self.gini(y[right_inds])
        # gini_split = (len(left_inds) / n) * gini_left +  (len(right_inds) / n) * gini_right
        # return gini_split, threshold
        #====================================== 
        # print(X_column.shape, y.shape )
        info_x = np.sum(
            [uitems[i] / n * self.entropy(y[X_column == keys[i]]) for i in range(len(uitems))]
        )
        # print("point3")
        split_info = -np.sum(
            [val / n * np.log2(val / n) for val in uitems if val > 0]
        )
        # print("point4")
        if split_info != 0:
            return (parent - info_x) / split_info, keys
        else:
            return 0, keys

    def most_common(self, y):
        labels = np.unique(y)
        vals, _ = self.calck_unic(y)
        return labels[np.argmax(vals)]

    def proba_val(self, y):
        n = y.shape[0]
        probas = np.zeros(self.classes.shape[0])
        vals, keys = self.calck_unic(y)

        probas[np.in1d(self.classes, keys)] = np.fromiter(vals, dtype=np.float64) / n
        return probas

    def best_split(self, X, y):
        best_feature = None
        best_gain = -1
        # best_gini = 1000000
        # uitems = []
        for i in range(X.shape[1]):
            #==============================================
            # trasholds = np.random.choice(np.unique(X[:, i]), 10)
            # print("new_i")
            # for trashold in trasholds:
            #==============================================
            # print("test1")
            gain, now_uitems = self.information_gain(X[:, i], y)
            # print("test2")
            # print(gain)
            if gain > best_gain:
                # print("yep")
                best_gain = gain
                best_feature = i
                uitems = now_uitems

        return best_feature, uitems

    def grow_tree(self, X, y, depth=0):
        n_samples = X.shape[0]
        n_labels = np.unique(y).shape[0]

        if n_samples <= self.min_samples or depth >= self.max_depth or n_labels <= 1:
            return Node(value=self.most_common(y), proba_value=self.proba_val(y))

        best_feature, ukeys = self.best_split(X, y)
        #==============================================
        # best_feature, best_threshold = self.best_split(X, y)
        # l_inds = np.argwhere(X[:, best_feature] <= best_threshold).flatten()
        # r_inds = np.argwhere(X[:, best_feature] > best_threshold).flatten()
        # # print("depth2:", depth)
        # if len(l_inds) == 0 or len(r_inds) == 0:
        #     return Node(value=self.most_common(y), proba_valUe=self.proba_val(y))
        
        # # print("depth3:", depth)
        # left = self.grow_tree(X = X[l_inds], y = y[l_inds], depth = depth+1)
        # right = self.grow_tree(X = X[r_inds], y = y[r_inds], depth = depth+1)

        # return Node(best_feature, best_threshold, left, right)
        #==============================================
        # В словаре содержатся не словари, а Node По сути, словарь содержит ссылки на объекты, а нужен он для более удобной навигации.
        childs = {
            key: self.grow_tree(
                X[X[:, best_feature] == key],
                y[X[:, best_feature] == key],
                depth=depth + 1,
            )
            for key in ukeys
        }

        return Node(best_feature, childs=childs)

    #==============================================
    # def travers_tree(self, x, tree):
    #     if tree.is_leaf_node():
    #         return tree.value
        
    #     return self.travers_tree(x, tree.left) if tree.threshold >= x[tree.feature] else self.travers_tree(x, tree.right)

    # def travers_proba_tree(self, x, tree):
    #     if tree.is_leaf_node():
    #         # print(tree.proba_value)
    #         return tree.proba_value
        
    #     return self.travers_proba_tree(x, tree.left) if tree.threshold >= x[tree.feature] else self.travers_proba_tree(x, tree.right)
    #==============================================

    def travers_tree(self, x, tree):
        if tree.is_leaf_node():
            return tree.value

        return self.travers_tree(
            x,
            tree.childs.get(
                x[tree.feature], tree.childs.get(list(tree.childs.keys())[0])
            ),
        )

    def travers_proba_tree(self, x, tree):
        if tree.is_leaf_node():
            return tree.proba_value

        return self.travers_proba_tree(
            x,
            tree.childs.get(
                x[tree.feature], tree.childs.get(list(tree.childs.keys())[0])
            ),
        )


In [43]:
import pandas as pd
import matplotlib.pyplot as plt

# Загрузка данных
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pd.read_csv(url, names=names)
transform = {'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}
dataset["class"] = dataset["class"].apply(lambda x: transform[x])

In [2]:
data = pd.read_csv("./Data/X_train.csv")
data.drop(labels="measurement_number", axis=1 , inplace=True)
data = data.groupby("series_id").agg(['mean', 'std', 'median'])
data.columns = [f'{col}_{stat}' for col, stat in data.columns]
data = data.reset_index()
data.drop(labels=["series_id","row_id_mean","row_id_std", "row_id_median"], axis=1 , inplace=True)
data = data.round(2).to_numpy(dtype=np.float64)
y = pd.read_csv("./Data/y_train.csv")["group_id"].to_numpy(dtype=np.int64)
# X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)
# print(np.unique(y_test))
# data.head(40)

In [5]:
# X = dataset[dataset.columns[:-1]].to_numpy()[:]
# y = dataset["class"].to_numpy()

# x_train, x_test, y_train1, y_test1 = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)
clf = DecisionTree(max_depth=10)
clf.fit(X_train, y_train.flatten())
otv = clf.predict(X_test)
print(otv)
# print(y_test.flatten()-1)
# print(X_test)
print(f1_score(y_test.flatten(), otv, average="micro"))

[46 13 15  0 69  0  3 25 66 18 18  7 13 60 38  7  4 23 23 35 20 22 59 46
 43  6 40 45  1 53 64 22 55 38 33 44 15 59 23 53 48 51 33  6  7  9 68 15
 41 21 23 68 60 19 34 49 44 38 32  3 13 51 51 53 33 65  6 33 10 59 22 12
 33 43 60  5 22 20  1 13  1 60 34 69 51 70  8 20 29 23 51  0 11 60 20 43
 37  8 20 28 16  4 68 53 18 53 53 38  8 15  0 14 15  7 18 10  6 16 22  9
  6 12 43  4 59  9 16 31 70 13 20 16 34 68 35  4 68 32 16  4 68 28 27 28
 15  8 59  4 40  4 60 53 66 10  0 34  7 38 31 43 16 31 33 13 51 28 53 65
 12 43 16 35 35 23 68 46  0 29 53 55 14 68 46 13 14 49 55 18  8 10 15  3
 31 53  3 51  2  0  1  7 70 53 45 68 48 18 18  0  1 69 53 52  7  4 42 16
 71  8 45 48  3 22  9 53 21 22 53 38  3 68  6  8 14 42 48  6 26 41 47 68
 61 15 32 19  6 48 37 12 31 51  3 68 20 20 10 56 15  6 40 33  1 53  6  3
 44  0  0 70 53  6 38 16  0  7 11  0  7 22 19  0 31  3 68 41 39 29 33 69
  3 14  3 18 32 62 32 33 13  6 59 22  3 15 69 18  8  7  6 66  0 39  8 51
 48 15 53 51 53 62 10  8  1 72 22 39 38 45  0 39 10

In [32]:
class RandomForest:
    def __init__(self, n_trees = 10, max_deep = 10, min_samples = 10):
        self.trees = [0 for i in range(n_trees)]
        self.tree_inds = [0 for i in range(n_trees)]
        self.n_trees = n_trees
        self.max_deep = max_deep
        self.min_samples = min_samples
        self.num_classes=None

    def fit(self, X, y):
        # n_features = int(X.shape[1]**0.5+0.5)
        n_features = X.shape[0]
        self.num_classes = np.unique(y).shape[0]

        for i in range(self.n_trees):
            # clf = DecisionTree(max_depth=self.max_deep, min_samples=self.min_samples)
            # inds =  np.random.choice(np.arange(X.shape[1]), n_features)
            clf = DecisionTree(classes = np.arange(73), max_depth=self.max_deep, min_samples=self.min_samples)
            inds =  np.random.choice(np.arange(X.shape[0] // 3), n_features, replace=True)
            # clf.fit(X[:, inds], y)
            # print(y)
            clf.fit(X[inds], y[inds])
            # print(clf.predict(x_test[:,inds ]))
            self.trees[i] = clf
            self.tree_inds[i] = inds
    
    def predict(self, X):
        otv = np.zeros((X.shape[0], self.num_classes))
        
        for i, clf in enumerate(self.trees):
            # otv[np.arange(X.shape[0]), clf.predict(X[:, self.tree_inds[i]])] += 1
            otv[np.arange(X.shape[0]), clf.predict(X)] += 1
            # print(otv)
        # print(otv)
        return np.argmax(otv, axis=1)
    
    def predict_proba(self, X):
        otv = np.zeros((X.shape[0], self.num_classes))
        for i, clf in enumerate(self.trees):
            # print(clf.predict_proba(X))
            # otv += clf.predict_proba(X[:, self.tree_inds[i]])

            otv += clf.predict_proba(X)
        return otv


In [34]:
clf = RandomForest(n_trees=800, max_deep=50)
clf.fit(X_train, y_train.flatten())
otv = clf.predict(X_test)
# otv = np.argmax(clf.predict_proba(X_test), axis=1)
print(otv)
# print(y_test.flatten())
print(f1_score(y_test.flatten() , otv, average="micro"))

[61 39 34 61 56  0 46 25 66 18 18 31 34 66 40  7  4 24 23 19 20 22 59 72
 20 46 53 46 11 48 64 22 71 43 33 44 15 59 54 53 53 71 33  6  2  9 68 72
 47 21 54 22 60 19 53 49 44 38 68 26 12 10 60 53 35 45 46 33 39 53 22 62
 35 43 60 53  2 20 57 13 11 60 15 56 71 70 33 20 29 54 60 61  9 60 65 20
 69  8 20  8 45 41 49 53 18 63 53 46 44 23  0 55 55 10 18 65 46 46 22  9
  6 12 60  4 53 21 45 31 70 13 46 62 34 68 35 53 68 20 20  4 68  8 27  8
 72  8 59  4 44 11 60 53 66 72  0 34 55 40 31 43 62 10 33 34 60  8 48 45
 12 43 16 69 35 23 68 46 32 29 53 71 14 22 18 13 52 49 10 18 57 15 65 46
 31 48 46 66  2  0 57 65 70 53 62  0 64 40 43  0 57 69 64 52 16  4 38 20
 71  8 16 53 43 22  9 53 21 22 53 38  3 49 53 33 53 42 53  6 46 47 57  0
 22 15 32 53 46 53 37 12 10 51 18 68 20 20 60 21 15 18 65 33 57 48 46 46
 42  0 20 70 53  6 72 62  0 31  9  0 10 68 19  0 31 40  0 41 70 29 33 69
 29 19 46 18 32 65 32 35 13  6 59 22 46 53 69 18  8 51 29 71 22 39 43 72
 53 25  8  7 48 13 31 16 18 72 22 48  8 49 32 39 39

In [36]:
x = np.array([1,1,1,1])

W = np.array([[1,7,8,9],
              [3,4,2,1],
              [7,3,3,1]])
x.dot(W.T) + 

array([25, 10, 14])