In [154]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

Сделаем обработку данных аналогично второму дз2 

In [157]:
def upd_data(data):
  data.loc[:, data.dtypes == "float64"] = data.loc[:, data.dtypes == "float64"].fillna(
      data.loc[:, data.dtypes == "float64"].mean(axis=0)
  )
  data.loc[:, data.dtypes == "object"] = data.loc[:, data.dtypes == "object"].fillna("")
  data_encoded = pd.get_dummies(data)

  data_encoded.columns = data_encoded.columns.astype(str)
  X = data_encoded.drop(columns=["25"])
  columns = X.columns
  y = data_encoded["25"]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)

  scaler = RobustScaler()
  scaler.fit(X_train)

  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)

  return X_train, X_test, y_train, y_test, columns

In [159]:
data = pd.read_csv("automobile/imports-85.data", header=None, na_values="?")
X_train, X_test, y_train, y_test, columns = upd_data(data)

In [175]:
class MyDecisionTree:
    LEAF = 1
    NON_LEAF = 0

    def __init__(self, max_depth=5, criterion="mse", min_samples_split=2) -> None:
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = dict()
        self.feature_importance = None

    def fit(self, X_train, y_train):
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        self.feature_importance = np.zeros(X_train.shape[1])
        self.fit_node(X_train, y_train, 0, 0)

    def _entropy(self, y):
        p = np.bincount(y) / len(y)
        p = p[np.where(p > 0)[0]]
        return -np.sum(p * np.log(p))

    def _gini(self, y):
        p = np.bincount(y) / len(y)
        return np.sum(p * (1 - p))

    def get_entropy_score(self, y_left, y_right):
        Hs = []
        for split in (y_left, y_right):
            H = self._entropy(split)
            R = len(split) / (len(y_left) + len(y_right))
            Hs.append(R * H)
        return np.sum(Hs)

    def get_gini_score(self, y_left, y_right):
        Hs = []
        for split in (y_left, y_right):
            H = self._gini(split)
            R = len(split) / (len(y_left) + len(y_right))
            Hs.append(R * H)
        return np.sum(Hs)

    def get_mse_score(self, y_left, y_right):
        n = len(y_left) + len(y_right)
        if len(y_left) > 0:
            left_mean = np.mean(y_left)  # Если не пустая, вычисляем среднее
        else:
            left_mean = 0  # Если пустая, устанавливаем среднее равным 0
    
        # Проверяем, пустая ли правая группа
        if len(y_right) > 0:
            right_mean = np.mean(y_right)  # Если не пустая, вычисляем среднее
        else:
            right_mean = 0  # Если пустая, устанавливаем среднее равным 0

        mse_left = np.sum((y_left - left_mean) ** 2)
        mse_right = np.sum((y_right - right_mean) ** 2)
        return len(y_left)/n * mse_left + len(y_right)/n * mse_right

    def find_threshold(self, X, y: np.ndarray):
        min_criterion = np.inf
        feature_split = -1
        threshold_split = -1

        if self.criterion == "gini":
            H = self._gini(y)
            criterion = self.get_gini_score
        elif self.criterion == "entropy":
            H = self._entropy(y)
            criterion = self.get_entropy_score
        elif self.criterion == "mse":
            mean_y = np.mean(y)
            H = np.sum((y - mean_y)**2)
            criterion = self.get_mse_score
        else:
            raise RuntimeError("invalid criterion type")

        for feat in range(X.shape[1]):
            column = X[:, feat]
            unique_values = np.unique(column)
            for value in unique_values:
                left_mask = column <= value
                right_mask = column > value
                y_left = y[left_mask]
                y_right = y[right_mask]
                current_criterion = criterion(y_left, y_right)
                if current_criterion < min_criterion:
                    feature_split = feat
                    threshold_split = value
                    min_criterion = current_criterion

        information_gain = H - min_criterion
        if feature_split != -1:
            self.feature_importance[feature_split] += information_gain
        return feature_split, threshold_split

    def make_final_leaf(self, y, node_id):
        predict = np.mean(y)
        self.tree[node_id] = [self.LEAF, predict]

    def fit_node(self, X_train, y_train, node_id, depth):
        if depth == self.max_depth - 1 or X_train.shape[0] <= self.min_samples_split:
            self.make_final_leaf(y_train, node_id)
            return

        feature_split, threshold_split = self.find_threshold(X_train, y_train)

        if feature_split == -1:
            self.make_final_leaf(y_train, node_id)
            return

        X_left = X_train[X_train[:, feature_split] <= threshold_split]
        y_left = y_train[X_train[:, feature_split] <= threshold_split]
        X_right = X_train[X_train[:, feature_split] > threshold_split]
        y_right = y_train[X_train[:, feature_split] > threshold_split]

        if X_left.shape[0] == 0 or X_right.shape[0] == 0:
            self.make_final_leaf(y_train, node_id)
            return

        self.fit_node(X_left, y_left, 2 * node_id + 1, depth + 1)
        self.fit_node(X_right, y_right, 2 * node_id + 2, depth + 1)

        self.tree[node_id] = [self.NON_LEAF, feature_split, threshold_split]

    def predict_one_elem(self, x, node_id):
        if self.tree[node_id][0] == self.LEAF:
            return self.tree[node_id][1]
        else:
            _, feature, split = self.tree[node_id]
            if x[feature] <= split:
                return self.predict_one_elem(x, 2 * node_id + 1)
            else:
                return self.predict_one_elem(x, 2 * node_id + 2)

    def predict(self, X):
        return np.array([self.predict_one_elem(x, 0) for x in X])


Построим модель и сделаем фит

In [178]:
my_tree = MyDecisionTree(max_depth=5, criterion="mse")

my_tree.fit(X_train, y_train)

Как из семинара построим фрейм с оценками

In [181]:
pd.DataFrame(
    {
        "columns": data_encoded.drop(columns=["25"]).columns,
        "my_tree": my_tree.feature_importance/10**10,
        "sklearn": clf.feature_importances_,
    }
).sort_values(by="my_tree", ascending=False).reset_index(drop=True)

Unnamed: 0,columns,my_tree,sklearn
0,13,0.951483,0.241216
1,16,0.399224,0.627182
2,20,0.070612,0.000000
3,10,0.044203,0.020504
4,7_rwd,0.027573,0.000000
...,...,...,...
71,2_volkswagen,0.000000,0.000000
72,2_volvo,0.000000,0.000000
73,3_diesel,0.000000,0.000000
74,4_std,0.000000,0.000000
