<a href="https://colab.research.google.com/github/Aleksey55555/LMT/blob/master/LMT_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#!pip install optuna

In [5]:
from math import ceil
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_wine
from sklearn.datasets import make_classification
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score
from sklearn.metrics import precision_score, recall_score, fbeta_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import numpy as np
import optuna
import optuna.visualization as viz
import pandas as pd


Идея создать такой классификатор LMT (logistic model tree), который будет сочитать дерево решений и логистическую регрессию. В каждом листе будет логистическая регрессия на признаках, которые не использовались в ветвлении дерева.
Подход

Строим дерево по подмножеству признаков:

на каждом узле выбираем признак и порог для разбиения (как в DecisionTreeClassifier).

глубина/мин-сэмплы ограничивают переобучение.

В листьях:

берём только те признаки, которые не использовались для делений выше по пути.

обучаем LogisticRegression на этом подмножестве данных.

Предсказание:

объект проходит по дереву до листа.

в листе к нему применяется локальная логистическая регрессия.

Реализация с помощью scikit-learn

In [6]:
class LogisticModelTree(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=3, min_samples_leaf=20, random_state=None):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state
    def fit(self, X, y):
        # шаг 1: строим дерево только для разбиений
        self.tree_ = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
        )
        self.tree_.fit(X, y)
        # шаг 2: находим индексы объектов в листьях
        leaf_ids = self.tree_.apply(X)
        self.models_ = {}
        self.classes_ = self.tree_.classes_
        for leaf in np.unique(leaf_ids):
            mask = (leaf_ids == leaf)
            # получаем признаки, использованные на пути до этого листа
            path_features = self._get_features_on_path(leaf)
            remaining_features = [i for i in range(X.shape[1]) if i not in path_features]
            if not remaining_features:
                remaining_features = list(range(X.shape[1]))
            X_leaf = X[mask][:, remaining_features]
            y_leaf = y[mask]
            if len(np.unique(y_leaf)) == 1:
                # "чистый" лист: всегда один класс
                class_idx = np.where(self.classes_ == y_leaf[0])[0][0]
                def dummy_model(X_input, c=class_idx):
                    proba = np.zeros((X_input.shape[0], len(self.classes_)))
                    proba[:, c] = 1.0
                    return proba
                self.models_[leaf] = (dummy_model, remaining_features, True)
            else:
                model = LogisticRegression(max_iter=500)
                model.fit(X_leaf, y_leaf)
                self.models_[leaf] = (model, remaining_features, False)
        return self
    def predict_proba(self, X):
        leaf_ids = self.tree_.apply(X)
        proba = np.zeros((X.shape[0], len(self.classes_)))
        for leaf, (model, feats, is_dummy) in self.models_.items():
            mask = (leaf_ids == leaf)
            if np.any(mask):
                X_leaf = X[mask][:, feats]
                if is_dummy:
                    proba[mask] = model(X_leaf)
                else:
                    proba[mask] = model.predict_proba(X_leaf)
        return proba
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    def _get_features_on_path(self, leaf_id):
        """Собрать все признаки, использованные на пути до данного листа"""
        tree = self.tree_.tree_
        path_features = set()
        def recurse(node, path):
            if node == leaf_id:
                return path
            if tree.feature[node] >= 0:
                left = tree.children_left[node]
                right = tree.children_right[node]
                if left != -1:
                    res = recurse(left, path | {tree.feature[node]})
                    if res is not None:
                        return res
                if right != -1:
                    res = recurse(right, path | {tree.feature[node]})
                    if res is not None:
                        return res
            return None
        return recurse(0, set()) or set()


Посмотрим метрики на датасете breast_cancer

In [7]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticModelTree(max_depth=3, min_samples_leaf=30, random_state=42)
clf.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, clf.predict(X_test)))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9707602339181286


In [8]:
print(classification_report(y_test, clf.predict(X_test)))


              precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



Точность (precision)

Класс 0: 0.98

Класс 1: 0.96
→ почти без ложноположительных ошибок.

Полнота (recall)

Класс 0: 0.94

Класс 1: 0.99
→ модель чуть чаще путает класс 0

F1-score

Оба класса ≈ 0.96–0.98 → очень сбалансировано.


 Сравним  с другими моделями: RandomForestClassifier, LogisticRegression, XGBClassifier.

In [9]:
# обучаем все модели
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    ),
    "Logistic Model Tree": LogisticModelTree(max_depth=3, min_samples_leaf=30, random_state=42)
}
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name}")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    results[name] = acc


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression
Accuracy: 0.9707602339181286
              precision    recall  f1-score   support

           0       0.97      0.95      0.96        63
           1       0.97      0.98      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171


Random Forest
Accuracy: 0.9707602339181286
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost
Accuracy: 0.9590643274853801
              precision    recall  f1-score   support

           0       0.95      0.94      0.94        63
           1       0.96      0.97      0.97       108

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171


Logistic Model Tree
Accuracy: 0.9707602339181286
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LMT и Random Forest показали одинаковый результат, LogReg с таким же accuracy 0,971, но recall чуть хуже (на классе 1, что важно для данного набора). XGBoost дал хуже результат  accuract - 0.959

Попробуем улучшить модель LMT, добавив возможность использования небольшого количества признаков, использованнных для ветвления в логистической регрессии в листе. Гипрепараметр reuse_ratio=0.1

In [10]:
class LogisticModelTree(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=3, min_samples_leaf=20, random_state=None, reuse_ratio=0.1):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state
        self.reuse_ratio = reuse_ratio  # доля признаков из пути, которые можно "вернуть"
    def fit(self, X, y):
        # шаг 1: строим дерево для разбиений
        self.tree_ = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
        )
        self.tree_.fit(X, y)
        # шаг 2: распределяем объекты по листьям
        leaf_ids = self.tree_.apply(X)
        self.models_ = {}
        self.classes_ = self.tree_.classes_
        self.leaf_samples_ = {}
        rng = np.random.RandomState(self.random_state)
        for leaf in np.unique(leaf_ids):
            mask = (leaf_ids == leaf)
            self.leaf_samples_[leaf] = np.sum(mask)
            # признаки, использованные на пути
            path_features = list(self._get_features_on_path(leaf))
            unused_features = [i for i in range(X.shape[1]) if i not in path_features]
            # пропорция признаков из пути
            k = max(1, int(len(path_features) * self.reuse_ratio)) if path_features else 0
            reuse_features = rng.choice(path_features, size=k, replace=False).tolist() if k > 0 else []
            final_features = unused_features + reuse_features
            if not final_features:  # fallback
                final_features = list(range(X.shape[1]))
            X_leaf = X[mask][:, final_features]
            y_leaf = y[mask]
            if len(np.unique(y_leaf)) == 1:
                # чистый лист
                class_idx = np.where(self.classes_ == y_leaf[0])[0][0]
                def dummy_model(X_input, c=class_idx):
                    proba = np.zeros((X_input.shape[0], len(self.classes_)))
                    proba[:, c] = 1.0
                    return proba
                self.models_[leaf] = (dummy_model, final_features, True, class_idx, None)
            else:
                model = LogisticRegression(max_iter=500)
                model.fit(X_leaf, y_leaf)
                self.models_[leaf] = (model, final_features, False, None, model.coef_)
        return self
    def predict_proba(self, X):
        leaf_ids = self.tree_.apply(X)
        proba = np.zeros((X.shape[0], len(self.classes_)))
        for leaf, (model, feats, is_dummy, _, _) in self.models_.items():
            mask = (leaf_ids == leaf)
            if np.any(mask):
                X_leaf = X[mask][:, feats]
                if is_dummy:
                    proba[mask] = model(X_leaf)
                else:
                    proba[mask] = model.predict_proba(X_leaf)
        return proba
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    def _get_features_on_path(self, leaf_id):
        """Собрать все признаки, использованные на пути до данного листа"""
        tree = self.tree_.tree_
        path_features = set()
        def recurse(node, path):
            if tree.children_left[node] == -1 and tree.children_right[node] == -1:
                if node == leaf_id:
                    return path
                return None
            if tree.feature[node] >= 0:
                left = tree.children_left[node]
                right = tree.children_right[node]
                if left != -1:
                    res = recurse(left, path | {tree.feature[node]})
                    if res is not None:
                        return res
                if right != -1:
                    res = recurse(right, path | {tree.feature[node]})
                    if res is not None:
                        return res
            return None
        return recurse(0, set()) or set()
    def print_leaf_stats(self, feature_names=None):
        """Вывести статистику по каждому листу"""
        for leaf, (model, feats, is_dummy, class_idx, coefs) in self.models_.items():
            print("="*60)
            print(f"Лист {leaf} | объектов: {self.leaf_samples_[leaf]}")
            used_feats = self._get_features_on_path(leaf)
            if feature_names is not None:
                used_feats = [feature_names[i] for i in used_feats]
                feats_names = [feature_names[i] for i in feats]
            else:
                feats_names = feats
            print(f"  Использованные признаки на пути: {used_feats}")
            print(f"  Признаки в логрег: {feats_names}")
            if is_dummy:
                print(f"  Модель: ЧИСТЫЙ ЛИСТ → всегда класс {self.classes_[class_idx]}")
            else:
                print("  Модель: Логистическая регрессия")
                print("   Коэффициенты:")
                for i, c in enumerate(coefs[0]):
                    fname = feats_names[i]
                    print(f"     {fname}: {c:.4f}")


In [11]:
clf = LogisticModelTree(max_depth=3, min_samples_leaf=30, random_state=42, reuse_ratio=0.2)
clf.fit(X_train, y_train)
clf.print_leaf_stats(feature_names=load_breast_cancer().feature_names)


Лист 3 | объектов: 148
  Использованные признаки на пути: [np.str_('texture error'), np.str_('worst area'), np.str_('mean concave points')]
  Признаки в логрег: [np.str_('mean radius'), np.str_('mean texture'), np.str_('mean perimeter'), np.str_('mean area'), np.str_('mean smoothness'), np.str_('mean compactness'), np.str_('mean concavity'), np.str_('mean symmetry'), np.str_('mean fractal dimension'), np.str_('radius error'), np.str_('perimeter error'), np.str_('area error'), np.str_('smoothness error'), np.str_('compactness error'), np.str_('concavity error'), np.str_('concave points error'), np.str_('symmetry error'), np.str_('fractal dimension error'), np.str_('worst radius'), np.str_('worst texture'), np.str_('worst perimeter'), np.str_('worst smoothness'), np.str_('worst compactness'), np.str_('worst concavity'), np.str_('worst concave points'), np.str_('worst symmetry'), np.str_('worst fractal dimension'), np.str_('texture error')]
  Модель: ЧИСТЫЙ ЛИСТ → всегда класс 1
Лист 4 | 

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Протетстируем переиспользование признаков при разном reuse_ratio

Перепишем модель

In [12]:
class LogisticModelTree(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=3, min_samples_leaf=20, random_state=None,
                 reuse_ratio=0.1, max_iter=5000, solver="lbfgs"):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state
        self.reuse_ratio = reuse_ratio  # доля признаков из пути, которые можно "вернуть"
        self.max_iter = max_iter
        self.solver = solver
    def fit(self, X, y):
        # шаг 1: строим дерево для разбиений
        self.tree_ = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
        )
        self.tree_.fit(X, y)
        # шаг 2: распределяем объекты по листьям
        leaf_ids = self.tree_.apply(X)
        self.models_ = {}
        self.classes_ = self.tree_.classes_
        self.leaf_samples_ = {}
        rng = np.random.RandomState(self.random_state)
        for leaf in np.unique(leaf_ids):
            mask = (leaf_ids == leaf)
            self.leaf_samples_[leaf] = np.sum(mask)
            # признаки, использованные на пути
            path_features = list(self._get_features_on_path(leaf))
            unused_features = [i for i in range(X.shape[1]) if i not in path_features]
            # пропорция признаков из пути
            k = max(1, int(len(path_features) * self.reuse_ratio)) if path_features else 0
            reuse_features = rng.choice(path_features, size=k, replace=False).tolist() if k > 0 else []
            final_features = unused_features + reuse_features
            if not final_features:  # fallback
                final_features = list(range(X.shape[1]))
            X_leaf = X[mask][:, final_features]
            y_leaf = y[mask]
            if len(np.unique(y_leaf)) == 1:
                # чистый лист
                class_idx = np.where(self.classes_ == y_leaf[0])[0][0]
                def dummy_model(X_input, c=class_idx):
                    proba = np.zeros((X_input.shape[0], len(self.classes_)))
                    proba[:, c] = 1.0
                    return proba
                self.models_[leaf] = (dummy_model, final_features, True, class_idx, None)
            else:
                model = make_pipeline(
                    StandardScaler(),
                    LogisticRegression(max_iter=self.max_iter, solver=self.solver)
                )
                model.fit(X_leaf, y_leaf)
                coefs = model.named_steps["logisticregression"].coef_
                self.models_[leaf] = (model, final_features, False, None, coefs)
        return self
    def predict_proba(self, X):
        leaf_ids = self.tree_.apply(X)
        proba = np.zeros((X.shape[0], len(self.classes_)))
        for leaf, (model, feats, is_dummy, _, _) in self.models_.items():
            mask = (leaf_ids == leaf)
            if np.any(mask):
                X_leaf = X[mask][:, feats]
                if is_dummy:
                    proba[mask] = model(X_leaf)
                else:
                    proba[mask] = model.predict_proba(X_leaf)
        return proba
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    def _get_features_on_path(self, leaf_id):
        """Собрать все признаки, использованные на пути до данного листа"""
        tree = self.tree_.tree_
        path_features = set()
        def recurse(node, path):
            if tree.children_left[node] == -1 and tree.children_right[node] == -1:
                if node == leaf_id:
                    return path
                return None
            if tree.feature[node] >= 0:
                left = tree.children_left[node]
                right = tree.children_right[node]
                if left != -1:
                    res = recurse(left, path | {tree.feature[node]})
                    if res is not None:
                        return res
                if right != -1:
                    res = recurse(right, path | {tree.feature[node]})
                    if res is not None:
                        return res
            return None
        return recurse(0, set()) or set()
    def print_leaf_stats(self, feature_names=None):
        """Вывести статистику по каждому листу"""
        for leaf, (model, feats, is_dummy, class_idx, coefs) in self.models_.items():
            print("="*60)
            print(f"Лист {leaf} | объектов: {self.leaf_samples_[leaf]}")
            used_feats = self._get_features_on_path(leaf)
            if feature_names is not None:
                used_feats = [feature_names[i] for i in used_feats]
                feats_names = [feature_names[i] for i in feats]
            else:
                feats_names = feats
            print(f"  Использованные признаки на пути: {used_feats}")
            print(f"  Признаки в логрег: {feats_names}")
            if is_dummy:
                print(f"  Модель: ЧИСТЫЙ ЛИСТ → всегда класс {self.classes_[class_idx]}")
            else:
                print("  Модель: Логистическая регрессия (с масштабированием)")
                print("   Коэффициенты:")
                for i, c in enumerate(coefs[0]):
                    fname = feats_names[i]
                    print(f"     {fname}: {c:.4f}")


In [13]:
clf = LogisticModelTree(max_depth=3, min_samples_leaf=30, random_state=42,
                        reuse_ratio=0.2, max_iter=5000, solver="lbfgs")
clf.fit(X_train, y_train)
#clf.print_leaf_stats(feature_names=load_breast_cancer().feature_names)


In [14]:
ratios = [0.0, 0.1, 0.2, 0.5]
rows = []
for r in ratios:
    clf = LogisticModelTree(
        max_depth=3,
        min_samples_leaf=30,
        random_state=42,
        reuse_ratio=r,
        max_iter=5000,
        solver="lbfgs"
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    f2 = fbeta_score(y_test, y_pred, beta=2, average="weighted")
    rows.append({
        "reuse_ratio": r,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2
    })
results_df = pd.DataFrame(rows)
print(results_df)


   reuse_ratio  precision    recall        f1        f2
0          0.0   0.976608  0.976608  0.976608  0.976608
1          0.1   0.976608  0.976608  0.976608  0.976608
2          0.2   0.976608  0.976608  0.976608  0.976608
3          0.5   0.976608  0.976608  0.976608  0.976608


Метрики одинаковые при любых reuse_ratio, что говорит о том, что дерево делит пространство так, что оставшихся признаков уже хватает для локальной логистической регрессии. Добавление/убавление 10–50% «старых» признаков не меняет картину — модель в листьях даёт одинаковые предсказания.

Датасет Breast Cancer достаточно «лёгкий»: он линейно разделим и малошумный, поэтому гибрид быстро выходит на потолок ≈97–98% accuracy.

Попробуем на синтетических данных

In [15]:
# 1. создаём более сложный датасет
X, y = make_classification(
    n_samples=5000,
    n_features=30,
    n_informative=15,
    n_redundant=10,
    n_classes=2,
    random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 2. модели для сравнения
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    )
}
rows = []
# 3. прогон LogisticModelTree с разными reuse_ratio
for r in [0.0, 0.1, 0.2, 0.5]:
    clf = LogisticModelTree(
        max_depth=4,
        min_samples_leaf=50,
        random_state=42,
        reuse_ratio=r,
        max_iter=5000,
        solver="lbfgs"
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    rows.append({
        "Model": f"LMT (reuse={r})",
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1": f1_score(y_test, y_pred, average="weighted"),
        "F2": fbeta_score(y_test, y_pred, beta=2, average="weighted")
    })
# 4. прогон классических моделей
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rows.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1": f1_score(y_test, y_pred, average="weighted"),
        "F2": fbeta_score(y_test, y_pred, beta=2, average="weighted")
    })
# 5. выводим результаты
results_df = pd.DataFrame(rows)
print(results_df)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                 Model  Accuracy  Precision    Recall        F1        F2
0      LMT (reuse=0.0)  0.878000   0.878116  0.878000  0.877992  0.877981
1      LMT (reuse=0.1)  0.878000   0.878116  0.878000  0.877992  0.877981
2      LMT (reuse=0.2)  0.878000   0.878116  0.878000  0.877992  0.877981
3      LMT (reuse=0.5)  0.882000   0.882249  0.882000  0.881983  0.881959
4  Logistic Regression  0.818667   0.818807  0.818667  0.818643  0.818635
5        Random Forest  0.934667   0.934743  0.934667  0.934663  0.934655
6              XGBoost  0.944667   0.944687  0.944667  0.944666  0.944664


Гибрид (LMT) заметно сильнее обычной Logistic Regression (+6 процентных пунктов), но сильно проигрывает ансамблям деревьев (RF и XGB).

При reuse_ratio=0.5 результат немного лучше, чем при меньших значениях,то есть подмешивание части признаков ветвления действительно помогает.

Random Forest и XGBoost на этом датасете показывают высокие результаты (93–94%).

XGBoost чуть лучше, что типично для задач с нелинейной структурой и шумом.

Выводы

LMT уже даёт более гибкую модель, чем чистая логрег, но чтобы конкурировать с ансамблями, нужно либо глубже дерево, либо более «умный» выбор признаков в листьях (Можно добавить фича-селекшн по критериямв листе).

При reuse_ratio=0.5 есть небольшой, но заметный прирост — так что идея рабочая.

Добавим масштабирование перед построением регрессии в листе и фича-селекшн в модель. Подберем лучшие гиперпараметры с помощью Optuna.

In [17]:
# ==== 1) Модель: LogisticModelTree с локальным feature selection ====
class LogisticModelTree(BaseEstimator, ClassifierMixin):
    """
    Дерево разбиений + в листьях логистическая регрессия.
    Улучшения:
      - масштабирование признаков в каждом листе (StandardScaler),
      - reuse_ratio: можно "вернуть" часть признаков, использованных на пути,
      - per-leaf feature selection: выбор top-k признаков (по mutual information) из final_features.
    """
    def __init__(self,
                 max_depth=3,
                 min_samples_leaf=20,
                 random_state=None,
                 reuse_ratio=0.1,               # 0..1, доля признаков из пути, возвращаемых в лист
                 topk_frac=1.0,                 # 0..1, доля final_features, оставляемая в листе (>=1 признак)
                 C=1.0,                         # регуляризация логрег
                 solver="lbfgs",
                 max_iter=5000):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state
        self.reuse_ratio = reuse_ratio
        self.topk_frac = topk_frac
        self.C = C
        self.solver = solver
        self.max_iter = max_iter
    def fit(self, X, y):
        self.tree_ = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
        )
        self.tree_.fit(X, y)
        leaf_ids = self.tree_.apply(X)
        self.models_ = {}
        self.classes_ = self.tree_.classes_
        self.leaf_samples_ = {}
        rng = np.random.RandomState(self.random_state)
        n_features = X.shape[1]
        for leaf in np.unique(leaf_ids):
            mask = (leaf_ids == leaf)
            self.leaf_samples_[leaf] = int(np.sum(mask))
            # признаки на пути к листу
            path_features = list(self._get_features_on_path(leaf))
            unused_features = [i for i in range(n_features) if i not in path_features]
            # вернуть часть "деревянных" признаков
            k_reuse = max(0, int(len(path_features) * float(self.reuse_ratio))) if path_features else 0
            reuse_features = rng.choice(path_features, size=k_reuse, replace=False).tolist() if k_reuse > 0 else []
            final_features = unused_features + reuse_features
            if not final_features:   # fallback
                final_features = list(range(n_features))
            X_leaf_full = X[mask]
            y_leaf = y[mask]
            # "чистый" лист -> детерминистическая модель
            if len(np.unique(y_leaf)) == 1:
                class_idx = int(np.where(self.classes_ == y_leaf[0])[0][0])
                def dummy_model(X_input, c=class_idx, n_classes=len(self.classes_)):
                    proba = np.zeros((X_input.shape[0], n_classes))
                    proba[:, c] = 1.0
                    return proba
                self.models_[leaf] = (dummy_model, final_features, True, class_idx, None, None)
                continue
            # ---- локальный feature selection по mutual information ----
            # считаем важности только по final_features
            X_sub = X_leaf_full[:, final_features]
            # mutual_info_classif устойчив к масштабам; дискретизации не нужно
            mi = mutual_info_classif(X_sub, y_leaf, random_state=self.random_state)
            order = np.argsort(mi)[::-1]  # убыв. важность
            k_top = max(1, int(ceil(len(final_features) * float(self.topk_frac))))
            keep_idx = order[:k_top]
            selected_features = [final_features[i] for i in keep_idx]
            # обучаем пайплайн: скейлер + логрег
            X_leaf = X_leaf_full[:, selected_features]
            model = make_pipeline(
                StandardScaler(),
                LogisticRegression(
                    max_iter=self.max_iter,
                    solver=self.solver,
                    C=self.C
                )
            )
            model.fit(X_leaf, y_leaf)
            coefs = model.named_steps["logisticregression"].coef_
            self.models_[leaf] = (model, selected_features, False, None, coefs, mi)
        return self
    def predict_proba(self, X):
        leaf_ids = self.tree_.apply(X)
        proba = np.zeros((X.shape[0], len(self.classes_)))
        for leaf, (model, feats, is_dummy, _, _, _) in self.models_.items():
            mask = (leaf_ids == leaf)
            if not np.any(mask):
                continue
            X_leaf = X[mask][:, feats]
            if is_dummy:
                proba[mask] = model(X_leaf)
            else:
                proba[mask] = model.predict_proba(X_leaf)
        return proba
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    def _get_features_on_path(self, leaf_id):
        tree = self.tree_.tree_
        def recurse(node, used):
            # лист
            if tree.children_left[node] == -1 and tree.children_right[node] == -1:
                return used if node == leaf_id else None
            if tree.feature[node] >= 0:
                left = tree.children_left[node]
                right = tree.children_right[node]
                if left != -1:
                    r = recurse(left, used | {int(tree.feature[node])})
                    if r is not None:
                        return r
                if right != -1:
                    r = recurse(right, used | {int(tree.feature[node])})
                    if r is not None:
                        return r
            return None
        res = recurse(0, set())
        return res or set()
    def print_leaf_stats(self, feature_names=None, show_top=10):
        for leaf, (model, feats, is_dummy, class_idx, coefs, mi) in self.models_.items():
            print("="*70)
            print(f"Лист {leaf} | объектов: {self.leaf_samples_[leaf]}")
            used_feats = self._get_features_on_path(leaf)
            if feature_names is not None:
                used_feats_names = [feature_names[i] for i in used_feats]
                feats_names = [feature_names[i] for i in feats]
            else:
                used_feats_names = list(used_feats)
                feats_names = feats
            print(f"  Признаки на пути: {used_feats_names}")
            print(f"  Признаки в логрег (после selection): {feats_names[:show_top]}{' ...' if len(feats_names)>show_top else ''}")
            if is_dummy:
                print(f"  Модель: ЧИСТЫЙ ЛИСТ → класс {self.classes_[class_idx]}")
            else:
                print("  Модель: Логистическая регрессия (скейлер + L2)")
                print(f"   Кол-во признаков в листе: {len(feats_names)}")
                if coefs is not None:
                    for i, c in enumerate(coefs[0][:min(len(feats_names), show_top)]):
                        print(f"     {feats_names[i]}: {c:.4f}")
                if mi is not None:
                    print("   (MI использовалось для отбора признаков)")
# ==== 2) Optuna: подбор гиперпараметров LMT ====
# Если optuna не установлена — установи: pip install optuna
def tune_lmt_with_optuna(X, y, n_trials=50, random_state=42):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    scorer = make_scorer(f1_score, average="weighted")
    def objective(trial: optuna.Trial):
        params = {
            "max_depth": trial.suggest_int("max_depth", 2, 7),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 200),
            "reuse_ratio": trial.suggest_float("reuse_ratio", 0.0, 0.8),
            "topk_frac": trial.suggest_float("topk_frac", 0.2, 1.0),
            "C": trial.suggest_float("C", 1e-3, 10.0, log=True),
            "solver": trial.suggest_categorical("solver", ["lbfgs", "saga"]),
            "max_iter": 5000,
            "random_state": random_state,
        }
        # Saga поддерживает l2, всё ок; на маленьких листах может быть быстрее.
        model = LogisticModelTree(**params)
        scores = cross_val_score(model, X, y, scoring=scorer, cv=skf, n_jobs=-1)
        return float(np.mean(scores))
    study = optuna.create_study(direction="maximize",
                                sampler=optuna.samplers.TPESampler(seed=random_state),
                                pruner=optuna.pruners.MedianPruner(n_warmup_steps=10))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    return study
# ==== 3) Запуск тюнинга и финальная оценка ====
# Если у тебя уже есть X_train/X_test/y_train/y_test — используй их.
# Иначе раскомментируй блок генерации синтетики в самом низу.
study = tune_lmt_with_optuna(X_train, y_train, n_trials=60, random_state=42)
best_params = study.best_params
print("Best params (Optuna):", best_params)
# дообучаем на train, проверяем на test
lmt_best = LogisticModelTree(**{**best_params, "max_iter": 5000, "random_state": 42})
lmt_best.fit(X_train, y_train)
y_pred_lmt = lmt_best.predict(X_test)
def metrics_row(name, y_true, y_pred):
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average="weighted"),
        "Recall": recall_score(y_true, y_pred, average="weighted"),
        "F1": f1_score(y_true, y_pred, average="weighted"),
        "F2": fbeta_score(y_true, y_pred, beta=2, average="weighted"),
    }
rows = [metrics_row("LMT (Optuna)", y_test, y_pred_lmt)]
# ==== 4) Бейзлайны: LogisticRegression / RandomForest / XGBoost ====
try:
    has_xgb = True
except Exception:
    has_xgb = False
base_lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=5000, C=1.0, solver="lbfgs")
).fit(X_train, y_train)
rows.append(metrics_row("Logistic Regression", y_test, base_lr.predict(X_test)))
rf = RandomForestClassifier(n_estimators=400, max_depth=None, min_samples_leaf=1,
                            random_state=42, n_jobs=-1).fit(X_train, y_train)
rows.append(metrics_row("Random Forest", y_test, rf.predict(X_test)))
if has_xgb:
    xgb = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1
    ).fit(X_train, y_train)
    rows.append(metrics_row("XGBoost", y_test, xgb.predict(X_test)))
results = pd.DataFrame(rows).sort_values("Accuracy", ascending=False)
print(results)
# (опционально) быстрый просмотр важности гиперов в Optuna:
try:
    fig = viz.plot_param_importances(study)
    # fig.show()  # в Jupyter можно показать интерактивно
except Exception:
    pass


[I 2025-09-19 13:27:23,474] A new study created in memory with name: no-name-666c979f-9a1a-40ca-8b71-d2d769458cd7
[I 2025-09-19 13:27:29,589] Trial 0 finished with value: 0.7922722535939213 and parameters: {'max_depth': 4, 'min_samples_leaf': 191, 'reuse_ratio': 0.585595153449124, 'topk_frac': 0.6789267873576292, 'C': 0.004207988669606638, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.7922722535939213.
[I 2025-09-19 13:27:35,194] Trial 1 finished with value: 0.847630010515865 and parameters: {'max_depth': 7, 'min_samples_leaf': 124, 'reuse_ratio': 0.5664580622368364, 'topk_frac': 0.21646759543664196, 'C': 7.579479953348009, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.847630010515865.
[I 2025-09-19 13:27:38,600] Trial 2 finished with value: 0.8356066190706523 and parameters: {'max_depth': 3, 'min_samples_leaf': 45, 'reuse_ratio': 0.2433937943676302, 'topk_frac': 0.6198051453057902, 'C': 0.05342937261279776, 'solver': 'saga'}. Best is trial 1 with value: 0.847630010515865.
[I 2

Best params (Optuna): {'max_depth': 6, 'min_samples_leaf': 175, 'reuse_ratio': 0.35748122127911736, 'topk_frac': 0.7820979668396375, 'C': 1.1349835828662918, 'solver': 'lbfgs'}
                 Model  Accuracy  Precision    Recall        F1        F2
3              XGBoost  0.955333   0.955341  0.955333  0.955333  0.955332
2        Random Forest  0.934667   0.934715  0.934667  0.934664  0.934659
0         LMT (Optuna)  0.872667   0.872722  0.872667  0.872663  0.872658
1  Logistic Regression  0.818667   0.818807  0.818667  0.818643  0.818635


XGBoost ожидаемо лидер, он идеально справляется с нелинейными разделяющими поверхностями на синтетике.

RandomForest чуть слабее, но тоже близко.

LMT (Optuna): лучше, чем глобальная логрег (+5%), но заметно отстаёт от ансамблей.

Logistic Regression в чистом виде — самая простая и наименее подходящая модель для этого датасета.

LMT реально улучшает линейную модель, сохраняя интерпретируемость и гибкость, но ансамбли деревьев остаются лучшими на сложных данных.
Оптимизация гиперпараметров дала неплохой результат, но сам класс моделей (LMT) пока ограничен по мощности.

Возможные апгрейды LMT
добавить регуляризацию на уровне признаков в листьях (L1 для отбора, ElasticNet);
попробовать бустинг из LMT (как GradientBoosting, но листья = логреги);
попробовать беггинг;
попробовать более глубокие деревья + уменьшить min_samples_leaf, чтобы сделать более локальные логреги.

Но сначала попробуем улучшения и на реальном датасете Wine


In [18]:
class LogisticModelTree(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 max_depth=3,
                 min_samples_leaf=20,
                 random_state=None,
                 reuse_ratio=0.1,
                 topk_frac=1.0,
                 C=1.0,
                 solver="lbfgs",
                 max_iter=5000):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state
        self.reuse_ratio = reuse_ratio
        self.topk_frac = topk_frac
        self.C = C
        self.solver = solver
        self.max_iter = max_iter
    def fit(self, X, y):
        self.tree_ = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_leaf=self.min_samples_leaf,
            random_state=self.random_state
        )
        self.tree_.fit(X, y)
        leaf_ids = self.tree_.apply(X)
        self.models_ = {}
        self.classes_ = np.array(self.tree_.classes_)
        self.class_to_index_ = {c: i for i, c in enumerate(self.classes_)}
        self.leaf_samples_ = {}
        rng = np.random.RandomState(self.random_state)
        n_features = X.shape[1]
        for leaf in np.unique(leaf_ids):
            mask = (leaf_ids == leaf)
            self.leaf_samples_[leaf] = int(np.sum(mask))
            path_features = list(self._get_features_on_path(leaf))
            unused_features = [i for i in range(n_features) if i not in path_features]
            k_reuse = max(0, int(len(path_features) * float(self.reuse_ratio))) if path_features else 0
            reuse_features = rng.choice(path_features, size=k_reuse, replace=False).tolist() if k_reuse > 0 else []
            final_features = unused_features + reuse_features
            if not final_features:
                final_features = list(range(n_features))
            X_leaf_full = X[mask]
            y_leaf = y[mask]
            # чистый лист
            unique_leaf_classes = np.unique(y_leaf)
            if len(unique_leaf_classes) == 1:
                class_idx = int(self.class_to_index_[unique_leaf_classes[0]])
                def dummy_model(X_input, c=class_idx, n_classes=len(self.classes_)):
                    proba = np.zeros((X_input.shape[0], n_classes))
                    proba[:, c] = 1.0
                    return proba
                self.models_[leaf] = {
                    "model": dummy_model,
                    "feats": final_features,
                    "is_dummy": True,
                    "leaf_classes": np.array([unique_leaf_classes[0]]),
                    "coefs": None,
                    "mi": None,
                }
                continue
            # локальный feature selection по MI (если объектов совсем мало — пропускаем селекцию)
            X_sub = X_leaf_full[:, final_features]
            if X_sub.shape[0] >= 5:
                mi = mutual_info_classif(X_sub, y_leaf, random_state=self.random_state)
                order = np.argsort(mi)[::-1]
                k_top = max(1, int(ceil(len(final_features) * float(self.topk_frac))))
                keep_idx = order[:k_top]
                selected_features = [final_features[i] for i in keep_idx]
            else:
                mi = None
                selected_features = final_features
            X_leaf = X_leaf_full[:, selected_features]
            pipe = make_pipeline(
                StandardScaler(),
                LogisticRegression(
                    max_iter=self.max_iter,
                    solver=self.solver,
                    C=self.C,
                    multi_class="auto"
                )
            )
            pipe.fit(X_leaf, y_leaf)
            lr = pipe.named_steps["logisticregression"]
            self.models_[leaf] = {
                "model": pipe,
                "feats": selected_features,
                "is_dummy": False,
                "leaf_classes": np.array(lr.classes_),  # важный момент!
                "coefs": lr.coef_,
                "mi": mi,
            }
        return self
    def predict_proba(self, X):
        leaf_ids = self.tree_.apply(X)
        proba = np.zeros((X.shape[0], len(self.classes_)))
        for leaf, blob in self.models_.items():
            mask = (leaf_ids == leaf)
            if not np.any(mask):
                continue
            feats = blob["feats"]
            X_leaf = X[mask][:, feats]
            if blob["is_dummy"]:
                proba[mask] = blob["model"](X_leaf)
            else:
                local_proba = blob["model"].predict_proba(X_leaf)  # shape: [n, n_leaf_classes]
                leaf_classes = blob["leaf_classes"]
                # распределяем по глобальным классам
                tmp = np.zeros((local_proba.shape[0], len(self.classes_)))
                for j, cls in enumerate(leaf_classes):
                    gidx = self.class_to_index_[cls]
                    tmp[:, gidx] = local_proba[:, j]
                proba[mask] = tmp
        return proba
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    def _get_features_on_path(self, leaf_id):
        tree = self.tree_.tree_
        def recurse(node, used):
            if tree.children_left[node] == -1 and tree.children_right[node] == -1:
                return used if node == leaf_id else None
            if tree.feature[node] >= 0:
                left = tree.children_left[node]
                right = tree.children_right[node]
                if left != -1:
                    r = recurse(left, used | {int(tree.feature[node])})
                    if r is not None:
                        return r
                if right != -1:
                    r = recurse(right, used | {int(tree.feature[node])})
                    if r is not None:
                        return r
            return None
        res = recurse(0, set())
        return res or set()


In [19]:
wine = load_wine()
X, y = wine.data, wine.target
feature_names = wine.feature_names
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [20]:
study = tune_lmt_with_optuna(X_train, y_train, n_trials=25)


[I 2025-09-19 13:32:28,021] A new study created in memory with name: no-name-6efa911f-9580-4e5c-b904-cca6302d92b9
[I 2025-09-19 13:32:28,400] Trial 0 finished with value: 0.9185826078439889 and parameters: {'max_depth': 4, 'min_samples_leaf': 191, 'reuse_ratio': 0.585595153449124, 'topk_frac': 0.6789267873576292, 'C': 0.004207988669606638, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.9185826078439889.
[I 2025-09-19 13:32:28,869] Trial 1 finished with value: 0.9017217818988715 and parameters: {'max_depth': 7, 'min_samples_leaf': 124, 'reuse_ratio': 0.5664580622368364, 'topk_frac': 0.21646759543664196, 'C': 7.579479953348009, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.9185826078439889.
[I 2025-09-19 13:32:29,355] Trial 2 finished with value: 0.9343255676555987 and parameters: {'max_depth': 3, 'min_samples_leaf': 45, 'reuse_ratio': 0.2433937943676302, 'topk_frac': 0.6198051453057902, 'C': 0.05342937261279776, 'solver': 'saga'}. Best is trial 2 with value: 0.9343255676555987.
[

In [21]:
# дообучаем LMT на лучших параметрах
lmt_best = LogisticModelTree(**{**best_params, "max_iter": 5000, "random_state": 42})
lmt_best.fit(X_train, y_train)
y_pred_lmt = lmt_best.predict(X_test)
def metrics_row(name, y_true, y_pred):
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average="weighted"),
        "Recall": recall_score(y_true, y_pred, average="weighted"),
        "F1": f1_score(y_true, y_pred, average="weighted"),
        "F2": fbeta_score(y_true, y_pred, beta=2, average="weighted")
    }
rows = [metrics_row("LMT (Optuna)", y_test, y_pred_lmt)]
# Logistic Regression (со скейлингом)
base_lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=5000, solver="lbfgs", multi_class="auto")
).fit(X_train, y_train)
rows.append(metrics_row("Logistic Regression", y_test, base_lr.predict(X_test)))
# Random Forest
rf = RandomForestClassifier(
    n_estimators=400, random_state=42, n_jobs=-1
).fit(X_train, y_train)
rows.append(metrics_row("Random Forest", y_test, rf.predict(X_test)))
# XGBoost
try:
    xgb = XGBClassifier(
        objective="multi:softprob",
        num_class=len(np.unique(y_train)),
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        eval_metric="mlogloss",
        random_state=42,
        n_jobs=-1
    ).fit(X_train, y_train)
    rows.append(metrics_row("XGBoost", y_test, xgb.predict(X_test)))
except ImportError:
    print("XGBoost недоступен")
results = pd.DataFrame(rows).sort_values("Accuracy", ascending=False)
print(results)








                 Model  Accuracy  Precision    Recall        F1        F2
3              XGBoost  1.000000   1.000000  1.000000  1.000000  1.000000
2        Random Forest  1.000000   1.000000  1.000000  1.000000  1.000000
1  Logistic Regression  0.981481   0.982456  0.981481  0.981506  0.981380
0         LMT (Optuna)  0.962963   0.963938  0.962963  0.962894  0.962803


XGBoost и RandomForest идеально решают Wine,
Logistic Regression на скейлинге показывает очень достойно: почти 98%.
LMT (Optuna) отстаёт (96%), но всё равно выше, чем ожидалось для интерпретируемой гибридной модели.
Выводы
Wine — относительно простой датасет. Ансамбли деревьев справляются идеально.
Логрег и LMT немного ошибаются, но дают хорошую интерпретируемость.

Попробуем LMT бустинг и беггинг на датасете breast_cancer

In [23]:
# данные
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
def metrics_row(name, y_true, y_pred):
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average="weighted"),
        "Recall": recall_score(y_true, y_pred, average="weighted"),
        "F1": f1_score(y_true, y_pred, average="weighted"),
        "F2": fbeta_score(y_true, y_pred, beta=2, average="weighted")
    }
rows = []
# --- базовый LMT  ---
base_lmt = LogisticModelTree(
    max_depth=3, min_samples_leaf=30, random_state=42,
    reuse_ratio=0.2, topk_frac=1.0, C=1.0, solver="lbfgs", max_iter=5000
)
base_lmt.fit(X_train, y_train)
rows.append(metrics_row("LMT (base)", y_test, base_lmt.predict(X_test)))
# --- LMT-Bagging ---
lmt_for_bag = LogisticModelTree(
    max_depth=3, min_samples_leaf=25, random_state=42,
    reuse_ratio=0.2, topk_frac=0.8, C=1.0, solver="lbfgs", max_iter=5000
)
bag = BaggingClassifier(
    estimator=lmt_for_bag,
    n_estimators=25,
    max_samples=0.8,
    max_features=1.0,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1,
    random_state=42
)
bag.fit(X_train, y_train)
rows.append(metrics_row("LMT-Bagging (25x, 80%)", y_test, bag.predict(X_test)))
# --- LMT-Boosting (AdaBoost; "SAMME" теперь универсальный) ---

class LogisticModelTree(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=3, min_samples_leaf=20, random_state=None,
                 reuse_ratio=0.1, topk_frac=1.0, C=1.0, solver="lbfgs", max_iter=5000):
        self.max_depth=max_depth; self.min_samples_leaf=min_samples_leaf; self.random_state=random_state
        self.reuse_ratio=reuse_ratio; self.topk_frac=topk_frac; self.C=C; self.solver=solver; self.max_iter=max_iter

    def fit(self, X, y, sample_weight=None):
        self.tree_ = DecisionTreeClassifier(max_depth=self.max_depth,
                                            min_samples_leaf=self.min_samples_leaf,
                                            random_state=self.random_state)
        self.tree_.fit(X, y, sample_weight=sample_weight)

        leaf_ids = self.tree_.apply(X)
        self.classes_ = np.array(self.tree_.classes_)
        self.class_to_index_ = {c:i for i,c in enumerate(self.classes_)}
        self.models_ = {}
        rng = np.random.RandomState(self.random_state)
        n_features = X.shape[1]
        sw = sample_weight if sample_weight is not None else np.ones(len(y), float)

        for leaf in np.unique(leaf_ids):
            mask = (leaf_ids == leaf)
            y_leaf = y[mask]; X_leaf_full = X[mask]; sw_leaf = sw[mask]

            # признаки: неиспользованные + часть "путевых"
            path = self._get_features_on_path(leaf)
            unused = [i for i in range(n_features) if i not in path]
            k_reuse = int(len(path)*self.reuse_ratio) if path else 0
            reuse = rng.choice(list(path), size=k_reuse, replace=False).tolist() if k_reuse>0 else []
            final_feats = unused + reuse or list(range(n_features))

            uniq = np.unique(y_leaf)
            if len(uniq)==1:
                cls_idx = self.class_to_index_[uniq[0]]
                def dummy(X_in, c=cls_idx, n=len(self.classes_)):
                    P = np.zeros((X_in.shape[0], n)); P[:,c]=1.0; return P
                self.models_[leaf] = {"is_dummy":True, "feats":final_feats, "model":dummy, "leaf_classes":np.array([uniq[0]])}
                continue

            X_sub = X_leaf_full[:, final_feats]
            if X_sub.shape[0] >= 5:
                mi = mutual_info_classif(X_sub, y_leaf, random_state=self.random_state)
                order = np.argsort(mi)[::-1]
                k_top = max(1, int(ceil(len(final_feats)*self.topk_frac)))
                keep = order[:k_top]
                feats = [final_feats[i] for i in keep]
            else:
                feats = final_feats

            scaler = StandardScaler().fit(X_leaf_full[:, feats])
            Xs = scaler.transform(X_leaf_full[:, feats])
            lr = LogisticRegression(max_iter=self.max_iter, solver=self.solver, C=self.C)
            lr.fit(Xs, y_leaf, sample_weight=sw_leaf)

            self.models_[leaf] = {"is_dummy":False, "feats":feats,
                                  "model":(scaler, lr), "leaf_classes":np.array(lr.classes_)}
        return self

    def predict_proba(self, X):
        leaf_ids = self.tree_.apply(X)
        proba = np.zeros((X.shape[0], len(self.classes_)))
        for leaf, blob in self.models_.items():
            mask = (leaf_ids == leaf)
            if not np.any(mask): continue
            feats = blob["feats"]; X_leaf = X[mask][:, feats]
            if blob["is_dummy"]:
                proba[mask] = blob["model"](X_leaf)
            else:
                scaler, lr = blob["model"]
                local = lr.predict_proba(scaler.transform(X_leaf))
                tmp = np.zeros((local.shape[0], len(self.classes_)))
                for j, cls in enumerate(blob["leaf_classes"]):
                    tmp[:, self.class_to_index_[cls]] = local[:, j]
                proba[mask] = tmp
        return proba

    def predict(self, X): return np.argmax(self.predict_proba(X), axis=1)

    def _get_features_on_path(self, leaf_id):
        t = self.tree_.tree_
        def rec(node, used):
            if t.children_left[node]==-1 and t.children_right[node]==-1:
                return used if node==leaf_id else None
            if t.feature[node] >= 0:
                left, right = t.children_left[node], t.children_right[node]
                r = rec(left, used|{int(t.feature[node])});  r = r if r is not None else rec(right, used|{int(t.feature[node])})
                return r
            return None
        return rec(0, set()) or set()

boost = AdaBoostClassifier(
    estimator=LogisticModelTree(max_depth=2, min_samples_leaf=20, random_state=42,
                                reuse_ratio=0.3, topk_frac=0.9, C=1.0, solver="lbfgs", max_iter=5000),
    n_estimators=30, learning_rate=0.5, random_state=42
)

boost.fit(X_train, y_train)
rows.append(metrics_row("LMT-Boosting (AdaBoost, 30, 0.5)", y_test, boost.predict(X_test)))
# --- Бейзлайны: LogisticRegression / RandomForest / XGBoost ---
lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=5000, solver="lbfgs")
).fit(X_train, y_train)
rows.append(metrics_row("Logistic Regression", y_test, lr.predict(X_test)))
rf = RandomForestClassifier(
    n_estimators=400, random_state=42, n_jobs=-1
).fit(X_train, y_train)
rows.append(metrics_row("Random Forest", y_test, rf.predict(X_test)))
try:
    xgb = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1
    ).fit(X_train, y_train)
    rows.append(metrics_row("XGBoost", y_test, xgb.predict(X_test)))
except Exception as e:
    pass
results = pd.DataFrame(rows).sort_values("Accuracy", ascending=False)
print(results)












                              Model  Accuracy  Precision    Recall        F1  \
3               Logistic Regression  0.988304   0.988304  0.988304  0.988304   
2  LMT-Boosting (AdaBoost, 30, 0.5)  0.964912   0.964964  0.964912  0.964796   
5                           XGBoost  0.964912   0.965576  0.964912  0.964668   
0                        LMT (base)  0.953216   0.953216  0.953216  0.953216   
1            LMT-Bagging (25x, 80%)  0.947368   0.947463  0.947368  0.947101   
4                     Random Forest  0.947368   0.947463  0.947368  0.947101   

         F2  
3  0.988304  
2  0.964832  
5  0.964679  
0  0.953216  
1  0.947187  
4  0.947187  


Логистическая регрессия — победитель. Датасет почти линейно разделим, поэтому ансамбли и гибриды даже проигрывают по метрикам.

LMT-Boosting ≈ XGBoost —  бустинг на LMT вышел на уровень XGBoost, хотя тот куда более оптимизирован.
Bagging улучшает базовый LMT, но не так сильно, как бустинг.
Random Forest — хуже всех, что тоже ожидаемо: дерево «дробит» пространство, а Breast Cancer этому не очень подходит.

Вывод
На «чистых» и почти линейных данных глобальная логрег остаётся топом.
LMT+Boosting показал, что может соревноваться с XGBoost. На более сложных данных он может раскрыться ещё лучше.
Bagging стабилизирует, но не даёт драматического прироста.