In [60]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor as SklearnGBR, AdaBoostRegressor as SklearnAdaBoost
from sklearn.neighbors import KernelDensity
from sklearn.metrics import mean_squared_error

In [None]:
class DecisionStump:

    def __init__(self):
        self.feature_index: int | None = None
        self.threshold: float | None = None
        self.left_value: float | None = None
        self.right_value: float | None = None

    def fit(self, X: np.ndarray, y: np.ndarray, sample_weights: np.ndarray | None = None):
        if sample_weights is None:
            sample_weights = np.ones_like(y, dtype=float)
        m, n = X.shape
        best_loss = np.inf

        for feature in range(n):
            thresholds = np.unique(X[:, feature])
            for thr in thresholds:
                left = X[:, feature] <= thr
                right = ~left
                if not left.any() or not right.any():
                    continue
                lw = sample_weights[left]
                rw = sample_weights[right]

                lv = np.average(y[left], weights=lw)
                rv = np.average(y[right], weights=rw)

                pred = np.where(left, lv, rv)
                loss = np.average((y - pred) ** 2, weights=sample_weights)
                if loss < best_loss:
                    best_loss = loss
                    self.feature_index = feature
                    self.threshold = thr
                    self.left_value = lv
                    self.right_value = rv

    def predict(self, X: np.ndarray) -> np.ndarray:
        feat = X[:, self.feature_index]
        return np.where(feat <= self.threshold, self.left_value, self.right_value)


In [63]:
class GradientBoostingRegressor:
    def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.trees: list[DecisionStump] = []
        self.init_value: float | None = None

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        self.init_value = np.mean(y)
        y_pred = np.full_like(y, self.init_value, dtype=float)

        for _ in range(self.n_estimators):
            residual = y - y_pred
            stump = DecisionStump()
            stump.fit(X, residual)
            update = stump.predict(X)
            y_pred += self.learning_rate * update
            self.trees.append(stump)

    def predict(self, X):
        X = np.asarray(X)
        y_pred = np.full(X.shape[0], self.init_value, dtype=float)
        for stump in self.trees:
            y_pred += self.learning_rate * stump.predict(X)
        return y_pred

In [64]:
class KDE:
    def __init__(self, bandwidth: float = 1.0):
        self.bandwidth = bandwidth
        self.X_train: np.ndarray | None = None

    def fit(self, X):
        self.X_train = np.asarray(X)

    def evaluate(self, X):
        X = np.asarray(X).reshape(-1, 1)
        n = self.X_train.shape[0]
        const = 1 / (self.bandwidth * np.sqrt(2 * np.pi) * n)
        diffs = (X - self.X_train) / self.bandwidth
        densities = const * np.sum(np.exp(-0.5 * diffs ** 2), axis=1)
        return densities


In [66]:
class MyAdaBoostRegressor:
    """Простая реализация AdaBoost.R2 с пнями в качестве базовых моделей."""

    def __init__(self, n_estimators: int = 50):
        self.n_estimators = n_estimators
        self.models: list[DecisionStump] = []
        self.alphas: list[float] = []  # alpha_t = ln(1/beta_t)

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        n = len(y)
        w = np.full(n, 1 / n)

        for _ in range(self.n_estimators):
            stump = DecisionStump()
            stump.fit(X, y, sample_weights=w)
            pred = stump.predict(X)

            error = np.abs(y - pred)
            max_err = error.max()
            if max_err == 0:
                beta = 0
            else:
                error_norm = np.average(error / max_err, weights=w)
                # если модель не лучше случайной, прерываемся
                if error_norm >= 0.5:
                    break
                beta = error_norm / (1 - error_norm)

            # веса и alpha
            alpha = np.log(1 / (beta + 1e-10))
            self.models.append(stump)
            self.alphas.append(alpha)

            # обновление весов
            w *= beta ** (1 - error / (max_err + 1e-12))
            w /= w.sum()

    def predict(self, X):
        X = np.asarray(X)
        preds = np.zeros(X.shape[0])
        total_alpha = np.sum(self.alphas)
        for stump, alpha in zip(self.models, self.alphas):
            preds += alpha * stump.predict(X)
        return preds / total_alpha


In [67]:
train_df = pd.read_csv("prices_train.csv")
test_df = pd.read_csv("prices_test.csv")

train_df = train_df.drop(columns=["Unnamed: 0"])
test_df = test_df.drop(columns=["Unnamed: 0"])

y_train = train_df["Y house price of unit area"]
X_train = train_df.drop(columns=["Y house price of unit area"])

combined = pd.concat([X_train, test_df], axis=0)
combined_filled = combined.fillna(combined.mean(numeric_only=True)).fillna(0)

X_train_filled = combined_filled.iloc[: len(X_train)]
X_test_filled = combined_filled.iloc[len(X_train) :]


In [68]:
my_gb = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1)
my_gb.fit(X_train_filled, y_train)
print("MSE custom:", mean_squared_error(y_train, my_gb.predict(X_train_filled)))

sk_gb = SklearnGBR(n_estimators=50, learning_rate=0.1, max_depth=1)
sk_gb.fit(X_train_filled, y_train)
print("MSE sklearn:", mean_squared_error(y_train, sk_gb.predict(X_train_filled)))

MSE custom: 54.71336785891531
MSE sklearn: 54.71336785891531


In [69]:
print("\n===== KDE =====")
my_kde = KDE(bandwidth=3)
my_kde.fit(y_train)
print("Custom densities:", my_kde.evaluate(y_train[:5]))

sk_kde = KernelDensity(bandwidth=3)
sk_kde.fit(y_train.values.reshape(-1, 1))
print("Sklearn densities:", np.exp(sk_kde.score_samples(y_train[:5].values.reshape(-1, 1))))



===== KDE =====
Custom densities: [0.03348561 0.01329155 0.02025072 0.01244522 0.02434462]
Sklearn densities: [0.03348561 0.01329155 0.02025072 0.01244522 0.02434462]


In [None]:
print("\n===== AdaBoost.R2 =====")
my_ab = MyAdaBoostRegressor(n_estimators=40)
my_ab.fit(X_train_filled, y_train)
print("MSE custom:", mean_squared_error(y_train, my_ab.predict(X_train_filled)))

sk_ab = SklearnAdaBoost(n_estimators=40)
sk_ab.fit(X_train_filled, y_train)
print("MSE sklearn:", mean_squared_error(y_train, sk_ab.predict(X_train_filled)))



===== AdaBoost.R2 =====
MSE custom: 139.36755465078411
MSE sklearn: 40.17101510100741


: 