In [49]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-clean-pric/cleaned_data_reg.csv


In [50]:
# Загружаем предварительно очищенный датасет
df = pd.read_csv("/kaggle/input/data-clean-pric/cleaned_data_reg.csv")

df.shape, df.head()

((28298, 9),
       Price Property Type  Bedrooms  Bathrooms        Size Postcode  \
 0  330000.0     Apartment       1.0        1.0  518.000000      E14   
 1  340000.0          Flat       1.0        1.0  887.498269      E14   
 2  340000.0     Apartment       1.0        1.0  934.569040      E14   
 3  340000.0          Flat       1.0        1.0  887.498269      E14   
 4  340000.0          Flat       1.0        1.0  388.000000     SW20   
 
             Area Price_Category  Area_Avg_Price  
 0        Eastern            Low    1.001684e+06  
 1        Eastern            Low    1.001684e+06  
 2        Eastern            Low    1.001684e+06  
 3        Eastern            Low    1.001684e+06  
 4  South Western            Low    1.516724e+06  )

In [51]:
# Проверяем пропуски и типы данных
print(df.info())
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28298 entries, 0 to 28297
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Price           28298 non-null  float64
 1   Property Type   28298 non-null  object 
 2   Bedrooms        28298 non-null  float64
 3   Bathrooms       28298 non-null  float64
 4   Size            28298 non-null  float64
 5   Postcode        28298 non-null  object 
 6   Area            28298 non-null  object 
 7   Price_Category  28298 non-null  object 
 8   Area_Avg_Price  28298 non-null  float64
dtypes: float64(5), object(4)
memory usage: 1.9+ MB
None


Price             0
Property Type     0
Bedrooms          0
Bathrooms         0
Size              0
Postcode          0
Area              0
Price_Category    0
Area_Avg_Price    0
dtype: int64

In [52]:
# Разделяем признаки и целевую переменную
X = df.drop("Price", axis=1)
y = df["Price"]

X.head()


Unnamed: 0,Property Type,Bedrooms,Bathrooms,Size,Postcode,Area,Price_Category,Area_Avg_Price
0,Apartment,1.0,1.0,518.0,E14,Eastern,Low,1001684.0
1,Flat,1.0,1.0,887.498269,E14,Eastern,Low,1001684.0
2,Apartment,1.0,1.0,934.56904,E14,Eastern,Low,1001684.0
3,Flat,1.0,1.0,887.498269,E14,Eastern,Low,1001684.0
4,Flat,1.0,1.0,388.0,SW20,South Western,Low,1516724.0


In [53]:
# Определяем типы признаков
numeric_features = ["Bedrooms", "Bathrooms", "Size", "Area_Avg_Price"]
categorical_features = ["Property Type", "Postcode", "Area", "Price_Category"]

numeric_features, categorical_features


(['Bedrooms', 'Bathrooms', 'Size', 'Area_Avg_Price'],
 ['Property Type', 'Postcode', 'Area', 'Price_Category'])

In [54]:
#создаём препроцессор

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [55]:
#pipeline с решающим деревом (baseline)

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("reg", DecisionTreeRegressor(
        random_state=42
    ))
])

In [56]:
#train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((22638, 8), (5660, 8))

In [57]:
#обучаем baseline дерево

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [58]:
#baseline метрики

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("=== Baseline DecisionTreeRegressor ===")
print("MAE: ", mae)
print("RMSE:", rmse)
print("R²:  ", r2)


=== Baseline DecisionTreeRegressor ===
MAE:  127194.27321985947
RMSE: 184217.70256486014
R²:   0.8918366321730543


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Признаки
num = ["Bedrooms", "Bathrooms", "Size", "Area_Avg_Price"]
cat = ["Property Type", "Postcode", "Area", "Price_Category"]

# Препроцессинг
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat)
])

# Модель
pipe = Pipeline([
    ("prep", preprocessor),
    ("reg", DecisionTreeRegressor(random_state=42))
])

# Параметры для поиска
params = {
    "reg__max_depth": [5, 7, 9],
    "reg__min_samples_split": [2, 5, 10, 20, 50],
    "reg__min_samples_leaf": [1, 5, 10],
    "reg__criterion": ["squared_error", "friedman_mse"],
    "reg__splitter": ["best", "random"]
}

grid_reg = GridSearchCV(
    estimator=pipe,
    param_grid=params,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_reg.fit(X_train, y_train)

grid_reg.best_params_


Fitting 3 folds for each of 180 candidates, totalling 540 fits


{'reg__criterion': 'squared_error',
 'reg__max_depth': 9,
 'reg__min_samples_leaf': 1,
 'reg__min_samples_split': 50,
 'reg__splitter': 'best'}

In [60]:
# Строим модель с лучшими параметрами
best_tree_reg = grid_reg.best_estimator_

# Предсказания
y_pred_imp = best_tree_reg.predict(X_test)

# Метрики
mae_imp = mean_absolute_error(y_test, y_pred_imp)
rmse_imp = mean_squared_error(y_test, y_pred_imp, squared=False)
r2_imp = r2_score(y_test, y_pred_imp)

print("=== Improved DecisionTreeRegressor ===")
print("MAE: ", mae_imp)
print("RMSE:", rmse_imp)
print("R²:  ", r2_imp)
print("\nBest params:", grid_reg.best_params_)


=== Improved DecisionTreeRegressor ===
MAE:  127323.73804250378
RMSE: 164364.62294749534
R²:   0.9138938496291722

Best params: {'reg__criterion': 'squared_error', 'reg__max_depth': 9, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 50, 'reg__splitter': 'best'}


In [None]:
#Самописное DecisionTreeRegressor

import numpy as np

class MyDecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def _mse(self, y):
        #Критерий разбиения: MSE
        if len(y) == 0:
            return 0
        return np.mean((y - np.mean(y)) ** 2)

    def _best_split(self, X, y):
        #Поиск лучшего разбиения
        best_feat, best_thresh = None, None
        best_mse = 1e18

        n_samples, n_features = X.shape

        for feat in range(n_features):
            thresholds = np.unique(X[:, feat])

            for t in thresholds:
                left_mask = X[:, feat] <= t
                right_mask = ~left_mask

                if left_mask.sum() < self.min_samples_leaf: 
                    continue
                if right_mask.sum() < self.min_samples_leaf:
                    continue

                mse_left = self._mse(y[left_mask])
                mse_right = self._mse(y[right_mask])

                mse = (mse_left * left_mask.sum() + mse_right * right_mask.sum()) / n_samples

                if mse < best_mse:
                    best_mse = mse
                    best_feat = feat
                    best_thresh = t

        return best_feat, best_thresh

    def _build_tree(self, X, y, depth):
        if len(y) <= self.min_samples_leaf:
            return {"leaf": True, "value": np.mean(y)}

        if self.max_depth is not None and depth >= self.max_depth:
            return {"leaf": True, "value": np.mean(y)}

        feat, thresh = self._best_split(X, y)

        if feat is None:
            return {"leaf": True, "value": np.mean(y)}

        left_mask = X[:, feat] <= thresh
        right_mask = ~left_mask

        return {
            "leaf": False,
            "feature": feat,
            "threshold": thresh,
            "left": self._build_tree(X[left_mask], y[left_mask], depth + 1),
            "right": self._build_tree(X[right_mask], y[right_mask], depth + 1)
        }

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, 0)
        return self

    def _predict_one(self, node, x):
        if node["leaf"]:
            return node["value"]

        if x[node["feature"]] <= node["threshold"]:
            return self._predict_one(node["left"], x)
        else:
            return self._predict_one(node["right"], x)

    def predict(self, X):
        return np.array([self._predict_one(self.tree, x) for x in X])


In [None]:
# Переводим X в numpy
def to_dense(X):
    try:
        return X.toarray()
    except:
        return np.asarray(X)

X_train_np = to_dense(X_train)
X_test_np  = to_dense(X_test)
y_train_np = np.asarray(y_train)
y_test_np  = np.asarray(y_test)

# Baseline самописная дерево
my_tree_reg = MyDecisionTreeRegressor(
    max_depth=5,
    min_samples_leaf=5
)

my_tree_reg.fit(X_train_np, y_train_np)

y_pred_my = my_tree_reg.predict(X_test_np)

print("=== MyDecisionTreeRegressor Baseline ===")
print("MAE: ", mean_absolute_error(y_test_np, y_pred_my))
print("RMSE:", mean_squared_error(y_test_np, y_pred_my, squared=False))
print("R²:  ", r2_score(y_test_np, y_pred_my))


=== MyDecisionTreeRegressor Baseline ===
MAE:  133602.6610628405
RMSE: 170246.61042179164
R²:   0.9076207513766857


In [64]:
#Улучшенная самописная версия

best_params = grid_reg.best_params_

my_tree_imp = MyDecisionTreeRegressor(
    max_depth=best_params["reg__max_depth"],
    min_samples_leaf=best_params["reg__min_samples_leaf"]
)

my_tree_imp.fit(X_train_np, y_train_np)
y_pred_my_imp = my_tree_imp.predict(X_test_np)

print("=== MyDecisionTreeRegressor Improved ===")
print("MAE: ", mean_absolute_error(y_test_np, y_pred_my_imp))
print("RMSE:", mean_squared_error(y_test_np, y_pred_my_imp, squared=False))
print("R²:  ", r2_score(y_test_np, y_pred_my_imp))


=== MyDecisionTreeRegressor Improved ===
MAE:  127670.55888795946
RMSE: 166859.54267157102
R²:   0.911259969118088


In [65]:
#Финальная сравнительная таблица

import pandas as pd

comparison = pd.DataFrame([
    ["sklearn baseline", mae, rmse, r2],
    ["sklearn improved", mae_imp, rmse_imp, r2_imp],
    ["my baseline", 
        mean_absolute_error(y_test_np, y_pred_my),
        mean_squared_error(y_test_np, y_pred_my, squared=False),
        r2_score(y_test_np, y_pred_my)],
    ["my improved",
        mean_absolute_error(y_test_np, y_pred_my_imp),
        mean_squared_error(y_test_np, y_pred_my_imp, squared=False),
        r2_score(y_test_np, y_pred_my_imp)]
],
columns=["model", "MAE", "RMSE", "R²"])

comparison


Unnamed: 0,model,MAE,RMSE,R²
0,sklearn baseline,127194.27322,184217.702565,0.891837
1,sklearn improved,127323.738043,164364.622947,0.913894
2,my baseline,133602.661063,170246.610422,0.907621
3,my improved,127670.558888,166859.542672,0.91126
