In [21]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-longon/cleaned_data_reg.csv


In [22]:
# Загружаем предварительно очищенный датасет
df = pd.read_csv("/kaggle/input/data-longon/cleaned_data_reg.csv")

df.shape, df.head()

((28298, 9),
       Price Property Type  Bedrooms  Bathrooms        Size Postcode  \
 0  330000.0     Apartment       1.0        1.0  518.000000      E14   
 1  340000.0          Flat       1.0        1.0  887.498269      E14   
 2  340000.0     Apartment       1.0        1.0  934.569040      E14   
 3  340000.0          Flat       1.0        1.0  887.498269      E14   
 4  340000.0          Flat       1.0        1.0  388.000000     SW20   
 
             Area Price_Category  Area_Avg_Price  
 0        Eastern            Low    1.001684e+06  
 1        Eastern            Low    1.001684e+06  
 2        Eastern            Low    1.001684e+06  
 3        Eastern            Low    1.001684e+06  
 4  South Western            Low    1.516724e+06  )

In [23]:
#средняя цена по категории
df_te = df.copy()

categorical_features = ["Property Type", "Postcode", "Area", "Price_Category"]

for col in categorical_features:
    mapping = df_te.groupby(col)["Price"].mean()
    df_te[col] = df_te[col].map(mapping)
df_te.head()

Unnamed: 0,Price,Property Type,Bedrooms,Bathrooms,Size,Postcode,Area,Price_Category,Area_Avg_Price
0,330000.0,849484.941579,1.0,1.0,518.0,783413.164846,749084.506424,347077.129766,1001684.0
1,340000.0,716960.071821,1.0,1.0,887.498269,783413.164846,749084.506424,347077.129766,1001684.0
2,340000.0,849484.941579,1.0,1.0,934.56904,783413.164846,749084.506424,347077.129766,1001684.0
3,340000.0,716960.071821,1.0,1.0,887.498269,783413.164846,749084.506424,347077.129766,1001684.0
4,340000.0,716960.071821,1.0,1.0,388.0,826569.523438,996273.742012,347077.129766,1516724.0


In [24]:
#Разделяем признаки и целевую переменную
X = df_te.drop("Price", axis=1)
y = df_te["Price"]


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((22638, 8), (5660, 8))

In [26]:
#Baseline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

baseline_rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

baseline_rf.fit(X_train, y_train)
y_pred_base = baseline_rf.predict(X_test)

baseline_results = {
    "MAE": mean_absolute_error(y_test, y_pred_base),
    "RMSE": mean_squared_error(y_test, y_pred_base, squared=False),
    "R2": r2_score(y_test, y_pred_base)
}

baseline_results


{'MAE': 113219.25383653244,
 'RMSE': 154179.6311062534,
 'R2': 0.9242345020078285}

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X_small = X_train.sample(frac=0.3, random_state=42)
y_small = y_train.loc[X_small.index]
# Признаки
cat = ["Property Type", "Postcode", "Area", "Price_Category"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat)
], remainder="passthrough")   # числовые признаки оставляем как есть

pipe = Pipeline([
    ("prep", preprocessor),
    ("reg", RandomForestRegressor(random_state=42, n_jobs=-1))
])

params = {
    "reg__n_estimators": [1000, 1500],
    "reg__max_depth": [20, 30],
    "reg__min_samples_leaf": [1, 2],
    "reg__max_features": ["sqrt", "log2"],
}

grid_rf = GridSearchCV(
    pipe,
    param_grid=params,
    scoring="neg_mean_squared_error",
    cv=2,
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X_small, y_small)
grid_rf.best_params_

Fitting 2 folds for each of 16 candidates, totalling 32 fits


{'reg__max_depth': 20,
 'reg__max_features': 'sqrt',
 'reg__min_samples_leaf': 1,
 'reg__n_estimators': 1000}

In [28]:
best = grid_rf.best_params_

improved_rf = RandomForestRegressor(
    n_estimators=best["reg__n_estimators"],
    max_depth=best["reg__max_depth"],
    min_samples_leaf=best["reg__min_samples_leaf"],
    max_features=best["reg__max_features"],
    max_samples=0.8,
    random_state=42,
    n_jobs=-1
)

improved_rf.fit(X_train, y_train)
y_pred_imp = improved_rf.predict(X_test)

improved_results_new = {
    "MAE": mean_absolute_error(y_test, y_pred_imp),
    "RMSE": mean_squared_error(y_test, y_pred_imp, squared=False),
    "R2": r2_score(y_test, y_pred_imp)
}

improved_results_new


{'MAE': 112415.8760942382,
 'RMSE': 152559.18517596484,
 'R2': 0.9258187412231473}

In [29]:
import pandas as pd

comparison = pd.DataFrame({
    "Metric": ["MAE", "RMSE", "R2"],
    "Baseline": [baseline_results["MAE"], baseline_results["RMSE"], baseline_results["R2"]],
    "Improved_new": [improved_results_new["MAE"], improved_results_new["RMSE"], improved_results_new["R2"]],
})

comparison


Unnamed: 0,Metric,Baseline,Improved_new
0,MAE,113219.253837,112415.876094
1,RMSE,154179.631106,152559.185176
2,R2,0.924235,0.925819


In [30]:
# Самописное DecisionTreeRegressor
import numpy as np

class MyDecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def _mse(self, y):
        if len(y) == 0:
            return 0
        return np.mean((y - np.mean(y)) ** 2)

    def _best_split(self, X, y):
        best_feat, best_thresh = None, None
        best_mse = 1e18

        n_samples, n_features = X.shape

        for feat in range(n_features):
            thresholds = np.unique(X[:, feat])

            for t in thresholds:
                left_mask = X[:, feat] <= t
                right_mask = ~left_mask

                if left_mask.sum() < self.min_samples_leaf: 
                    continue
                if right_mask.sum() < self.min_samples_leaf: 
                    continue

                mse_left = self._mse(y[left_mask])
                mse_right = self._mse(y[right_mask])
                mse = (mse_left * left_mask.sum() + mse_right * right_mask.sum()) / n_samples

                if mse < best_mse:
                    best_mse = mse
                    best_feat = feat
                    best_thresh = t

        return best_feat, best_thresh

    def _build_tree(self, X, y, depth):
        if len(y) <= self.min_samples_leaf:
            return {"leaf": True, "value": np.mean(y)}

        if self.max_depth is not None and depth >= self.max_depth:
            return {"leaf": True, "value": np.mean(y)}

        feat, thresh = self._best_split(X, y)

        if feat is None:
            return {"leaf": True, "value": np.mean(y)}

        left_mask = X[:, feat] <= thresh
        right_mask = ~left_mask

        return {
            "leaf": False,
            "feature": feat,
            "threshold": thresh,
            "left": self._build_tree(X[left_mask], y[left_mask], depth + 1),
            "right": self._build_tree(X[right_mask], y[right_mask], depth + 1)
        }

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, 0)
        return self

    def _predict_one(self, node, x):
        if node["leaf"]:
            return node["value"]

        if x[node["feature"]] <= node["threshold"]:
            return self._predict_one(node["left"], x)
        else:
            return self._predict_one(node["right"], x)

    def predict(self, X):
        return np.array([self._predict_one(self.tree, x) for x in X])


In [31]:
#самописный RandomForestRegressor
import numpy as np

class MyRandomForestRegressor:
    def __init__(self, 
                 n_estimators=10, 
                 max_depth=None, 
                 min_samples_leaf=1,
                 max_features="sqrt",
                 sample_fraction=0.8,
                 random_state=42):
        
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.sample_fraction = sample_fraction
        self.random_state = random_state
        
        self.trees = []
        self.feature_indices = []

    def _get_feature_indices(self, n_features):
        if self.max_features == "sqrt":
            k = int(np.sqrt(n_features))
        elif self.max_features == "log2":
            k = int(np.log2(n_features))
        else:
            k = n_features
        return np.random.choice(n_features, k, replace=False)

    def fit(self, X, y):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape

        self.trees = []
        self.feature_indices = []

        for i in range(self.n_estimators):
            # bootstrap выборка
            indices = np.random.choice(n_samples, 
                                       int(self.sample_fraction * n_samples),
                                       replace=True)
            X_boot = X[indices]
            y_boot = y[indices]

            # подвыбор признаков
            feat_idx = self._get_feature_indices(n_features)
            self.feature_indices.append(feat_idx)

            tree = MyDecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf
            )

            tree.fit(X_boot[:, feat_idx], y_boot)
            self.trees.append(tree)

        return self

    def predict(self, X):
        preds = []
        for tree, feat_idx in zip(self.trees, self.feature_indices):
            preds.append(tree.predict(X[:, feat_idx]))
        return np.mean(preds, axis=0)


In [32]:
# Перевод в numpy
def to_dense(X):
    try:
        return X.toarray()     # если X sparse
    except:
        return np.asarray(X)   # если X обычный

X_train_np = to_dense(X_train)
X_test_np  = to_dense(X_test)
y_train_np = np.asarray(y_train)
y_test_np  = np.asarray(y_test)

X_train_np.shape, X_test_np.shape

((22638, 8), (5660, 8))

In [33]:
# Baseline self-made RandomForest
my_rf = MyRandomForestRegressor(
    n_estimators=20,
    max_depth=10,
    min_samples_leaf=2,
    max_features="sqrt",
    sample_fraction=0.8,
    random_state=42
)

my_rf.fit(X_train_np, y_train_np)
y_pred_my_rf = my_rf.predict(X_test_np)

print("=== MyRandomForestRegressor Baseline ===")
print("MAE: ", mean_absolute_error(y_test_np, y_pred_my_rf))
print("RMSE:", mean_squared_error(y_test_np, y_pred_my_rf, squared=False))
print("R²:  ", r2_score(y_test_np, y_pred_my_rf))


=== MyRandomForestRegressor Baseline ===
MAE:  224197.65098652637
RMSE: 319867.99886604334
R²:   0.673893793574321


In [34]:
#Улучшенная самописная версия
best = grid_rf.best_params_

my_rf_imp = MyRandomForestRegressor(
    n_estimators=best["reg__n_estimators"],
    max_depth=best["reg__max_depth"],
    min_samples_leaf=best["reg__min_samples_leaf"],
    max_features=best["reg__max_features"],
    sample_fraction=0.8,
    random_state=42
)

my_rf_imp.fit(X_train_np, y_train_np)
y_pred_my_rf_imp = my_rf_imp.predict(X_test_np)

print("=== MyRandomForestRegressor Improved ===")
print("MAE: ", mean_absolute_error(y_test_np, y_pred_my_rf_imp))
print("RMSE:", mean_squared_error(y_test_np, y_pred_my_rf_imp, squared=False))
print("R²:  ", r2_score(y_test_np, y_pred_my_rf_imp))


=== MyRandomForestRegressor Improved ===
MAE:  219021.43383434115
RMSE: 310642.60379269195
R²:   0.6924331581708916


In [36]:
#Финальная таблица
comparison = pd.DataFrame([
    ["sklearn baseline", baseline_results["MAE"], baseline_results["RMSE"], baseline_results["R2"]],
    ["sklearn improved", improved_results_new["MAE"], improved_results_new["RMSE"], improved_results_new["R2"]],
    ["my baseline",
        mean_absolute_error(y_test_np, y_pred_my_rf),
        mean_squared_error(y_test_np, y_pred_my_rf, squared=False),
        r2_score(y_test_np, y_pred_my_rf)],
    ["my improved",
        mean_absolute_error(y_test_np, y_pred_my_rf_imp),
        mean_squared_error(y_test_np, y_pred_my_rf_imp, squared=False),
        r2_score(y_test_np, y_pred_my_rf_imp)]
],
columns=["model", "MAE", "RMSE", "R²"])

comparison


Unnamed: 0,model,MAE,RMSE,R²
0,sklearn baseline,113219.253837,154179.631106,0.924235
1,sklearn improved,112415.876094,152559.185176,0.925819
2,my baseline,224197.650987,319867.998866,0.673894
3,my improved,219021.433834,310642.603793,0.692433
