In [99]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-reg/cleaned_data_reg.csv


In [None]:
#Импортируем основные библиотеки
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [101]:
# Загружаем предварительно очищенный датасет
df = pd.read_csv("/kaggle/input/data-reg/cleaned_data_reg.csv")

df.shape, df.head()

((28298, 9),
       Price Property Type  Bedrooms  Bathrooms        Size Postcode  \
 0  330000.0     Apartment       1.0        1.0  518.000000      E14   
 1  340000.0          Flat       1.0        1.0  887.498269      E14   
 2  340000.0     Apartment       1.0        1.0  934.569040      E14   
 3  340000.0          Flat       1.0        1.0  887.498269      E14   
 4  340000.0          Flat       1.0        1.0  388.000000     SW20   
 
             Area Price_Category  Area_Avg_Price  
 0        Eastern            Low    1.001684e+06  
 1        Eastern            Low    1.001684e+06  
 2        Eastern            Low    1.001684e+06  
 3        Eastern            Low    1.001684e+06  
 4  South Western            Low    1.516724e+06  )

In [None]:
X = df.drop("Price", axis=1)
y = df["Price"]

# 1. признаки категориальные
cat_cols = ["Property Type", "Postcode", "Area", "Price_Category"]
num_cols = [c for c in X.columns if c not in cat_cols]

# 2. Трансформеры
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

# 3. Модель + pipeline
baseline_model = Pipeline([
    ("preprocess", preprocess),
    ("gb", GradientBoostingRegressor(random_state=42))
])

# 4. Делим датасет на тренировочный и тестовый
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [103]:
# 5. Обучение
baseline_model.fit(X_train, y_train)

y_pred = baseline_model.predict(X_test)


In [104]:
# 6. Метрики
baseline_metrics = {
    "MAE": mean_absolute_error(y_test, y_pred),
    "RMSE": mean_squared_error(y_test, y_pred, squared=False),
    "R2": r2_score(y_test, y_pred)
}

baseline_metrics

{'MAE': 128105.35928235197,
 'RMSE': 163470.67885906852,
 'R2': 0.9148279285072275}

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 0. FE
df_fe = df.copy()
df_fe["Size_per_Bedroom"] = df_fe["Size"] / (df_fe["Bedrooms"] + 1)
df_fe["Postcode_freq"] = df_fe["Postcode"].map(df_fe["Postcode"].value_counts())
df_fe["BedBath"] = df_fe["Bedrooms"] * df_fe["Bathrooms"]
df_fe["Area_Rank"] = df_fe["Area"].map(df_fe["Area"].value_counts(normalize=True))

df_fe["Price_log"] = np.log1p(df_fe["Price"])

price_thr = np.percentile(df["Price"], 75)
df_fe["IsLuxury"] = (df_fe["Price"] >= price_thr).astype(int)

# 1. X, y
X = df_fe.drop(["Price", "Price_log"], axis=1)
y_reg = df_fe["Price_log"]
y_router = df_fe["IsLuxury"]

cat_cols = ["Property Type", "Postcode", "Area", "Price_Category"]
num_cols = [c for c in X.columns if c not in cat_cols]

# 2. Делим датасет на тренировочный и тестовый
X_train, X_test, y_reg_train, y_reg_test, y_router_train, y_router_test = train_test_split(
    X, y_reg, y_router, test_size=0.2, random_state=42
)

# 3. OneHotEncoder обучаем только на тренировочной выборке
encoder = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", "passthrough", num_cols)
])

encoder.fit(X_train)

# Преобразование данных
X_train_enc = encoder.transform(X_train)
X_test_enc  = encoder.transform(X_test)

# 4. Модель-роутер (классификатор: дешёвое/дорогое жильё)
router_clf = GradientBoostingClassifier(random_state=42)
router_clf.fit(X_train_enc, y_router_train)

# 5. Отдельные модели для дешёвого и дорогого сегмента
cheap_idx = (y_router_train == 0).values
lux_idx   = (y_router_train == 1).values

model_cheap = GradientBoostingRegressor(random_state=42)
model_cheap.fit(X_train_enc[cheap_idx], y_reg_train[cheap_idx])

model_lux = GradientBoostingRegressor(random_state=42)
model_lux.fit(X_train_enc[lux_idx], y_reg_train[lux_idx])

# 6. Предсказание
router_pred = router_clf.predict(X_test_enc)

n = X_test_enc.shape[0]
pred_log = np.zeros(n)

pred_log[router_pred == 0] = model_cheap.predict(X_test_enc[router_pred == 0])
pred_log[router_pred == 1] = model_lux.predict(X_test_enc[router_pred == 1])

# Обратное преобразование логарифма
pred_price   = np.expm1(pred_log)
y_price_test = np.expm1(y_reg_test)

final_metrics = {
    "MAE": mean_absolute_error(y_price_test, pred_price),
    "RMSE": mean_squared_error(y_price_test, pred_price, squared=False),
    "R2": r2_score(y_price_test, pred_price)
}

final_metrics


{'MAE': 110468.4321167536,
 'RMSE': 147511.02843516364,
 'R2': 0.9306468071974487}

In [None]:
import numpy as np

class SimpleTreeRegressor:

    # простое решающее дерево для регрессии
    def __init__(self, max_depth=3, min_samples_split=10):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.feature = None
        self.threshold = None
        self.left = None
        self.right = None
        self.value = None
        
    def fit(self, X, y, depth=0):
        if depth >= self.max_depth or len(y) < self.min_samples_split:
            self.value = np.mean(y)
            return
        
        best_feat, best_thr, best_loss = None, None, float("inf")
        
        # Перебор всех признаков и порогов
        for feat in range(X.shape[1]):
            values = np.unique(X[:, feat])
            for thr in values:
                left = X[:, feat] <= thr
                right = ~left
                if left.sum() == 0 or right.sum() == 0:
                    continue
                    
                loss = ((y[left] - y[left].mean())**2).sum() + \
                       ((y[right] - y[right].mean())**2).sum()
                
                if loss < best_loss:
                    best_feat = feat
                    best_thr = thr
                    best_loss = loss
        
        if best_feat is None:
            self.value = np.mean(y)
            return
        
        self.feature = best_feat
        self.threshold = best_thr
        
        left = X[:, best_feat] <= best_thr
        right = ~left
        
        self.left = SimpleTreeRegressor(self.max_depth, self.min_samples_split)
        self.left.fit(X[left], y[left], depth+1)
        
        self.right = SimpleTreeRegressor(self.max_depth, self.min_samples_split)
        self.right.fit(X[right], y[right], depth+1)
    
    def predict_row(self, x):
        if self.value is not None:
            return self.value
        
        if x[self.feature] <= self.threshold:
            return self.left.predict_row(x)
        else:
            return self.right.predict_row(x)
            
    def predict(self, X):
        return np.array([self.predict_row(row) for row in X])


In [None]:
class SimpleGBR:
    def __init__(self, n_estimators=50, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.init_value = None
        
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        # Начальное значение - среднее по таргету
        self.init_value = y.mean()
        prediction = np.ones(len(y)) * self.init_value
        
        for _ in range(self.n_estimators):
            # Остатки
            residual = y - prediction
            
            # Обучаем дерево на остатках
            tree = SimpleTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            
            self.trees.append(tree)

            # Обновляем предсказание
            prediction += self.learning_rate * tree.predict(X)
    
    def predict(self, X):
        X = np.array(X)
        pred = np.ones(X.shape[0]) * self.init_value
        
        for tree in self.trees:
            pred += self.learning_rate * tree.predict(X)
        
        return pred


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df_enc = df.copy()

# низкая кардинальность - оставляем как LabelEncode
for col in ["Property Type", "Area", "Price_Category"]:
    df_enc[col] = df_enc[col].astype("category").cat.codes

# высокая кардинальность - превращаем в частоту
df_enc["Postcode"] = df_enc["Postcode"].map(df_enc["Postcode"].value_counts())

# лог-таргет
df_enc["Price_log"] = np.log1p(df_enc["Price"])

# 1) X/y
X = df_enc.drop(["Price", "Price_log"], axis=1).values
y = df_enc["Price_log"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2) Самописный Gradient Boosting - ускоренная версия
model_base = SimpleGBR(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=2
)

model_base.fit(X_train, y_train)

pred_log = model_base.predict(X_test)
pred = np.expm1(pred_log)

# 3) Метрики
self_metrics = {
    "MAE": mean_absolute_error(np.expm1(y_test), pred),
    "RMSE": mean_squared_error(np.expm1(y_test), pred, squared=False),
    "R2": r2_score(np.expm1(y_test), pred)
}

self_metrics


{'MAE': 141645.68059704316,
 'RMSE': 201097.25223842415,
 'R2': 0.8711068757670604}

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1) FE
df_fe = df.copy()

# 1. Плотность комнат
df_fe["Size_per_Bedroom"] = df_fe["Size"] / (df_fe["Bedrooms"] + 1)

# 2. Частота postcode
df_fe["Postcode_freq"] = df_fe["Postcode"].map(df_fe["Postcode"].value_counts())

# 3. Комнаты * ванны
df_fe["BedBath"] = df_fe["Bedrooms"] * df_fe["Bathrooms"]

# 4. Rank района (относительная популярность)
df_fe["Area_Rank"] = df_fe["Area"].map(df_fe["Area"].value_counts(normalize=True))

# 5. Лог-таргет
df_fe["Price_log"] = np.log1p(df_fe["Price"])

# 2) Encoding категориальных
df_fe["Property Type"] = df_fe["Property Type"].astype("category").cat.codes
df_fe["Area"]          = df_fe["Area"].astype("category").cat.codes
df_fe["Price_Category"] = df_fe["Price_Category"].astype("category").cat.codes

# Postcode уже заменили на частоту

# 3) X / y
X = df_fe.drop(["Price", "Price_log"], axis=1).values
y = df_fe["Price_log"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4) Самописный GB - оптимальные гиперы
model_fe = SimpleGBR(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
)

model_fe.fit(X_train, y_train)

pred_log = model_fe.predict(X_test)
pred = np.expm1(pred_log)

# 5) Метрики
metrics_fe = {
    "MAE": mean_absolute_error(np.expm1(y_test), pred),
    "RMSE": mean_squared_error(np.expm1(y_test), pred, squared=False),
    "R2": r2_score(np.expm1(y_test), pred)
}

metrics_fe


{'MAE': 126373.23926483991,
 'RMSE': 168884.4967511176,
 'R2': 0.9090930586484427}