In [341]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-clean-lon/cleaned_data_reg.csv


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Модели и инструменты
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Метрики регрессии
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Модель для ЛР
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_columns', None)


In [343]:
# Загружаем предварительно очищенный датасет
df = pd.read_csv("/kaggle/input/data-clean-lon/cleaned_data_reg.csv")

df.shape, df.head()

((28298, 9),
       Price Property Type  Bedrooms  Bathrooms        Size Postcode  \
 0  330000.0     Apartment       1.0        1.0  518.000000      E14   
 1  340000.0          Flat       1.0        1.0  887.498269      E14   
 2  340000.0     Apartment       1.0        1.0  934.569040      E14   
 3  340000.0          Flat       1.0        1.0  887.498269      E14   
 4  340000.0          Flat       1.0        1.0  388.000000     SW20   
 
             Area Price_Category  Area_Avg_Price  
 0        Eastern            Low    1.001684e+06  
 1        Eastern            Low    1.001684e+06  
 2        Eastern            Low    1.001684e+06  
 3        Eastern            Low    1.001684e+06  
 4  South Western            Low    1.516724e+06  )

In [344]:
# Проверяем пропуски и типы данных
print(df.info())
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28298 entries, 0 to 28297
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Price           28298 non-null  float64
 1   Property Type   28298 non-null  object 
 2   Bedrooms        28298 non-null  float64
 3   Bathrooms       28298 non-null  float64
 4   Size            28298 non-null  float64
 5   Postcode        28298 non-null  object 
 6   Area            28298 non-null  object 
 7   Price_Category  28298 non-null  object 
 8   Area_Avg_Price  28298 non-null  float64
dtypes: float64(5), object(4)
memory usage: 1.9+ MB
None


Price             0
Property Type     0
Bedrooms          0
Bathrooms         0
Size              0
Postcode          0
Area              0
Price_Category    0
Area_Avg_Price    0
dtype: int64

In [345]:
# Разделяем признаки и целевую переменную
X = df.drop("Price", axis=1)
y = df["Price"]

X.head()


Unnamed: 0,Property Type,Bedrooms,Bathrooms,Size,Postcode,Area,Price_Category,Area_Avg_Price
0,Apartment,1.0,1.0,518.0,E14,Eastern,Low,1001684.0
1,Flat,1.0,1.0,887.498269,E14,Eastern,Low,1001684.0
2,Apartment,1.0,1.0,934.56904,E14,Eastern,Low,1001684.0
3,Flat,1.0,1.0,887.498269,E14,Eastern,Low,1001684.0
4,Flat,1.0,1.0,388.0,SW20,South Western,Low,1516724.0


In [346]:
# Определяем типы признаков
numeric_features = ["Bedrooms", "Bathrooms", "Size", "Area_Avg_Price"]
categorical_features = ["Property Type", "Postcode", "Area", "Price_Category"]

numeric_features, categorical_features

(['Bedrooms', 'Bathrooms', 'Size', 'Area_Avg_Price'],
 ['Property Type', 'Postcode', 'Area', 'Price_Category'])

In [347]:
# Preprocessing: масштабирование + OneHotEncoding
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Pipeline: препроцессинг + линейная регрессия
model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("reg", LinearRegression())
])


In [348]:
# Делим датасет
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((22638, 8), (5660, 8))

In [349]:
# Обучаем модель
model.fit(X_train, y_train)

# Предсказания
y_pred = model.predict(X_test)

# Метрики
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("=== Baseline Linear Regression ===")
print("MAE: ", mae)
print("RMSE:", rmse)
print("R²:  ", r2)

=== Baseline Linear Regression ===
MAE:  127731.28857943449
RMSE: 164012.58765097606
R²:   0.9142622980478154


In [350]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Числовые и категориальные признаки
num = ["Bedrooms", "Bathrooms", "Size", "Area_Avg_Price"]
cat = ["Property Type", "Postcode", "Area", "Price_Category"]

# Новый препроцессор: добавляем PolynomialFeatures только для числовых фичей
preprocessor_poly = ColumnTransformer([
    ("poly", Pipeline([
        ("scaler", StandardScaler()),
        ("poly", PolynomialFeatures(degree=2, include_bias=False))
    ]), num),
    
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat)
])

ridge_poly_pipe = Pipeline([
    ("prep", preprocessor_poly),
    ("reg", Ridge(random_state=42))
])

params_poly = {
    "reg__alpha": [0.1, 1, 3, 5, 10, 20, 50]
}

grid_poly = GridSearchCV(
    ridge_poly_pipe,
    params_poly,
    scoring="neg_mean_absolute_error",
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_poly.fit(X_train, y_train)


Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [351]:
# Обучаем модель и выводим метрики

best_poly = grid_poly.best_estimator_
pred_poly = best_poly.predict(X_test)

mae_p = mean_absolute_error(y_test, pred_poly)
rmse_p = mean_squared_error(y_test, pred_poly, squared=False)
r2_p = r2_score(y_test, pred_poly)

print("=== Ridge + PolynomialFeatures(degree=2) ===")
print("Best alpha:", grid_poly.best_params_["reg__alpha"])
print("MAE: ", mae_p)
print("RMSE:", rmse_p)
print("R²:  ", r2_p)


=== Ridge + PolynomialFeatures(degree=2) ===
Best alpha: 3
MAE:  126863.2582158791
RMSE: 162351.44414767486
R²:   0.9159902311291376


In [359]:
# Базовый препроцессор: стандартизация + OHE (dense)
preprocessor_base = ColumnTransformer([
    ("num", StandardScaler(), num),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat)
])

In [None]:
X_train_prep = preprocessor_base.fit_transform(X_train)
X_test_prep  = preprocessor_base.transform(X_test)

# Преобразуем в numpy массивы
X_train_prep = np.asarray(X_train_prep)
X_test_prep = np.asarray(X_test_prep)

# Проверка формы
X_train_prep.shape, X_test_prep.shape


((22638, 241), (5660, 241))

In [361]:
# Созадем самописную модель линейной регресии
class MyLinearRegression:
    def fit(self, X, y):
        # Добавляем столбец bias
        X_ = np.c_[np.ones(X.shape[0]), X]
        # нормальное уравнение
        self.w = np.linalg.pinv(X_.T @ X_) @ X_.T @ y
        return self
    
    def predict(self, X):
        X_ = np.c_[np.ones(X.shape[0]), X]
        return X_ @ self.w


In [362]:
#Обучаем модель и выводим показатели

my_lr = MyLinearRegression().fit(X_train_prep, y_train.values)
y_pred_my = my_lr.predict(X_test_prep)

mae_my = mean_absolute_error(y_test, y_pred_my)
rmse_my = mean_squared_error(y_test, y_pred_my, squared=False)
r2_my = r2_score(y_test, y_pred_my)

print("=== MyLinearRegression (Normal Equation) ===")
print("MAE:", mae_my)
print("RMSE:", rmse_my)
print("R²:", r2_my)


=== MyLinearRegression (Normal Equation) ===
MAE: 127731.03228531536
RMSE: 164012.33993344233
R²: 0.9142625570366691


In [None]:
# Числовые и категориальные списки
num = ["Bedrooms", "Bathrooms", "Size", "Area_Avg_Price"]
cat = ["Property Type", "Postcode", "Area", "Price_Category"]

# 1) Стандартизируем числовые признаки и делаем полиномы degree=2
scaler_num = StandardScaler().fit(X_train[num])

X_num_train_scaled = scaler_num.transform(X_train[num])
X_num_test_scaled  = scaler_num.transform(X_test[num])

poly = PolynomialFeatures(degree=2, include_bias=False)
X_num_train_poly = poly.fit_transform(X_num_train_scaled)
X_num_test_poly  = poly.transform(X_num_test_scaled)

# 2) OneHotEncoder для категориальных (dense)
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_cat_train = ohe.fit_transform(X_train[cat])
X_cat_test  = ohe.transform(X_test[cat])

# 3) Склеиваем числовые (полиномы) + категориальные
X_train_my = np.hstack([X_num_train_poly, X_cat_train])
X_test_my  = np.hstack([X_num_test_poly,  X_cat_test])

# Проверка форм
print("X_train_my.shape:", X_train_my.shape)
print("X_test_my.shape :", X_test_my.shape)


X_train_my.shape: (22638, 251)
X_test_my.shape : (5660, 251)


In [None]:
# Самописная линейная регрессия
class MyLinearRegression:
    def fit(self, X, y):
        X_ = np.c_[np.ones(X.shape[0]), X]
        self.w = np.linalg.pinv(X_.T @ X_) @ X_.T @ y
        return self
    def predict(self, X):
        X_ = np.c_[np.ones(X.shape[0]), X]
        return X_ @ self.w

# Самописная Ridge (closed-form с регуляризацией)
class MyRidge:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
    def fit(self, X, y):
        X_ = np.c_[np.ones(X.shape[0]), X]
        n = X_.shape[1]
        A = X_.T @ X_ + self.alpha * np.eye(n)
        self.w = np.linalg.pinv(A) @ X_.T @ y
        return self
    def predict(self, X):
        X_ = np.c_[np.ones(X.shape[0]), X]
        return X_ @ self.w


In [365]:
# Обучаем MyLinearRegression на X_train_my
my_lr_poly = MyLinearRegression().fit(X_train_my, y_train.values)
pred_my_lr_poly = my_lr_poly.predict(X_test_my)

mae_my_lr_poly = mean_absolute_error(y_test, pred_my_lr_poly)
rmse_my_lr_poly = mean_squared_error(y_test, pred_my_lr_poly, squared=False)
r2_my_lr_poly = r2_score(y_test, pred_my_lr_poly)

print("=== MyLinearRegression + Poly ===")
print("MAE:", mae_my_lr_poly)
print("RMSE:", rmse_my_lr_poly)
print("R²:", r2_my_lr_poly)


=== MyLinearRegression + Poly ===
MAE: 126844.56413340954
RMSE: 162410.04711144205
R²: 0.9159295712438008


In [366]:
# Быстрый перебор alpha для MyRidge
alphas = [0.1, 1, 3, 5, 10, 20]
best_alpha = None
best_mae = np.inf
best_model = None

for a in alphas:
    model = MyRidge(alpha=a).fit(X_train_my, y_train.values)
    pred = model.predict(X_test_my)
    mae_tmp = mean_absolute_error(y_test, pred)
    if mae_tmp < best_mae:
        best_mae = mae_tmp
        best_alpha = a
        best_model = model

# Оцениваем лучший MyRidge
pred_best_ridge_my = best_model.predict(X_test_my)
mae_r_my = mean_absolute_error(y_test, pred_best_ridge_my)
rmse_r_my = mean_squared_error(y_test, pred_best_ridge_my, squared=False)
r2_r_my = r2_score(y_test, pred_best_ridge_my)

print("=== MyRidge + Poly (manual alpha search) ===")
print("Best alpha:", best_alpha)
print("MAE:", mae_r_my)
print("RMSE:", rmse_r_my)
print("R²:", r2_r_my)


=== MyRidge + Poly (manual alpha search) ===
Best alpha: 1
MAE: 126831.05205174189
RMSE: 162365.59138898127
R²: 0.9159755893344612


In [None]:
comparison = pd.DataFrame({
    "model": ["Baseline LR (sklearn)", "Ridge+Poly (sklearn)",
              "MyLinear + Poly", "MyRidge + Poly"],
    "MAE": [mae, mae_p, mae_my_lr_poly, mae_r_my],
    "RMSE": [rmse, rmse_p, rmse_my_lr_poly, rmse_r_my],
    "R2": [r2, r2_p, r2_my_lr_poly, r2_r_my]
})
display(comparison)


Unnamed: 0,model,MAE,RMSE,R2
0,Baseline LR (sklearn),127731.288579,164012.587651,0.914262
1,Ridge+Poly (sklearn),126863.258216,162351.444148,0.91599
2,MyLinear + Poly,126844.564133,162410.047111,0.91593
3,MyRidge + Poly,126831.052052,162365.591389,0.915976
