In [50]:
!pip -q install kagglehub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [52]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

import kagglehub

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [54]:
DATASET_NAME = "kevinnadar22/mumbai-house-price-data-70k-entries"

data_dir = kagglehub.dataset_download(DATASET_NAME)
csv_path = next(
    (os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".csv")),
    None,
)

if csv_path is None:
    raise FileNotFoundError("CSV не найден!")

df = pd.read_csv(csv_path)

In [56]:
df = df.drop_duplicates()
df["log_price"] = np.log1p(df["price"])

target = "log_price"
features = [
    "area",
    "bedroom_num",
    "bathroom_num",
    "balcony_num",
    "age",
    "total_floors",
    "property_type",
    "furnished",
    "locality",
]

X = df[features]
y = df[target]

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((41413, 9), (10354, 9), (41413,), (10354,))

In [60]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X, y, test_size=0.2, random_state=42
)

assert (X_train.index == X_train2.index).all()
assert (X_test.index == X_test2.index).all()
print("Данные воспроизводимы")

Данные воспроизводимы


In [64]:
mean_value = float(np.mean(y_train))
median_value = float(np.median(y_train))

y_pred_mean = np.full(shape=y_test.shape, fill_value=mean_value, dtype=float)
y_pred_median = np.full(shape=y_test.shape, fill_value=median_value, dtype=float)

rmse_mean = mean_squared_error(y_test, y_pred_mean) ** 0.5
rmse_median = mean_squared_error(y_test, y_pred_median) ** 0.5

mean_price_inr = float(np.expm1(mean_value))
median_price_inr = float(np.expm1(median_value))

print(f"Базовая модель (среднее): RMSE = {rmse_mean:.4f}")
print(f"Базовая модель (медиана): RMSE = {rmse_median:.4f}")
print(f"Средняя цена в выборке: {mean_price_inr:,.0f} ₨")
print(f"Медианная цена в выборке: {median_price_inr:,.0f} ₨")

Базовая модель (среднее): RMSE = 0.9203
Базовая модель (медиана): RMSE = 0.9207
Средняя цена в выборке: 12,465,149 ₨
Медианная цена в выборке: 12,000,339 ₨


In [71]:
numeric_features = [
    "area",
    "bedroom_num",
    "bathroom_num",
    "balcony_num",
    "age",
    "total_floors",
]
categorical_features = ["property_type", "furnished", "locality"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression()),
    ]
)

model.fit(X_train, y_train)

y_pred_log = model.predict(X_test)
rmse_log = mean_squared_error(y_test, y_pred_log) ** 0.5
mae_log = mean_absolute_error(y_test, y_pred_log)
r2 = r2_score(y_test, y_pred_log)

y_test_inr = np.expm1(y_test)
y_pred_inr = np.expm1(y_pred_log)
rmse_inr = mean_squared_error(y_test_inr, y_pred_inr) ** 0.5
mae_inr = mean_absolute_error(y_test_inr, y_pred_inr)

print(f"Ошибка RMSE (в логарифмах): {rmse_log:.4f}")
print(f"Средняя абсолютная ошибка MAE (в логарифмах): {mae_log:.4f}")
print(f"Коэффициент детерминации R²: {r2:.4f}")
print(f"Ошибка RMSE (в рупиях): {rmse_inr:,.0f} ₨")
print(f"Средняя абсолютная ошибка MAE (в рупиях): {mae_inr:,.0f} ₨")

Ошибка RMSE (в логарифмах): 0.3087
Средняя абсолютная ошибка MAE (в логарифмах): 0.2217
Коэффициент детерминации R²: 0.8875
Ошибка RMSE (в рупиях): 25,211,910 ₨
Средняя абсолютная ошибка MAE (в рупиях): 5,390,856 ₨
