In [1]:

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [9]:
DATA_PATH = "../data/data.csv"

df = pd.read_csv(DATA_PATH)

df['date'] = pd.to_datetime(df['date'])

df['sale_year'] = df['date'].dt.year
df['sale_month'] = df['date'].dt.month

df = df.drop(columns=['date'])

# Обрабатываем пропуски
df['condition'] = df['condition'].fillna(df['condition'].median())
df['yr_built'] = df['yr_built'].fillna(df['yr_built'].median())
df['city'] = df['city'].fillna('unknown')

# Более удобные признаки
df['house_age'] = df['sale_year'] - df['yr_built']
df = df.drop(columns=['sale_year'])
df['renovated'] = (df['yr_renovated'] > 0).astype(int)
df = df.drop(columns=['yr_built', 'yr_renovated'])

# Удаляем не используемые признаки
df = df.drop(columns=['street', 'statezip', 'city', 'country'])

# Логарифмируем (log1p = log(x+1), безопасная операция с нулями)
for col in ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']:
    df[col] = np.log1p(df[col])

# Доп параметры
df['sqft_living_sq'] = df['sqft_living'] ** 2
df['bath_bed_ratio'] = df['bathrooms'] / df['bedrooms']
df = df.dropna(subset=['bath_bed_ratio'])
df['total_sqft'] = df['sqft_living'] + df['sqft_basement']
df['bath_bed_ratio'] = df['bathrooms'] / (df['bedrooms'] + 0.01)
df['living_lot_ratio'] = df['sqft_living'] / (df['sqft_lot'] + 0.01)
df['has_basement'] = (df['sqft_basement'] > 0).astype(int)
df['price_per_sqft'] = df['price'] / np.expm1(df['sqft_living'])
df['bedroom_ratio'] = df['bedrooms'] / (df['bedrooms'] + df['bathrooms'] + 0.01)
df['luxury_score'] = df['view'] * df['waterfront'] * df['condition']
df['total_rooms'] = df['bedrooms'] + df['bathrooms']
df['season'] = ((df['sale_month'] % 12 + 3) // 3).astype(int)
df['living_times_view'] = df['sqft_living'] * df['view']
df['age_times_condition'] = df['house_age'] * df['condition']
df['waterfront_view'] = df['waterfront'] * df['view']

Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.7 * IQR
upper_bound = Q3 + 1.7 * IQR
df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

y = df['price']
X = df.drop(columns=['price'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=pd.cut(y, bins=9)
)
model = LinearRegression()
model.fit(X_train, y_train)

pred = model.predict(X_test)

print("R2:", r2_score(y_test, pred))
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

R2: 0.9115848643923246
RMSE: 67603.80999529724
