## 1. Импорт пакетов и файла

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('real_estate_data.csv')

In [3]:
# Фильтруем объявления (только продажи, исключаем аренду)
data = data[data['listing_type'] == 1]

In [4]:
# Предобработка
data = data[(data['size'] < 1000) & (data['price_try'] < 1e8)]  # Удаление outliers
exchange_rates = {'TRY': 1, 'USD': 34.0, 'EUR': 37.5, 'GBP': 44.5}  # Курсы на 07.09.2025
data['price_try'] = data.apply(lambda x: x['price'] * exchange_rates.get(x['price_currency'], 1), axis=1)

# Новые признаки
data['city'] = data['address'].str.split('/').str[0]
data['county'] = data['address'].str.split('/').str[1]
data['price_per_m2'] = data['price_try'] / data['size']
data['rooms_total'] = data['room_count'].str.split('+').apply(lambda x: sum(map(int, x)) if isinstance(x, list) else 0)
data['season'] = pd.to_datetime(data['start_date'], format='%m/%d/%y').dt.month // 3 + 1

# Импутация
imputer = IterativeImputer()
data[['size', 'building_age', 'floor_no']] = imputer.fit_transform(data[['size', 'building_age', 'floor_no']])

# Логарифмирование целевой переменной
y = np.log1p(data['price_try'])

# Кодирование
categorical_cols = ['type', 'sub_type', 'heating_type', 'furnished', 'city', 'county']
for col in categorical_cols:
    data[col] = data[col].replace('', np.nan).fillna('Unknown')
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# Стандартизация числовых признаков
scaler = StandardScaler()
numeric_cols = ['size', 'tom', 'building_age', 'total_floor_count', 'floor_no', 'price_per_m2', 'rooms_total']
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Данные
X = data.drop(['price_try', 'end_date', 'start_date', 'address', 'price', 'price_currency', 'id'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Оптимизация XGBoost с Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 1.0)
    }
    model = XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
xgb_params = study.best_params
xgb_model = XGBRegressor(**xgb_params, random_state=42)

In [6]:
# Создаем модели
rf_model = RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=-1, random_state=42)
xgb_model = XGBRegressor(**xgb_params, random_state=42)
lgb_model = LGBMRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
stacking_model = StackingRegressor(
    estimators=[
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('lgb', lgb_model)
    ],
    final_estimator=Ridge(alpha=0.5),
    cv=3
)

In [7]:
models = {
    'Random Forest': rf_model,
    'Gradient Boosting (XGBoost)': xgb_model,
    'LightGBM': lgb_model,
    'Stacking': stacking_model
}

In [8]:
# Обучаем модели
results = {}
predictions = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}
    predictions[name] = y_pred
    print(f"{name} - MSE: {mse:.2f}, R2: {r2:.2f}")

# Кросс-валидация для лучшей модели
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = models[best_model_name]
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print(f"\nCross-Validation R2 for {best_model_name}: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

In [9]:
# Find the best model
best_model = max(results.items(), key=lambda x: x[1]['R2'])
print(f"\nBest Model: {best_model[0]} with MSE: {best_model[1]['MSE']:.2f} and R2: {best_model[1]['R2']:.2f}")

In [10]:
# Visualization 1: Bar Plot for MSE and R2
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# MSE Bar Plot
sns.barplot(x=list(results.keys()), y=[results[model]['MSE'] for model in results], ax=ax1)
ax1.set_title('Mean Squared Error (MSE) Comparison')
ax1.set_ylabel('MSE')
ax1.tick_params(axis='x', rotation=45)

# R2 Bar Plot
sns.barplot(x=list(results.keys()), y=[results[model]['R2'] for model in results], ax=ax2)
ax2.set_title('R² Score Comparison')
ax2.set_ylabel('R² Score')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [11]:
# Visualization 2: Scatter Plots for Predicted vs Actual Prices
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=True)
fig.suptitle('Predicted vs Actual Prices (Log Scale)')

for i, (name, y_pred) in enumerate(predictions.items()):
    axes[i].scatter(y_test, y_pred, alpha=0.5)
    axes[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[i].set_title(name)
    axes[i].set_xlabel('Actual Log Price (TRY)')
    axes[i].set_ylabel('Predicted Log Price (TRY)')
    axes[i].set_xscale('log')
    axes[i].set_yscale('log')

plt.tight_layout()
plt.show()