## 1. Импорт пакетов и файла

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('real_estate_data.csv')

In [3]:
# Предобработка
data = data[(data['size'] < 1000) & (data['price_try'] < 1e8)]  # Удаление outliers
exchange_rates = {'TRY': 1, 'USD': 34.0, 'EUR': 37.5, 'GBP': 44.5}  # Курсы на 07.09.2025
data['price_try'] = data.apply(lambda x: x['price'] * exchange_rates.get(x['price_currency'], 1), axis=1)

# Новые признаки
data['city'] = data['address'].str.split('/').str[0]
data['county'] = data['address'].str.split('/').str[1]
data['price_per_m2'] = data['price_try'] / data['size']
data['rooms_total'] = data['room_count'].str.split('+').apply(lambda x: sum(map(int, x)) if isinstance(x, list) else 0)
data['season'] = pd.to_datetime(data['start_date'], format='%m/%d/%y').dt.month // 3 + 1

# Импутация
imputer = IterativeImputer()
data[['size', 'building_age', 'floor_no']] = imputer.fit_transform(data[['size', 'building_age', 'floor_no']])

# Логарифмирование целевой переменной
y = np.log1p(data['price_try'])

# Кодирование
categorical_cols = ['type', 'sub_type', 'heating_type', 'furnished', 'city', 'county']
for col in categorical_cols:
    data[col] = data[col].replace('', np.nan).fillna('Unknown')
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# Стандартизация числовых признаков
scaler = StandardScaler()
numeric_cols = ['size', 'tom', 'building_age', 'total_floor_count', 'floor_no', 'price_per_m2', 'rooms_total']
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Разделение на продажу и аренду
data_sale = data[data['listing_type'] == 1].drop(['listing_type'], axis=1)
data_rent = data[data['listing_type'] == 2].drop(['listing_type'], axis=1)
X_sale = data_sale.drop(['price_try', 'end_date', 'start_date', 'address', 'price', 'price_currency', 'id'], axis=1)
y_sale = np.log1p(data_sale['price_try'])
X_rent = data_rent.drop(['price_try', 'end_date', 'start_date', 'address', 'price', 'price_currency', 'id'], axis=1)
y_rent = np.log1p(data_rent['price_try'])

X_train_sale, X_test_sale, y_train_sale, y_test_sale = train_test_split(X_sale, y_sale, test_size=0.2, random_state=42)
X_train_rent, X_test_rent, y_train_rent, y_test_rent = train_test_split(X_rent, y_rent, test_size=0.2, random_state=42)

In [4]:
# Оптимизация XGBoost с Optuna для продажи
def objective_sale(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 1.0)
    }
    model = XGBRegressor(**params, random_state=42)
    model.fit(X_train_sale, y_train_sale)
    y_pred = model.predict(X_test_sale)
    return mean_squared_error(y_test_sale, y_pred)

study_sale = optuna.create_study(direction='minimize')
study_sale.optimize(objective_sale, n_trials=20)
xgb_params_sale = study_sale.best_params
xgb_model_sale = XGBRegressor(**xgb_params_sale, random_state=42)

# Оптимизация XGBoost для аренды
def objective_rent(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 1.0)
    }
    model = XGBRegressor(**params, random_state=42)
    model.fit(X_train_rent, y_train_rent)
    y_pred = model.predict(X_test_rent)
    return mean_squared_error(y_test_rent, y_pred)

study_rent = optuna.create_study(direction='minimize')
study_rent.optimize(objective_rent, n_trials=20)
xgb_params_rent = study_rent.best_params
xgb_model_rent = XGBRegressor(**xgb_params_rent, random_state=42)

In [5]:
# Создаем модели для продажи
rf_model_sale = RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=-1, random_state=42)
lgb_model_sale = LGBMRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
stacking_model_sale = StackingRegressor(
    estimators=[
        ('rf', rf_model_sale),
        ('xgb', xgb_model_sale),
        ('lgb', lgb_model_sale)
    ],
    final_estimator=Ridge(alpha=0.5),
    cv=3
)

# Создаем модели для аренды
rf_model_rent = RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=-1, random_state=42)
lgb_model_rent = LGBMRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
stacking_model_rent = StackingRegressor(
    estimators=[
        ('rf', rf_model_rent),
        ('xgb', xgb_model_rent),
        ('lgb', lgb_model_rent)
    ],
    final_estimator=Ridge(alpha=0.5),
    cv=3
)

In [6]:
models_sale = {
    'Random Forest (Sale)': rf_model_sale,
    'Gradient Boosting (XGBoost) (Sale)': xgb_model_sale,
    'LightGBM (Sale)': lgb_model_sale,
    'Stacking (Sale)': stacking_model_sale
}

models_rent = {
    'Random Forest (Rent)': rf_model_rent,
    'Gradient Boosting (XGBoost) (Rent)': xgb_model_rent,
    'LightGBM (Rent)': lgb_model_rent,
    'Stacking (Rent)': stacking_model_rent
}

In [7]:
# Обучаем модели для продажи
results_sale = {}
predictions_sale = {}
for name, model in models_sale.items():
    model.fit(X_train_sale, y_train_sale)
    y_pred = model.predict(X_test_sale)
    mse = mean_squared_error(y_test_sale, y_pred)
    r2 = r2_score(y_test_sale, y_pred)
    results_sale[name] = {'MSE': mse, 'R2': r2}
    predictions_sale[name] = y_pred
    print(f"{name} - MSE: {mse:.2f}, R2: {r2:.2f}")

# Обучаем модели для аренды
results_rent = {}
predictions_rent = {}
for name, model in models_rent.items():
    model.fit(X_train_rent, y_train_rent)
    y_pred = model.predict(X_test_rent)
    mse = mean_squared_error(y_test_rent, y_pred)
    r2 = r2_score(y_test_rent, y_pred)
    results_rent[name] = {'MSE': mse, 'R2': r2}
    predictions_rent[name] = y_pred
    print(f"{name} - MSE: {mse:.2f}, R2: {r2:.2f}")

# Кросс-валидация
best_model_sale_name = max(results_sale, key=lambda x: results_sale[x]['R2'])
best_model_rent_name = max(results_rent, key=lambda x: results_rent[x]['R2'])
best_model_sale = models_sale[best_model_sale_name]
best_model_rent = models_rent[best_model_rent_name]
cv_scores_sale = cross_val_score(best_model_sale, X_sale, y_sale, cv=5, scoring='r2')
cv_scores_rent = cross_val_score(best_model_rent, X_rent, y_rent, cv=5, scoring='r2')
print(f"\nCross-Validation R2 for {best_model_sale_name}: {cv_scores_sale.mean():.2f} (+/- {cv_scores_sale.std() * 2:.2f})")
print(f"Cross-Validation R2 for {best_model_rent_name}: {cv_scores_rent.mean():.2f} (+/- {cv_scores_rent.std() * 2:.2f})")

In [8]:
# Find the best models
best_model_sale = max(results_sale.items(), key=lambda x: x[1]['R2'])
best_model_rent = max(results_rent.items(), key=lambda x: x[1]['R2'])
print(f"\nBest Model (Sale): {best_model_sale[0]} with MSE: {best_model_sale[1]['MSE']:.2f} and R2: {best_model_sale[1]['R2']:.2f}")
print(f"Best Model (Rent): {best_model_rent[0]} with MSE: {best_model_rent[1]['MSE']:.2f} and R2: {best_model_rent[1]['R2']:.2f}")

In [9]:
# Visualization 1: Bar Plot for MSE and R2 (Sale)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sns.barplot(x=list(results_sale.keys()), y=[results_sale[model]['MSE'] for model in results_sale], ax=ax1)
ax1.set_title('Mean Squared Error (MSE) Comparison - Sale')
ax1.set_ylabel('MSE')
ax1.tick_params(axis='x', rotation=45)
sns.barplot(x=list(results_sale.keys()), y=[results_sale[model]['R2'] for model in results_sale], ax=ax2)
ax2.set_title('R² Score Comparison - Sale')
ax2.set_ylabel('R² Score')
ax2.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

# Visualization 1: Bar Plot for MSE and R2 (Rent)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sns.barplot(x=list(results_rent.keys()), y=[results_rent[model]['MSE'] for model in results_rent], ax=ax1)
ax1.set_title('Mean Squared Error (MSE) Comparison - Rent')
ax1.set_ylabel('MSE')
ax1.tick_params(axis='x', rotation=45)
sns.barplot(x=list(results_rent.keys()), y=[results_rent[model]['R2'] for model in results_rent], ax=ax2)
ax2.set_title('R² Score Comparison - Rent')
ax2.set_ylabel('R² Score')
ax2.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

In [10]:
# Visualization 2: Scatter Plots for Predicted vs Actual Prices
fig, axes = plt.subplots(2, 4, figsize=(20, 10), sharey='row')
fig.suptitle('Predicted vs Actual Prices (Log Scale)')

# Sale
for i, (name, y_pred) in enumerate(predictions_sale.items()):
    axes[0, i].scatter(y_test_sale, y_pred, alpha=0.5)
    axes[0, i].plot([y_test_sale.min(), y_test_sale.max()], [y_test_sale.min(), y_test_sale.max()], 'r--', lw=2)
    axes[0, i].set_title(f'{name} (Sale)')
    axes[0, i].set_xlabel('Actual Log Price (TRY)')
    axes[0, i].set_ylabel('Predicted Log Price (TRY)')
    axes[0, i].set_xscale('log')
    axes[0, i].set_yscale('log')

# Rent
for i, (name, y_pred) in enumerate(predictions_rent.items()):
    axes[1, i].scatter(y_test_rent, y_pred, alpha=0.5)
    axes[1, i].plot([y_test_rent.min(), y_test_rent.max()], [y_test_rent.min(), y_test_rent.max()], 'r--', lw=2)
    axes[1, i].set_title(f'{name} (Rent)')
    axes[1, i].set_xlabel('Actual Log Price (TRY)')
    axes[1, i].set_ylabel('Predicted Log Price (TRY)')
    axes[1, i].set_xscale('log')
    axes[1, i].set_yscale('log')

plt.tight_layout()
plt.show()