In [107]:
import pandas as pd
import numpy as np
import glob
import string
import re
import pymorphy3
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import (
    BaggingRegressor,
    StackingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, rand_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


pio.renderers.default = "notebook"  

## Предобработка данных

In [227]:
df = pd.read_csv('real_estate_data.csv')
df.head(10)

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,address,furnished,heating_type,price,price_currency
0,1,Konut,Rezidans,12/10/18,1/9/19,2,30,0.0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3500.0,TRY
1,2,Konut,Daire,2/13/19,,1,14,0.0,20 ve üzeri,20 ve üzeri,1+0,43.0,İstanbul/Kartal/Kordonboyu,,Fancoil,490000.0,TRY
2,3,Konut,Daire,10/9/18,11/8/18,1,30,0.0,1,Yüksek Giriş,2+1,,Tekirdağ/Çorlu/Reşadiye,,Fancoil,155000.0,TRY
3,4,Konut,Rezidans,9/10/18,10/10/18,1,30,3.0,20 ve üzeri,20 ve üzeri,6+1,450.0,İstanbul/Beşiktaş/Levent,,Fancoil,32500000.0,TRY
4,5,Konut,Rezidans,12/10/18,1/9/19,1,30,0.0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,1450000.0,TRY
5,6,Konut,Rezidans,11/9/18,12/9/18,1,30,2.0,10-20 arası,10,1+1,45.0,İstanbul/Maltepe/Altayçeşme,,Fancoil,780000.0,TRY
6,7,Konut,Daire,1/4/19,,2,54,0.0,20 ve üzeri,14,3+1,160.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3750.0,TRY
7,8,Konut,Villa,10/3/18,1/3/19,1,92,0.0,4,,4+1,,İzmir/Urla/M. Fevzi Çakmak,,Fancoil,1500000.0,TRY
8,9,Konut,Daire,2/16/19,,1,11,,2,Kot 2,3+1,140.0,Çanakkale/Ayvacık/Küçükkuyu Bld. (Mıhlı),,Fancoil,1500000.0,TRY
9,10,Konut,Daire,12/26/18,12/26/18,1,0,1.0,1,Asma Kat,2+2,550.0,İstanbul/Fatih/Sarıdemir,,Fancoil,84256.0,GBP


### Описание признаков• type - Тип недвижимости. \
• sub_type - Подгруппа по типу недвижимости. \
• start_date - Дата, когда листинг начинает действовать на рынке. \
• end_date - Дата, когда листинг больше не активен на рынке. \
• listing_type - Тип листинга. \
• tom - Время выхода на рынок. \
• building_age - Возраст здания. \
• total_floor_count - Общее количество этажей в здании. \
• room_count - Количество комнат в квартире.\
• size - Размер здания.\
• address - Адрес.\
• furnished - Фурнитура.\
• currency - Валюта цены на жилье.\
• floor_no - Информация о номере этажа в данном объявлении. \
• price - Цена. \
• heating_type - Различные типы систем отопления.


In [229]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403487 entries, 0 to 403486
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 403487 non-null  int64  
 1   type               403487 non-null  object 
 2   sub_type           403487 non-null  object 
 3   start_date         403487 non-null  object 
 4   end_date           266298 non-null  object 
 5   listing_type       403487 non-null  int64  
 6   tom                403487 non-null  int64  
 7   building_age       376097 non-null  object 
 8   total_floor_count  375466 non-null  object 
 9   floor_no           368191 non-null  object 
 10  room_count         403487 non-null  object 
 11  size               257481 non-null  float64
 12  address            403487 non-null  object 
 13  furnished          0 non-null       float64
 14  heating_type       375517 non-null  object 
 15  price              402772 non-null  float64
 16  pr

In [232]:
df.isna().sum()

id                        0
type                      0
sub_type                  0
start_date                0
end_date             137189
listing_type              0
tom                       0
building_age          27390
total_floor_count     28021
floor_no              35296
room_count                0
size                 146006
address                   0
furnished            403487
heating_type          27970
price                   715
price_currency          715
dtype: int64

In [234]:
df.duplicated().sum()  

0

In [235]:
# Сначала преобразуем столбец в числовой тип, нечисловые значения станут NaN
df['size'] = pd.to_numeric(df['size'], errors='coerce')

# Теперь заполняем пропуски средним значением
df['size'] = df['size'].fillna(round(df['size'].mean()))

df.head(10)

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,address,furnished,heating_type,price,price_currency
0,1,Konut,Rezidans,12/10/18,1/9/19,2,30,0.0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3500.0,TRY
1,2,Konut,Daire,2/13/19,,1,14,0.0,20 ve üzeri,20 ve üzeri,1+0,43.0,İstanbul/Kartal/Kordonboyu,,Fancoil,490000.0,TRY
2,3,Konut,Daire,10/9/18,11/8/18,1,30,0.0,1,Yüksek Giriş,2+1,279.0,Tekirdağ/Çorlu/Reşadiye,,Fancoil,155000.0,TRY
3,4,Konut,Rezidans,9/10/18,10/10/18,1,30,3.0,20 ve üzeri,20 ve üzeri,6+1,450.0,İstanbul/Beşiktaş/Levent,,Fancoil,32500000.0,TRY
4,5,Konut,Rezidans,12/10/18,1/9/19,1,30,0.0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,1450000.0,TRY
5,6,Konut,Rezidans,11/9/18,12/9/18,1,30,2.0,10-20 arası,10,1+1,45.0,İstanbul/Maltepe/Altayçeşme,,Fancoil,780000.0,TRY
6,7,Konut,Daire,1/4/19,,2,54,0.0,20 ve üzeri,14,3+1,160.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3750.0,TRY
7,8,Konut,Villa,10/3/18,1/3/19,1,92,0.0,4,,4+1,279.0,İzmir/Urla/M. Fevzi Çakmak,,Fancoil,1500000.0,TRY
8,9,Konut,Daire,2/16/19,,1,11,,2,Kot 2,3+1,140.0,Çanakkale/Ayvacık/Küçükkuyu Bld. (Mıhlı),,Fancoil,1500000.0,TRY
9,10,Konut,Daire,12/26/18,12/26/18,1,0,1.0,1,Asma Kat,2+2,550.0,İstanbul/Fatih/Sarıdemir,,Fancoil,84256.0,GBP


In [238]:
df['building_age'] = pd.to_numeric(df['building_age'], errors='coerce')
df['building_age'] = df['building_age'].fillna(round(df['building_age'].mean()))

df.head(10)

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,address,furnished,heating_type,price,price_currency
0,1,Konut,Rezidans,12/10/18,1/9/19,2,30,0.0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3500.0,TRY
1,2,Konut,Daire,2/13/19,,1,14,0.0,20 ve üzeri,20 ve üzeri,1+0,43.0,İstanbul/Kartal/Kordonboyu,,Fancoil,490000.0,TRY
2,3,Konut,Daire,10/9/18,11/8/18,1,30,0.0,1,Yüksek Giriş,2+1,279.0,Tekirdağ/Çorlu/Reşadiye,,Fancoil,155000.0,TRY
3,4,Konut,Rezidans,9/10/18,10/10/18,1,30,3.0,20 ve üzeri,20 ve üzeri,6+1,450.0,İstanbul/Beşiktaş/Levent,,Fancoil,32500000.0,TRY
4,5,Konut,Rezidans,12/10/18,1/9/19,1,30,0.0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,1450000.0,TRY
5,6,Konut,Rezidans,11/9/18,12/9/18,1,30,2.0,10-20 arası,10,1+1,45.0,İstanbul/Maltepe/Altayçeşme,,Fancoil,780000.0,TRY
6,7,Konut,Daire,1/4/19,,2,54,0.0,20 ve üzeri,14,3+1,160.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3750.0,TRY
7,8,Konut,Villa,10/3/18,1/3/19,1,92,0.0,4,,4+1,279.0,İzmir/Urla/M. Fevzi Çakmak,,Fancoil,1500000.0,TRY
8,9,Konut,Daire,2/16/19,,1,11,1.0,2,Kot 2,3+1,140.0,Çanakkale/Ayvacık/Küçükkuyu Bld. (Mıhlı),,Fancoil,1500000.0,TRY
9,10,Konut,Daire,12/26/18,12/26/18,1,0,1.0,1,Asma Kat,2+2,550.0,İstanbul/Fatih/Sarıdemir,,Fancoil,84256.0,GBP


In [239]:
df.isna().sum()

id                        0
type                      0
sub_type                  0
start_date                0
end_date             137189
listing_type              0
tom                       0
building_age              0
total_floor_count     28021
floor_no              35296
room_count                0
size                      0
address                   0
furnished            403487
heating_type          27970
price                   715
price_currency          715
dtype: int64

In [241]:
df['room_count'] = df['room_count'].str.extract(r'(\d+)\+').fillna(0).astype(int) + \
                    df['room_count'].str.extract(r'\+(\d+)').fillna(0).astype(int)

In [243]:
df.head(20)

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,address,furnished,heating_type,price,price_currency
0,1,Konut,Rezidans,12/10/18,1/9/19,2,30,0.0,20 ve üzeri,2,3,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3500.0,TRY
1,2,Konut,Daire,2/13/19,,1,14,0.0,20 ve üzeri,20 ve üzeri,1,43.0,İstanbul/Kartal/Kordonboyu,,Fancoil,490000.0,TRY
2,3,Konut,Daire,10/9/18,11/8/18,1,30,0.0,1,Yüksek Giriş,3,279.0,Tekirdağ/Çorlu/Reşadiye,,Fancoil,155000.0,TRY
3,4,Konut,Rezidans,9/10/18,10/10/18,1,30,3.0,20 ve üzeri,20 ve üzeri,7,450.0,İstanbul/Beşiktaş/Levent,,Fancoil,32500000.0,TRY
4,5,Konut,Rezidans,12/10/18,1/9/19,1,30,0.0,20 ve üzeri,2,3,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,1450000.0,TRY
5,6,Konut,Rezidans,11/9/18,12/9/18,1,30,2.0,10-20 arası,10,2,45.0,İstanbul/Maltepe/Altayçeşme,,Fancoil,780000.0,TRY
6,7,Konut,Daire,1/4/19,,2,54,0.0,20 ve üzeri,14,4,160.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3750.0,TRY
7,8,Konut,Villa,10/3/18,1/3/19,1,92,0.0,4,,5,279.0,İzmir/Urla/M. Fevzi Çakmak,,Fancoil,1500000.0,TRY
8,9,Konut,Daire,2/16/19,,1,11,1.0,2,Kot 2,4,140.0,Çanakkale/Ayvacık/Küçükkuyu Bld. (Mıhlı),,Fancoil,1500000.0,TRY
9,10,Konut,Daire,12/26/18,12/26/18,1,0,1.0,1,Asma Kat,4,550.0,İstanbul/Fatih/Sarıdemir,,Fancoil,84256.0,GBP


In [244]:
df['total_floor_count'].unique()

array(['20 ve üzeri', '1', '10-20 arası', '4', '2', '3', '8', '7', '6',
       '5', '10', nan, '9'], dtype=object)

In [245]:
df['total_floor_count'] = df['total_floor_count'].str.replace(' ve üzeri', '')

In [246]:
df['total_floor_count'] = df['total_floor_count'].str.replace('10-20 arası', '15')

In [247]:
df['total_floor_count'].unique()

array(['20', '1', '15', '4', '2', '3', '8', '7', '6', '5', '10', nan, '9'],
      dtype=object)

In [248]:
df['total_floor_count'] = df['total_floor_count'].fillna('1').astype(int)

In [255]:
df['end_date'].unique()

array(['1/9/19', nan, '11/8/18', '10/10/18', '12/9/18', '1/3/19',
       '12/26/18', '11/26/18', '1/18/19', '10/13/18', '11/2/18', '1/4/19',
       '2/11/19', '9/20/18', '1/14/19', '11/13/18', '11/12/18', '12/2/18',
       '2/23/19', '9/17/18', '12/12/18', '2/1/19', '11/6/18', '12/10/18',
       '1/29/19', '12/28/18', '11/9/18', '1/19/19', '11/1/18', '12/14/18',
       '11/4/18', '11/11/18', '10/19/18', '1/25/19', '12/13/18',
       '2/15/19', '2/25/19', '2/3/19', '2/13/19', '1/7/19', '10/20/18',
       '12/7/18', '12/15/18', '11/14/18', '1/21/19', '10/25/18', '2/9/19',
       '1/16/19', '2/6/19', '2/10/19', '11/7/18', '11/17/18', '2/22/19',
       '10/4/18', '11/30/18', '1/12/19', '10/28/18', '12/23/18',
       '1/11/19', '1/24/19', '12/24/18', '10/15/18', '12/27/18',
       '12/19/18', '11/23/18', '12/5/18', '9/10/18', '11/20/18',
       '11/10/18', '9/13/18', '2/8/19', '12/4/18', '12/3/18', '11/19/18',
       '12/20/18', '1/10/19', '12/6/18', '12/30/18', '12/21/18',
       '2/14/19'

In [258]:
df['end_date'] = df['end_date'].fillna('12/12/25')

In [260]:
df.isna().sum()

id                        0
type                      0
sub_type                  0
start_date                0
end_date                  0
listing_type              0
tom                       0
building_age              0
total_floor_count         0
floor_no              35296
room_count                0
size                      0
address                   0
furnished            403487
heating_type          27970
price                   715
price_currency          715
dtype: int64

In [262]:
df['floor_no'].unique()

array(['2', '20 ve üzeri', 'Yüksek Giriş', '10', '14', nan, 'Kot 2',
       'Asma Kat', 'Bahçe katı', '11', '3', '13', '7', '16', 'Müstakil',
       'Zemin Kat', '19', '4', '5', 'En Üst Kat', '8', '15', '1',
       'Giriş Katı', '9', 'Çatı Katı', '12', '17', '6', 'Kot 4', 'Kot 1',
       'Kot 3', '18', 'Teras Kat', 'Komple', 'Bodrum Kat', 1, 2, 3, 12, 4,
       5, 6, 7, 8], dtype=object)

In [264]:
df['floor_no'] = df['floor_no'].str.replace('Kot ', '')
df['floor_no'] = df['floor_no'].str.replace(' ve üzeri', '')
df['floor_no'] = df['floor_no'].str.replace('Yüksek Giriş', '10')
df['floor_no'] = df['floor_no'].str.replace('Asma Kat', '1')
df['floor_no'] = df['floor_no'].str.replace('Bahçe katı', '4')
df['floor_no'] = df['floor_no'].str.replace('Müstakil', '7')
df['floor_no'] = df['floor_no'].str.replace('Zemin Kat', '1')
df['floor_no'] = df['floor_no'].str.replace('En Üst Kat', '10')
df['floor_no'] = df['floor_no'].str.replace('Giriş Katı', '3')
df['floor_no'] = df['floor_no'].str.replace('Çatı Katı', '4')

In [266]:
df['floor_no'] = df['floor_no'].str.replace('Teras', '1')
df['floor_no'] = df['floor_no'].str.replace('Komple', '6')
df['floor_no'] = df['floor_no'].str.replace('Bodrum Kat', '1')
df['floor_no'] = df['floor_no'].str.replace(' Kat', '')
df['floor_no'] = df['floor_no'].fillna('0').astype(int)

In [268]:
df['floor_no'].unique()

array([ 2, 20, 10, 14,  0,  1,  4, 11,  3, 13,  7, 16, 19,  5,  8, 15,  9,
       12, 17,  6, 18])

In [270]:
df = df.drop(columns=['furnished'])

In [272]:
df['heating_type'] = df['heating_type'].fillna('Yok')

In [274]:
df['heating_type'].unique()

array(['Fancoil', 'Yok', 'Kalorifer (Doğalgaz)', 'Kalorifer (Kömür)',
       'Kombi (Elektrikli)', 'Klima', 'Kombi (Doğalgaz)',
       'Merkezi Sistem (Isı Payı Ölçer)', 'Merkezi Sistem',
       'Soba (Kömür)', 'Yerden Isıtma', 'Soba (Doğalgaz)',
       'Güneş Enerjisi', 'Kalorifer (Akaryakıt)', 'Jeotermal',
       'Kat Kaloriferi'], dtype=object)

In [276]:
df['price'].unique()

array([3.5000000e+03, 4.9000000e+05, 1.5500000e+05, ..., 3.1234123e+07,
       1.1500610e+06, 2.2230000e+05])

In [278]:
df['price'] = df['price'].fillna(df['price'].mean)

In [280]:
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df = df.dropna(subset=['price'])

In [282]:
df['price'] = df['price'].apply(lambda x: df['price'].mean()  if x < 0 else x)

In [284]:
df.isna().sum()

id                   0
type                 0
sub_type             0
start_date           0
end_date             0
listing_type         0
tom                  0
building_age         0
total_floor_count    0
floor_no             0
room_count           0
size                 0
address              0
heating_type         0
price                0
price_currency       0
dtype: int64

In [286]:
df['price_currency'].unique()

array(['TRY', 'GBP', 'EUR', 'USD'], dtype=object)

In [288]:
df['price_currency'] = df['price_currency'].fillna('USD')

In [290]:
df.isna().sum()

id                   0
type                 0
sub_type             0
start_date           0
end_date             0
listing_type         0
tom                  0
building_age         0
total_floor_count    0
floor_no             0
room_count           0
size                 0
address              0
heating_type         0
price                0
price_currency       0
dtype: int64

In [292]:
df.describe()

Unnamed: 0,id,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,price
count,402772.0,402772.0,402772.0,402772.0,402772.0,402772.0,402772.0,402772.0,402772.0
mean,201619.644484,1.294425,56.956288,1.050239,5.478844,3.071068,3.668103,279.265418,354642.5
std,116389.822993,0.467795,44.31981,1.228184,4.126416,3.610067,1.167708,7539.060996,4809503.0
min,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
25%,100828.75,1.0,29.0,0.0,3.0,0.0,3.0,100.0,2500.0
50%,201658.5,1.0,40.0,1.0,4.0,2.0,4.0,150.0,199000.0
75%,302380.25,2.0,90.0,1.0,6.0,4.0,4.0,279.0,342000.0
max,403487.0,3.0,180.0,5.0,20.0,20.0,20.0,948235.0,2000000000.0


In [217]:
df.head()

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,address,heating_type,price,price_currency
0,1,Konut,Rezidans,12/10/18,1/9/19,2,30,0.0,20,2,3,90.0,İstanbul/Kartal/Kordonboyu,Fancoil,3500.0,TRY
1,2,Konut,Daire,2/13/19,12/12/25,1,14,0.0,20,20,1,43.0,İstanbul/Kartal/Kordonboyu,Fancoil,490000.0,TRY
2,3,Konut,Daire,10/9/18,11/8/18,1,30,0.0,1,10,3,279.0,Tekirdağ/Çorlu/Reşadiye,Fancoil,155000.0,TRY
3,4,Konut,Rezidans,9/10/18,10/10/18,1,30,3.0,20,20,7,450.0,İstanbul/Beşiktaş/Levent,Fancoil,32500000.0,TRY
4,5,Konut,Rezidans,12/10/18,1/9/19,1,30,0.0,20,2,3,90.0,İstanbul/Kartal/Kordonboyu,Fancoil,1450000.0,TRY


In [294]:
# Приводим к одному типу валюты
exchange_rates = {'USD': 5.3, 'EUR': 6.0, 'GBP': 7.0, 'TRY': 1.0}
for curr in exchange_rates:
    mask = df['price_currency'] == curr
    df.loc[mask, 'price'] = df.loc[mask, 'price'] * exchange_rates[curr]

In [327]:
drop_cols = [
    'id', 'address', 'start_date', 'end_date', 'tom', 'building_age',
    'type', 'price_currency', 'furnished', 'price_per_sqm'
]
X = df.drop(columns=[col for col in drop_cols if col in df.columns])

cat_cols = X.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Вспомогательная функция для оценки
def evaluate_model(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name}: MAE = {mae:.2f}, RMSE = {rmse:.2f}, R² = {r2:.4f}")


MAE (Mean Absolute Error) — Средняя абсолютная ошибка \
Среднее значение модулей разностей между истинными и предсказанными значениями. \
Показывает, на сколько в среднем предсказание отличается от реальности (в тех же единицах измерения). \
→ Чем ниже, тем лучше. \
\
RMSE (Root Mean Squared Error) — Корень из среднеквадратичной ошибки \
Квадратный корень из среднего значения квадратов ошибок. \
Более чувствителен к большим ошибкам (штрафует их сильнее, чем MAE). \
→ Чем ниже, тем лучше. \
\
R² (R-squared, Коэффициент детерминации) — Доля объяснённой дисперсии \
Показывает, насколько хорошо модель объясняет изменчивость целевой переменной. \
Диапазон: от -∞ до 1. \
\
R² = 1 — модель идеально предсказывает значения, \
R² = 0 — модель не лучше, чем просто предсказывать среднее, \
R² < 0 — модель хуже, чем константное предсказание. \
→ Чем ближе к 1, тем лучше.

# Ансамблевые методы

# Регрессия

## Стекинг

При обучении данной мета-модели используется приём удерживаемого набора. Сначала набор разделяется на 2 части. Слабые ученики обучаются на первой половине обучающего набора, затем на второй. Затем создаётся новый обучающий набор на основе прогнозов, сделанных на прогнозах первой и второй части набора. Таким образом, на каждый образец из входного набора приходится столько прогнозов, сколько слабых учеников в ансамбле (в примере на картинке три). Мета-модель учится прогнозировать значения на основе нового набора.

In [308]:
stacking_model = StackingRegressor(
    estimators=[
        ('rf', RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)),
        ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42)),
        ('dt', DecisionTreeRegressor(random_state=42))
    ],
    final_estimator=LinearRegression(),
    n_jobs=-1
)

stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
evaluate_model(y_test, y_pred_stack, "Stacking Regressor")

Stacking Regressor: MAE = 71574.30, RMSE = 10795054.45, R² = -2.1425


## Бэггинг

Для начала генерируется несколько бутстрэп-выборок. Бутстрэп - это случайный выбор данных из датасета и представление их в модель, затем данные возвращаются в датасет и процесс повторяется. После модели делают свои прогнозы на основе бутстрэп-выборок. В случае регрессии прогнозы просто усредняются. В случае же классификации применяется голосование.

In [304]:
bagging_model = BaggingRegressor(
    estimator=DecisionTreeRegressor(random_state=42),
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)
bagging_model.fit(X_train, y_train)
y_pred_bag = bagging_model.predict(X_test)
evaluate_model(y_test, y_pred_bag, "Bagging Regressor")

Bagging Regressor: MAE = 4855.86, RMSE = 1185258.34, R² = 0.9621


## Бустинг

Метод бустинга в чём то схож с методом бэггинга: берётся множество одинаковых моделей и объединяется, чтобы получить сильного ученика. Но разница заключается в том, что модели приспосабливаются к данным последовательно, то есть каждая модель будет исправлять ошибки предыдущей.

Базовые модели для бустинга - это модели с низким разбросом и высоким смещением. Например неглубокие деревья решений. Одна из причин такого выбора моделей - они требуют меньше вычислительных затрат. Ещё бустинг (в отличии от бэггинга) нельзя распараллелить.

Существует два наиболее распространённых алгоритма бустинга - адаптивный бустинг и градиентный бустинг.
Мы будем использовать алгоритм градиентного бустинга.

In [302]:
boosting_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

rs_sp = boosting_model.fit(X_train, y_train)
y_pred_boost = boosting_model.predict(X_test)
evaluate_model(y_test, y_pred_boost, "Gradient Boosting")

Gradient Boosting: MAE = 2649.22, RMSE = 89921.96, R² = 0.9998


# Классификация

In [75]:
X = df.drop(columns=['sub_type', 'id', 'start_date', 'end_date', 'address'])
y = df['sub_type']

cat_cols = X.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Стекинг

In [79]:
from sklearn.ensemble import StackingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split

In [54]:
from sklearn.ensemble import StackingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split

stacking_model = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=10, max_depth=4, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=10, max_depth=2, learning_rate=0.1, random_state=42)),
        ('dt', DecisionTreeClassifier(max_depth=2, random_state=42))
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1),
    n_jobs=-1,
    cv=2  
)

stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")
print(classification_report(y_test, y_pred_stack))

Accuracy: 0.9167
                     precision    recall  f1-score   support

              Daire       0.94      0.99      0.96     70817
        Komple Bina       0.96      0.72      0.83       533
         Kooperatif       0.00      0.00      0.00        10
Köşk / Konak / Yalı       0.00      0.00      0.00        67
               Loft       0.00      0.00      0.00         5
        Müstakil Ev       0.48      0.20      0.28      1956
       Prefabrik Ev       0.00      0.00      0.00       145
           Rezidans       0.56      0.10      0.17      1526
              Villa       0.62      0.66      0.64      4185
       Yalı Dairesi       0.00      0.00      0.00        36
             Yazlık       0.30      0.01      0.02      1177
        Çiftlik Evi       0.50      0.08      0.14        98

           accuracy                           0.92     80555
          macro avg       0.36      0.23      0.25     80555
       weighted avg       0.89      0.92      0.90     80555



## Бэггинг

In [56]:
bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

bagging_model.fit(X_train, y_train)
y_pred_bag = bagging_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_bag):.4f}")
print(classification_report(y_test, y_pred_bag))

Accuracy: 0.9431
                     precision    recall  f1-score   support

              Daire       0.97      0.99      0.98     70817
        Komple Bina       0.92      0.80      0.86       533
         Kooperatif       0.80      0.40      0.53        10
Köşk / Konak / Yalı       0.67      0.43      0.53        67
               Loft       0.50      0.20      0.29         5
        Müstakil Ev       0.65      0.46      0.54      1956
       Prefabrik Ev       0.86      0.81      0.83       145
           Rezidans       0.77      0.42      0.54      1526
              Villa       0.74      0.80      0.77      4185
       Yalı Dairesi       0.71      0.28      0.40        36
             Yazlık       0.62      0.40      0.49      1177
        Çiftlik Evi       0.65      0.47      0.54        98

           accuracy                           0.94     80555
          macro avg       0.74      0.54      0.61     80555
       weighted avg       0.94      0.94      0.94     80555



## Бустинг

In [83]:
boosting_model = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=4,
    random_state=42,
)

rs_sp = boosting_model.fit(X_train, y_train)
y_pred_boost = boosting_model.predict(X_test) 

print(f"Accuracy: {accuracy_score(y_test, y_pred_boost):.4f}")
print(classification_report(y_test, y_pred_boost))

Accuracy: 0.9277
                     precision    recall  f1-score   support

              Daire       0.95      0.99      0.97     70817
        Komple Bina       0.93      0.75      0.83       533
         Kooperatif       0.00      0.00      0.00        10
Köşk / Konak / Yalı       0.34      0.25      0.29        67
               Loft       0.00      0.00      0.00         5
        Müstakil Ev       0.62      0.23      0.33      1956
       Prefabrik Ev       0.76      0.46      0.57       145
           Rezidans       0.72      0.21      0.32      1526
              Villa       0.66      0.76      0.70      4185
       Yalı Dairesi       0.43      0.08      0.14        36
             Yazlık       0.46      0.13      0.20      1177
        Çiftlik Evi       0.41      0.12      0.19        98

           accuracy                           0.93     80555
          macro avg       0.52      0.33      0.38     80555
       weighted avg       0.92      0.93      0.91     80555



# Вывод

Проанализировали Датафрейм, избавились от аномальных и пустых значенний во время предобработки данных. Обучив модели мы получили от них вот такой результат: 

========================================== \
Класификация \
========================================== \
Stacking Classifier: Accuracy: 0.9167 \
Begging Classifier: Accuracy: 0.9431 \
Boosting Classifier: Accuracy: 0.9277 \
========================================== \
Регрессия \
========================================== \
Stacking Regressor: MAE = 71574.30, RMSE = 10795054.45, R² = -2.1425 \
Bagging Regressor: MAE = 4855.86, RMSE = 1185258.34, R² = 0.9621 \
Gradient Boosting: MAE = 2649.22, RMSE = 89921.96, R² = 0.9998 
