# Командная работа
#### Работали: Столяров Артём, Собко Артём



### Описание признаков 

| Признак             | Описание                                                                 |
|---------------------|--------------------------------------------------------------------------|
| `type`              | Тип недвижимости (например, жилая, коммерческая).                        |
| `sub_type`          | Подтип недвижимости (например, квартира, вилла, пентхаус).               |
| `start_date`        | Дата, когда объявление появилось на рынке.                               |
| `end_date`          | Дата, когда объявление было снято с рынка (или "Not expired").            |
| `listing_type`      | Тип объявления: продажа (`Sale`) или аренда (`Rent`).                    |
| `tom`               | Время выхода на рынок (Time On Market) — сколько объект находится в продаже/аренде. |
| `building_age`      | Возраст здания (в годах).                                                |
| `total_floor_count` | Общее количество этажей в здании.                                        |
| `room_count`        | Количество комнат в объекте.                                             |
| `size`              | Площадь объекта (в м²).                                                  |
| `address`           | Адрес расположения объекта.                                              |
| `furnished`         | Наличие мебели (удалён из анализа).                                      |
| `price_currency`    | Валюта цены (TRY, USD, EUR — конвертируется в TRY).                      |
| `floor_no`          | Номер этажа (с нормализацией: цоколь → 0, земля → 1, и т.д.).            |
| `price`             | Цена объекта (в TRY после конвертации).                                  |
| `heating_type`      | Тип системы отопления (кодируется в числовые категории).                 |



In [237]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import (
    BaggingRegressor,
    StackingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
    BaggingClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier
)
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

####  Загрузка датафрейма

In [238]:
df = pd.read_csv('real_estate_data.csv', sep=',')

####  Предобработка данных

In [239]:
df = df.drop('furnished', axis=1)

df['end_date'] = df['end_date'].fillna("Not expired")

df['total_floor_count'] = df['total_floor_count'].replace({
    '10-20 arası': 15,
    '20 ve üzeri': 30
})
df['total_floor_count'] = pd.to_numeric(df['total_floor_count'], errors='coerce')
df['total_floor_count'].fillna(round(df['total_floor_count'].mean()), inplace=True)
df['total_floor_count'] = df['total_floor_count'].astype(int)

In [240]:
floor_mapping = {
    'Bodrum Kat': 0,
    'Bahçe katı': 1,
    'Müstakil': 1,
    'Yüksek Giriş': 1,
    'Zemin Kat': 1,
    'Kot 2': 1,
    'Giriş Katı': 1,
    'Kot 4': 2,
    'Kot 1': 1,
    'Kot 3': 1,
    '20 ve üzeri': 25,
    'Komple': 25,
    'En Üst Kat': 30,  
    'Çatı Katı': 30,   
    'Asma Kat': 30,
    'Teras Kat': 30
}
df['floor_no'] = df['floor_no'].replace(floor_mapping)
df['floor_no'] = pd.to_numeric(df['floor_no'], errors='coerce')
df['floor_no'].fillna(round(df['floor_no'].mean()), inplace=True)
df['floor_no'] = df['floor_no'].astype(int)

In [241]:
age_mapping = {
    '6-10 arası': 8,
    '11-15 arası': 13,
    '16-20 arası': 18,
    '21-25 arası': 23,
    '26-30 arası': 28,
    '31-35 arası': 33,
    '36-40 arası': 38,
    '40 ve üzeri': 50
}
df['building_age'] = df['building_age'].replace(age_mapping)
df['building_age'] = pd.to_numeric(df['building_age'], errors='coerce')
df['building_age'].fillna(round(df['building_age'].mean()), inplace=True)
df['building_age'] = df['building_age'].astype(int)

In [242]:
def to_try(row):
    if row['price_currency'] == 'EUR':
        return round(row['price'] * 48, 2)
    elif row['price_currency'] == 'USD':
        return round(row['price'] * 41, 2)
    else:
        return round(row['price'], 2)

if 'price_currency' in df.columns and 'price' in df.columns:
    df['price'] = df.apply(to_try, axis=1)
    df['price_currency'] = 'TRY'  

df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['price'] = df['price'].clip(lower=0)
df['price'].fillna(df['price'].mean(), inplace=True)
df['price'] = df['price'].astype(int)

In [243]:

df['room_count'] = df['room_count'].replace('+', None)
mode_val = df['room_count'].mode()
if not mode_val.empty:
    df['room_count'].fillna(mode_val[0], inplace=True)


In [244]:
df['size'] = pd.to_numeric(df['size'], errors='coerce')
df['size'] = df['size'].apply(lambda x: round(df['size'].mean()) if pd.notna(x) and x > 500 else x)
df['size'].fillna(round(df['size'].mean()), inplace=True)
df['size'] = df['size'].astype(int)

In [245]:
for col in ['heating_type', 'price_currency']:
    mode_val = df[col].mode()
    if not mode_val.empty:
        df[col].fillna(mode_val[0], inplace=True)


In [246]:
df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')


In [247]:
categorical_mappings = {
    'type': {'Konut': 'Housing'},
    'sub_type': {'Daire': 'Flat'},
    'listing_type': {1: 'Sale', 2: 'Rent'}, 
    'price_currency': {'TL': 'TRY', 'USD': 'USD', 'EUR': 'EUR'}
}

for col, mapping in categorical_mappings.items():
    if col in df.columns:
        df[col] = df[col].replace(mapping)

In [248]:
df['sub_type'] = pd.Categorical(df['sub_type']).codes

if (df['sub_type'] == -1).any():
    mode_val = df['sub_type'][df['sub_type'] != -1].mode()
    if not mode_val.empty:
        df['sub_type'] = df['sub_type'].replace(-1, mode_val[0])
df['sub_type'] = df['sub_type'].astype(int)

In [249]:
heating_mapping = {
    'Yok': 0,
    'Soba (Kömür)': 1,
    'Soba (Doğalgaz)': 2,
    'Kalorifer (Kömür)': 3,
    'Kalorifer (Doğalgaz)': 4,
    'Kalorifer (Akaryakıt)': 5,
    'Kombi (Doğalgaz)': 6,
    'Kombi (Elektrikli)': 7,
    'Kat Kaloriferi': 8,
    'Merkezi Sistem': 9,
    'Merkezi Sistem (Isı Payı Ölçer)': 10,
    'Yerden Isıtma': 11,
    'Klima': 12,
    'Fancoil': 13,
    'Güneş Enerjisi': 14,
    'Jeotermal': 15
}
df['heating_type'] = df['heating_type'].map(heating_mapping)

if df['heating_type'].isnull().any():
    mode_val = df['heating_type'].mode()
    if not mode_val.empty:
        df['heating_type'].fillna(mode_val[0], inplace=True)
df['heating_type'] = df['heating_type'].astype(int)

In [250]:
# сохранение для дальнейшей работы
df.to_csv('real_estate_data_fix.csv', index=False)

## Регрессия

In [251]:
drop_cols = [
    'id', 'address', 'start_date', 'end_date', 'tom', 'building_age',
    'type', 'price_currency', 'furnished', 'price_per_sqm', 'listing_type'
]
X = df.drop(columns=[col for col in drop_cols if col in df.columns])

y = df['price']

encoders_reg = {}
cat_cols = X.select_dtypes(include=['object']).columns

for col in cat_cols:
    le = LabelEncoder()
    X[col] = X[col].fillna('Unknown').astype(str)
    X[col] = le.fit_transform(X[col])
    encoders_reg[col] = le

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def metricsf(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"средняя абсолютная ошибка = {mae:.2f}, средняя квадратичная ошибка = {rmse:.2f}, коэфф. дисперсии = {r2:.4f}")

### Стекинг регрессор

In [252]:
stacking_model = StackingRegressor(
    estimators=[
        ('dt', RandomForestRegressor(random_state=42)),
        ('rf', BaggingRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
        ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42))
    ],
    final_estimator=LinearRegression(),
    n_jobs=-1
)
stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
metricsf(y_test, y_pred_stack)

средняя абсолютная ошибка = 57201.49, средняя квадратичная ошибка = 2625989.23, коэфф. дисперсии = 0.9154


### Беггинг регрессор

In [253]:
bagging_model = BaggingRegressor(
    estimator=DecisionTreeRegressor(random_state=42),
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)
bagging_model.fit(X_train, y_train)
y_pred_bag = bagging_model.predict(X_test)
metricsf(y_test, y_pred_bag)



средняя абсолютная ошибка = 13894.00, средняя квадратичная ошибка = 2741725.02, коэфф. дисперсии = 0.9078


### Бустинг регрессор

In [254]:
boosting_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)
boosting_model.fit(X_train, y_train)
y_pred_boost = boosting_model.predict(X_test)
metricsf(y_test, y_pred_boost)

средняя абсолютная ошибка = 14454.01, средняя квадратичная ошибка = 2609356.66, коэфф. дисперсии = 0.9165


In [None]:
_X = df.drop(columns=['sub_type', 'id', 'start_date', 'end_date', 'address'])
_y = df['sub_type']

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(_X, _y, test_size=0.2, random_state=42)

# кодировка целевой переменной
le_y = LabelEncoder()
y_train_clf = le_y.fit_transform(y_train_clf)
y_test_clf = le_y.transform(y_test_clf)

categorical_columns = [
    'location',
    'heating_type',  
    'price_currency',
    'room_count',
    'listing_type',
    'type',
    'tom'
]

clf_encoders = {}
cat_cols_clf = [col for col in categorical_columns if col in _X.columns]

# кодировка признаков
for col in cat_cols_clf:

    combined = pd.concat([X_train_clf[col], X_test_clf[col]], ignore_index=True).fillna('Unknown').astype(str)
    all_categories = combined.unique()

    X_train_clf[col] = pd.Categorical(
        X_train_clf[col].fillna('Unknown').astype(str),
        categories=all_categories
    ).codes
    
    X_test_clf[col] = pd.Categorical(
        X_test_clf[col].fillna('Unknown').astype(str),
        categories=all_categories
    ).codes

## Классификация

### Стекинг классификатор

In [256]:
stacking_model_clf = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=10, max_depth=4, random_state=42)),
        ('gb', HistGradientBoostingClassifier(max_iter=100,learning_rate=0.1,max_depth=6,random_state=42)),
        ('dt', DecisionTreeClassifier(max_depth=2, random_state=42))
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1),
    n_jobs=-1,
    cv=2  
)
stacking_model_clf.fit(X_train_clf, y_train_clf)
y_pred_stack_clf = stacking_model_clf.predict(X_test_clf)

print(f"acc score: {accuracy_score(y_test_clf, y_pred_stack_clf):.4f}")
print(classification_report(y_test_clf, y_pred_stack_clf))



acc score: 0.9209
              precision    recall  f1-score   support

           0       0.94      0.99      0.97     71061
           1       0.72      0.37      0.49       514
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00        58
           4       0.00      0.00      0.00         3
           5       0.60      0.25      0.35      1916
           6       0.94      0.71      0.81       145
           7       0.73      0.15      0.25      1509
           8       0.65      0.69      0.67      4231
           9       0.00      0.00      0.00        37
          10       0.34      0.08      0.13      1108
          11       0.47      0.16      0.23       103

    accuracy                           0.92     80698
   macro avg       0.45      0.28      0.33     80698
weighted avg       0.91      0.92      0.91     80698



### Бустинг классификатор

In [257]:
boosting_model_clf = HistGradientBoostingClassifier(
   max_iter=100,learning_rate=0.1,max_depth=6,random_state=42
)
boosting_model_clf.fit(X_train_clf, y_train_clf)
y_pred_boost_clf = boosting_model_clf.predict(X_test_clf)
print(f"acc score: {accuracy_score(y_test_clf, y_pred_boost_clf):.4f}")
print(classification_report(y_test_clf, y_pred_boost_clf))

acc score: 0.9176
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     71061
           1       0.68      0.44      0.53       514
           2       0.00      0.00      0.00        13
           3       0.28      0.31      0.30        58
           4       0.00      0.00      0.00         3
           5       0.66      0.20      0.31      1916
           6       0.86      0.86      0.86       145
           7       0.73      0.17      0.28      1509
           8       0.68      0.62      0.65      4231
           9       0.03      0.08      0.05        37
          10       0.50      0.01      0.01      1108
          11       0.24      0.20      0.22       103

    accuracy                           0.92     80698
   macro avg       0.47      0.32      0.35     80698
weighted avg       0.90      0.92      0.90     80698



### Беггинг классификатор

In [258]:
bagging_model_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
bagging_model_clf.fit(X_train_clf, y_train_clf)
y_pred_bag_clf = bagging_model_clf.predict(X_test_clf)
print(f"acc score: {accuracy_score(y_test_clf, y_pred_bag_clf):.4f}")
print(classification_report(y_test_clf, y_pred_bag_clf))

acc score: 0.9412
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     71061
           1       0.80      0.64      0.71       514
           2       0.88      0.54      0.67        13
           3       0.72      0.59      0.65        58
           4       0.50      0.33      0.40         3
           5       0.67      0.48      0.56      1916
           6       0.92      0.92      0.92       145
           7       0.78      0.43      0.55      1509
           8       0.74      0.79      0.76      4231
           9       0.74      0.38      0.50        37
          10       0.58      0.35      0.44      1108
          11       0.79      0.41      0.54       103

    accuracy                           0.94     80698
   macro avg       0.76      0.57      0.64     80698
weighted avg       0.94      0.94      0.94     80698



In [259]:
from joblib import dump

# регрессия 
dump(stacking_model, 'stacking_regressor_price.joblib')
dump(encoders_reg, 'regression_encoders.joblib')
dump(list(X.columns), 'regression_features.joblib')

# классификация
dump(stacking_model_clf, 'stacking_classifier_subtype.joblib')
dump(clf_encoders, 'classification_encoders.joblib')
dump(list(_X.columns), 'classification_features.joblib')
dump(le_y, 'target_encoder_subtype.joblib')

['target_encoder_subtype.joblib']