In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

## COMMON FUNCTIONS

### Data Preparation

In [7]:
def prepare_data(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

### Data Splitting

In [8]:
def split_data(df, target, test_size=0.2, random_state=42):
    X = df.drop(columns=[target])
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test
    
df = pd.read_csv("./data/Input_data_filled_encoded.csv")
X_train, X_test, y_train, y_test = split_data(df, target="price")

### Model Evaluation

In [9]:
# Model Evaluation for: LINEAR REGRESSION, DECISION TREE, RANDOM FOREST
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    return {"r2": r2,"mse": mse}

# Model Evaluation for: SUPPORT VECTOR MACHINE, XGBoost
def evaluate_model_scaler(model, scaler, X_test, y_test):
    X_test_scaled = scaler.transform(X_test)  # применяем scaler
    preds = model.predict(X_test_scaled)      # предсказания
    r2 = r2_score(y_test, preds)              # метрики
    rmse = mean_squared_error(y_test, preds, squared=False)
    return {"r2": r2,"rmse": rmse}

### Model Saving

In [10]:
def save_model(model, filename):
    joblib.dump(model, filename)

### Model Loading

In [11]:
def load_model(filename):
    return joblib.load(filename)

### Definition of Overfitting

In [12]:
def overfitting():
    r2_train = r2_score(y_train, model.predict(X_train_scaled))
    r2_test = r2_score(y_test, model.predict(X_test_scaled))
    print("R2 train:", r2_train," R2 test:", r2_test)
    return

# Overfitting, if: R2_train >> R2_test or RMSE_test >> RMSE_train

# MODEL TRAINING

## LINEAR REGRESSION

### Model Training + Evaluation

In [13]:
df = pd.read_csv("./data/Input_data_filled_encoded_std.csv")
df.head(3)

Unnamed: 0,postal_code,price,number_of_bedrooms,living_area,equiped_kitchen,furnished,terrace,garden,swimming_pool,property_type_Apartment_Apartment,...,property_type_Apartment_Studio,property_type_Apartment_Triplex,property_type_House_Bungalow,property_type_House_Chalet,property_type_House_Cottage,property_type_House_Mansion,property_type_House_Master house,property_type_House_Mixed building,property_type_House_Residence,property_type_House_Villa
0,-0.634228,-0.209898,0.232704,-0.446268,-0.664201,-0.140397,0.475381,-1.084134,-0.156896,True,...,False,False,False,False,False,False,False,False,False,False
1,-0.857909,0.168014,0.232704,2.5467,-0.664201,-0.140397,0.475381,0.922331,-0.156896,False,...,False,False,False,False,False,False,False,False,True,False
2,-0.619316,-0.463021,-1.302127,-0.828581,-0.664201,-0.140397,0.475381,-1.084134,-0.156896,True,...,False,False,False,False,False,False,False,False,False,False


In [14]:
# Model Training
def train_linear_regression(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model
model = train_linear_regression(X_train, y_train)
# Model Evaluation
print(evaluate_model(model, X_test, y_test))

{'r2': 0.457072387834325, 'mse': 32264780032.228706}


### Model Saving

In [15]:
save_model(model, "linear_model.pkl")

### Model Loading

In [16]:
loaded_model = load_model("linear_model.pkl")

## DECISION TREE

In [17]:
df = pd.read_csv("./data/Input_data_filled_encoded.csv")
df.head(3)

Unnamed: 0,postal_code,price,number_of_bedrooms,living_area,equiped_kitchen,furnished,terrace,garden,swimming_pool,property_type_Apartment_Apartment,...,property_type_Apartment_Studio,property_type_Apartment_Triplex,property_type_House_Bungalow,property_type_House_Chalet,property_type_House_Cottage,property_type_House_Mansion,property_type_House_Master house,property_type_House_Mixed building,property_type_House_Residence,property_type_House_Villa
0,2800,329000,3.0,104,0,0,1,0,0,True,...,False,False,False,False,False,False,False,False,False,False
1,2200,425000,3.0,378,0,0,1,1,0,False,...,False,False,False,False,False,False,False,False,True,False
2,2840,264700,1.0,69,0,0,1,0,0,True,...,False,False,False,False,False,False,False,False,False,False


### Preparation data

In [18]:
X, y = prepare_data(df, "price")

### Splitting Data (train/test)

In [19]:
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

X_train, X_test, y_train, y_test = split_data(X, y)

### Creating a model

In [20]:
def build_decision_tree_model():
    model = DecisionTreeRegressor(
        max_depth=5,            # ограничиваем глубину (борьба с переобучением)
        min_samples_split=10,   # минимальное число выборок для разбиения
        min_samples_leaf=5,     # минимальный размер листа
        random_state=42
    )
    return model
model = build_decision_tree_model()

### Model Training + Evaluation

In [21]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

model = train_model(model, X_train, y_train)
print(evaluate_model(model, X_test, y_test))

{'r2': 0.5229734505728445, 'mse': 28348450773.035313}


### Model Saving

In [22]:
save_model(model, "decision_tree_model.pkl")

### Model Loading

In [23]:
loaded_model = load_model("decision_tree_model.pkl")

## RANDOM FOREST

In [24]:
df = pd.read_csv("./data/Input_data_filled_encoded.csv")
df.head(3)

Unnamed: 0,postal_code,price,number_of_bedrooms,living_area,equiped_kitchen,furnished,terrace,garden,swimming_pool,property_type_Apartment_Apartment,...,property_type_Apartment_Studio,property_type_Apartment_Triplex,property_type_House_Bungalow,property_type_House_Chalet,property_type_House_Cottage,property_type_House_Mansion,property_type_House_Master house,property_type_House_Mixed building,property_type_House_Residence,property_type_House_Villa
0,2800,329000,3.0,104,0,0,1,0,0,True,...,False,False,False,False,False,False,False,False,False,False
1,2200,425000,3.0,378,0,0,1,1,0,False,...,False,False,False,False,False,False,False,False,True,False
2,2840,264700,1.0,69,0,0,1,0,0,True,...,False,False,False,False,False,False,False,False,False,False


### Model Training + Evaluation

In [25]:
def train_random_forest(X_train, y_train,
                        n_estimators=25, # По умолчанию - 200
                        max_depth=None,
                        random_state=42):
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
    model.fit(X_train, y_train)
    return model

model = train_random_forest(X_train, y_train)
print(evaluate_model(model, X_test, y_test))

{'r2': 0.6725733580549771, 'mse': 19458116224.56909}


### Model Saving

In [26]:
save_model(model, "random_forest_model.pkl")

### Model Loading

In [27]:
loaded_model = load_model("random_forest_model.pkl")

## SUPPORT VECTOR MACHINE

In [28]:
df = pd.read_csv("./data/Input_data_filled_encoded_std.csv")
df.head(3)

Unnamed: 0,postal_code,price,number_of_bedrooms,living_area,equiped_kitchen,furnished,terrace,garden,swimming_pool,property_type_Apartment_Apartment,...,property_type_Apartment_Studio,property_type_Apartment_Triplex,property_type_House_Bungalow,property_type_House_Chalet,property_type_House_Cottage,property_type_House_Mansion,property_type_House_Master house,property_type_House_Mixed building,property_type_House_Residence,property_type_House_Villa
0,-0.634228,-0.209898,0.232704,-0.446268,-0.664201,-0.140397,0.475381,-1.084134,-0.156896,True,...,False,False,False,False,False,False,False,False,False,False
1,-0.857909,0.168014,0.232704,2.5467,-0.664201,-0.140397,0.475381,0.922331,-0.156896,False,...,False,False,False,False,False,False,False,False,True,False
2,-0.619316,-0.463021,-1.302127,-0.828581,-0.664201,-0.140397,0.475381,-1.084134,-0.156896,True,...,False,False,False,False,False,False,False,False,False,False


### Preparation data

In [29]:
X, y = prepare_data(df, "price")

### Train/Test split

In [30]:
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)
X_train, X_test, y_train, y_test = split_data(X, y)

### Creating a model

In [31]:
def build_svm_model():
    model = SVR(
        kernel='rbf',
        C=10,          # степень штрафа
        gamma='scale', # параметр ядра
        epsilon=0.1    # допуск отклонений
    )
    return model
model = build_svm_model()

### Scaling + Training + Evaluation

In [32]:
from sklearn.preprocessing import StandardScaler
def train_model(model, X_train, y_train): # Масштабирует признаки и обучает SVR.Возвращает обученный scaler и модель.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    model.fit(X_train_scaled, y_train)
    return scaler, model
scaler, model = train_model(model, X_train, y_train)

print(evaluate_model_scaler(model, scaler, X_test, y_test))

TypeError: got an unexpected keyword argument 'squared'

### Model Saving

In [None]:
def save_model_SVM(model, scaler, filepath="svm_regressor.pkl"):
    joblib.dump({"model": model, "scaler": scaler}, filepath)
    return
save_model_SVM(model, scaler)

### Model Loading

In [None]:
def load_model(filepath="svm_regressor.pkl"):
    saved = joblib.load(filepath)
    return saved["model"], saved["scaler"]

## XGBoost MODEL

In [33]:
df = pd.read_csv("./data/Input_data_filled.csv")
df.head(3)

Unnamed: 0,postal_code,price,number_of_bedrooms,living_area,equiped_kitchen,furnished,terrace,garden,swimming_pool,property_type
0,2800,329000,3.0,104,0,0,1,0,0,Apartment_Apartment
1,2200,425000,3.0,378,0,0,1,1,0,House_Residence
2,2840,264700,1.0,69,0,0,1,0,0,Apartment_Apartment


### Model Building + Training + Evaluation

In [34]:
def build_xgb_model():
    model = XGBRegressor(
    n_estimators=775, # Количество деревьев в ансамбле, чем больше, тем выше качество, но медленнее обучение, типовое значение: 100–1000
    learning_rate=0.0125, # Скорость обучения, Маленькое значение = модель учится медленно, но устойчиво, большое значение = модель быстрее переобучается, Обычно 0.01–0.3
    max_depth=6, # Максимальная глубина каждого дерева, глубокие деревья → модель точнее, но легко переобучается, Мелкие деревья → более обобщающие. Типичный диапазон: 3–10
    subsample=0.8, # Доля строк (обучающих наблюдений), используемая для каждого дерева. 0.8 = каждое дерево обучается на 80% случайно выбранных данных. Помогает бороться с переобучением
    colsample_bytree=0.78, # Доля признаков (features), используемых для построения каждого дерева. 0.8 = использовать 80% случайных колонок на дерево. Помогает снизить переобучение и ускорить обучение
    objective="reg:squarederror",
    random_state=42 # Фиксация случайности. Нужна, чтобы модель каждый раз давала одинаковый результат при одних и тех же данных
    )
    return model

def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

model = build_xgb_model()
model = train_model(model, X_train, y_train)
print(evaluate_model(model, X_test, y_test))

{'r2': 0.7048772316545796, 'mse': 0.271787180919992}


### Model Saving

In [35]:
save_model(model, "price_xgb.pkl")

### Model Loading

In [36]:
loaded_model = load_model("price_xgb.pkl")