In [5]:
# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
from scipy.stats import randint, uniform

pd.options.display.float_format = '{:.2f}'.format

## IMPORT & EXPLORE

#### FUNCTIONS

In [48]:
def regression_report(dependent_test, 
                      estimator, 
                      independent_test,
                      predictions_plot=True):
    """ 
    This function calculates and prints basic performance metrics for regression evaluation.
    Additionally, it plots predicted vs actual values if `predictions_plot` is True.

    Parameters:
        dependent_test (array-like): Ground truth target values (y_test).
        estimator (model): Trained regression model.
        independent_test (array-like): Test input data (X_test).
        predictions_plot (bool): Whether to plot predicted vs actual values. Default is True.
    """
    
    # Calculate predictions
    predictions = estimator.predict(independent_test)
    
    # Performance Metrics
    print(f'R2 : {metrics.r2_score(dependent_test, predictions)}')
    print(f'MAE : {metrics.mean_absolute_error(dependent_test, predictions)}')
    print(f'MSE : {metrics.mean_squared_error(dependent_test, predictions)}')
    print(f'RMSE : {np.sqrt(metrics.mean_squared_error(dependent_test, predictions))}')
    
    # Plotting Predictions
    if predictions_plot:
        plt.figure(figsize=(12, 6))
        plt.scatter(dependent_test, predictions, alpha=0.6, edgecolor='k', label='Predictions')
        plt.plot([dependent_test.min(), dependent_test.max()],
                 [dependent_test.min(), dependent_test.max()],
                 color='red', linestyle='--', label='Ideal Fit')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.title('Actual vs Predicted Values')
        plt.legend()
        plt.grid(alpha=0.3)

#### DATA

In [2]:
krisha_almaty_rental = pd.read_csv(r"C:\Users\User\Desktop\DATA SCIENCE\Github\krisha_bot\data\original_source\krisha_almaty_rental.csv.gz")

In [6]:
krisha_almaty_rental_filtered = krisha_almaty_rental[['floor', 'total_floors', 'area_sqm', 'rooms', 'price',
                                                       'full_address_code', 'furniture_code', 'parking_code', 'security_code',
                                                       'bathroom_code']]

In [13]:
krisha_almaty_rental_price_corr = krisha_almaty_rental_filtered.corr()

krisha_almaty_rental_price_corr[['price']].sort_values( by = 'price',
                                                        ascending = False)


Unnamed: 0,price
price,1.0
area_sqm,0.64
rooms,0.57
total_floors,0.49
floor,0.33
security_code,0.11
parking_code,0.02
full_address_code,-0.01
bathroom_code,-0.12
furniture_code,-0.28


## CLEAN & PREPARE

In [14]:
krisha_almaty_rental_filtered = krisha_almaty_rental_filtered.drop(columns = ['parking_code','full_address_code','security_code',
                                                                              'furniture_code'])

In [20]:
krisha_almaty_rental_filtered_independent = krisha_almaty_rental_filtered.drop(columns = ['price'])

krisha_almaty_rental_filtered_dependent = krisha_almaty_rental_filtered['price']


In [85]:
from sklearn.model_selection import train_test_split

krisha_almaty_rental_filtered_independent_train , krisha_almaty_rental_filtered_independent_test , krisha_almaty_rental_filtered_dependent_train , krisha_almaty_rental_filtered_dependent_test = train_test_split(
    krisha_almaty_rental_filtered_independent,
    krisha_almaty_rental_filtered_dependent,
    train_size = 0.95 ,
    random_state = 42
) 

In [86]:
print(f'INDEPENDENT SUBSETS SHAPES (TRAIN - TEST) : {krisha_almaty_rental_filtered_independent_train.shape,krisha_almaty_rental_filtered_independent_test.shape}')

INDEPENDENT SUBSETS SHAPES (TRAIN - TEST) : ((5828, 5), (307, 5))


## MODEL TRAIN

In [87]:
from sklearn import metrics

from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.model_selection import RandomizedSearchCV




#### HGB

In [88]:
krisha_almaty_rental_hgb_param_distributions = {'learning_rate': [0.01, 0.05, 0.1, 0.2],
                                                'max_iter': [100, 300, 500, 1000],
                                                'max_leaf_nodes': [15, 31, 63, 127],
                                                'min_samples_leaf': [10, 20, 50, 100],
                                                'max_depth': [None, 10, 20, 30],
                                                'l2_regularization': [0.0, 0.1, 0.5, 1.0],
                                                'early_stopping': [True, False]}

In [89]:
krisha_almaty_rental_hgb = HistGradientBoostingRegressor()

In [90]:
krisha_almaty_rental_hgb_random_search_cv = RandomizedSearchCV( estimator = krisha_almaty_rental_hgb,
                                                                param_distributions = krisha_almaty_rental_hgb_param_distributions,
                                                                n_iter = 30,
                                                                cv = 7,
                                                                scoring = 'neg_mean_absolute_error',
                                                                verbose = 1 )

In [91]:
krisha_almaty_rental_hgb_random_search_cv.fit( krisha_almaty_rental_filtered_independent_train,
                                               krisha_almaty_rental_filtered_dependent_train )

Fitting 7 folds for each of 30 candidates, totalling 210 fits


In [92]:
krisha_almaty_rental_hgb_random_search_cv.best_params_

{'min_samples_leaf': 20,
 'max_leaf_nodes': 31,
 'max_iter': 1000,
 'max_depth': None,
 'learning_rate': 0.01,
 'l2_regularization': 0.5,
 'early_stopping': False}

In [93]:
krisha_almaty_rental_hgb_random_search_cv.best_score_

-57066.22750138428

#### ExtraTrees

In [94]:
krisha_almaty_rental_extra_trees = ExtraTreesRegressor()

In [95]:
krisha_almaty_rental_extra_trees_param_distributions = {'n_estimators': [100, 300, 500, 800, 1200],
                                                        'max_depth': [None, 10, 20, 30, 50, 70],
                                                        'min_samples_split': [2, 5, 10, 15],
                                                        'min_samples_leaf': [1, 2, 5, 10],
                                                        'max_features': ['sqrt', 'log2'],
                                                        'bootstrap': [True, False]}

In [96]:
krisha_almaty_rental_extra_trees_random_search_cv = RandomizedSearchCV( estimator = krisha_almaty_rental_extra_trees,
                                                                        param_distributions = krisha_almaty_rental_extra_trees_param_distributions,
                                                                        n_iter = 15,
                                                                        cv = 6,
                                                                        scoring = 'neg_mean_absolute_error',
                                                                        verbose = 1 )

In [97]:
krisha_almaty_rental_extra_trees_random_search_cv.fit( krisha_almaty_rental_filtered_independent_train,
                                                       krisha_almaty_rental_filtered_dependent_train )

Fitting 6 folds for each of 15 candidates, totalling 90 fits


In [98]:
krisha_almaty_rental_extra_trees_random_search_cv.best_params_

{'n_estimators': 300,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 20,
 'bootstrap': False}

In [99]:
krisha_almaty_rental_extra_trees_random_search_cv.best_score_

-57940.99954880297

#### Stacking

In [105]:
krisha_almaty_rental_base_learners = [
    ('hgb', HistGradientBoostingRegressor(
        min_samples_leaf=20,
        max_leaf_nodes=31,
        max_iter=1000,
        max_depth=None,
        learning_rate=0.01,
        l2_regularization=0.5,
        early_stopping=False
    )),
    ('etr', ExtraTreesRegressor(
        n_estimators=300,
        min_samples_split=10,
        min_samples_leaf=1,
        max_features='log2',
        max_depth=20,
        bootstrap=False
    ))
]

In [106]:
krisha_almaty_rental_stacking = StackingRegressor( estimators = krisha_almaty_rental_base_learners,
                                                   cv = 5 )

In [107]:
krisha_almaty_rental_stacking

In [108]:
krisha_almaty_rental_stacking.fit( krisha_almaty_rental_filtered_independent_train,
                                   krisha_almaty_rental_filtered_dependent_train )

### EVALUATE

#### HGB

In [112]:
krisha_almaty_rental_hgb_best_estimator = HistGradientBoostingRegressor(
        min_samples_leaf=20,
        max_leaf_nodes=31,
        max_iter=1000,
        max_depth=None,
        learning_rate=0.01,
        l2_regularization=0.5,
        early_stopping=False)

In [123]:
from sklearn.model_selection import cross_val_score

krisha_almaty_rental_hgb_best_estimator_scores = cross_val_score( X = krisha_almaty_rental_filtered_independent_test,
                                                                  y = krisha_almaty_rental_filtered_dependent_test,
                                                                  cv = 10,
                                                                  scoring = "neg_mean_absolute_error",
                                                                  estimator = krisha_almaty_rental_hgb_best_estimator )


In [125]:
krisha_almaty_rental_hgb_best_estimator_scores.mean()

-66926.57812225647

#### ExtraTrees

In [126]:
krisha_almaty_rental_extra_trees_best_estimator = ExtraTreesRegressor(
        n_estimators=300,
        min_samples_split=10,
        min_samples_leaf=1,
        max_features='log2',
        max_depth=20,
        bootstrap=False)

In [127]:
krisha_almaty_rental_extra_trees_best_estimator_scores = cross_val_score( X = krisha_almaty_rental_filtered_independent_test,
                                                                          y = krisha_almaty_rental_filtered_dependent_test,
                                                                          cv = 10,
                                                                          scoring = "neg_mean_absolute_error",
                                                                          estimator = krisha_almaty_rental_extra_trees_best_estimator )


In [128]:
krisha_almaty_rental_extra_trees_best_estimator_scores.mean()

-63703.55563420664

#### Stacking

In [130]:
krisha_almaty_rental_stacking_scores = cross_val_score( X = krisha_almaty_rental_filtered_independent_test,
                                                        y = krisha_almaty_rental_filtered_dependent_test,
                                                        cv = 10,
                                                        scoring = "neg_mean_absolute_error",
                                                        estimator = krisha_almaty_rental_stacking )


In [131]:
krisha_almaty_rental_stacking_scores.mean()

-63315.08310731826

In [133]:
for score in [krisha_almaty_rental_hgb_best_estimator_scores,
              krisha_almaty_rental_extra_trees_best_estimator_scores,
              krisha_almaty_rental_stacking_scores]:

    print(score.mean())

-66926.57812225647
-63703.55563420664
-63315.08310731826


## SAVE

In [134]:
from joblib import dump

dump(krisha_almaty_rental_stacking,
     "krisha_almaty_rental_stacking.joblib")


['krisha_almaty_rental_stacking.joblib']

In [136]:
#### TEST LOADING

from joblib import load

krisha_almaty_rental_stacking = load(r"C:\Users\User\Desktop\DATA SCIENCE\Github\krisha_bot\models\krisha_almaty_rental_stacking.joblib")


In [137]:
krisha_almaty_rental_stacking

In [145]:

test_dict = { 'floor' : [3], 
              'total_floors': [5], 
              'area_sqm' : [55], 
              'rooms' : [3], 
              'bathroom_code' : [0] } 

test_dict2 = { 'floor' : [10], 
              'total_floors': [12], 
              'area_sqm' : [30], 
              'rooms' : [1], 
              'bathroom_code' : [1] } 


In [143]:
krisha_almaty_rental_stacking.predict(pd.DataFrame(test_dict))

array([326694.08489549])

In [146]:
krisha_almaty_rental_stacking.predict(pd.DataFrame(test_dict2))

array([236912.36577887])