# Regression. Part 2

---
Author: Durkin Anatoliy

Updated: 31.03.2025

---
В данном ноутбуке ...

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [18]:
def metrics(true, pred):
    print('R2:', r2_score(true, pred))
    print('MAE:', mean_absolute_error(true, pred))
    print('RMSE:', mean_squared_error(true, pred)**0.5)

In [19]:
df = pd.read_csv('house_price_regression_dataset.csv')

In [20]:
df.head()

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,House_Price
0,1360,2,1,1981,0.599637,0,5,262382.9
1,4272,3,3,2016,4.753014,1,6,985260.9
2,3592,1,2,2016,3.634823,0,9,777977.4
3,966,1,2,1977,2.730667,1,8,229698.9
4,4926,2,1,1993,4.699073,0,8,1041741.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Square_Footage        1000 non-null   int64  
 1   Num_Bedrooms          1000 non-null   int64  
 2   Num_Bathrooms         1000 non-null   int64  
 3   Year_Built            1000 non-null   int64  
 4   Lot_Size              1000 non-null   float64
 5   Garage_Size           1000 non-null   int64  
 6   Neighborhood_Quality  1000 non-null   int64  
 7   House_Price           1000 non-null   float64
dtypes: float64(2), int64(6)
memory usage: 62.6 KB


In [22]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['House_Price'], axis=1), df['House_Price'], test_size=0.2, random_state=42)

# Модели

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
lr = LinearRegression().fit(X_train, y_train)

In [25]:
metrics(y_test, lr.predict(X_test))

R2: 0.9984263636823408
MAE: 8174.583600008741
RMSE: 10071.4844241387


In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
rf = RandomForestRegressor().fit(X_train, y_train)

In [28]:
metrics(y_test, rf.predict(X_test))

R2: 0.9939675768704473
MAE: 15901.387883254707
RMSE: 19719.10495918623


In [29]:
from sklearn.ensemble import GradientBoostingRegressor

In [30]:
gbr = GradientBoostingRegressor().fit(X_train, y_train)

In [31]:
metrics(y_test, gbr.predict(X_test))

R2: 0.9965090698096019
MAE: 12307.411764828263
RMSE: 15000.712324447199


In [32]:
from catboost import CatBoostRegressor

In [33]:
cb = CatBoostRegressor().fit(X_train, y_train, verbose=False)

In [34]:
metrics(y_test, cb.predict(X_test))

R2: 0.9976469255147913
MAE: 9687.789728016342
RMSE: 12315.701033854577


# Отбор признаков

## Прямой отбор

In [35]:
from sklearn.feature_selection import SequentialFeatureSelector

In [36]:
cb = CatBoostRegressor(verbose=False)
sfs = SequentialFeatureSelector(cb, direction='forward')
sfs.fit(X_train, y_train)

In [37]:
sfs.get_support()

array([ True, False, False,  True,  True, False, False])

In [38]:
sfs.get_params()

{'cv': 5,
 'direction': 'forward',
 'estimator__loss_function': 'RMSE',
 'estimator__verbose': False,
 'estimator': <catboost.core.CatBoostRegressor at 0x29be6fe0770>,
 'n_features_to_select': 'auto',
 'n_jobs': None,
 'scoring': None,
 'tol': None}

In [39]:
sfs.transform(X_test)

array([[4.01200000e+03, 2.01600000e+03, 2.09809241e+00],
       [2.31000000e+03, 1.98800000e+03, 1.36962237e+00],
       [4.70800000e+03, 1.96200000e+03, 1.79297022e+00],
       [4.93200000e+03, 1.97200000e+03, 4.47959818e+00],
       [3.64600000e+03, 1.99400000e+03, 3.98098734e+00],
       [3.58600000e+03, 1.96400000e+03, 2.56842869e+00],
       [4.63800000e+03, 2.00000000e+03, 1.49039905e+00],
       [4.12700000e+03, 1.99200000e+03, 1.02615591e+00],
       [3.78100000e+03, 1.98900000e+03, 3.16407579e+00],
       [4.24300000e+03, 2.00200000e+03, 4.49808849e+00],
       [3.61000000e+03, 1.97900000e+03, 3.43441878e+00],
       [3.06800000e+03, 1.98600000e+03, 2.07591346e+00],
       [7.80000000e+02, 2.00800000e+03, 2.59075926e+00],
       [4.92500000e+03, 1.95900000e+03, 4.89664254e+00],
       [2.56500000e+03, 2.00400000e+03, 1.03802899e+00],
       [8.85000000e+02, 1.96000000e+03, 1.79713210e+00],
       [2.18600000e+03, 1.97800000e+03, 5.39177662e-01],
       [1.70500000e+03, 1.97700

## Последовательный отбор

In [40]:
cb = CatBoostRegressor(verbose=False)
sfs = SequentialFeatureSelector(cb, direction='backward')
sfs.fit(X_train, y_train)

In [41]:
sfs.get_support()

array([ True,  True, False,  True,  True, False, False])

## Исчерпывающий выбор

In [42]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector

In [43]:
cb = CatBoostRegressor(verbose=False)
efs = ExhaustiveFeatureSelector(cb, min_features=1, max_features=7, scoring='r2', cv=5)
efs.fit(X_train, y_train)

Features: 127/127

In [44]:
efs.best_score_

np.float64(0.9970567314053543)

In [45]:
efs.best_feature_names_

('Square_Footage',
 'Num_Bedrooms',
 'Num_Bathrooms',
 'Year_Built',
 'Lot_Size',
 'Garage_Size')

In [46]:
efs.subsets_

{0: {'feature_idx': (0,),
  'cv_scores': array([0.97921123, 0.98034293, 0.98241054, 0.97691594, 0.9807523 ]),
  'avg_score': np.float64(0.9799265873009346),
  'feature_names': ('Square_Footage',)},
 1: {'feature_idx': (1,),
  'cv_scores': array([ 0.00075249,  0.00635901, -0.00178436,  0.00829565,  0.00756045]),
  'avg_score': np.float64(0.004236649931976122),
  'feature_names': ('Num_Bedrooms',)},
 2: {'feature_idx': (2,),
  'cv_scores': array([-0.00850559, -0.00065699, -0.00100347,  0.00019372, -0.01260056]),
  'avg_score': np.float64(-0.004514578001016578),
  'feature_names': ('Num_Bathrooms',)},
 3: {'feature_idx': (3,),
  'cv_scores': array([-0.07221276, -0.18134114, -0.08481621, -0.10551191, -0.12634343]),
  'avg_score': np.float64(-0.11404508809386331),
  'feature_names': ('Year_Built',)},
 4: {'feature_idx': (4,),
  'cv_scores': array([-0.09886572, -0.05185182, -0.06426124,  0.06371277, -0.0870423 ]),
  'avg_score': np.float64(-0.04766166076940139),
  'feature_names': ('Lot_Size

# Pipeline

## Pipeline as transformer

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [48]:
simple_imputer = SimpleImputer(strategy='mean')
scaler = MinMaxScaler()

In [49]:
pipe = Pipeline(steps=[('imputer', simple_imputer), ('scaler', scaler)])

In [50]:
pipe.fit(X_train)

In [51]:
pipe.transform(X_test)

array([[0.78047153, 0.5       , 0.        , ..., 0.35394383, 0.5       ,
        0.44444444],
       [0.40191281, 0.5       , 0.        , ..., 0.19116336, 0.5       ,
        0.33333333],
       [0.9352758 , 0.        , 1.        , ..., 0.28576267, 0.5       ,
        0.77777778],
       ...,
       [0.88701068, 0.75      , 0.5       , ..., 0.64467318, 0.        ,
        0.33333333],
       [0.301379  , 0.        , 0.        , ..., 0.90109175, 1.        ,
        0.77777778],
       [0.59875445, 0.        , 0.        , ..., 0.75955225, 1.        ,
        0.        ]], shape=(200, 7))

## Pipeline as model

In [52]:
model = LinearRegression()

In [53]:
pipe = Pipeline(steps=[('imputer', simple_imputer), ('scaler', scaler), ('model', model)])

In [54]:
pipe.fit(X_train, y_train)

In [55]:
metrics(y_test, pipe.predict(X_test))

R2: 0.9984263636823413
MAE: 8174.583600006591
RMSE: 10071.484424137052


## Обработка разнородных данных

In [56]:
df = pd.read_csv('insurance.csv')

In [57]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [58]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['charges'], axis=1), df['charges'], test_size=0.2, random_state=42)

In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [60]:
col_transformer = ColumnTransformer([('num_preproc', MinMaxScaler(), [x for x in X_train.columns if X_train[x].dtype!='object']),
                                     ('cat_preproc', OneHotEncoder(dtype='int'), [x for x in X_train.columns if X_train[x].dtype=='object'])])

In [61]:
pipe = Pipeline([('preproc', col_transformer), ('LR', LinearRegression())])

In [62]:
pipe.fit(X_train, y_train)

In [63]:
metrics(y_test, pipe.predict(X_test))

R2: 0.7835929767120723
MAE: 4181.194473753648
RMSE: 5796.2846592762735


## Подбор гиперпараметров

In [64]:
from sklearn.model_selection import GridSearchCV

In [65]:
pipe = Pipeline([('preproc', col_transformer), ('CatBoost', CatBoostRegressor(verbose=False))])

In [66]:
pipe.fit(X_train, y_train)

In [67]:
pipe.score(X_test, y_test)

np.float64(0.8650932224197827)

In [68]:
param_grid = {
    "CatBoost__iterations": [1000, 2000],
    "CatBoost__learning_rate": [0.01, 0.05],
    "CatBoost__depth": [3, 5, 7]
}
search = GridSearchCV(pipe, param_grid)

In [69]:
search.fit(X_train, y_train)

In [70]:
search.best_score_

np.float64(0.8508020663055372)

In [71]:
search.best_params_

{'CatBoost__depth': 3,
 'CatBoost__iterations': 1000,
 'CatBoost__learning_rate': 0.01}