In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder, TargetEncoder
from sklearn.model_selection import GridSearchCV, KFold
import category_encoders as ce
# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostRegressor 
from lightgbm import LGBMRegressor as lgb
from xgboost import XGBRegressor as xgb
from sklearn.ensemble import GradientBoostingRegressor

# Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error 


# tunning hyperparamters model
import optuna

In [2]:
path = "/Users/verakabanova/Downloads/house-prices-advanced-regression-techniques/train.csv"

In [3]:
df = pd.read_csv(path)
df


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [700]:
# df[['GarageFinish', 'Street', 'FireplaceQu', 'MasVnrType','BsmtFinType1', 'BsmtExposure', 'CentralAir','GarageType', 'PoolQC','Fence','MiscFeature' ]] = df[['GarageFinish', 'Street', 'FireplaceQu', 'MasVnrType','BsmtFinType1', 'BsmtExposure', 'CentralAir','GarageType', 'PoolQC','Fence','MiscFeature' ]].fillna('no')

In [4]:
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Пропущено_значений': df.isnull().sum(),
    'Процент_пропусков': missing_percent
})

 #процента пропусков
missing_df = missing_df.sort_values('Процент_пропусков', ascending=False)

print("Таблица пропусков (отсортированная):")
print(missing_df.iloc[: 20])

Таблица пропусков (отсортированная):
              Пропущено_значений  Процент_пропусков
PoolQC                      1453          99.520548
MiscFeature                 1406          96.301370
Alley                       1369          93.767123
Fence                       1179          80.753425
MasVnrType                   872          59.726027
FireplaceQu                  690          47.260274
LotFrontage                  259          17.739726
GarageQual                    81           5.547945
GarageFinish                  81           5.547945
GarageType                    81           5.547945
GarageYrBlt                   81           5.547945
GarageCond                    81           5.547945
BsmtFinType2                  38           2.602740
BsmtExposure                  38           2.602740
BsmtCond                      37           2.534247
BsmtQual                      37           2.534247
BsmtFinType1                  37           2.534247
MasVnrArea                 

In [702]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
drop_columns = ["PoolQC","MiscFeature","Alley", "Fence", "MasVnrType","FireplaceQu", "LotFrontage","GarageYrBlt","TotRmsAbvGrd","GarageCars","TotalBsmtSF"]
df_copy = df.drop(drop_columns, axis=1)
print(df_copy.iloc[: 20])

    Id  MSSubClass MSZoning  LotArea Street LotShape LandContour Utilities  \
0    1          60       RL     8450   Pave      Reg         Lvl    AllPub   
1    2          20       RL     9600   Pave      Reg         Lvl    AllPub   
2    3          60       RL    11250   Pave      IR1         Lvl    AllPub   
3    4          70       RL     9550   Pave      IR1         Lvl    AllPub   
4    5          60       RL    14260   Pave      IR1         Lvl    AllPub   
5    6          50       RL    14115   Pave      IR1         Lvl    AllPub   
6    7          20       RL    10084   Pave      Reg         Lvl    AllPub   
7    8          60       RL    10382   Pave      IR1         Lvl    AllPub   
8    9          50       RM     6120   Pave      Reg         Lvl    AllPub   
9   10         190       RL     7420   Pave      Reg         Lvl    AllPub   
10  11          20       RL    11200   Pave      Reg         Lvl    AllPub   
11  12          60       RL    11924   Pave      IR1         Lvl

In [6]:
percent = (df_copy.isnull().sum() / len(df_copy)) * 100
df_new = pd.DataFrame({
    'Пропущено_значений': df_copy.isnull().sum(),
    'Процент_пропусков': percent
})

 #процента пропусков
df_new= df_new.sort_values(by='Процент_пропусков', ascending=False, axis=0)

print("Таблица пропусков (отсортированная):")
print(df_new.iloc[: 20])

Таблица пропусков (отсортированная):
              Пропущено_значений  Процент_пропусков
GarageCond                    81           5.547945
GarageQual                    81           5.547945
GarageFinish                  81           5.547945
GarageType                    81           5.547945
BsmtExposure                  38           2.602740
BsmtFinType2                  38           2.602740
BsmtFinType1                  37           2.534247
BsmtCond                      37           2.534247
BsmtQual                      37           2.534247
MasVnrArea                     8           0.547945
Electrical                     1           0.068493
Id                             0           0.000000
MSSubClass                     0           0.000000
MSZoning                       0           0.000000
LotShape                       0           0.000000
Street                         0           0.000000
LotArea                        0           0.000000
LotConfig                  

In [7]:
X,y = df.drop("SalePrice", axis=1),df["SalePrice"]

In [706]:
# plt.figure(figsize=(20, 18))
# sns.heatmap(number_features.corr(), annot=True, fmt='.2f', cmap='coolwarm')
# plt.title('Корреляция числовых признаков')
# plt.show()

In [707]:
# y =df_copy["SalePrice"]
# plt.figure()
# plt.scatter(range(len(y)), y)
# plt.xlabel("Индекс наблюдения")
# plt.ylabel("Значение целевой переменной")
# plt.title("Scatter plot целевой переменной HasDetections")
# plt.grid(True)
# plt.show()

In [713]:
df_copy.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [714]:
# df_copy["MSZoning"].value_counts()

In [8]:
X_train, X_valid, y_train, y_valid =  train_test_split(X,y, test_size=0.2, random_state=42)

In [719]:
# number_features= X.select_dtypes(include=["int64","float64"]).columns.tolist()
# category_featerus = X.select_dtypes(include="object").columns.tolist()

In [9]:
numerical_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [10]:
drop_columns = ["PoolQC","MiscFeature","Alley", "Fence", "MasVnrType","FireplaceQu", "LotFrontage","GarageYrBlt","TotRmsAbvGrd","GarageCars","TotalBsmtSF"]
imputer = ColumnTransformer(
    transformers= [
        ("numerical_features", SimpleImputer(strategy ="median"), numerical_features),
        ("categorical_features", SimpleImputer(strategy="most_frequent"), categorical_features),
        ("drop", "drop", drop_columns)],
    verbose_feature_names_out=False,
    remainder= 'passthrough'
    
)

In [722]:
# feature_columns = df_copy.columns.drop('SalePrice', errors='ignore')  # или просто data.columns, если все — признаки

# # Словарь: ключ — количество уникальных значений, значение — список имён столбцов
# columns_by_nunique = {}

# for col in feature_columns:
#     n_unique = df_copy[col].nunique(dropna=True)  # dropna=True — по умолчанию, но явно указываем
#     if n_unique not in columns_by_nunique:
#         columns_by_nunique[n_unique] = []
#     columns_by_nunique[n_unique].append(col)

# # Теперь можно создать отдельные списки, например:
# list_3 = columns_by_nunique.get(3, [])
# list_4 = columns_by_nunique.get(4, [])
# list_5 = columns_by_nunique.get(5, [])

In [11]:
encoder_and_scaler =ColumnTransformer(
transformers=[
        ('encoder', ce.CatBoostEncoder(),categorical_features),
        ('scaler', StandardScaler(), numerical_features)
    ],
    verbose_feature_names_out=False,
    remainder= 'passthrough'
)

In [12]:
preprocessor = Pipeline([
    ("imputer", imputer), 
    ("encoder_and_scaler", encoder_and_scaler)
]
)

In [725]:
preprocessor

0,1,2
,steps,"[('imputer', ...), ('encoder_and_scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical_features', ...), ('categorical_features', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('encoder', ...), ('scaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,random_state,
,sigma,
,a,1

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [726]:
X_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [727]:
print(categorical_features)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [13]:
X_train_prep = preprocessor.fit_transform(X_train, y_train)
X_valid_prep = preprocessor.fit_transform(X_valid, y_valid)

In [729]:
# models = {
#     "linereg": LinearRegression(),
#     "Ridge": Ridge(),
#     "Lasso": Lasso(),
#     "RandomForest": RandomForestRegressor(),
#     "GradientBoosting": GradientBoostingRegressor(),
#     "KNN": KNeighborsRegressor(),
#     "XGB": XGBRegressor(),
#     "LGBM": LGBMRegressor()}
# result = {}

# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_valid)
    
#     r2 = r2_score(y_valid, y_pred)
#     rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
#     mae = mean_absolute_error(y_valid, y_pred)
#     rmlse = np.sqrt(mean_squared_log_error(y_valid, y_pred))
    
    
#     result[name]={
#         "model": model,
#         "r2": r2,
#         "rmse": rmse,
#         "mae": mae,
#         "rmlse": rmlse        
#     }
#     print(f" {name} / {r2:.4f} / {rmse:.4f} / {mae:.4f} / {rmlse:.4f} ")
    
# result_df = pd.DataFrame([{
#     "model": name,
#     "r2": metrics["r2"],
#     "rmse": metrics["rmse"],
#     "mae": metrics["mae"],
#     "rmlse": metrics["rmlse"],
# }
# for name, metrics in result.items()
                              
# ]) 

# print(result_df.to_string(index=False))

In [14]:
models = {
    "linereg": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "KNN": KNeighborsRegressor(),
    "XGB": xgb(),
    "LGBM": lgb()#LGBMRegressor()
}

result = {}

for name, model in models.items():
    # Обучаем на ПОДГОТОВЛЕННЫХ данных
    model.fit(X_train_prep, y_train)
    y_pred = model.predict(X_valid_prep)
    
    r2 = r2_score(y_valid, y_pred)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    mae = mean_absolute_error(y_valid, y_pred)
    
    # RMSLE только если все значения > 0
    try:
        rmlse = np.sqrt(mean_squared_log_error(y_valid, y_pred))
    except ValueError:
        rmlse = float('nan')
    
    result[name] = {
        "r2": r2,
        "rmse": rmse,
        "mae": mae,
        "rmlse": rmlse
    }
    print(f"{name} / R2: {r2:.4f} / RMSE: {rmse:.4f} / MAE: {mae:.4f} / RMSLE: {rmlse:.4f}")

linereg / R2: 0.8428 / RMSE: 34728.8882 / MAE: 22138.0353 / RMSLE: 0.1734
Ridge / R2: 0.8427 / RMSE: 34731.6586 / MAE: 22137.1647 / RMSLE: 0.1734
Lasso / R2: 0.8432 / RMSE: 34675.7143 / MAE: 22127.4826 / RMSLE: 0.1738


  model = cd_fast.enet_coordinate_descent(


RandomForest / R2: 0.8799 / RMSE: 30355.6962 / MAE: 18999.9938 / RMSLE: 0.1616
GradientBoosting / R2: 0.9099 / RMSE: 26285.0410 / MAE: 17759.6125 / RMSLE: 0.1461
KNN / R2: 0.6987 / RMSE: 48073.8332 / MAE: 30013.9096 / RMSLE: 0.2347
XGB / R2: 0.8547 / RMSE: 33389.3319 / MAE: 21010.5215 / RMSLE: 0.1692
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001763 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14137
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 79
[LightGBM] [Info] Start training from score 181441.541952
LGBM / R2: 0.8837 / RMSE: 29871.7078 / MAE: 18263.1844 / RMSLE: 0.1514


In [None]:

models = {
    "linereg": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "KNN": KNeighborsRegressor(),
    "XGB": xgb.XGBRegressor(random_state=42, verbosity=0),
    "LGBM": lgb.LGBMRegressor(random_state=42, verbose=-1)
}

result = {}

print(f"{'Модель':<20} {'R2':<10} {'RMSE':<10} {'MAE':<10} {'RMSLE':<10}")


for name, model in models.items():
    try:
        # Обучаем 
        model.fit(X_train_prep, y_train)
        y_pred = model.predict(X_valid_prep)
        
        r2 = r2_score(y_valid, y_pred)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        mae = mean_absolute_error(y_valid, y_pred)
        
        # RMSLE только если все значения > 0
        if (y_valid > 0).all() and (y_pred > 0).all():
            rmlse = np.sqrt(mean_squared_log_error(y_valid, y_pred))
        else:
            rmlse = float('nan')
        
        result[name] = {
            "model": model,
            "r2": r2,
            "rmse": rmse,
            "mae": mae,
            "rmlse": rmlse,
            "predictions": y_pred
        }
        print(f"{name:<20} {r2:<10.4f} {rmse:<10.4f} {mae:<10.4f} {rmlse:<10.4f}")
    except Exception as e:
        print(f"{name:<20} Ошибка: {str(e)[:30]}")



def objective(trial, X_train, y_train, X_test, y_test, model_type):
    """
    Функция для оптимизации гиперпараметров с помощью Optuna
    """
    
    if model_type == "RF":
        model = RandomForestRegressor(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 3, 15),
            min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
            min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
            random_state=42,
            n_jobs=-1
        )
        
    elif model_type == 'GradientBoosting':
        model = GradientBoostingRegressor(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 3, 10),    
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            random_state=42
        )

    elif model_type == 'Cat':
        model = CatBoostRegressor(
            iterations=trial.suggest_int('iterations', 100, 500),
            depth=trial.suggest_int('depth', 3, 10),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
            l2_leaf_reg=trial.suggest_float('l2_leaf_reg', 1, 10),
            verbose=False,
            random_state=42
        )
        
    elif model_type == 'LGBM':
        model = lgb.LGBMRegressor(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 3, 15),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            num_leaves=trial.suggest_int('num_leaves', 20, 100),
            min_child_samples=trial.suggest_int('min_child_samples', 5, 50),
            random_state=42,
            verbose=-1
        )
        
    elif model_type == 'XGB':
        model = xgb.XGBRegressor(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 3, 15),
            learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
            random_state=42,
            verbosity=0
        )
    else:
        raise ValueError(f"Неизвестный тип модели: {model_type}")
    
    # Обучение модели
    model.fit(X_train, y_train)
    
    # Предсказание и оценка
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    
    return score


models_to_optimize = ['RF', 'GradientBoosting', 'XGB', 'LGBM', 'Cat']



optuna_results = {}

for model_name in models_to_optimize:
    try:
        print(f"\nОптимизация модели: {model_name}")
        
        # Создаем исследование
        study = optuna.create_study(direction='maximize')
        
        # Оптимизируем
        study.optimize(
            lambda trial: objective(
                trial, X_train_prep, y_train, X_valid_prep, y_valid, model_name
            ), 
            n_trials=20,
            show_progress_bar=True
        )
        
        # Сохраняем результаты
        optuna_results[model_name] = {
            'best_value': study.best_value,
            'best_params': study.best_params,
            'study': study
        }
        
        print(f"  Лучший R2: {study.best_value:.4f}")
        print(f"  Лучшие параметры: {study.best_params}")
        
    except Exception as e:
        print(f"  Ошибка при оптимизации {model_name}: {str(e)[:50]}")



best_models = {}

for model_name, result_data in optuna_results.items():
    try:
        print(f"\nОбучение лучшей модели {model_name}...")
        
        if model_name == "RF":
            model = RandomForestRegressor(
                **result_data['best_params'],
                random_state=42,
                n_jobs=-1
            )
        elif model_name == 'GradientBoosting':
            model = GradientBoostingRegressor(
                **result_data['best_params'],
                random_state=42
            )
        elif model_name == 'Cat':
            model = CatBoostRegressor(
                **result_data['best_params'],
                verbose=False,
                random_state=42
            )
        elif model_name == 'LGBM':
            model = lgb.LGBMRegressor(
                **result_data['best_params'],
                random_state=42,
                verbose=-1
            )
        elif model_name == 'XGB':
            model = xgb.XGBRegressor(
                **result_data['best_params'],
                random_state=42,
                verbosity=0
            )
        
        # Обучаем модель
        model.fit(X_train_prep, y_train)
        
        # Делаем предсказания
        y_pred_train = model.predict(X_train_prep)
        y_pred_valid = model.predict(X_valid_prep)
        
        # Считаем метрики
        r2_train = r2_score(y_train, y_pred_train)
        r2_valid = r2_score(y_valid, y_pred_valid)
        rmse_valid = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
        
        # Сохраняем модель
        best_models[f"{model_name}_optimized"] = {
            'model': model,
            'r2_train': r2_train,
            'r2_valid': r2_valid,
            'rmse_valid': rmse_valid,
            'params': result_data['best_params']
        }
        
        print(f"  R2 на обучении: {r2_train:.4f}")
        print(f"  R2 на валидации: {r2_valid:.4f}")
        print(f"  RMSE на валидации: {rmse_valid:.4f}")
        
    except Exception as e:
        print(f"  Ошибка при обучении {model_name}: {str(e)[:50]}")


base_results_df = pd.DataFrame([
    {
        'Model': name,
        'Type': 'Baseline',
        'R2': metrics['r2'],
        'RMSE': metrics['rmse'],
        'MAE': metrics['mae']
    }
    for name, metrics in result.items() if 'r2' in metrics
])

# Создаем DataFrame с результатами оптимизированных моделей
opt_results_df = pd.DataFrame([
    {
        'Model': name,
        'Type': 'Optimized',
        'R2': metrics['r2_valid'],
        'RMSE': metrics['rmse_valid'],
        'MAE': np.nan  
    }
    for name, metrics in best_models.items()
])

# Объединяем результаты
all_results_df = pd.concat([base_results_df, opt_results_df], ignore_index=True)
all_results_df = all_results_df.sort_values('R2', ascending=False).reset_index(drop=True)

print(all_results_df.to_string(index=False))


# Находим лучшую модель
best_result = all_results_df.iloc[0]
best_model_name = best_result['Model']
best_model_type = best_result['Type']


print(f"ЛУЧШАЯ МОДЕЛЬ: {best_model_name} ({best_model_type})")
print(f"R2: {best_result['R2']:.4f}")
print(f"RMSE: {best_result['RMSE']:.4f}")


# Сохраняем лучшую модель
import joblib

if best_model_type == 'Baseline':
    best_model = result[best_model_name]['model']
elif best_model_type == 'Optimized':
    # Убираем суффикс _optimized
    base_name = best_model_name.replace('_optimized', '')
    best_model = best_models[best_model_name]['model']

# Сохраняем модель
joblib.dump(best_model, 'best_model.pkl')
print("Лучшая модель сохранена в файл 'best_model.pkl'")

In [None]:

plt.figure(figsize=(12, 6))

# Базовые модели
base_models = [r for r in all_results_df[all_results_df['Type'] == 'Baseline'].itertuples()]
opt_models = [r for r in all_results_df[all_results_df['Type'] == 'Optimized'].itertuples()]

x_base = np.arange(len(base_models))
x_opt = np.arange(len(opt_models)) + len(base_models) + 0.5

plt.bar(x_base, [m.R2 for m in base_models], width=0.4, label='Базовые', alpha=0.7)
plt.bar(x_opt, [m.R2 for m in opt_models], width=0.4, label='Оптимизированные', alpha=0.7)

plt.xlabel('Модели')
plt.ylabel('R2 Score')
plt.title('Сравнение моделей (чем выше, тем лучше)')
plt.xticks(np.concatenate([x_base, x_opt]), 
           [m.Model for m in base_models] + [m.Model for m in opt_models],
           rotation=45, ha='right')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
