In [627]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder, TargetEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, KFold
import category_encoders as ce
# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error 


# tunning hyperparamters model
import optuna

In [628]:
path = "D:/phase_1_repeat/project_2_house/train.csv"

In [629]:
df = pd.read_csv(path)
df


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [630]:
# df[['GarageFinish', 'Street', 'FireplaceQu', 'MasVnrType','BsmtFinType1', 'BsmtExposure', 'CentralAir','GarageType', 'PoolQC','Fence','MiscFeature' ]] = df[['GarageFinish', 'Street', 'FireplaceQu', 'MasVnrType','BsmtFinType1', 'BsmtExposure', 'CentralAir','GarageType', 'PoolQC','Fence','MiscFeature' ]].fillna('no')

In [631]:
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Пропущено_значений': df.isnull().sum(),
    'Процент_пропусков': missing_percent
})

 #процента пропусков
missing_df = missing_df.sort_values('Процент_пропусков', ascending=False)

print("Таблица пропусков (отсортированная):")
print(missing_df.iloc[: 20])

Таблица пропусков (отсортированная):
              Пропущено_значений  Процент_пропусков
PoolQC                      1453          99.520548
MiscFeature                 1406          96.301370
Alley                       1369          93.767123
Fence                       1179          80.753425
MasVnrType                   872          59.726027
FireplaceQu                  690          47.260274
LotFrontage                  259          17.739726
GarageQual                    81           5.547945
GarageFinish                  81           5.547945
GarageType                    81           5.547945
GarageYrBlt                   81           5.547945
GarageCond                    81           5.547945
BsmtFinType2                  38           2.602740
BsmtExposure                  38           2.602740
BsmtCond                      37           2.534247
BsmtQual                      37           2.534247
BsmtFinType1                  37           2.534247
MasVnrArea                 

In [632]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [633]:
drop_columns = ["PoolQC","MiscFeature","Alley", "Fence", "MasVnrType","FireplaceQu", "LotFrontage","GarageYrBlt","TotRmsAbvGrd","GarageCars","TotalBsmtSF"]
df_copy = df.drop(drop_columns, axis=1)
print(df_copy.iloc[: 20])

    Id  MSSubClass MSZoning  LotArea Street LotShape LandContour Utilities  \
0    1          60       RL     8450   Pave      Reg         Lvl    AllPub   
1    2          20       RL     9600   Pave      Reg         Lvl    AllPub   
2    3          60       RL    11250   Pave      IR1         Lvl    AllPub   
3    4          70       RL     9550   Pave      IR1         Lvl    AllPub   
4    5          60       RL    14260   Pave      IR1         Lvl    AllPub   
5    6          50       RL    14115   Pave      IR1         Lvl    AllPub   
6    7          20       RL    10084   Pave      Reg         Lvl    AllPub   
7    8          60       RL    10382   Pave      IR1         Lvl    AllPub   
8    9          50       RM     6120   Pave      Reg         Lvl    AllPub   
9   10         190       RL     7420   Pave      Reg         Lvl    AllPub   
10  11          20       RL    11200   Pave      Reg         Lvl    AllPub   
11  12          60       RL    11924   Pave      IR1         Lvl

In [634]:
percent = (df_copy.isnull().sum() / len(df_copy)) * 100
df_new = pd.DataFrame({
    'Пропущено_значений': df_copy.isnull().sum(),
    'Процент_пропусков': percent
})

 #процента пропусков
df_new= df_new.sort_values(by='Процент_пропусков', ascending=False, axis=0)

print("Таблица пропусков (отсортированная):")
print(df_new.iloc[: 20])

Таблица пропусков (отсортированная):
              Пропущено_значений  Процент_пропусков
GarageCond                    81           5.547945
GarageQual                    81           5.547945
GarageFinish                  81           5.547945
GarageType                    81           5.547945
BsmtExposure                  38           2.602740
BsmtFinType2                  38           2.602740
BsmtFinType1                  37           2.534247
BsmtCond                      37           2.534247
BsmtQual                      37           2.534247
MasVnrArea                     8           0.547945
Electrical                     1           0.068493
Id                             0           0.000000
MSSubClass                     0           0.000000
MSZoning                       0           0.000000
LotShape                       0           0.000000
Street                         0           0.000000
LotArea                        0           0.000000
LotConfig                  

In [635]:
X,y = df.drop("SalePrice", axis=1),df["SalePrice"]

In [636]:
# plt.figure(figsize=(20, 18))
# sns.heatmap(number_features.corr(), annot=True, fmt='.2f', cmap='coolwarm')
# plt.title('Корреляция числовых признаков')
# plt.show()

In [637]:
# y =df_copy["SalePrice"]
# plt.figure()
# plt.scatter(range(len(y)), y)
# plt.xlabel("Индекс наблюдения")
# plt.ylabel("Значение целевой переменной")
# plt.title("Scatter plot целевой переменной HasDetections")
# plt.grid(True)
# plt.show()

In [638]:
df_copy.groupby("BsmtCond").size()

BsmtCond
Fa      45
Gd      65
Po       2
TA    1311
dtype: int64

In [639]:
df_copy.groupby("BsmtFinType2").size()

BsmtFinType2
ALQ      19
BLQ      33
GLQ      14
LwQ      46
Rec      54
Unf    1256
dtype: int64

In [640]:
df_copy.groupby("GarageQual").size()

GarageQual
Ex       3
Fa      48
Gd      14
Po       3
TA    1311
dtype: int64

In [641]:
df_copy.groupby("BsmtQual").size()

BsmtQual
Ex    121
Fa     35
Gd    618
TA    649
dtype: int64

In [642]:
df_copy.groupby("MasVnrArea").size()

MasVnrArea
0.0       861
1.0         2
11.0        1
14.0        1
16.0        7
         ... 
1115.0      1
1129.0      1
1170.0      1
1378.0      1
1600.0      1
Length: 327, dtype: int64

In [643]:
df_copy.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [644]:
# df_copy["MSZoning"].value_counts()

In [645]:
X_train, X_valid, y_train, y_valid =  train_test_split(X,y, test_size=0.2, random_state=42)

In [646]:
X_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [647]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [648]:
X.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [649]:
# number_features= X.select_dtypes(include=["int64","float64"]).columns.tolist()
# category_featerus = X.select_dtypes(include="object").columns.tolist()

In [650]:
numerical_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [651]:
drop_columns = ["PoolQC","MiscFeature","Alley", "Fence", "MasVnrType","FireplaceQu", "LotFrontage","GarageYrBlt","TotRmsAbvGrd","GarageCars","TotalBsmtSF", 'Exterior2nd', 'LotConfig', 'GarageCond', 'Fence', 'Id', 'GarageArea', 'BsmtCond', 'BldgType', 'KitchenQual', 'ExterQual', 'Alley', 'SaleCondition', 'PavedDrive', 'GarageQual', 'Condition2', 'YearRemodAdd', 'LotFrontage', 'YearBuilt', 'RoofMatl', 'BsmtExposure', 'BsmtUnfSF', 'OverallQual', 'MiscFeature', 'LotShape', 'Condition1', 'Neighborhood', 'BsmtFinType2', 'MasVnrType', 'Heating', 'Foundation', 'LandSlope', 'GarageCars', 'GarageType', 'BsmtFinSF2', 'YrSold', 'MasVnrArea', 'MSZoning', 'MSSubClass', 'Street', 'HeatingQC', 'TotalBsmtSF']
imputer = ColumnTransformer(
    transformers= [
        ("numerical_features", SimpleImputer(strategy ="median"), numerical_features),
        ("categorical_features", SimpleImputer(strategy="most_frequent"), categorical_features),
        ("drop", "drop", drop_columns)],
    verbose_feature_names_out=False,
    remainder= 'passthrough'
    
)

In [652]:
# feature_columns = df_copy.columns.drop('SalePrice', errors='ignore')  # или просто data.columns, если все — признаки

# # Словарь: ключ — количество уникальных значений, значение — список имён столбцов
# columns_by_nunique = {}

# for col in feature_columns:
#     n_unique = df_copy[col].nunique(dropna=True)  # dropna=True — по умолчанию, но явно указываем
#     if n_unique not in columns_by_nunique:
#         columns_by_nunique[n_unique] = []
#     columns_by_nunique[n_unique].append(col)

# # Теперь можно создать отдельные списки, например:
# list_3 = columns_by_nunique.get(3, [])
# list_4 = columns_by_nunique.get(4, [])
# list_5 = columns_by_nunique.get(5, [])

In [653]:
encoder_and_scaler =ColumnTransformer(
transformers=[
        ('encoder', ce.CatBoostEncoder(),categorical_features),
        ('scaler', StandardScaler(), numerical_features)
    ],
    verbose_feature_names_out=False,
    remainder= 'passthrough'
)

In [654]:
preprocessor = Pipeline([
    ("imputer", imputer), 
    ("encoder_and_scaler", encoder_and_scaler)
]
)

In [655]:
preprocessor

0,1,2
,steps,"[('imputer', ...), ('encoder_and_scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical_features', ...), ('categorical_features', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('encoder', ...), ('scaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,random_state,
,sigma,
,a,1

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [656]:
X_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [657]:
print(categorical_features)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [658]:
X_train_prep = preprocessor.fit_transform(X_train, y_train)
X_valid_prep = preprocessor.fit_transform(X_valid, y_valid)

In [659]:
# models = {
#     "linereg": LinearRegression(),
#     "Ridge": Ridge(),
#     "Lasso": Lasso(),
#     "RandomForest": RandomForestRegressor(),
#     "GradientBoosting": GradientBoostingRegressor(),
#     "KNN": KNeighborsRegressor(),
#     "XGB": XGBRegressor(),
#     "LGBM": LGBMRegressor()}
# result = {}

# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_valid)
    
#     r2 = r2_score(y_valid, y_pred)
#     rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
#     mae = mean_absolute_error(y_valid, y_pred)
#     rmlse = np.sqrt(mean_squared_log_error(y_valid, y_pred))
    
    
#     result[name]={
#         "model": model,
#         "r2": r2,
#         "rmse": rmse,
#         "mae": mae,
#         "rmlse": rmlse        
#     }
#     print(f" {name} / {r2:.4f} / {rmse:.4f} / {mae:.4f} / {rmlse:.4f} ")
    
# result_df = pd.DataFrame([{
#     "model": name,
#     "r2": metrics["r2"],
#     "rmse": metrics["rmse"],
#     "mae": metrics["mae"],
#     "rmlse": metrics["rmlse"],
# }
# for name, metrics in result.items()
                              
# ]) 

# print(result_df.to_string(index=False))

In [660]:
# models = {
#     "linereg": LinearRegression(),
#     "Ridge": Ridge(),
#     "Lasso": Lasso(),
#     "RandomForest": RandomForestRegressor(),
#     "GradientBoosting": GradientBoostingRegressor(
#         n_estimators = 253, max_depth = 5, 
#         learning_rate = 0.024850394952947306, 
#         subsample = 0.8433434535775448
#     ),
#     "KNN": KNeighborsRegressor(),
#     "XGB": XGBRegressor(
#         n_estimators = 506, max_depth = 5, 
#         learning_rate = 0.024850394952947306, 
#         subsample = 0.8433434535775448
#     ),
#     "LGBM": LGBMRegressor(
#         n_estimators= 537, max_depth = 4, learning_rate= 0.037258937732582084, num_leaves= 47, subsample= 0.7786260132455978, colsample_bytree= 0.7707995808218282, min_child_samples = 14, reg_alpha = 0.0020899880683935804, reg_lambda = 0.00041689593212457807
#     )
    
# }

# result = {}

# for name, model in models.items():
#     # Обучаем на ПОДГОТОВЛЕННЫХ данных
#     model.fit(X_train_prep, y_train)
#     y_pred = model.predict(X_valid_prep)
    
#     r2 = r2_score(y_valid, y_pred)
#     rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
#     mae = mean_absolute_error(y_valid, y_pred)
    
#     # RMSLE только если все значения > 0
#     try:
#         rmlse = np.sqrt(mean_squared_log_error(y_valid, y_pred))
#     except ValueError:
#         rmlse = float('nan')
    
#     result[name] = {
#         "r2": r2,
#         "rmse": rmse,
#         "mae": mae,
#         "rmlse": rmlse
#     }
#     print(f"{name} / R2: {r2:.4f} / RMSE: {rmse:.4f} / MAE: {mae:.4f} / RMSLE: {rmlse:.4f}")

In [661]:
X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)

In [662]:

model =GradientBoostingRegressor(
        n_estimators = 253, max_depth = 5, 
        learning_rate = 0.024850394952947306, 
        subsample = 0.8433434535775448
    )
model.fit(X_train, y_train)


0,1,2
,loss,'squared_error'
,learning_rate,0.024850394952947306
,n_estimators,253
,subsample,0.8433434535775448
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,5
,min_impurity_decrease,0.0


In [663]:
y_pred = model.predict(X_valid)
r2_scores = r2_score(y_valid, y_pred)
r2_scores

0.9201617752519062

In [664]:
X_test =  pd.read_csv("D:/phase_1_repeat/project_2_house/test.csv")
X_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [665]:
X_test = preprocessor.transform(X_test)
y_pred_kaggle = model.predict(X_test)
y_pred_kaggle

array([124218.02396843, 155901.24821982, 188357.67858683, ...,
       163715.74620301, 118637.78049064, 200994.57493644], shape=(1459,))

In [666]:
X_test.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,178839.811644,179598.18492,180360.853002,158766.105643,178339.710718,178839.811644,171531.553297,178674.239315,140301.062481,147089.425728,...,1.207184,0.496225,-0.700238,-0.392536,-0.100385,1.924845,-0.058621,-0.156264,-0.065411,1.626532
1,193646.990402,179598.18492,180360.853002,219785.294633,178339.710718,178839.811644,206857.251434,178674.239315,140301.062481,183924.975921,...,-0.660366,2.884599,0.020289,-0.392536,-0.100385,-0.247557,-0.058621,157.66445,-0.065411,1.626532
2,193646.990402,179598.18492,180360.853002,219785.294633,178339.710718,178839.811644,171531.553297,178674.239315,175946.920776,183924.975921,...,0.099164,1.175921,-0.01974,-0.392536,-0.100385,-0.247557,-0.058621,-0.156264,-1.126522,1.626532
3,193646.990402,179598.18492,180360.853002,219785.294633,178339.710718,178839.811644,171531.553297,178674.239315,175946.920776,183924.975921,...,0.04555,2.573072,0.020289,-0.392536,-0.100385,-0.247557,-0.058621,-0.156264,-0.065411,1.626532
4,193646.990402,179598.18492,180360.853002,219785.294633,198589.738228,178839.811644,171531.553297,178674.239315,314866.801941,183924.975921,...,0.206392,-0.825404,0.940963,-0.392536,-0.100385,2.359325,-0.058621,-0.156264,-1.83393,1.626532


In [667]:
X_test_2 = pd.read_csv("D:/phase_1_repeat/project_2_house/test.csv")

In [668]:
submission = pd.DataFrame({
    'Id': X_test_2['Id'],           # ← обязательно из test.csv!
    'SalePrice': y_pred_kaggle
})
submission.set_index('Id', inplace=True)
# submission = submission.reset_index() 
submission.to_csv('submission.csv', index=True)

In [669]:
submission

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,124218.023968
1462,155901.248220
1463,188357.678587
1464,189612.558488
1465,195767.676486
...,...
2915,82163.049393
2916,85292.031279
2917,163715.746203
2918,118637.780491


In [670]:
# check = pd.Series(dict(zip(X.columns, model.feature_importances_))).sort_values(key=lambda x: abs(x), ascending=False)
# check.iloc[:]

In [671]:
# check.iloc[:25]

In [672]:
# check.iloc[25:50]

In [673]:
# check.iloc[50:75]

In [674]:
# feature_names = check.iloc[39:].index.tolist()
# print(feature_names)

In [675]:

# model_cat_boost = CatBoostRegressor(
#     # n_estimators= 81, 
#     # max_depth=6, 
#     # learning_rate =  0.09929108743509919,
#     # l2_leaf_reg =  83.86834789304754,
#     # bagging_temperature= 0.9905878484464076,
#     # random_strength = 0.3972742649694947
# )

# model_cat_boost.fit(X_train_optune, y_train)


In [676]:
# X_valid_optune = preprocessor.transform(X_valid)

In [677]:
# y_pred_cat = model_cat_boost.predict(X_valid_optune)

In [678]:
# r2_catboost = r2_score(y_valid, y_pred_cat)
# r2_catboost

In [679]:
# X_train_optune = preprocessor.transform(X_train)
# X_train_optune

In [680]:
# def objective(trial):
#     # Гиперпараметры для регрессии (улучшенные диапазоны)
#     n_estimators = trial.suggest_int('n_estimators', 100, 1000)
#     max_depth = trial.suggest_int('max_depth', 3, 12)          # LightGBM эффективен с меньшей глубиной
#     learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2, log=True)
#     num_leaves = trial.suggest_int('num_leaves', 20, 256)      # max: ~2^max_depth
#     subsample = trial.suggest_float('subsample', 0.6, 1.0)
#     colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1.0)
#     min_child_samples = trial.suggest_int('min_child_samples', 10, 100)
#     reg_alpha = trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True)      # L1
#     reg_lambda = trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True)    # L2

#     model = LGBMRegressor(
#         n_estimators=n_estimators,
#         max_depth=max_depth,
#         learning_rate=learning_rate,
#         num_leaves=num_leaves,
#         subsample=subsample,
#         colsample_bytree=colsample_bytree,
#         min_child_samples=min_child_samples,
#         reg_alpha=reg_alpha,
#         reg_lambda=reg_lambda,
#         random_state=42,
#         verbose=-1
#     )
    
#     # ←←← ИСПРАВЛЕНО: метрика для РЕГРЕССИИ
#     scores = cross_val_score(
#         model, 
#         X_train_optune, 
#         y_train, 
#         cv=5, 
#         scoring='neg_root_mean_squared_error'  # или 'r2'
#     )
    
#     # cross_val_score возвращает отрицательные значения для ошибок
#     return scores.mean()  # это уже -RMSE, но Optuna максимизирует → так правильно

# # Направление: 'maximize', потому что neg_root_mean_squared_error — чем ближе к нулю (меньше ошибка), тем выше значение
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# best_params = study.best_params
# best_score = study.best_value

# print("Лучшие параметры:", best_params)
# print("Лучший CV результат (negative RMSE):", best_score)
# print("Соответствует RMSE ≈", -best_score)