In [1]:
import pandas as pd
import numpy as np

import sklearn
sklearn.set_config(transform_output="pandas")
import warnings
warnings.filterwarnings('ignore')

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import make_column_selector
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler

# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import VotingRegressor, StackingRegressor

from catboost import CatBoostRegressor
from category_encoders import CatBoostEncoder

# Metrics
from sklearn.metrics import root_mean_squared_error


# tunning hyperparamters model
import optuna
import shap

In [2]:
import scipy.stats as stats

path = 'data/train.csv'
train = pd.read_csv(path)
z = np.abs(stats.zscore(train.iloc[:,-1]))
f = (abs(z) >= 2)

train = train.loc[~f]

In [3]:
num_features = train.select_dtypes(exclude='object')
cat_features = train.select_dtypes(include='object')

### Custom

In [4]:
class MasVnrType_modify(BaseEstimator, TransformerMixin): 
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_copy = X.copy()

        ff = ((X_copy['MasVnrArea'] == 0)  & (X_copy['MasVnrType'] != 'NO'))
        X_copy.loc[ff,'MasVnrArea'] = X_copy['MasVnrArea'].loc[X_copy['MasVnrType'] == 'Stone'].median()
        
        f = ((X_copy['MasVnrArea'] != 0)  & (X_copy['MasVnrType'] == 'NO'))
        X_copy.loc[f,'MasVnrType'] = 'BrkFace'
        
        return X_copy

### Pipeline

In [5]:
cools_to_fill = ['Alley', 'PoolQC','Fence','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2', 'FireplaceQu','GarageType', 'GarageFinish','GarageQual','GarageCond','MasVnrType']

drop_col = ['MiscFeature','Id']

fill_na = ColumnTransformer(
     transformers = [
        ('fillna' , SimpleImputer(strategy='constant', fill_value='NO') , cools_to_fill),
        ('garage_YER' , SimpleImputer(strategy='constant', fill_value= -1) , ['GarageYrBlt']),
        ('mod_electrical', SimpleImputer(strategy='most_frequent'), ['Electrical']),
        ('Median_value', SimpleImputer(strategy='median'), ['LotFrontage']),
        ('MasVn_zero', SimpleImputer(strategy='constant', fill_value = 0 ), ['MasVnrArea']),
        ('drop_col' , 'drop', drop_col)
     ],
     verbose_feature_names_out = False,
     remainder = 'passthrough')

fill_na_2 = ColumnTransformer(
     transformers = [
        ('fill_na_add' , SimpleImputer(strategy='most_frequent'),make_column_selector()),
     ],
     verbose_feature_names_out = False,
     remainder = 'passthrough')

fill_na_p = Pipeline(
    [
        ('fill_na', fill_na),
        ('fill_na 2',fill_na_2),
    ]
)



from custom import AdjustedScaler

MasVnrType = ColumnTransformer(
    transformers = [
        ('MasVnrType_modify', MasVnrType_modify(), ['MasVnrArea','MasVnrType']),
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

target = cat_features.columns.to_list()


my_encoder = ColumnTransformer(
    [
        ('ordinal_encoding', CatBoostEncoder(), target)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

categorical_pipeline = Pipeline(steps=[
    ('cat_encode', CatBoostEncoder()),
    # ('impute', SimpleImputer(strategy='most_frequent')),
])

numeric_pipeline = Pipeline(steps=[
    ('impute', IterativeImputer()),
    ('scale_norm', StandardScaler()),
    # ('scale', AdjustedScaler())
])


pca = PCA(n_components=25, random_state=42)

add_pca = Pipeline(
    [
        ('stage 1', pca),
    ]
)


omega = ['WoodDeckSF',
 'GarageYrBlt',
 'BsmtFullBath',
 'Condition1',
 'GarageType',
 'OpenPorchSF',
 'MSZoning',
 'LotFrontage',
 'BsmtExposure',
 'TotRmsAbvGrd',
 'BsmtFinType1',
 'ScreenPorch',
 'HalfBath',
 'Functional',
 'BsmtQual',
 'Heating',
 'HeatingQC',
 'MasVnrArea',
 'BsmtUnfSF',
 'BsmtCond',
 'CentralAir',
 'Exterior1st',
 'HouseStyle',
 'MasVnrType',
 'GarageQual',
 'Fence',
 'LotConfig',
 'Exterior2nd',
 'SaleType',
 'YrSold',
 'MoSold',
 'LotShape',
 'RoofStyle',
 'LandSlope',
 'Electrical',
 'KitchenAbvGr',
 'LandContour',
 'BldgType',
 'BsmtFinSF2',
 'ExterCond',
 'PavedDrive',
 'PoolQC',
 'GarageCond',
 'Alley',
 'BedroomAbvGr',
 'EnclosedPorch',
 'Foundation',
 'FullBath',
 'BsmtFinType2',
 'Condition2',
 'RoofMatl',
 'Street',
 'MSSubClass',
 'Utilities',
 'LowQualFinSF',
 'PoolArea',
 '3SsnPorch',
 'BsmtHalfBath',
 'MiscVal']

drop_col = ColumnTransformer(
    [
        ('drop_col_2', 'drop', omega)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

preprocessor = Pipeline(
    [
        ('stage 1', fill_na),
        ('Custom',MasVnrType),
        ('Encoding',categorical_pipeline),
        ('Scaling', numeric_pipeline),
        # ('PCA',add_pca),
        ('drop PCA', drop_col)
    ]
)


In [6]:
X, y = train.drop('SalePrice', axis=1), train['SalePrice']
path = 'data/test.csv'
test =  pd.read_csv(path)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

### SHAP

In [80]:
shap.initjs()

model = CatBoostRegressor(iterations=3500, learning_rate=0.04,depth=4, random_seed=42)
model.fit(X, y, verbose=False, plot=False)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

vals = np.abs(shap_values).mean(0)
vals
feature_importance = pd.DataFrame(list(zip(X.columns,vals)),columns=['col_name','power'])
feature_importance.sort_values(by=['power'],ascending=False,inplace=True)


f = feature_importance.power > feature_importance.power.mean()

Unnamed: 0,col_name,power
53,GrLivArea,13449.413295
32,OverallQual,9528.569113
46,TotalBsmtSF,6443.24434
27,Neighborhood,6402.330869
43,BsmtFinSF1,4299.047331
33,OverallCond,3920.441281
35,YearRemodAdd,3113.19416
50,1stFlrSF,2842.727043
20,LotArea,2813.638881
65,GarageArea,2654.203577


In [83]:
feature_importance.loc[~f]['col_name'].to_list()

['WoodDeckSF',
 'GarageYrBlt',
 'BsmtFullBath',
 'Condition1',
 'GarageType',
 'OpenPorchSF',
 'MSZoning',
 'LotFrontage',
 'BsmtExposure',
 'TotRmsAbvGrd',
 'BsmtFinType1',
 'ScreenPorch',
 'HalfBath',
 'Functional',
 'BsmtQual',
 'Heating',
 'HeatingQC',
 'MasVnrArea',
 'BsmtUnfSF',
 'BsmtCond',
 'CentralAir',
 'Exterior1st',
 'HouseStyle',
 'MasVnrType',
 'GarageQual',
 'Fence',
 'LotConfig',
 'Exterior2nd',
 'SaleType',
 'YrSold',
 'MoSold',
 'LotShape',
 'RoofStyle',
 'LandSlope',
 'Electrical',
 'KitchenAbvGr',
 'LandContour',
 'BldgType',
 'BsmtFinSF2',
 'ExterCond',
 'PavedDrive',
 'PoolQC',
 'GarageCond',
 'Alley',
 'BedroomAbvGr',
 'EnclosedPorch',
 'Foundation',
 'FullBath',
 'BsmtFinType2',
 'Condition2',
 'RoofMatl',
 'Street',
 'MSSubClass',
 'Utilities',
 'LowQualFinSF',
 'PoolArea',
 '3SsnPorch',
 'BsmtHalfBath',
 'MiscVal']

### OPTUNE

In [7]:
ml_pipeline = Pipeline(
    [
        # ('preprocessor', preprocessor),
        ('model', RandomForestRegressor())
    ]
)


def objective(trial):

    model_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 800, 5),
        'max_depth' : trial.suggest_int('max_depth', 2, 6, 1),
        'random_state': trial.suggest_int('random_state', 42, 42, 1)
    }
    
    
    ml_pipeline.named_steps['model'].set_params(**model_params)

    cv = KFold(n_splits=6, random_state=42, shuffle=True)
    
    return cross_val_score(ml_pipeline, X, y, cv=cv).mean()

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=40)

best_params = study.best_params
best_value = study.best_value


print(f"Лучшие параметры: {best_params}")
print(f"Лучшее значение: {best_value}")

[I 2024-02-09 10:40:26,623] A new study created in memory with name: no-name-3cc03aa4-9b57-41b2-8e22-8e324756a55b
[I 2024-02-09 10:40:28,781] Trial 0 finished with value: 0.8257719828111215 and parameters: {'n_estimators': 115, 'max_depth': 5, 'random_state': 42}. Best is trial 0 with value: 0.8257719828111215.
[I 2024-02-09 10:40:37,527] Trial 1 finished with value: 0.7988933405877434 and parameters: {'n_estimators': 560, 'max_depth': 4, 'random_state': 42}. Best is trial 0 with value: 0.8257719828111215.
[I 2024-02-09 10:40:42,777] Trial 2 finished with value: 0.7986054730236357 and parameters: {'n_estimators': 335, 'max_depth': 4, 'random_state': 42}. Best is trial 0 with value: 0.8257719828111215.
[I 2024-02-09 10:40:52,480] Trial 3 finished with value: 0.8268190733956485 and parameters: {'n_estimators': 520, 'max_depth': 5, 'random_state': 42}. Best is trial 3 with value: 0.8268190733956485.
[I 2024-02-09 10:40:54,376] Trial 4 finished with value: 0.7519839770219586 and parameters

Лучшие параметры: {'n_estimators': 775, 'max_depth': 6, 'random_state': 42}
Лучшее значение: 0.8460397729830813


### Training

In [7]:
from xgboost import XGBRegressor

xgb = XGBRegressor(max_death=5, learning_rate = 0.02, n_estimators = 1000)
rfg = RandomForestRegressor(n_estimators=775,max_depth=6,random_state=42)
cbr = CatBoostRegressor(iterations=3000, learning_rate=0.04366383,depth=7, random_seed=42)


STC = StackingRegressor(
    estimators=[
        ('RFR',rfg),
        ('xgboost',xgb),
        ('catboost',cbr)
    ])

ml_pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', STC)
    ]
)

In [8]:
ml_pipeline.fit_transform(X_train,y_train)

0:	learn: 56545.8750030	total: 50.1ms	remaining: 2m 30s
1:	learn: 54872.4224671	total: 50.9ms	remaining: 1m 16s
2:	learn: 53308.1341828	total: 51.5ms	remaining: 51.5s
3:	learn: 51822.2085894	total: 52.1ms	remaining: 39s
4:	learn: 50346.7453007	total: 52.7ms	remaining: 31.6s
5:	learn: 48929.3601725	total: 53.3ms	remaining: 26.6s
6:	learn: 47583.2398098	total: 53.8ms	remaining: 23s
7:	learn: 46348.9752558	total: 54.4ms	remaining: 20.3s
8:	learn: 45160.1041653	total: 55ms	remaining: 18.3s
9:	learn: 43993.5062474	total: 55.6ms	remaining: 16.6s
10:	learn: 42821.5169592	total: 56.2ms	remaining: 15.3s
11:	learn: 41736.1007663	total: 56.8ms	remaining: 14.1s
12:	learn: 40750.4391586	total: 57.3ms	remaining: 13.2s
13:	learn: 39759.0459650	total: 57.9ms	remaining: 12.3s
14:	learn: 38855.7895080	total: 58.5ms	remaining: 11.6s
15:	learn: 37921.7214299	total: 59ms	remaining: 11s
16:	learn: 37036.3171305	total: 59.6ms	remaining: 10.5s
17:	learn: 36215.0513963	total: 60.2ms	remaining: 9.96s
18:	learn:

Unnamed: 0,stackingregressor_RFR,stackingregressor_xgboost,stackingregressor_catboost
160,164709.053653,165749.406250,162838.418943
933,207090.609761,189682.093750,190119.361766
1170,149967.518845,165115.468750,170580.741019
356,161854.887092,170157.328125,172656.495965
1195,171000.225068,173357.140625,174870.834682
...,...,...,...
1146,170222.327767,179881.937500,179924.594160
1184,191419.448041,186584.781250,186756.680128
1352,119409.093034,134676.468750,134817.835984
907,237286.273794,249999.218750,249955.100655


In [10]:
root_mean_squared_error(ml_pipeline.predict(X_valid),y_valid)

17760.555680432

### CROSS VALIDATION

In [None]:
models = {'catboost':cbr, 'RFG':rfg, 'XGBoost':xgb, 'STC':STC}

cv = KFold(n_splits=5, random_state=42, shuffle=True)
d = []
for name, model in models.items():
    ml_pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        (name, model)
    ]
)
    ml_pipeline.fit(X_train, y_train)

    train = root_mean_squared_error(y_train, ml_pipeline.predict(X_train))
    valid = root_mean_squared_error(y_valid, ml_pipeline.predict(X_valid))
    

    
    cross_validation_result = cross_val_score(
        ml_pipeline,
        X,
        y,
        cv = cv,)
    
    x = [name, train, valid, cross_validation_result.mean()]
    d.append(x)

final = pd.DataFrame(d).rename(columns={0:'Model',1:'train score',2:'valid score',3:'cross validation'}).sort_values('cross validation', ascending = False)

0:	learn: 56545.8750030	total: 30.7ms	remaining: 1m 32s
1:	learn: 54872.4224671	total: 35ms	remaining: 52.4s
2:	learn: 53308.1341828	total: 38.9ms	remaining: 38.9s
3:	learn: 51822.2085894	total: 43.6ms	remaining: 32.7s
4:	learn: 50346.7453007	total: 44.3ms	remaining: 26.5s
5:	learn: 48929.3601725	total: 44.8ms	remaining: 22.4s
6:	learn: 47583.2398098	total: 45.4ms	remaining: 19.4s
7:	learn: 46348.9752558	total: 46ms	remaining: 17.2s
8:	learn: 45160.1041653	total: 46.5ms	remaining: 15.5s
9:	learn: 43993.5062474	total: 47.3ms	remaining: 14.1s
10:	learn: 42821.5169592	total: 47.9ms	remaining: 13s
11:	learn: 41736.1007663	total: 48.5ms	remaining: 12.1s
12:	learn: 40750.4391586	total: 49.3ms	remaining: 11.3s
13:	learn: 39759.0459650	total: 49.8ms	remaining: 10.6s
14:	learn: 38855.7895080	total: 50.4ms	remaining: 10s
15:	learn: 37921.7214299	total: 51ms	remaining: 9.51s
16:	learn: 37036.3171305	total: 51.6ms	remaining: 9.05s
17:	learn: 36215.0513963	total: 52.2ms	remaining: 8.65s
18:	learn: 

In [None]:
final

### LOG END

In [11]:
y_log = np.log(y)
ml_pipeline.fit(X, y_log)

0:	learn: 0.3444042	total: 1.59ms	remaining: 4.77s
1:	learn: 0.3343657	total: 2.45ms	remaining: 3.67s
2:	learn: 0.3252943	total: 3.12ms	remaining: 3.12s
3:	learn: 0.3159482	total: 3.72ms	remaining: 2.79s
4:	learn: 0.3076534	total: 4.33ms	remaining: 2.59s
5:	learn: 0.2994897	total: 4.91ms	remaining: 2.45s
6:	learn: 0.2918900	total: 5.57ms	remaining: 2.38s
7:	learn: 0.2848072	total: 6.26ms	remaining: 2.34s
8:	learn: 0.2773723	total: 6.94ms	remaining: 2.31s
9:	learn: 0.2702842	total: 7.57ms	remaining: 2.26s
10:	learn: 0.2634034	total: 8.19ms	remaining: 2.23s
11:	learn: 0.2566790	total: 8.8ms	remaining: 2.19s
12:	learn: 0.2503881	total: 9.4ms	remaining: 2.16s
13:	learn: 0.2448362	total: 9.99ms	remaining: 2.13s
14:	learn: 0.2391824	total: 10.6ms	remaining: 2.1s
15:	learn: 0.2336368	total: 11.2ms	remaining: 2.08s
16:	learn: 0.2286275	total: 11.8ms	remaining: 2.07s
17:	learn: 0.2240468	total: 12.4ms	remaining: 2.06s
18:	learn: 0.2191586	total: 13ms	remaining: 2.04s
19:	learn: 0.2148651	total:

In [None]:
y_pred = np.exp(ml_pipeline.predict(test))

In [None]:
id = np.arange(1461, 2920)
submission = pd.DataFrame(y_pred_final).rename(columns={0:'SalePrice'}).set_index(id)
submission.to_csv('submission_2.csv')