In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import shap
import joblib
import optuna
# Импорт необходимых библиотек
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, accuracy_score, mean_squared_log_error, make_scorer
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn import set_config
set_config(transform_output="pandas")
pd.set_option('future.no_silent_downcasting', True)

In [2]:
train = pd.read_csv("test_train/train.csv")
test = pd.read_csv("test_train/test.csv")
y = train['SalePrice']
data_id = train['Id']



In [3]:
X, y_log = train.drop('SalePrice', axis=1), np.log1p(y)

numeric_columns = [col for col in X.select_dtypes(include=['int64', 'float64']).columns 
                   if col != 'Id' and not X[col].isin([0, 1]).all()]
categorical_columns = list(X.select_dtypes(include=['object']).columns)

X_train, X_valid, y_train, y_valid = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [4]:
class MyTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        replace_map = {
            'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0,
            'Grvl': 2, 'Pave': 1,
            'Y': 2, 'P': 1, 'N': 0,
            'Av': 3, 'Mn': 2, 'No': 1,
            'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1,
            'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1,
            'Fin': 3, 'RFn': 2
        }
        values_to_check = ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA',
                           'Grvl', 'Pave', 
                           'Y', 'P', 'N',
                           'Av', 'Mn', 'No',
                           'GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf',
                           'GdPrv', 'MnPrv', 'GdWo', 'MnWw',
                           'Fin', 'RFn']
        
        columns_with_values = [
            col for col in X.columns
            if X[col].isin(values_to_check).any()
        ]
        X[columns_with_values] = X[columns_with_values].replace(replace_map)

        return X

class MissingValueReplacer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X = X.copy()
        X.replace('Missing', 0, inplace=True)
        return X

In [5]:
target_encoding_columns  = ['MSSubClass', 'MSZoning', 'Neighborhood', 'Condition1',
                            'Condition2', 'RoofStyle','Exterior1st', 'Exterior2nd',
                            'MasVnrType', 'Foundation', 'Heating', 'Electrical',
                            'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition']

ordinal_encoding_columns = {
            'LotShape': ['Missing', 'IR3', 'IR2', 'IR1', 'Reg'],
            'LandContour': ['Missing', 'Low', 'HLS', 'Bnk', 'Lvl'],
            'Utilities': ['Missing', 'ELO', 'NoSeWa', 'NoSewr', 'AllPub'],
            'LotConfig': ['Missing', 'FR3', 'FR2', 'CulDSac', 'Corner', 'Inside'],
            'LandSlope': ['Missing', 'Sev', 'Mod', 'Gtl'],
            'BldgType': ['Missing', 'TwnhsI', 'TwnhsE', 'Twnhs', 'Duplex', '2fmCon', '1Fam'],
            'HouseStyle': ['Missing', '1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf', 'SFoyer', 'SLvl'],
            'RoofMatl': ['Missing', 'WdShngl', 'WdShake', 'Tar&Grv', 'Roll', 'Metal', 'Membran', 'CompShg', 'ClyTile'],
            'Functional': ['Missing', 'Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']
}

drop_features = ['Id']

ordinal_columns = list(ordinal_encoding_columns.keys())
ordinal_categories = [ordinal_encoding_columns[col] for col in ordinal_columns]
best_model = CatBoostRegressor(
    depth=10,
    learning_rate=0.0011953388786005764,
    iterations=1000,
    eval_metric='RMSE',
    random_seed=42,
    verbose=100
)

In [6]:
my_scaler = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), numeric_columns)
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

my_dropper = ColumnTransformer(
    transformers = [
        ('drop_features', 'drop', drop_features),
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    

preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal_encoding_columns', OrdinalEncoder(
            categories=ordinal_categories),ordinal_columns),
        ('target_encoding_columns', TargetEncoder(
            cols=target_encoding_columns),target_encoding_columns)
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)

preprocessing_pipeline = Pipeline(
    steps=[
    ('my_dropper', my_dropper),
    ('my_transformer', MyTransformer()),
    ('imputer', ColumnTransformer(
            transformers=[
                ('num_imputer', SimpleImputer(strategy='constant', fill_value=0), numeric_columns),
                ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Missing'), categorical_columns),
            ],
            verbose_feature_names_out=False,
            remainder='passthrough'
        )),
    ('preprocessor', preprocessor),
    ('missing_replacer', MissingValueReplacer()),
    ('my_scaler', my_scaler)
])

full_pipeline = Pipeline(
    steps=[
        ('preprocessing', preprocessing_pipeline),
        ('model', best_model)
])

X_train_processed = preprocessing_pipeline.fit_transform(X_train, y_train)
X_valid_processed = preprocessing_pipeline.transform(X_valid)
traintrain = preprocessing_pipeline.transform(X)

In [7]:
azbukatest = preprocessing_pipeline.transform(test)
azbukatest.to_csv('azbukatest.csv', index=False)

X_train_final = pd.concat([traintrain, y], axis=1)
X_train_final.to_csv('processed_train_dataset.csv', index=False)


In [8]:
pd.set_option('display.max_rows', None)
X_train_final.corr()['SalePrice'][:-1].sort_values(key=lambda x: abs(x), ascending=False)

OverallQual      0.790982
GrLivArea        0.708624
Neighborhood     0.699746
ExterQual        0.682639
KitchenQual      0.659600
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
BsmtQual         0.585207
FullBath         0.560664
GarageFinish     0.549247
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
FireplaceQu      0.520438
YearRemodAdd     0.507101
Foundation       0.504458
GarageType       0.489187
MSSubClass       0.481434
MasVnrArea       0.472614
Fireplaces       0.466929
HeatingQC        0.427649
MasVnrType       0.426141
BsmtFinSF1       0.386420
Exterior1st      0.376955
Exterior2nd      0.376616
BsmtExposure     0.374696
SaleType         0.364324
SaleCondition    0.362844
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
MSZoning         0.312965
BsmtFinType1     0.304908
HalfBath         0.284108
GarageQual       0.273839
LotShape        -0.267759
LotArea          0.263843
GarageCond  

In [9]:
pd.reset_option('display.max_rows')

In [10]:
joblib.dump(preprocessing_pipeline, 'preprocessing_pipeline.pkl')

['preprocessing_pipeline.pkl']

In [11]:
full_pipeline.fit(X_train, y_train)

0:	learn: 0.3901423	total: 75.5ms	remaining: 1m 15s
100:	learn: 0.3621385	total: 752ms	remaining: 6.69s
200:	learn: 0.3368210	total: 1.33s	remaining: 5.3s
300:	learn: 0.3141280	total: 1.91s	remaining: 4.45s
400:	learn: 0.2936144	total: 2.48s	remaining: 3.7s
500:	learn: 0.2749352	total: 3.05s	remaining: 3.04s
600:	learn: 0.2579936	total: 3.61s	remaining: 2.4s
700:	learn: 0.2427810	total: 4.17s	remaining: 1.78s
800:	learn: 0.2290759	total: 4.75s	remaining: 1.18s
900:	learn: 0.2167098	total: 5.33s	remaining: 585ms
999:	learn: 0.2055333	total: 5.88s	remaining: 0us


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
y_valid_pred = full_pipeline.predict(test)
y_valid_pred_exp = np.expm1(y_valid_pred)
y_valid_exp = np.expm1(y_valid)

In [13]:
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': y_valid_pred_exp
})
submission.to_csv('submission.csv', index=False)
submission


Unnamed: 0,Id,SalePrice
0,1461,138471.202990
1,1462,154868.227936
2,1463,177525.053882
3,1464,180575.891388
4,1465,185455.575327
...,...,...
1454,2915,117410.827602
1455,2916,123678.101269
1456,2917,159106.711860
1457,2918,135080.308280


In [14]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

def objective(trial):
    # Выбор модели для оптимизации
    model_type = trial.suggest_categorical('model_type', ['Ridge', 'Lasso', 'CatBoost', 'XGBoost', 'RandomForest'])

    if model_type == 'Ridge':
        alpha = trial.suggest_float('ridge_alpha', 1e-3, 10.0, log=True)
        model = Ridge(alpha=alpha)

    elif model_type == 'CatBoost':
        depth = trial.suggest_int('catboost_depth', 3, 5)  # Уменьшаем глубину деревьев
        learning_rate = trial.suggest_float('catboost_lr', 1e-3, 0.3, log=True)
        iterations = trial.suggest_int('catboost_iterations', 500, 2000)  # Добавляем больше итераций
        model = CatBoostRegressor(depth=depth, learning_rate=learning_rate, iterations=iterations, verbose=0)

    elif model_type == 'RandomForest':
        n_estimators = trial.suggest_int('rf_n_estimators', 500, 2000)  # Большее количество деревьев
        max_depth = trial.suggest_int('rf_max_depth', 3, 5)  # Уменьшаем глубину деревьев
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

    # Встраиваем выбранную модель в pipeline
    full_pipeline.set_params(model=model)

    # Кросс-валидация для оценки модели
    score = cross_val_score(full_pipeline, X_train, y_train, cv=5, scoring=rmsle_scorer).mean()

    return score


study = optuna.create_study(direction='minimize')

# Запускаем оптимизацию
study.optimize(objective, n_trials=100)

#Результаты оптимизации
print("Best trial:")
trial = study.best_trial

print(" Value: {}".format(trial.value))
print(" Params: ")
for key, value in trial.params.items():
   print("    {}: {}".format(key, value))