In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
sklearn.set_config(transform_output="pandas")

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder, TargetEncoder
from sklearn.model_selection import GridSearchCV, KFold


In [114]:
df = pd.read_csv('train.csv')

In [115]:
test = pd.read_csv('test.csv')

In [116]:
test['SalePrice'] = np.nan

In [117]:
df = pd.concat([df, test], axis=0, ignore_index=True)

In [119]:
pd.set_option('display.max_columns', None)

In [120]:
# shape and data types of the data
print(df.shape)
print(df.dtypes)

(2919, 81)
Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice        float64
Length: 81, dtype: object


In [121]:
# отбор числовых колонок
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print(numeric_cols)

['Id' 'MSSubClass' 'LotFrontage' 'LotArea' 'OverallQual' 'OverallCond'
 'YearBuilt' 'YearRemodAdd' 'MasVnrArea' 'BsmtFinSF1' 'BsmtFinSF2'
 'BsmtUnfSF' 'TotalBsmtSF' '1stFlrSF' '2ndFlrSF' 'LowQualFinSF'
 'GrLivArea' 'BsmtFullBath' 'BsmtHalfBath' 'FullBath' 'HalfBath'
 'BedroomAbvGr' 'KitchenAbvGr' 'TotRmsAbvGrd' 'Fireplaces' 'GarageYrBlt'
 'GarageCars' 'GarageArea' 'WoodDeckSF' 'OpenPorchSF' 'EnclosedPorch'
 '3SsnPorch' 'ScreenPorch' 'PoolArea' 'MiscVal' 'MoSold' 'YrSold'
 'SalePrice']


In [122]:
# отбор нечисловых колонок
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print(non_numeric_cols)

['MSZoning' 'Street' 'Alley' 'LotShape' 'LandContour' 'Utilities'
 'LotConfig' 'LandSlope' 'Neighborhood' 'Condition1' 'Condition2'
 'BldgType' 'HouseStyle' 'RoofStyle' 'RoofMatl' 'Exterior1st'
 'Exterior2nd' 'MasVnrType' 'ExterQual' 'ExterCond' 'Foundation'
 'BsmtQual' 'BsmtCond' 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2'
 'Heating' 'HeatingQC' 'CentralAir' 'Electrical' 'KitchenQual'
 'Functional' 'FireplaceQu' 'GarageType' 'GarageFinish' 'GarageQual'
 'GarageCond' 'PavedDrive' 'PoolQC' 'Fence' 'MiscFeature' 'SaleType'
 'SaleCondition']


In [123]:
# процент пропусков
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

Id - 0%
MSSubClass - 0%
MSZoning - 0%
LotFrontage - 17%
LotArea - 0%
Street - 0%
Alley - 93%
LotShape - 0%
LandContour - 0%
Utilities - 0%
LotConfig - 0%
LandSlope - 0%
Neighborhood - 0%
Condition1 - 0%
Condition2 - 0%
BldgType - 0%
HouseStyle - 0%
OverallQual - 0%
OverallCond - 0%
YearBuilt - 0%
YearRemodAdd - 0%
RoofStyle - 0%
RoofMatl - 0%
Exterior1st - 0%
Exterior2nd - 0%
MasVnrType - 61%
MasVnrArea - 1%
ExterQual - 0%
ExterCond - 0%
Foundation - 0%
BsmtQual - 3%
BsmtCond - 3%
BsmtExposure - 3%
BsmtFinType1 - 3%
BsmtFinSF1 - 0%
BsmtFinType2 - 3%
BsmtFinSF2 - 0%
BsmtUnfSF - 0%
TotalBsmtSF - 0%
Heating - 0%
HeatingQC - 0%
CentralAir - 0%
Electrical - 0%
1stFlrSF - 0%
2ndFlrSF - 0%
LowQualFinSF - 0%
GrLivArea - 0%
BsmtFullBath - 0%
BsmtHalfBath - 0%
FullBath - 0%
HalfBath - 0%
BedroomAbvGr - 0%
KitchenAbvGr - 0%
KitchenQual - 0%
TotRmsAbvGrd - 0%
Functional - 0%
Fireplaces - 0%
FireplaceQu - 49%
GarageType - 5%
GarageYrBlt - 5%
GarageFinish - 5%
GarageCars - 0%
GarageArea - 0%
GarageQ

In [124]:
# делим df на фичи и таргет
X, y = df.drop('SalePrice', axis=1), df['SalePrice']

In [132]:
# определяем фичи для типов кодирования
ordinal_features = ['LotShape', 'LandSlope', 'ExterQual', 'ExterCond', 
'Foundation', 'BsmtQual', 'BsmtCond',  'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'Electrical', 
'KitchenQual','Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC']


# Настройка one-hot кодирования
onehot_features = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LandContour', 'Utilities', 
'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Heating', 'CentralAir', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'KitchenAbvGr', 'Fireplaces', 'GarageType', 'GarageCars', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


# standard_scaler_features = 

# порядок категорий для ordinal_features
ordinal_categories = [
    ['Reg', 'IR1', 'IR2', 'IR3'],  # Порядок для LotShape
    ['Gtl', 'Mod', 'Sev'], # Порядок для LandSlope
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # ExterQual 
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'],  # ExterCond
    ['PConc', 'CBlock', 'BrkTil', 'Stone', 'Wood', 'Slab'],  # Foundation
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],  # BsmtQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'],  # BsmtCond
    ['Gd', 'Av', 'Mn', 'No', 'NA'],  # BsmtExposure
    ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'], # BsmtFinType1
    ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'], # BsmtFinType2
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'], # HeatingQC
    ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix'], # Electrical 
    ['Ex', 'Gd', 'TA', 'Fa', 'Po'], #  KitchenQual
    ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'], # Functional
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], # FireplaceQu
    ['Fin', 'RFn', 'Unf', 'NA'], # GarageFinish
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], # GarageQual
    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], # GarageCond 
    ['Ex', 'Gd', 'TA', 'Fa', 'NA'], #  PoolQC
]

***Scaler and encoder***

In [133]:
scaler_and_encoder = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_features),
        ('onehot', OneHotEncoder(sparse_output=False), onehot_features)
    ],
    remainder='passthrough'
)


In [134]:
scaler_and_encoder.fit_transform(df)

ValueError: Found unknown categories [nan] in column 5 during fit