In [122]:
# base imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
import category_encoders as ce

# notebook settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [150]:
df_train = pd.read_csv('../train.csv')
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


In [33]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [149]:
df_test = pd.read_csv('../test.csv')
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,2Story,4,7,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,546.0,546.0,GasA,Gd,Y,SBrkr,546,546,0,1092,0.0,0.0,1,1,3,1,TA,5,Typ,0,,,,,0.0,0.0,,,Y,0,0,0,0,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,5,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,252.0,Unf,0.0,294.0,546.0,GasA,TA,Y,SBrkr,546,546,0,1092,0.0,0.0,1,1,3,1,TA,6,Typ,0,,CarPort,1970.0,Unf,1.0,286.0,TA,TA,Y,0,24,0,0,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1Story,5,7,1960,1996,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,ALQ,1224.0,Unf,0.0,0.0,1224.0,GasA,Ex,Y,SBrkr,1224,0,0,1224,1.0,0.0,1,0,4,1,TA,7,Typ,1,TA,Detchd,1960.0,Unf,2.0,576.0,TA,TA,Y,474,0,0,0,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,SFoyer,5,5,1992,1992,Gable,CompShg,HdBoard,Wd Shng,,0.0,TA,TA,PConc,Gd,TA,Av,GLQ,337.0,Unf,0.0,575.0,912.0,GasA,TA,Y,SBrkr,970,0,0,970,0.0,1.0,1,0,3,1,TA,6,Typ,0,,,,,0.0,0.0,,,Y,80,32,0,0,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [139]:
df = pd.concat([df_train, df_test], axis=0)
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,Exterior2nd,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Id,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,RoofStyle,RoofMatl,MasVnrType,Electrical,GarageFinish,PavedDrive,PoolQC,MiscFeature
0,60,RL,65.0,8450,Pave,Absence,1Fam,2Story,7,5,2003,2003,VinylSd,VinylSd,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,Absence,Attchd,2003.0,2.0,548.0,TA,TA,0,61,0,0,0,0,Absence,0,2,2008,WD,Normal,208500.0,,,,,,,,,,,,,,,,,
1,20,RL,80.0,9600,Pave,Absence,1Fam,1Story,6,8,1976,1976,MetalSd,MetalSd,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,2.0,460.0,TA,TA,298,0,0,0,0,0,Absence,0,5,2007,WD,Normal,181500.0,,,,,,,,,,,,,,,,,
2,60,RL,68.0,11250,Pave,Absence,1Fam,2Story,7,5,2001,2002,VinylSd,VinylSd,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,2.0,608.0,TA,TA,0,42,0,0,0,0,Absence,0,9,2008,WD,Normal,223500.0,,,,,,,,,,,,,,,,,
3,70,RL,60.0,9550,Pave,Absence,1Fam,2Story,7,5,1915,1970,Wd Sdng,Wd Shng,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,3.0,642.0,TA,TA,0,35,272,0,0,0,Absence,0,2,2006,WD,Abnorml,140000.0,,,,,,,,,,,,,,,,,
4,60,RL,84.0,14260,Pave,Absence,1Fam,2Story,8,5,2000,2000,VinylSd,VinylSd,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,3.0,836.0,TA,TA,192,84,0,0,0,0,Absence,0,12,2008,WD,Normal,250000.0,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Twnhs,2Story,4,7,1970,1970,CemntBd,CmentBd,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,546.0,546.0,GasA,Gd,Y,546,546,0,1092,0.0,0.0,1,1,3,1,TA,5,Typ,0,,,,0.0,0.0,,,0,0,0,0,0,0,,0,6,2006,WD,Normal,,2915.0,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Gable,CompShg,,SBrkr,,Y,,
1455,160,RM,21.0,1894,Pave,,TwnhsE,2Story,4,5,1970,1970,CemntBd,CmentBd,0.0,TA,TA,CBlock,TA,TA,No,Rec,252.0,Unf,0.0,294.0,546.0,GasA,TA,Y,546,546,0,1092,0.0,0.0,1,1,3,1,TA,6,Typ,0,,CarPort,1970.0,1.0,286.0,TA,TA,0,24,0,0,0,0,,0,4,2006,WD,Abnorml,,2916.0,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Gable,CompShg,,SBrkr,Unf,Y,,
1456,20,RL,160.0,20000,Pave,,1Fam,1Story,5,7,1960,1996,VinylSd,VinylSd,0.0,TA,TA,CBlock,TA,TA,No,ALQ,1224.0,Unf,0.0,0.0,1224.0,GasA,Ex,Y,1224,0,0,1224,1.0,0.0,1,0,4,1,TA,7,Typ,1,TA,Detchd,1960.0,2.0,576.0,TA,TA,474,0,0,0,0,0,,0,9,2006,WD,Abnorml,,2917.0,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,Gable,CompShg,,SBrkr,Unf,Y,,
1457,85,RL,62.0,10441,Pave,,1Fam,SFoyer,5,5,1992,1992,HdBoard,Wd Shng,0.0,TA,TA,PConc,Gd,TA,Av,GLQ,337.0,Unf,0.0,575.0,912.0,GasA,TA,Y,970,0,0,970,0.0,1.0,1,0,3,1,TA,6,Typ,0,,,,0.0,0.0,,,80,32,0,0,0,0,MnPrv,700,7,2006,WD,Normal,,2918.0,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,Gable,CompShg,,SBrkr,,Y,,Shed


In [41]:
df.to_csv('../concatenated_train_test.csv', index=False)

### 1. Знакомимся с датасетом и принимаем решение по поводу тектовых данных

In [24]:
# создаем список названий колонок, где тип данных object
list_of_obj_values = df.select_dtypes(include='object').columns.tolist()
len(list_of_obj_values) # 43 колонки из 81

43

In [None]:
# создадим словарь, где будет количество уникальных значений в каждом столбце
# в том числе НаН будет
unique_values_count = {col: df[col].nunique(dropna=False) for col in list_of_obj_values}
unique_values_count

In [None]:
for col in list_of_obj_values:
    total = len(df[col])  # Общее количество строк
    nan_count = df[col].isna().sum()  # Количество NaN
    nan_percentage = (nan_count / total) * 100  # Процентное количество NaN
    print(f"Фича '{col}':")
    print(f"NaN абсолютное кол-во: {nan_count}")
    print(f"NaN в процентах: {nan_percentage:.2f}%")
    print(f"Значения, которые принимает фича:")
    print(df[col].value_counts(dropna=False))
    print("\n" + "="*50 + "\n")  # Разделитель для удобства чтения

In [114]:
# список колонок, которые решили удалить
columns_to_drop = [
    'LotShape',
    'LandContour',
    'Utilities',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'RoofStyle',
    'RoofMatl',
    'MasVnrType',
    'Electrical',
    'GarageFinish',
    'PavedDrive',
    'PoolQC',
    'MiscFeature',
    'Id'
]

In [39]:
# удаляем колонки сразу из обоих датасетов
df_working = df.drop(columns=columns_to_drop)
df_working

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,Exterior2nd,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,1Fam,2Story,7,5,2003,2003,VinylSd,VinylSd,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,2.0,548.0,TA,TA,0,61,0,0,0,0,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,1Fam,1Story,6,8,1976,1976,MetalSd,MetalSd,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,2.0,460.0,TA,TA,298,0,0,0,0,0,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,1Fam,2Story,7,5,2001,2002,VinylSd,VinylSd,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,2.0,608.0,TA,TA,0,42,0,0,0,0,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,1Fam,2Story,7,5,1915,1970,Wd Sdng,Wd Shng,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,3.0,642.0,TA,TA,0,35,272,0,0,0,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,1Fam,2Story,8,5,2000,2000,VinylSd,VinylSd,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,3.0,836.0,TA,TA,192,84,0,0,0,0,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Twnhs,2Story,4,7,1970,1970,CemntBd,CmentBd,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,546.0,546.0,GasA,Gd,Y,546,546,0,1092,0.0,0.0,1,1,3,1,TA,5,Typ,0,,,,0.0,0.0,,,0,0,0,0,0,0,,0,6,2006,WD,Normal,
1455,2916,160,RM,21.0,1894,Pave,,TwnhsE,2Story,4,5,1970,1970,CemntBd,CmentBd,0.0,TA,TA,CBlock,TA,TA,No,Rec,252.0,Unf,0.0,294.0,546.0,GasA,TA,Y,546,546,0,1092,0.0,0.0,1,1,3,1,TA,6,Typ,0,,CarPort,1970.0,1.0,286.0,TA,TA,0,24,0,0,0,0,,0,4,2006,WD,Abnorml,
1456,2917,20,RL,160.0,20000,Pave,,1Fam,1Story,5,7,1960,1996,VinylSd,VinylSd,0.0,TA,TA,CBlock,TA,TA,No,ALQ,1224.0,Unf,0.0,0.0,1224.0,GasA,Ex,Y,1224,0,0,1224,1.0,0.0,1,0,4,1,TA,7,Typ,1,TA,Detchd,1960.0,2.0,576.0,TA,TA,474,0,0,0,0,0,,0,9,2006,WD,Abnorml,
1457,2918,85,RL,62.0,10441,Pave,,1Fam,SFoyer,5,5,1992,1992,HdBoard,Wd Shng,0.0,TA,TA,PConc,Gd,TA,Av,GLQ,337.0,Unf,0.0,575.0,912.0,GasA,TA,Y,970,0,0,970,0.0,1.0,1,0,3,1,TA,6,Typ,0,,,,0.0,0.0,,,80,32,0,0,0,0,MnPrv,700,7,2006,WD,Normal,


In [37]:
# колонки, в которых НаНы не НаНы, а просто - нет!
not_nan_list = [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]

In [53]:
# получим еще раз колонки, где тип данных object
list_of_obj_values = df_working.select_dtypes(include='object').columns.tolist()
len(list_of_obj_values) # 27 из 65

27

In [49]:
# получим список колонок, где надо заменить НаНы на какое-то осмысленное отрицательное значение
nan_replace_list = [x for x in not_nan_list if x in list_of_obj_values]

# значение, которым мы заменяем
replace_value = 'Absence'

# преобразование по замене значений на осмысленное отрицание, но не NaN
df_working[nan_replace_list] = df_working[nan_replace_list].fillna(replace_value)

In [55]:
# unique_values_count = {col: df_working[col].nunique(dropna=False) for col in list_of_obj_values}

In [None]:
# посмотрели, сколько НаНов в текстовых столбцах
pd.DataFrame(data={'NaN_count': df_working[list_of_obj_values].isna().sum(), 'data_type':df_working[list_of_obj_values].dtypes})

In [None]:
# посмотрели, сколько НаНов во всех столбцах
pd.DataFrame(data={'NaN_count': df_working.isna().sum(), 'data_type':df_working.dtypes})

In [80]:
df_working.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,Exterior2nd,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Absence,1Fam,2Story,7,5,2003,2003,VinylSd,VinylSd,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,Absence,Attchd,2003.0,2.0,548.0,TA,TA,0,61,0,0,0,0,Absence,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,Absence,1Fam,1Story,6,8,1976,1976,MetalSd,MetalSd,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,2.0,460.0,TA,TA,298,0,0,0,0,0,Absence,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,Absence,1Fam,2Story,7,5,2001,2002,VinylSd,VinylSd,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,2.0,608.0,TA,TA,0,42,0,0,0,0,Absence,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,Absence,1Fam,2Story,7,5,1915,1970,Wd Sdng,Wd Shng,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,3.0,642.0,TA,TA,0,35,272,0,0,0,Absence,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,Absence,1Fam,2Story,8,5,2000,2000,VinylSd,VinylSd,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,3.0,836.0,TA,TA,192,84,0,0,0,0,Absence,0,12,2008,WD,Normal,250000.0


In [81]:
# список для заполнения нанов для object колонок - самым частым
object_list = [
    'MSZoning',
    'Exterior1st',
    'Exterior2nd',
    'KitchenQual',
    'Functional',
    'SaleType'
]

# список для заполнения нанов для числовых колонок
# numbers_list = [
#     # 'LotFrontage',# 486
#     # 'MasVnrArea',# 23 вопросик по этой фиче
#     #'BsmtFinSF1',# 1
#     #'BsmtFinSF2',# 1
#     #'BsmtUnfSF',# 1
#     #'TotalBsmtSF',# 1
#     #'BsmtFullBath',# 2
#     #'BsmtHalfBath',# 2
#     # 'GarageYrBlt',# 159
#     #'GarageCars',# 1
#     #'GarageArea'# 1
# ]

numbers_list_to_mean = [
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageCars',
    'GarageArea',
    'MasVnrArea'
]

numbers_list_to_median = [
    'LotFrontage',
    'GarageYrBlt'
]

# колонки, которые мы решаем, что нам не нужны и мы можем удалить
drop_features = ['Id']

In [84]:
# разделим суммарный датасет на X и y
X_total, y_total = df_working.drop('SalePrice', axis=1), df_working['SalePrice']

In [99]:
X_total.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,Exterior2nd,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,Absence,1Fam,2Story,7,5,2003,2003,VinylSd,VinylSd,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,Absence,Attchd,2003.0,2.0,548.0,TA,TA,0,61,0,0,0,0,Absence,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,Absence,1Fam,1Story,6,8,1976,1976,MetalSd,MetalSd,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,2.0,460.0,TA,TA,298,0,0,0,0,0,Absence,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,Absence,1Fam,2Story,7,5,2001,2002,VinylSd,VinylSd,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,2.0,608.0,TA,TA,0,42,0,0,0,0,Absence,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,Absence,1Fam,2Story,7,5,1915,1970,Wd Sdng,Wd Shng,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,3.0,642.0,TA,TA,0,35,272,0,0,0,Absence,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,Absence,1Fam,2Story,8,5,2000,2000,VinylSd,VinylSd,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,3.0,836.0,TA,TA,192,84,0,0,0,0,Absence,0,12,2008,WD,Normal


In [85]:
# создаем имютер для заполнения НаНов

my_imputer = ColumnTransformer(
    transformers = [
        ('drop_features', 'drop', drop_features),
        ('num_imputer1', SimpleImputer(strategy='mean'), numbers_list_to_mean),
        ('num_imputer2', SimpleImputer(strategy='median'), numbers_list_to_median),
        ('cat_imputer', SimpleImputer(strategy='most_frequent'), object_list)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough',
    force_int_remainder_cols=False
)    

In [89]:
filled_data_total = my_imputer.fit_transform(X=X_total)

In [100]:
filled_data_total.head()

Unnamed: 0,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MasVnrArea,LotFrontage,GarageYrBlt,MSZoning,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,MSSubClass,LotArea,Street,Alley,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleCondition
0,706.0,0.0,150.0,856.0,1.0,0.0,2.0,548.0,196.0,65.0,2003.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,8450,Pave,Absence,1Fam,2Story,7,5,2003,2003,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,856,854,0,1710,2,1,3,1,8,0,Absence,Attchd,TA,TA,0,61,0,0,0,0,Absence,0,2,2008,Normal
1,978.0,0.0,284.0,1262.0,0.0,1.0,2.0,460.0,0.0,80.0,1976.0,RL,MetalSd,MetalSd,TA,Typ,WD,20,9600,Pave,Absence,1Fam,1Story,6,8,1976,1976,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,1262,0,0,1262,2,0,3,1,6,1,TA,Attchd,TA,TA,298,0,0,0,0,0,Absence,0,5,2007,Normal
2,486.0,0.0,434.0,920.0,1.0,0.0,2.0,608.0,162.0,68.0,2001.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,11250,Pave,Absence,1Fam,2Story,7,5,2001,2002,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,920,866,0,1786,2,1,3,1,6,1,TA,Attchd,TA,TA,0,42,0,0,0,0,Absence,0,9,2008,Normal
3,216.0,0.0,540.0,756.0,1.0,0.0,3.0,642.0,0.0,60.0,1998.0,RL,Wd Sdng,Wd Shng,Gd,Typ,WD,70,9550,Pave,Absence,1Fam,2Story,7,5,1915,1970,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,961,756,0,1717,1,0,3,1,7,1,Gd,Detchd,TA,TA,0,35,272,0,0,0,Absence,0,2,2006,Abnorml
4,655.0,0.0,490.0,1145.0,1.0,0.0,3.0,836.0,350.0,84.0,2000.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,14260,Pave,Absence,1Fam,2Story,8,5,2000,2000,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,1145,1053,0,2198,2,1,4,1,9,1,TA,Attchd,TA,TA,192,84,0,0,0,0,Absence,0,12,2008,Normal


In [104]:
pd.DataFrame(data={'NaN_count': filled_data_total.isna().sum(), 'data_type':filled_data_total.dtypes})

Unnamed: 0,NaN_count,data_type
BsmtFinSF1,0,float64
BsmtFinSF2,0,float64
BsmtUnfSF,0,float64
TotalBsmtSF,0,float64
BsmtFullBath,0,float64
BsmtHalfBath,0,float64
GarageCars,0,float64
GarageArea,0,float64
MasVnrArea,0,float64
LotFrontage,0,float64


In [94]:
df_working

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,Exterior2nd,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Absence,1Fam,2Story,7,5,2003,2003,VinylSd,VinylSd,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,Absence,Attchd,2003.0,2.0,548.0,TA,TA,0,61,0,0,0,0,Absence,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,Absence,1Fam,1Story,6,8,1976,1976,MetalSd,MetalSd,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,2.0,460.0,TA,TA,298,0,0,0,0,0,Absence,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,Absence,1Fam,2Story,7,5,2001,2002,VinylSd,VinylSd,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,2.0,608.0,TA,TA,0,42,0,0,0,0,Absence,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,Absence,1Fam,2Story,7,5,1915,1970,Wd Sdng,Wd Shng,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,3.0,642.0,TA,TA,0,35,272,0,0,0,Absence,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,Absence,1Fam,2Story,8,5,2000,2000,VinylSd,VinylSd,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,3.0,836.0,TA,TA,192,84,0,0,0,0,Absence,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,Absence,Twnhs,2Story,4,7,1970,1970,CemntBd,CmentBd,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,546.0,546.0,GasA,Gd,Y,546,546,0,1092,0.0,0.0,1,1,3,1,TA,5,Typ,0,Absence,Absence,,0.0,0.0,Absence,Absence,0,0,0,0,0,0,Absence,0,6,2006,WD,Normal,
1455,2916,160,RM,21.0,1894,Pave,Absence,TwnhsE,2Story,4,5,1970,1970,CemntBd,CmentBd,0.0,TA,TA,CBlock,TA,TA,No,Rec,252.0,Unf,0.0,294.0,546.0,GasA,TA,Y,546,546,0,1092,0.0,0.0,1,1,3,1,TA,6,Typ,0,Absence,CarPort,1970.0,1.0,286.0,TA,TA,0,24,0,0,0,0,Absence,0,4,2006,WD,Abnorml,
1456,2917,20,RL,160.0,20000,Pave,Absence,1Fam,1Story,5,7,1960,1996,VinylSd,VinylSd,0.0,TA,TA,CBlock,TA,TA,No,ALQ,1224.0,Unf,0.0,0.0,1224.0,GasA,Ex,Y,1224,0,0,1224,1.0,0.0,1,0,4,1,TA,7,Typ,1,TA,Detchd,1960.0,2.0,576.0,TA,TA,474,0,0,0,0,0,Absence,0,9,2006,WD,Abnorml,
1457,2918,85,RL,62.0,10441,Pave,Absence,1Fam,SFoyer,5,5,1992,1992,HdBoard,Wd Shng,0.0,TA,TA,PConc,Gd,TA,Av,GLQ,337.0,Unf,0.0,575.0,912.0,GasA,TA,Y,970,0,0,970,0.0,1.0,1,0,3,1,TA,6,Typ,0,Absence,Absence,,0.0,0.0,Absence,Absence,80,32,0,0,0,0,MnPrv,700,7,2006,WD,Normal,


In [95]:
filled_data_total

Unnamed: 0,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MasVnrArea,LotFrontage,GarageYrBlt,MSZoning,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,MSSubClass,LotArea,Street,Alley,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleCondition
0,706.0,0.0,150.0,856.0,1.0,0.0,2.0,548.0,196.0,65.0,2003.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,8450,Pave,Absence,1Fam,2Story,7,5,2003,2003,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,856,854,0,1710,2,1,3,1,8,0,Absence,Attchd,TA,TA,0,61,0,0,0,0,Absence,0,2,2008,Normal
1,978.0,0.0,284.0,1262.0,0.0,1.0,2.0,460.0,0.0,80.0,1976.0,RL,MetalSd,MetalSd,TA,Typ,WD,20,9600,Pave,Absence,1Fam,1Story,6,8,1976,1976,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,1262,0,0,1262,2,0,3,1,6,1,TA,Attchd,TA,TA,298,0,0,0,0,0,Absence,0,5,2007,Normal
2,486.0,0.0,434.0,920.0,1.0,0.0,2.0,608.0,162.0,68.0,2001.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,11250,Pave,Absence,1Fam,2Story,7,5,2001,2002,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,920,866,0,1786,2,1,3,1,6,1,TA,Attchd,TA,TA,0,42,0,0,0,0,Absence,0,9,2008,Normal
3,216.0,0.0,540.0,756.0,1.0,0.0,3.0,642.0,0.0,60.0,1998.0,RL,Wd Sdng,Wd Shng,Gd,Typ,WD,70,9550,Pave,Absence,1Fam,2Story,7,5,1915,1970,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,961,756,0,1717,1,0,3,1,7,1,Gd,Detchd,TA,TA,0,35,272,0,0,0,Absence,0,2,2006,Abnorml
4,655.0,0.0,490.0,1145.0,1.0,0.0,3.0,836.0,350.0,84.0,2000.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,14260,Pave,Absence,1Fam,2Story,8,5,2000,2000,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,1145,1053,0,2198,2,1,4,1,9,1,TA,Attchd,TA,TA,192,84,0,0,0,0,Absence,0,12,2008,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,0.0,546.0,546.0,0.0,0.0,0.0,0.0,0.0,21.0,1979.0,RM,CemntBd,CmentBd,TA,Typ,WD,160,1936,Pave,Absence,Twnhs,2Story,4,7,1970,1970,TA,TA,CBlock,TA,TA,No,Unf,Unf,GasA,Gd,Y,546,546,0,1092,1,1,3,1,5,0,Absence,Absence,Absence,Absence,0,0,0,0,0,0,Absence,0,6,2006,Normal
1455,252.0,0.0,294.0,546.0,0.0,0.0,1.0,286.0,0.0,21.0,1970.0,RM,CemntBd,CmentBd,TA,Typ,WD,160,1894,Pave,Absence,TwnhsE,2Story,4,5,1970,1970,TA,TA,CBlock,TA,TA,No,Rec,Unf,GasA,TA,Y,546,546,0,1092,1,1,3,1,6,0,Absence,CarPort,TA,TA,0,24,0,0,0,0,Absence,0,4,2006,Abnorml
1456,1224.0,0.0,0.0,1224.0,1.0,0.0,2.0,576.0,0.0,160.0,1960.0,RL,VinylSd,VinylSd,TA,Typ,WD,20,20000,Pave,Absence,1Fam,1Story,5,7,1960,1996,TA,TA,CBlock,TA,TA,No,ALQ,Unf,GasA,Ex,Y,1224,0,0,1224,1,0,4,1,7,1,TA,Detchd,TA,TA,474,0,0,0,0,0,Absence,0,9,2006,Abnorml
1457,337.0,0.0,575.0,912.0,0.0,1.0,0.0,0.0,0.0,62.0,1979.0,RL,HdBoard,Wd Shng,TA,Typ,WD,85,10441,Pave,Absence,1Fam,SFoyer,5,5,1992,1992,TA,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,TA,Y,970,0,0,970,1,0,3,1,6,0,Absence,Absence,Absence,Absence,80,32,0,0,0,0,MnPrv,700,7,2006,Normal


In [108]:
list_of_obj_values = filled_data_total.select_dtypes(include='object').columns.tolist()
len(list_of_obj_values) # 27 колонок из 63

27

In [None]:
unique_values_count = {col: filled_data_total[col].nunique(dropna=True) for col in list_of_obj_values}
unique_values_count

In [110]:
# списки колонок для применения разных энкодеров
one_hot_list = [
    'Street',
    'Alley',
    'CentralAir'
]

target_list = [
    'MSZoning',
    'Exterior1st',
    'Exterior2nd',
    'KitchenQual',
    'Functional',
    'SaleType',
    'BldgType',
    'HouseStyle',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'Heating',
    'HeatingQC',
    'FireplaceQu',
    'GarageType',
    'GarageQual',
    'GarageCond',
    'Fence',
    'SaleCondition'
]

In [125]:
filled_data_total

Unnamed: 0,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MasVnrArea,LotFrontage,GarageYrBlt,MSZoning,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,MSSubClass,LotArea,Street,Alley,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleCondition
0,706.0,0.0,150.0,856.0,1.0,0.0,2.0,548.0,196.0,65.0,2003.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,8450,Pave,Absence,1Fam,2Story,7,5,2003,2003,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,856,854,0,1710,2,1,3,1,8,0,Absence,Attchd,TA,TA,0,61,0,0,0,0,Absence,0,2,2008,Normal
1,978.0,0.0,284.0,1262.0,0.0,1.0,2.0,460.0,0.0,80.0,1976.0,RL,MetalSd,MetalSd,TA,Typ,WD,20,9600,Pave,Absence,1Fam,1Story,6,8,1976,1976,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,1262,0,0,1262,2,0,3,1,6,1,TA,Attchd,TA,TA,298,0,0,0,0,0,Absence,0,5,2007,Normal
2,486.0,0.0,434.0,920.0,1.0,0.0,2.0,608.0,162.0,68.0,2001.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,11250,Pave,Absence,1Fam,2Story,7,5,2001,2002,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,920,866,0,1786,2,1,3,1,6,1,TA,Attchd,TA,TA,0,42,0,0,0,0,Absence,0,9,2008,Normal
3,216.0,0.0,540.0,756.0,1.0,0.0,3.0,642.0,0.0,60.0,1998.0,RL,Wd Sdng,Wd Shng,Gd,Typ,WD,70,9550,Pave,Absence,1Fam,2Story,7,5,1915,1970,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,961,756,0,1717,1,0,3,1,7,1,Gd,Detchd,TA,TA,0,35,272,0,0,0,Absence,0,2,2006,Abnorml
4,655.0,0.0,490.0,1145.0,1.0,0.0,3.0,836.0,350.0,84.0,2000.0,RL,VinylSd,VinylSd,Gd,Typ,WD,60,14260,Pave,Absence,1Fam,2Story,8,5,2000,2000,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,1145,1053,0,2198,2,1,4,1,9,1,TA,Attchd,TA,TA,192,84,0,0,0,0,Absence,0,12,2008,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,0.0,546.0,546.0,0.0,0.0,0.0,0.0,0.0,21.0,1979.0,RM,CemntBd,CmentBd,TA,Typ,WD,160,1936,Pave,Absence,Twnhs,2Story,4,7,1970,1970,TA,TA,CBlock,TA,TA,No,Unf,Unf,GasA,Gd,Y,546,546,0,1092,1,1,3,1,5,0,Absence,Absence,Absence,Absence,0,0,0,0,0,0,Absence,0,6,2006,Normal
1455,252.0,0.0,294.0,546.0,0.0,0.0,1.0,286.0,0.0,21.0,1970.0,RM,CemntBd,CmentBd,TA,Typ,WD,160,1894,Pave,Absence,TwnhsE,2Story,4,5,1970,1970,TA,TA,CBlock,TA,TA,No,Rec,Unf,GasA,TA,Y,546,546,0,1092,1,1,3,1,6,0,Absence,CarPort,TA,TA,0,24,0,0,0,0,Absence,0,4,2006,Abnorml
1456,1224.0,0.0,0.0,1224.0,1.0,0.0,2.0,576.0,0.0,160.0,1960.0,RL,VinylSd,VinylSd,TA,Typ,WD,20,20000,Pave,Absence,1Fam,1Story,5,7,1960,1996,TA,TA,CBlock,TA,TA,No,ALQ,Unf,GasA,Ex,Y,1224,0,0,1224,1,0,4,1,7,1,TA,Detchd,TA,TA,474,0,0,0,0,0,Absence,0,9,2006,Abnorml
1457,337.0,0.0,575.0,912.0,0.0,1.0,0.0,0.0,0.0,62.0,1979.0,RL,HdBoard,Wd Shng,TA,Typ,WD,85,10441,Pave,Absence,1Fam,SFoyer,5,5,1992,1992,TA,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,TA,Y,970,0,0,970,1,0,3,1,6,0,Absence,Absence,Absence,Absence,80,32,0,0,0,0,MnPrv,700,7,2006,Normal


In [123]:
# создадим базовые энкодеры
my_encoder = ColumnTransformer(
    transformers=[
        ('one_hot_encoding', OneHotEncoder(sparse_output=False), one_hot_list),
        ('target_encoding', ce.TargetEncoder(), target_list)
    ],
    verbose_feature_names_out=False,
    remainder='passthrough',
    force_int_remainder_cols=False
)

In [126]:
encoded_total = my_encoder.fit_transform(filled_data_total, y_total)

In [135]:
encoded_total

Unnamed: 0,Street_Grvl,Street_Pave,Alley_Absence,Alley_Grvl,Alley_Pave,CentralAir_N,CentralAir_Y,MSZoning,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,BldgType,HouseStyle,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,FireplaceQu,GarageType,GarageQual,GarageCond,Fence,SaleCondition,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MasVnrArea,LotFrontage,GarageYrBlt,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,191004.994787,213732.900971,214432.460317,212116.023891,183429.147059,173401.836622,185763.807377,210051.764045,231633.510246,184034.896256,225230.442040,202688.478964,183632.62090,165652.295908,235413.720096,184694.690287,182021.195378,214914.429150,141331.482609,202892.656322,187489.836003,187885.735294,187596.837998,175202.219533,706.0,0.0,150.0,856.0,1.0,0.0,2.0,548.0,196.0,65.0,2003.0,60,8450,7,5,2003,2003,856,854,0,1710,2,1,3,1,8,0,0,61,0,0,0,0,0,2,2008
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,191004.994787,149422.177338,149803.173014,139962.511565,183429.147059,173401.836622,185763.807377,175985.477961,144341.313466,184034.896256,149805.714511,202688.478964,183632.62090,257688.946518,161573.068222,184694.690287,182021.195378,214914.429150,205723.488818,202892.656322,187489.836003,187885.735294,187596.837998,175202.219533,978.0,0.0,284.0,1262.0,0.0,1.0,2.0,460.0,0.0,80.0,1976.0,20,9600,6,8,1976,1976,1262,0,0,1262,2,0,3,1,6,1,298,0,0,0,0,0,0,5,2007
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,191004.994787,213732.900971,214432.460317,212116.023891,183429.147059,173401.836622,185763.807377,210051.764045,231633.510246,184034.896256,225230.442040,202688.478964,183632.62090,192788.676169,235413.720096,184694.690287,182021.195378,214914.429150,205723.488818,202892.656322,187489.836003,187885.735294,187596.837998,175202.219533,486.0,0.0,434.0,920.0,1.0,0.0,2.0,608.0,162.0,68.0,2001.0,60,11250,7,5,2001,2002,920,866,0,1786,2,1,3,1,6,1,0,42,0,0,0,0,0,9,2008
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,191004.994787,149841.645891,164108.128685,212116.023891,183429.147059,173401.836622,185763.807377,210051.764045,144341.313466,184034.896256,132291.239323,140759.818182,213240.86856,165652.295908,161573.068222,184694.690287,182021.195378,156858.871375,226351.415789,134091.162791,187489.836003,187885.735294,187596.837998,146537.060693,216.0,0.0,540.0,756.0,1.0,0.0,3.0,642.0,0.0,60.0,1998.0,70,9550,7,5,1915,1970,961,756,0,1717,1,0,3,1,7,1,0,35,272,0,0,0,0,2,2006
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,191004.994787,213732.900971,214432.460317,212116.023891,183429.147059,173401.836622,185763.807377,210051.764045,231633.510246,184034.896256,225230.442040,202688.478964,183632.62090,206643.420767,235413.720096,184694.690287,182021.195378,214914.429150,205723.488818,202892.656322,187489.836003,187885.735294,187596.837998,175202.219533,655.0,0.0,490.0,1145.0,1.0,0.0,3.0,836.0,350.0,84.0,2000.0,60,14260,8,5,2000,2000,1145,1053,0,2198,2,1,4,1,9,1,192,84,0,0,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,1.0,1.0,0.0,0.0,0.0,1.0,126316.830413,230862.986650,229209.403952,139962.511565,183429.147059,173401.836622,140013.033016,210051.764045,144341.313466,184034.896256,149805.714511,140759.818182,183632.62090,165652.295908,170670.576744,184694.690287,182021.195378,156858.871375,141331.482609,103490.949750,103490.949750,103490.949750,187596.837998,175202.219533,0.0,0.0,546.0,546.0,0.0,0.0,0.0,0.0,0.0,21.0,1979.0,160,1936,4,7,1970,1970,546,546,0,1092,1,1,3,1,5,0,0,0,0,0,0,0,0,6,2006
1455,0.0,1.0,1.0,0.0,0.0,0.0,1.0,126316.830413,230862.986650,229209.403952,139962.511565,183429.147059,173401.836622,181959.256233,210051.764045,144341.313466,184034.896256,149805.714511,140759.818182,183632.62090,165652.295908,146889.669190,184694.690287,182021.195378,142362.876168,141331.482609,163199.881551,187489.836003,187885.735294,187596.837998,146537.060693,252.0,0.0,294.0,546.0,0.0,0.0,1.0,286.0,0.0,21.0,1970.0,160,1894,4,5,1970,1970,546,546,0,1092,1,1,3,1,6,0,0,24,0,0,0,0,0,4,2006
1456,0.0,1.0,1.0,0.0,0.0,0.0,1.0,191004.994787,213732.900971,214432.460317,139962.511565,183429.147059,173401.836622,185763.807377,175985.477961,144341.313466,184034.896256,149805.714511,140759.818182,183632.62090,165652.295908,161573.068222,184694.690287,182021.195378,214914.429150,205723.488818,134091.162791,187489.836003,187885.735294,187596.837998,146537.060693,1224.0,0.0,0.0,1224.0,1.0,0.0,2.0,576.0,0.0,160.0,1960.0,20,20000,5,7,1960,1996,1224,0,0,1224,1,0,4,1,7,1,474,0,0,0,0,0,0,9,2006
1457,0.0,1.0,1.0,0.0,0.0,0.0,1.0,191004.994787,163077.450481,164108.128685,139962.511565,183429.147059,173401.836622,185763.807377,142156.210608,144341.313466,184034.896256,225230.442040,202688.478964,183632.62090,206643.420767,235413.720096,184694.690287,182021.195378,142362.876168,141331.482609,103490.949750,103490.949750,103490.949750,148751.125281,175202.219533,337.0,0.0,575.0,912.0,0.0,1.0,0.0,0.0,0.0,62.0,1979.0,85,10441,5,5,1992,1992,970,0,0,970,1,0,3,1,6,0,80,32,0,0,0,0,700,7,2006


In [132]:
# столбцы для нормирования
numeric_columns = encoded_total.select_dtypes(include=['number']).columns  # Выбираем только числовые столбцы
columns_to_scale = [col for col in numeric_columns if encoded_total[col].max() > 1]


In [136]:
my_scaler = ColumnTransformer(
    transformers=[
        ('standard_scale', StandardScaler(), columns_to_scale)
    ],
    verbose_feature_names_out=False,
    remainder='passthrough',
    force_int_remainder_cols=False
)

In [137]:
full_prepared_total = my_scaler.fit_transform(encoded_total)

In [138]:
full_prepared_total

Unnamed: 0,MSZoning,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,BldgType,HouseStyle,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,FireplaceQu,GarageType,GarageQual,GarageCond,Fence,SaleCondition,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MasVnrArea,LotFrontage,GarageYrBlt,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Street_Grvl,Street_Pave,Alley_Absence,Alley_Grvl,Alley_Pave,CentralAir_N,CentralAir_Y
0,0.393191,1.097101,1.129034,0.579156,0.24989,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,-0.508241,1.492421,0.293401,0.110108,0.963091,-0.955493,0.540480,0.322070,0.298704,0.450863,-0.214778,0.580907,-0.29313,-0.934863,-0.444328,1.087023,-0.249895,0.306528,0.348900,0.525202,-0.191815,0.998954,0.067331,-0.217879,0.646183,-0.507284,1.046258,0.896833,-0.773861,1.207379,-0.101197,0.413547,0.781366,1.232599,0.169927,-0.207698,0.986849,-0.924311,-0.740760,0.200006,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,0.393191,-1.095788,-1.085498,-0.766638,0.24989,-0.268332,0.339821,-0.237108,-0.694228,0.302765,-0.803606,0.381578,0.164226,2.516713,-0.549169,0.293401,0.110108,0.963091,0.678737,0.540480,0.322070,0.298704,0.450863,-0.214778,1.178112,-0.29313,-0.629896,0.477111,-0.819679,3.822419,0.306528,-0.059792,-0.572250,0.511940,-0.086940,-0.873616,-0.072044,-0.063185,2.188279,0.154764,-0.395604,0.261075,-0.785025,-0.101197,-0.471891,0.781366,-0.756321,0.169927,-0.207698,-0.287758,0.623632,1.614879,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.446925,-0.602962,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,0.393191,1.097101,1.129034,0.579156,0.24989,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,0.383646,1.492421,0.293401,0.110108,0.963091,0.678737,0.540480,0.322070,0.298704,0.450863,-0.214778,0.097873,-0.29313,-0.288516,-0.299076,1.087023,-0.249895,0.306528,0.627553,0.334828,-0.051064,0.918517,0.067331,0.137197,0.646183,-0.507284,0.980221,0.848965,-0.610718,1.235375,-0.101197,0.563755,0.781366,1.232599,0.169927,-0.207698,-0.287758,0.623632,-0.740760,-0.081209,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.393191,-1.081485,-0.595336,0.579156,0.24989,-0.268332,0.339821,1.325041,-0.694228,0.302765,-1.245160,-0.751066,1.963481,-0.508241,-0.549169,0.293401,0.110108,-0.703473,1.202261,-1.211765,0.322070,0.298704,0.450863,-1.222534,-0.494941,-0.29313,-0.047275,-0.671283,1.087023,-0.249895,1.619961,0.785457,-0.572250,-0.426400,0.797862,0.302568,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.506205,0.978742,-0.101197,0.427382,-1.027363,-0.756321,0.169927,-0.207698,0.349546,0.623632,-0.740760,-0.184815,3.874967,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,0.393191,1.097101,1.129034,0.579156,0.24989,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,0.839008,1.492421,0.293401,0.110108,0.963091,0.678737,0.540480,0.322070,0.298704,0.450863,-0.214778,0.468931,-0.29313,-0.161068,0.211573,1.087023,-0.249895,1.619961,1.686437,1.387486,0.699608,0.878299,0.067331,0.518903,1.355551,-0.507284,0.947203,0.753229,-0.037170,1.671651,-0.101197,1.378042,0.781366,1.232599,1.385655,-0.207698,1.624153,0.623632,0.776967,0.540424,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,2.132012,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,-2.225331,1.681209,1.635368,-0.766638,0.24989,-0.268332,-3.024908,1.325041,-0.694228,0.302765,-0.803606,-0.751066,0.164226,-0.508241,-0.297636,0.293401,0.110108,-0.703473,-0.955493,-1.991095,-3.470626,-3.698044,0.450863,-0.214778,-0.969192,-0.29313,-0.033619,-1.147889,-0.819679,-0.249895,-2.320339,-2.196138,-0.572250,-2.256162,0.033715,2.419700,-1.043937,-1.481920,1.289758,-0.043346,-0.682812,-1.564083,0.488807,-0.101197,-0.807883,-1.027363,1.232599,0.169927,-0.207698,-0.925062,-0.924311,-0.740760,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.078505,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1455,-2.225331,1.681209,1.635368,-0.766638,0.24989,-0.268332,0.060016,1.325041,-0.694228,0.302765,-0.803606,-0.751066,0.164226,-0.508241,-0.955144,0.293401,0.110108,-1.119600,-0.955493,-0.470421,0.322070,0.298704,0.450863,-1.222534,-0.415899,-0.29313,-0.607138,-1.147889,-0.819679,-0.249895,-1.006906,-0.867888,-0.572250,-2.256162,-0.328249,2.419700,-1.049263,-1.481920,-0.507284,-0.043346,-0.682812,-1.564083,0.488807,-0.101197,-0.807883,-1.027363,1.232599,0.169927,-0.207698,-0.287758,-0.924311,-0.740760,-0.347624,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.815344,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1456,0.393191,1.097101,1.129034,-0.766638,0.24989,-0.268332,0.339821,-0.237108,-0.694228,0.302765,-0.803606,-0.751066,0.164226,-0.508241,-0.549169,0.293401,0.110108,0.963091,0.678737,-1.211765,0.322070,0.298704,0.450863,-1.222534,1.718232,-0.29313,-1.276243,0.390868,1.087023,-0.249895,0.306528,0.478938,-0.572250,4.265298,-0.730432,-0.873616,1.246808,-0.772552,1.289758,-0.373528,0.561757,0.164209,-0.785025,-0.101197,-0.546995,-1.027363,-0.756321,1.385655,-0.207698,0.349546,0.623632,3.006130,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1457,0.393191,-0.630166,-0.595336,-0.766638,0.24989,-0.268332,0.339821,-1.788388,-0.694228,0.302765,1.097914,0.381578,0.164226,0.839008,1.492421,0.293401,0.110108,-1.119600,-0.955493,-1.991095,-3.470626,-3.698044,-2.214307,-0.214778,-0.229272,-0.29313,0.032381,-0.317233,-0.819679,3.822419,-2.320339,-2.196138,-0.572250,-0.332566,0.033715,0.655424,0.034605,-0.772552,-0.507284,0.683057,0.370284,-0.483263,-0.785025,-0.101197,-1.049006,-1.027363,-0.756321,0.169927,-0.207698,-0.287758,-0.924311,-0.108374,-0.229217,-0.359601,-0.103331,-0.285935,-0.06315,1.144312,0.289914,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [141]:
full_prepared_with_y = pd.concat([full_prepared_total, y_total], axis=1)

In [142]:
full_prepared_with_y

Unnamed: 0,MSZoning,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,BldgType,HouseStyle,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,FireplaceQu,GarageType,GarageQual,GarageCond,Fence,SaleCondition,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MasVnrArea,LotFrontage,GarageYrBlt,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,Street_Grvl,Street_Pave,Alley_Absence,Alley_Grvl,Alley_Pave,CentralAir_N,CentralAir_Y,SalePrice
0,0.393191,1.097101,1.129034,0.579156,0.24989,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,-0.508241,1.492421,0.293401,0.110108,0.963091,-0.955493,0.540480,0.322070,0.298704,0.450863,-0.214778,0.580907,-0.29313,-0.934863,-0.444328,1.087023,-0.249895,0.306528,0.348900,0.525202,-0.191815,0.998954,0.067331,-0.217879,0.646183,-0.507284,1.046258,0.896833,-0.773861,1.207379,-0.101197,0.413547,0.781366,1.232599,0.169927,-0.207698,0.986849,-0.924311,-0.740760,0.200006,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0,208500.0
1,0.393191,-1.095788,-1.085498,-0.766638,0.24989,-0.268332,0.339821,-0.237108,-0.694228,0.302765,-0.803606,0.381578,0.164226,2.516713,-0.549169,0.293401,0.110108,0.963091,0.678737,0.540480,0.322070,0.298704,0.450863,-0.214778,1.178112,-0.29313,-0.629896,0.477111,-0.819679,3.822419,0.306528,-0.059792,-0.572250,0.511940,-0.086940,-0.873616,-0.072044,-0.063185,2.188279,0.154764,-0.395604,0.261075,-0.785025,-0.101197,-0.471891,0.781366,-0.756321,0.169927,-0.207698,-0.287758,0.623632,1.614879,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.446925,-0.602962,0.0,1.0,1.0,0.0,0.0,0.0,1.0,181500.0
2,0.393191,1.097101,1.129034,0.579156,0.24989,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,0.383646,1.492421,0.293401,0.110108,0.963091,0.678737,0.540480,0.322070,0.298704,0.450863,-0.214778,0.097873,-0.29313,-0.288516,-0.299076,1.087023,-0.249895,0.306528,0.627553,0.334828,-0.051064,0.918517,0.067331,0.137197,0.646183,-0.507284,0.980221,0.848965,-0.610718,1.235375,-0.101197,0.563755,0.781366,1.232599,0.169927,-0.207698,-0.287758,0.623632,-0.740760,-0.081209,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0,223500.0
3,0.393191,-1.081485,-0.595336,0.579156,0.24989,-0.268332,0.339821,1.325041,-0.694228,0.302765,-1.245160,-0.751066,1.963481,-0.508241,-0.549169,0.293401,0.110108,-0.703473,1.202261,-1.211765,0.322070,0.298704,0.450863,-1.222534,-0.494941,-0.29313,-0.047275,-0.671283,1.087023,-0.249895,1.619961,0.785457,-0.572250,-0.426400,0.797862,0.302568,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.506205,0.978742,-0.101197,0.427382,-1.027363,-0.756321,0.169927,-0.207698,0.349546,0.623632,-0.740760,-0.184815,3.874967,-0.103331,-0.285935,-0.06315,-0.089592,-1.552184,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0,140000.0
4,0.393191,1.097101,1.129034,0.579156,0.24989,-0.268332,0.339821,1.325041,0.933597,0.302765,1.097914,0.381578,0.164226,0.839008,1.492421,0.293401,0.110108,0.963091,0.678737,0.540480,0.322070,0.298704,0.450863,-0.214778,0.468931,-0.29313,-0.161068,0.211573,1.087023,-0.249895,1.619961,1.686437,1.387486,0.699608,0.878299,0.067331,0.518903,1.355551,-0.507284,0.947203,0.753229,-0.037170,1.671651,-0.101197,1.378042,0.781366,1.232599,1.385655,-0.207698,1.624153,0.623632,0.776967,0.540424,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,2.132012,0.157646,0.0,1.0,1.0,0.0,0.0,0.0,1.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,-2.225331,1.681209,1.635368,-0.766638,0.24989,-0.268332,-3.024908,1.325041,-0.694228,0.302765,-0.803606,-0.751066,0.164226,-0.508241,-0.297636,0.293401,0.110108,-0.703473,-0.955493,-1.991095,-3.470626,-3.698044,0.450863,-0.214778,-0.969192,-0.29313,-0.033619,-1.147889,-0.819679,-0.249895,-2.320339,-2.196138,-0.572250,-2.256162,0.033715,2.419700,-1.043937,-1.481920,1.289758,-0.043346,-0.682812,-1.564083,0.488807,-0.101197,-0.807883,-1.027363,1.232599,0.169927,-0.207698,-0.925062,-0.924311,-0.740760,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.078505,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0,
1455,-2.225331,1.681209,1.635368,-0.766638,0.24989,-0.268332,0.060016,1.325041,-0.694228,0.302765,-0.803606,-0.751066,0.164226,-0.508241,-0.955144,0.293401,0.110108,-1.119600,-0.955493,-0.470421,0.322070,0.298704,0.450863,-1.222534,-0.415899,-0.29313,-0.607138,-1.147889,-0.819679,-0.249895,-1.006906,-0.867888,-0.572250,-2.256162,-0.328249,2.419700,-1.049263,-1.481920,-0.507284,-0.043346,-0.682812,-1.564083,0.488807,-0.101197,-0.807883,-1.027363,1.232599,0.169927,-0.207698,-0.287758,-0.924311,-0.740760,-0.347624,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,-0.815344,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0,
1456,0.393191,1.097101,1.129034,-0.766638,0.24989,-0.268332,0.339821,-0.237108,-0.694228,0.302765,-0.803606,-0.751066,0.164226,-0.508241,-0.549169,0.293401,0.110108,0.963091,0.678737,-1.211765,0.322070,0.298704,0.450863,-1.222534,1.718232,-0.29313,-1.276243,0.390868,1.087023,-0.249895,0.306528,0.478938,-0.572250,4.265298,-0.730432,-0.873616,1.246808,-0.772552,1.289758,-0.373528,0.561757,0.164209,-0.785025,-0.101197,-0.546995,-1.027363,-0.756321,1.385655,-0.207698,0.349546,0.623632,3.006130,-0.702843,-0.359601,-0.103331,-0.285935,-0.06315,-0.089592,1.026753,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0,
1457,0.393191,-0.630166,-0.595336,-0.766638,0.24989,-0.268332,0.339821,-1.788388,-0.694228,0.302765,1.097914,0.381578,0.164226,0.839008,1.492421,0.293401,0.110108,-1.119600,-0.955493,-1.991095,-3.470626,-3.698044,-2.214307,-0.214778,-0.229272,-0.29313,0.032381,-0.317233,-0.819679,3.822419,-2.320339,-2.196138,-0.572250,-0.332566,0.033715,0.655424,0.034605,-0.772552,-0.507284,0.683057,0.370284,-0.483263,-0.785025,-0.101197,-1.049006,-1.027363,-0.756321,0.169927,-0.207698,-0.287758,-0.924311,-0.108374,-0.229217,-0.359601,-0.103331,-0.285935,-0.06315,1.144312,0.289914,-1.363569,0.0,1.0,1.0,0.0,0.0,0.0,1.0,


In [143]:
train_prepared = full_prepared_with_y.iloc[:1460, :]

In [151]:
train_prepared.to_csv('../train_prepared_1st.csv', index=False)

In [147]:
test_prepared = full_prepared_with_y.iloc[1460:, :-1]

In [152]:
test_prepared.to_csv('../test_prepared_1st.csv', index=False)