In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer

cols_to_use = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

data = pd.read_csv("houseprice.csv", usecols=cols_to_use)

data.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [2]:
data[["LotFrontage","MasVnrArea"]].mean().to_dict()

{'LotFrontage': 70.04995836802665, 'MasVnrArea': 103.68526170798899}

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1), 
    data["SalePrice"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [5]:
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [6]:
imputer = MeanMedianImputer(imputation_method="median")
imputer.fit(X_train)

In [7]:
imputer.variables_

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [8]:
X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)
X_train_t.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,69.0,573.0,Gd,,1998.0
682,69.0,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,,1979.0
1384,60.0,0.0,TA,,1939.0
1100,60.0,0.0,TA,,1930.0


In [9]:
X_train_t[imputer.variables_].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

In [11]:
imputer = MeanMedianImputer(
    imputation_method="mean",
    variables=["LotFrontage","MasVnrArea"]
)
imputer.fit(X_train)
imputer.variables_

['LotFrontage', 'MasVnrArea']

In [12]:
pipe = Pipeline(steps=[
    ("median_imputer",MeanMedianImputer(imputation_method="median",variables=["LotFrontage","MasVnrArea"])),
    ("mean_imputer",MeanMedianImputer(imputation_method="mean",variables=["GarageYrBlt"]))
])
pipe.fit(X_train)

In [13]:
pipe.named_steps["median_imputer"].imputer_dict_

{'LotFrontage': 69.0, 'MasVnrArea': 0.0}

In [14]:
train_t=pipe.transform(X_train);train_t

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,69.0,573.0,Gd,,1998.000000
682,69.0,0.0,Gd,Gd,1996.000000
960,50.0,0.0,TA,,1978.012397
1384,60.0,0.0,TA,,1939.000000
1100,60.0,0.0,TA,,1930.000000
...,...,...,...,...,...
763,82.0,673.0,Gd,Gd,1999.000000
835,60.0,0.0,Gd,,1996.000000
1216,68.0,0.0,,,1978.000000
559,69.0,18.0,Gd,TA,2003.000000


In [15]:
train_t.isna().mean()

LotFrontage    0.000000
MasVnrArea     0.000000
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.000000
dtype: float64

In [16]:
from feature_engine.imputation import ArbitraryNumberImputer
imputer = ArbitraryNumberImputer(arbitrary_number=-999,variables=["LotFrontage"])
imputer.fit_transform(X_train)

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,-999.0,573.0,Gd,,1998.0
682,-999.0,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,,
1384,60.0,0.0,TA,,1939.0
1100,60.0,0.0,TA,,1930.0
...,...,...,...,...,...
763,82.0,673.0,Gd,Gd,1999.0
835,60.0,0.0,Gd,,1996.0
1216,68.0,0.0,,,1978.0
559,-999.0,18.0,Gd,TA,2003.0


In [17]:
imputer = ArbitraryNumberImputer(imputer_dict={"LotFrontage":-999,
                                               "GarageYrBlt":-1})

In [18]:
from feature_engine.imputation import CategoricalImputer
imputer = CategoricalImputer(imputation_method="frequent")
imputer.fit(X_train)
imputer.imputer_dict_

{'BsmtQual': 'TA', 'FireplaceQu': 'Gd'}

In [19]:
imputer = CategoricalImputer(imputation_method="frequent",variables=["BsmtQual"])

In [20]:
imputer = CategoricalImputer()
imputer.fit(X_train)
imputer.imputer_dict_

{'BsmtQual': 'Missing', 'FireplaceQu': 'Missing'}

In [21]:
CategoricalImputer(variables=["BsmtQual"])

In [33]:
pipe = Pipeline(steps=[
    ("imputer_mode",CategoricalImputer(imputation_method="frequent",variables=["BsmtQual"])),
    ("imputer_missing",CategoricalImputer(variables=["FireplaceQu"]))
])
pipe.fit(X_train)

In [34]:
pipe.transform(X_train)

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,,573.0,Gd,Missing,1998.0
682,,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,Missing,
1384,60.0,0.0,TA,Missing,1939.0
1100,60.0,0.0,TA,Missing,1930.0
...,...,...,...,...,...
763,82.0,673.0,Gd,Gd,1999.0
835,60.0,0.0,Gd,Missing,1996.0
1216,68.0,0.0,TA,Missing,1978.0
559,,18.0,Gd,TA,2003.0


In [35]:
pipe.named_steps["imputer_missing"].imputer_dict_

{'FireplaceQu': 'Missing'}

In [36]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False,handle_unknown="infrequent_if_exist").set_output(transform="pandas")

In [39]:
pipe = Pipeline(steps=[
    ("imputer_mode",CategoricalImputer(imputation_method="frequent",variables=["BsmtQual"])),
    ("imputer_missing",CategoricalImputer(variables=["FireplaceQu"])),
    ("ohe",ohe)
])
pipe.fit_transform(X_train[["FireplaceQu","BsmtQual"]])

Unnamed: 0,FireplaceQu_Ex,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_Missing,FireplaceQu_Po,FireplaceQu_TA,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA
64,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
682,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
960,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1384,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1100,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
763,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
835,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1216,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
559,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [25]:
from feature_engine.imputation import AddMissingIndicator
imputer = AddMissingIndicator(missing_only=True)
imputer.fit_transform(X_train)

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,LotFrontage_na,MasVnrArea_na,BsmtQual_na,FireplaceQu_na,GarageYrBlt_na
64,,573.0,Gd,,1998.0,1,0,0,1,0
682,,0.0,Gd,Gd,1996.0,1,0,0,0,0
960,50.0,0.0,TA,,,0,0,0,1,1
1384,60.0,0.0,TA,,1939.0,0,0,0,1,0
1100,60.0,0.0,TA,,1930.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
763,82.0,673.0,Gd,Gd,1999.0,0,0,0,0,0
835,60.0,0.0,Gd,,1996.0,0,0,0,1,0
1216,68.0,0.0,,,1978.0,0,0,1,1,0
559,,18.0,Gd,TA,2003.0,1,0,0,0,0


In [26]:
imputer = AddMissingIndicator(variables=["BsmtQual","LotFrontage"])
imputer.fit_transform(X_train)

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,BsmtQual_na,LotFrontage_na
64,,573.0,Gd,,1998.0,0,1
682,,0.0,Gd,Gd,1996.0,0,1
960,50.0,0.0,TA,,,0,0
1384,60.0,0.0,TA,,1939.0,0,0
1100,60.0,0.0,TA,,1930.0,0,0
...,...,...,...,...,...,...,...
763,82.0,673.0,Gd,Gd,1999.0,0,0
835,60.0,0.0,Gd,,1996.0,0,0
1216,68.0,0.0,,,1978.0,1,0
559,,18.0,Gd,TA,2003.0,0,1


In [27]:
pipe = Pipeline(steps=[
    ("missing_ind",AddMissingIndicator()),
    ("imputer_mode",CategoricalImputer(imputation_method="frequent",variables=["FireplaceQu","BsmtQual"])),
    ("imputer_median",MeanMedianImputer(imputation_method="median",variables=["LotFrontage","MasVnrArea","GarageYrBlt"]))
])
pipe.fit_transform(X_train)

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,LotFrontage_na,MasVnrArea_na,BsmtQual_na,FireplaceQu_na,GarageYrBlt_na
64,69.0,573.0,Gd,Gd,1998.0,1,0,0,1,0
682,69.0,0.0,Gd,Gd,1996.0,1,0,0,0,0
960,50.0,0.0,TA,Gd,1979.0,0,0,0,1,1
1384,60.0,0.0,TA,Gd,1939.0,0,0,0,1,0
1100,60.0,0.0,TA,Gd,1930.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
763,82.0,673.0,Gd,Gd,1999.0,0,0,0,0,0
835,60.0,0.0,Gd,Gd,1996.0,0,0,0,1,0
1216,68.0,0.0,TA,Gd,1978.0,0,0,1,1,0
559,69.0,18.0,Gd,TA,2003.0,1,0,0,0,0


In [28]:
pipe.named_steps["imputer_median"].imputer_dict_

{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}

In [29]:
from sklearn.preprocessing import StandardScaler

In [31]:
pipe = Pipeline(steps=[
     ("imputer_median",MeanMedianImputer(imputation_method="median",variables=["LotFrontage","MasVnrArea","GarageYrBlt"])),
     ("scaler",StandardScaler())
]).set_output(transform="pandas")

In [32]:
pipe.fit_transform(X_train[["LotFrontage","MasVnrArea","GarageYrBlt"]])

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt
64,-0.026172,2.593218,0.820202
682,-0.026172,-0.568617,0.737916
960,-0.938578,-0.568617,0.038486
1384,-0.458365,-0.568617,-1.607232
1100,-0.458365,-0.568617,-1.977519
...,...,...,...
763,0.598106,3.145021,0.861345
835,-0.458365,-0.568617,0.737916
1216,-0.074194,-0.568617,-0.002657
559,-0.026172,-0.469292,1.025917
