In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer

cols_to_use = [
    'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
    'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
    '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
    'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea',
    'WoodDeckSF',  'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
    'SalePrice'
]
data = pd.read_csv('houseprice.csv', usecols=cols_to_use)

# find variables with missing data
for var in data.columns:
    if data[var].isnull().sum() > 1:
        print(var, data[var].isnull().sum())

LotFrontage 259
MasVnrArea 8
GarageYrBlt 81


In [2]:
data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000


In [3]:
cols_to_use.remove('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(
    data[cols_to_use],
    data['SalePrice'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((1022, 36), (438, 36))

In [4]:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)

In [5]:
imputer = KNNImputer(
    n_neighbors=8,
    weights="distance",
    metric="nan_euclidean",
    add_indicator=False
    ).set_output(transform="pandas")

imputer.fit(X_train)

In [6]:
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)

In [7]:
train_t

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60.0,73.849884,9375.0,7.0,5.0,1997.0,1998.0,573.0,739.0,0.0,...,645.0,576.0,36.0,0.0,0.0,0.0,0.0,0.0,2.0,2009.0
1,120.0,37.966017,2887.0,6.0,5.0,1996.0,1997.0,0.0,1003.0,0.0,...,431.0,307.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,2008.0
2,20.0,50.000000,7207.0,5.0,7.0,1958.0,2008.0,0.0,696.0,0.0,...,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2010.0
3,50.0,60.000000,9060.0,6.0,5.0,1939.0,1950.0,0.0,204.0,0.0,...,280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,2009.0
4,30.0,60.000000,8400.0,2.0,5.0,1920.0,1950.0,0.0,290.0,0.0,...,246.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,60.0,82.000000,9430.0,8.0,5.0,1999.0,1999.0,673.0,1163.0,0.0,...,856.0,0.0,128.0,0.0,0.0,180.0,0.0,0.0,7.0,2009.0
1018,20.0,60.000000,9600.0,4.0,7.0,1950.0,1995.0,0.0,442.0,0.0,...,436.0,290.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2010.0
1019,90.0,68.000000,8930.0,6.0,5.0,1978.0,1978.0,0.0,0.0,0.0,...,539.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010.0
1020,120.0,43.286342,3196.0,7.0,5.0,2003.0,2004.0,18.0,0.0,0.0,...,420.0,143.0,20.0,0.0,0.0,0.0,0.0,0.0,10.0,2006.0


In [8]:
X_train[X_train["MasVnrArea"].isnull()]["MasVnrArea"]

420   NaN
490   NaN
642   NaN
824   NaN
921   NaN
Name: MasVnrArea, dtype: float64

In [9]:
train_t[X_train["MasVnrArea"].isnull()]["MasVnrArea"]

420    127.323004
490     57.519619
642      1.767345
824    259.219897
921    136.168649
Name: MasVnrArea, dtype: float64

In [10]:
X_train["MasVnrArea"].mean()

103.55358898721731

In [11]:
train_t["MasVnrArea"].mean()

103.61643690358332

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    data[cols_to_use],  # just the features
    data['SalePrice'],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0)  # for reproducibility

X_train.shape, X_test.shape

((1022, 36), (438, 36))

In [14]:
pipe = Pipeline(steps=[
    ("imputer",KNNImputer(n_neighbors=5,
                          weights="distance",
                          add_indicator=False)),
    ("scaler",StandardScaler()),
    ("regressor",Lasso(max_iter=2000))
])

In [16]:
param_grid={
    "imputer__n_neighbors":[5,10,13,15],
    "imputer__weights":["uniform","distance"],
    "imputer__add_indicator":[True,False],
    "regressor__alpha":[10,100,200]
}

grid_search= GridSearchCV(pipe,param_grid,cv=10,n_jobs=-1,scoring="r2")

grid_search.fit(X_train,y_train)

In [17]:
grid_search.best_score_

0.8243245469452933

In [18]:
grid_search.best_params_

{'imputer__add_indicator': True,
 'imputer__n_neighbors': 5,
 'imputer__weights': 'distance',
 'regressor__alpha': 200}

In [19]:
grid_search.score(X_test,y_test)

0.7298837148786135