# 2.Mean or Median Imputation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImpute
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [None]:
dataset=pd.read_csv("/content/kaggle_house_pred_train.csv")

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# get numeric COlumns which needs to be imputed

In [None]:
numeric_cols=[dataset[feat].dtype for feat in dataset.columns if dataset[feat].dtype!='object']
len(numeric_cols)

38

In [None]:
numeric_cols=dataset.select_dtypes(include=numeric_cols).columns

In [None]:
lst=dataset[numeric_cols].columns
lst=lst.to_list()
type(lst)

list

In [None]:
numeric_cols

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [None]:
missing_cols=[feat for feat in numeric_cols if dataset[feat].isnull().mean()>0]
missing_cols

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

# Split dataset into training and test data 

In [None]:
#numeric_cols.remove('SalePrice')
X_train,X_test,y_train,y_test=train_test_split(dataset[numeric_cols].drop('SalePrice',axis=1),  # just the features
    dataset['SalePrice'],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0, # for reproducibility
    )

In [None]:
X_train.shape

(1022, 37)

# Identify Normally Distributed and Skewed Features

In [None]:
median_list=[feat for feat in missing_cols  if   X_train[feat].skew()> 0.5 or X_train[feat].skew()<0.5 ]

In [None]:
median_list

(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], LotFrontage    1.884780
 MasVnrArea     2.758202
 GarageYrBlt   -0.666976
 dtype: float64)

In [None]:
preprocessor = ColumnTransformer(transformers=[
   
    ('median_imputer', SimpleImputer(strategy='median'), median_list)
], remainder='passthrough')

In [None]:
preprocessor.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('median_imputer',
                                 SimpleImputer(strategy='median'),
                                 ['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])])

In [None]:
preprocessor.named_transformers_['median_imputer'].statistics_


array([  69.,    0., 1979.])

In [None]:
X_train = preprocessor.transform(X_train)

In [None]:
X_test = preprocessor.transform(X_test)

# and check that it worked
np.mean(np.isnan(X_test))

0.0

In [None]:
preprocessor.transformers_


[('median_imputer',
  SimpleImputer(strategy='median'),
  ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']),
 ('remainder',
  'passthrough',
  [0,
   1,
   3,
   4,
   5,
   6,
   7,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36])]

In [None]:
cols_to_use=dataset[numeric_cols].columns
cols_to_use

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [None]:
remainder_cols = [lst[c] for c in [0,
   1,
   3,
   4,
   5,
   6,
   7,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36]]
remainder_cols

['Id',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [None]:
pd.DataFrame(X_train,
             columns=median_list+remainder_cols
            ).head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,69.0,573.0,1998.0,65.0,60.0,9375.0,7.0,5.0,1997.0,1998.0,...,645.0,576.0,36.0,0.0,0.0,0.0,0.0,0.0,2.0,2009.0
1,69.0,0.0,1996.0,683.0,120.0,2887.0,6.0,5.0,1996.0,1997.0,...,431.0,307.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,2008.0
2,50.0,0.0,1979.0,961.0,20.0,7207.0,5.0,7.0,1958.0,2008.0,...,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2010.0
3,60.0,0.0,1939.0,1385.0,50.0,9060.0,6.0,5.0,1939.0,1950.0,...,280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,2009.0
4,60.0,0.0,1930.0,1101.0,30.0,8400.0,2.0,5.0,1920.0,1950.0,...,246.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009.0
