In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# these are the objects we need to impute missing data
# with sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# to split the datasets
from sklearn.model_selection import train_test_split


In [2]:
# we use only the following variables for the demo:
# a mix of categorical and numerical

cols_to_use = ['BsmtQual', 'FireplaceQu', 'MSZoning',
               'BsmtUnfSF', 'LotFrontage', 'MasVnrArea',
               'Street', 'Alley', 'SalePrice']


In [3]:
# let's load the House Prices dataset

data = pd.read_csv('houseprice.csv', usecols=cols_to_use)
print(data.shape)
data.head()


(1460, 9)


Unnamed: 0,MSZoning,LotFrontage,Street,Alley,MasVnrArea,BsmtQual,BsmtUnfSF,FireplaceQu,SalePrice
0,RL,65.0,Pave,,196.0,Gd,150,,208500
1,RL,80.0,Pave,,0.0,Gd,284,TA,181500
2,RL,68.0,Pave,,162.0,Gd,434,TA,223500
3,RL,60.0,Pave,,0.0,TA,540,Gd,140000
4,RL,84.0,Pave,,350.0,Gd,490,TA,250000


In [4]:
# let's check the null values
data.isnull().mean()
data.dtypes
# The cateogrical variables Alley, BsmtQual and FirePlaceQu contain missing data.
# let's separate into training and testing set

MSZoning        object
LotFrontage    float64
Street          object
Alley           object
MasVnrArea     float64
BsmtQual        object
BsmtUnfSF        int64
FireplaceQu     object
SalePrice        int64
dtype: object

In [5]:
# first let's remove the target from the features
cols_to_use.remove('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use], # just the features
                                                    data['SalePrice'], # the target
                                                    test_size=0.3, # the percentage of obs in the test set
                                                    random_state=0) # for reproducibility
X_train.shape, X_test.shape


((1022, 8), (438, 8))

In [6]:
# let's look at the missing values
X_train.isnull().mean()
# for demo, impute categorical variables with the frequent category
# impute numerical variables with the mean

# first we need to make lists, indicating which features
# will be imputed with each method

features_numeric = ['BsmtUnfSF', 'LotFrontage', 'MasVnrArea', ]
features_categoric = ['BsmtQual', 'FireplaceQu', 'MSZoning',
                      'Street', 'Alley']

In [7]:
# then we put the features list and the transformers together
# using the column transformer

preprocessor = ColumnTransformer(transformers=[
    ('numeric_imputer', SimpleImputer(strategy='mean'), features_numeric),
    ('categoric_imputer', SimpleImputer(strategy='most_frequent'), features_categoric)
])
# now we fit the preprocessor
preprocessor.fit(X_train)
# we can explore the transformers like this:

preprocessor.transformers



[('numeric_imputer',
  SimpleImputer(),
  ['BsmtUnfSF', 'LotFrontage', 'MasVnrArea']),
 ('categoric_imputer',
  SimpleImputer(strategy='most_frequent'),
  ['BsmtQual', 'FireplaceQu', 'MSZoning', 'Street', 'Alley'])]

In [8]:
# and we can corroborate the value with that one in
# the train set
X_train[features_numeric].mean()
# and we corroborate those values in the train set

X_train[features_categoric].mode()

Unnamed: 0,BsmtQual,FireplaceQu,MSZoning,Street,Alley
0,TA,Gd,RL,Pave,Pave


In [9]:
# and now we can impute the data
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

# see how the result of the imputation is a 3 column dataset
#pd.DataFrame(X_train,
#             columns=features_numeric + features_categoric).head()

Have a nice day!