In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv(r"C:\Users\Bhakti Gajipara\OneDrive\Documents\machine learing\train.csv")
test = pd.read_csv(r"C:\Users\Bhakti Gajipara\OneDrive\Documents\machine learing\test.csv")

In [3]:
print("Shape of train dataframe",train.shape)
print("Shape of test dataframe",test.shape)

Shape of train dataframe (1460, 81)
Shape of test dataframe (1459, 80)


In [4]:
x_train = train.drop(columns = "SalePrice",axis = 1)
y_train = train["SalePrice"]
x_test = test.copy()
print("Shape of x_train",x_train.shape)
print("Shape of y_train",y_train.shape)
print("Shape of x_test",x_test.shape)

Shape of x_train (1460, 80)
Shape of y_train (1460,)
Shape of x_test (1459, 80)


# Missing value imputation

In [5]:
isnull_sum = x_train.isnull().sum()
isnull_sum

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [6]:
num_var = x_train.select_dtypes(include = ["int64","float64"]).columns
num_var_miss = [var for var in num_var if isnull_sum[var] > 0]
num_var_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [7]:
cat_var = x_train.select_dtypes(include = ["O"]).columns
cat_var_miss = [var for var in cat_var if isnull_sum[var] > 0]
cat_var_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [8]:
num_var_mean = ['LotFrontage']
num_var_median = ['MasVnrArea', 'GarageYrBlt']
cat_var_mod = ['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu']
cat_var_miss = ['GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [9]:
num_var_mean_imputer = Pipeline(steps = [("imputer",SimpleImputer(strategy = "mean"))])
num_var_median_imputer = Pipeline(steps = [("imputer",SimpleImputer(strategy = "median"))])
num_var_mod_imputer = Pipeline(steps = [("imputer",SimpleImputer(strategy = "most_frequent"))])
num_var_missing_imputer = Pipeline(steps = [("imputer",SimpleImputer(strategy = "constant",fill_value="Missing"))])

In [10]:
preprocessor = ColumnTransformer(transformers=[("mean_imputer",num_var_mean_imputer,num_var_mean),
                                ("mediun_imputer",num_var_median_imputer,num_var_median),
                                ("mod_imputer",num_var_mod_imputer,cat_var_mod),
                                ("miss_imputer",num_var_mod_imputer,cat_var_miss)])

In [11]:
preprocessor.fit(x_train)

In [12]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('mediun_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mod_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('miss_impute

In [14]:
preprocessor.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([70.04995837])

In [15]:
preprocessor.named_transformers_["mod_imputer"].named_steps["imputer"].statistics_

array(['Grvl', 'BrkFace', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [16]:
x_train_clean = preprocessor.transform(x_train)
x_test_clean = preprocessor.transform(x_test)

In [17]:
x_train_clean

array([[65.0, 196.0, 2003.0, ..., 'Gd', 'MnPrv', 'Shed'],
       [80.0, 0.0, 1976.0, ..., 'Gd', 'MnPrv', 'Shed'],
       [68.0, 162.0, 2001.0, ..., 'Gd', 'MnPrv', 'Shed'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'Gd', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'Gd', 'MnPrv', 'Shed'],
       [75.0, 0.0, 1965.0, ..., 'Gd', 'MnPrv', 'Shed']], dtype=object)

In [18]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('mediun_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mod_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('miss_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0, 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 27, 28, 29, 34, 36, 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 61, 62, 65, 66, 67, 68, 69, 70, 71, 75, 76, 77, 78, 79])]

In [19]:
x_train_clean_miss = pd.DataFrame(x_test_clean,columns = num_var_mean+num_var_median+cat_var_mod+cat_var_miss)

In [21]:
x_train_clean_miss.isnull().sum().sum()

0

In [22]:
train["Alley"].value_counts()

Alley
Grvl    50
Pave    41
Name: count, dtype: int64

In [23]:
x_train_clean_miss["Alley"].value_counts()

Alley
Grvl    1422
Pave      37
Name: count, dtype: int64

In [24]:
x_train_clean_miss["MiscFeature"].value_counts()

MiscFeature
Shed    1454
Gar2       3
Othr       2
Name: count, dtype: int64