In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [11]:
#  Read the data
X_full = pd.read_csv("train.csv", index_col="Id")
X_test_full = pd.read_csv("test.csv", index_col="Id")

#  Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = X_full.SalePrice
X_full.drop(["SalePrice"], axis=1, inplace=True)

# Let's keep only numerical predictors
X = X_full.select_dtypes(exclude=["object"])
X_test = X_test_full.select_dtypes(exclude=["object"])

#  Separate validation data from training data
train_X, val_X, train_y, val_y = train_test_split(X, y, train_size= 0.8, test_size=0.2, random_state=0)


### Handling missing values

In [12]:
# Check the shape of training data
print(train_X.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = train_X.isnull().sum()
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [13]:
tot_missing = missing_val_count_by_column.sum()
print(f"Total missing values in training data: {tot_missing}")
print(f"Percentage of missing values in col with highest amount of missing values intraining data: {missing_val_count_by_column.max() / train_X.shape[0]:.2%}")

Total missing values in training data: 276
Percentage of missing values in col with highest amount of missing values intraining data: 18.15%


In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# define a function to get the MAE for a given model
def get_mae(train_X=train_X, val_X=val_X, train_y=train_y, val_y=val_y):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

Usually since there are relatively few missing entries in the data(infact the column with the highest amount of missing values has like 17% of its data missing), we can say that droping columns with missing values will be unlikely yield good results as we would be throwing away a lot of valuable data.

But let's test that either way

### Drop columns with missing values

In [15]:
cols_with_missing = [col for col in train_X.columns if train_X[col].isnull().any()]

reduced_train_X = train_X.drop(cols_with_missing, axis=1)
reduced_val_X = val_X.drop(cols_with_missing, axis=1)

In [16]:
# get mae with dropped columns
print("MAE from dropping columns with missing values:")
print(get_mae(reduced_train_X, reduced_val_X, train_y, val_y))

MAE from dropping columns with missing values:
17837.82570776256


### Using imputation

In [18]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.transform(val_X))

# imputation removed column names, put them back
imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

print("MAE from Imputation:")
print(get_mae(imputed_train_X, imputed_val_X, train_y, val_y))

MAE from Imputation:
18062.894611872147


In [19]:
# let us impute with the median instead of the mean
final_imputer = SimpleImputer(strategy="median")
final_train_X = pd.DataFrame(final_imputer.fit_transform(train_X))
final_val_X = pd.DataFrame(final_imputer.transform(val_X))

# imputation removed column names, put them back
final_train_X.columns = train_X.columns
final_val_X.columns = val_X.columns



In [20]:
# Define and fit model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_train_X, train_y)

# Get validation predictions and MAE
preds_valid = model.predict(final_val_X)
print("MAE (Your approach):")
print(mean_absolute_error(val_y, preds_valid))

MAE (Your approach):
17791.59899543379


In [21]:
# prepocessing the test data
final_test_X = pd.DataFrame(final_imputer.transform(X_test))
final_test_X.columns = X_test.columns

# Get test predictions
test_preds = model.predict(final_test_X)

## Dealing with categorical varibles

Categorical variables: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
