In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

## LOAD DATA TO PANDAS DATAFRAME

A model given the housing data predict the price of a house

In [4]:
csv_file_path = "./csv/iowa_housing.csv";

In [5]:
X = pd.read_csv(csv_file_path, index_col='Id')
y = X.SalePrice

# remove the SalePrice from X, inplace: modify the original dataframe (inplace=false will return a copy)
X.drop(['SalePrice'], axis=1, inplace=True)

# exclude non-numerical columns
X = X.select_dtypes(exclude=['object'])

In [6]:
# train & test split
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, test_size=0.2)

## HANDLE MISSING VALUES

Find columns with missing values

In [9]:
cols_with_missing_value = [ col for col in train_X.columns if train_X[col].isnull().any() ]

Method 1: Drop columns with missing value

In [10]:
reduced_train_X = train_X.drop(cols_with_missing_value, axis=1)
reduced_test_X = test_X.drop(cols_with_missing_value, axis=1)

Method 2: Imputation

essentially make up values at spot where its missing according to surrounding values

In [14]:
from sklearn.impute import SimpleImputer

# Imputation
# simple imputer fills in the missing value with mean values
my_imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_test_X = pd.DataFrame(my_imputer.transform(test_X))

# imputation will remove column titles / names, we have to add them back
imputed_train_X.columns = train_X.columns
imputed_test_X.columns = test_X.columns

We can add a flag column tracking which value was imputed

In [16]:
train_X_plus = train_X.copy()
test_X_plus = test_X.copy()

# for each existing column with missing value, we create a new flag column
for col in cols_with_missing_value:
    # name new flag column column_name + "Missing"
    train_X_plus[col + 'Missing'] = train_X_plus[col].isnull()
    test_X_plus[col + 'Missing'] = test_X_plus[col].isnull()

# with the new flag columns, do the same imputation as above
imputed_train_X_plus = pd.DataFrame(my_imputer.fit_transform(train_X_plus))
imputed_test_X_plus = pd.DataFrame(my_imputer.transform(test_X_plus))

# add back column names
imputed_train_X_plus.columns = train_X_plus.columns
imputed_test_X_plus.columns = test_X_plus.columns

print(imputed_train_X_plus.head())

   MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0        60.0    70.537981  10304.0          5.0          7.0     1976.0   
1        50.0    50.000000   8405.0          5.0          8.0     1900.0   
2        30.0    51.000000   6120.0          2.0          3.0     1936.0   
3        20.0   110.000000  14977.0          8.0          5.0     2006.0   
4        20.0    70.537981   9991.0          4.0          4.0     1976.0   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  EnclosedPorch  \
0        1976.0        44.0       381.0         0.0  ...            0.0   
1        1950.0         0.0       241.0       391.0  ...          112.0   
2        1950.0         0.0         0.0         0.0  ...            0.0   
3        2007.0       304.0      1350.0         0.0  ...            0.0   
4        1993.0         0.0      1116.0         0.0  ...            0.0   

   3SsnPorch  ScreenPorch  PoolArea  MiscVal  MoSold  YrSold  \
0        0.0          0.0   

Train Model

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=150)
model.fit(imputed_train_X_plus, train_y)

In [18]:
predict_y = model.predict(imputed_test_X_plus)
print("MAE: ", mean_absolute_error(test_y, predict_y))

MAE:  16293.503333333334
