In [1]:
#import libraries and functions
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

#read train and test data
X_full = pd.read_csv('~/Desktop/Kaggle/practice/Intro_Machine_Learning/House_Price_Competition/train.csv', index_col='Id')
X_test_full = pd.read_csv('~/Desktop/Kaggle/practice/Intro_Machine_Learning/House_Price_Competition/test.csv', index_col='Id')

#remove rows with missing target, and separate target with predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

#set predictor
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

#break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

### METHOD 1: Drop columns with missing values

In [2]:
#function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [3]:
#get name of the columns with missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

#drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

In [4]:
#print result
print('MAE:', score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE: 17837.82570776256


### METHOD 2: Imputation

In [5]:
#imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

#imputation removed columns names, put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [6]:
#print result
print('MAE:', score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE: 18062.894611872147


### CONCLUSION:
The Method 1 preformed better than Method 2. This is because filling with mean value it makes less sence than filling with zeros, or most frequent values, or some other method. For example, the column `GarageYrBld` has missing values, that means some houses does not have garage. Thus, it makes more sence to fill up with 0 value.

### Generate Test Prediction: 

#### Part A

In [7]:
#preprocess the training and valid data.

#imputation
final_imputer = SimpleImputer(strategy='median')
final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))

#put columns names back
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns

#define and fit model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)

#validation prediction
preds_val = model.predict(final_X_valid)

#MAE
final_MAE = mean_absolute_error(y_valid, preds_val)
print('MAE:', final_MAE)

MAE: 17791.59899543379


#### Part B 

In [10]:
#preprocess test data
final_X_test = pd.DataFrame(final_imputer.transform(X_test))

#get test prediction
preds_test = model.predict(final_X_test)

#savw output
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('missing_values_submission.csv', index=False)