In [457]:
import pandas as pd                                     # Data analysis tool
import numpy as np                                      # Package for scientific computing
from sklearn.model_selection import train_test_split    # Splits arrays or matrices into random train and test subsets
from sklearn.model_selection import KFold               # Cross-validator
from sklearn.model_selection import cross_validate      # Evaluate metrics by cross-validation
from sklearn.model_selection import GridSearchCV        # Search over specified parameter values for an estimator
from sklearn.compose import ColumnTransformer           # Applies transformers to columns of DataFrames
from sklearn.pipeline import Pipeline                   # Helps building a chain of transforms and estimators
from sklearn.impute import SimpleImputer                # Imputation transformer for completing missing values
from sklearn.preprocessing import OneHotEncoder         # Encode categorical features
from sklearn.metrics import mean_absolute_error         # One of many statistical measures of error
from xgboost import XGBRegressor                        # Our model estimator

In [458]:
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Obtain target vectors and predictors
X = X_full.copy()
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

In [459]:
print(X.shape)
print(y.shape)

(1460, 79)
(1460,)


In [460]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                                train_size=0.8, 
                                                                test_size=0.2, 
                                                                random_state=0)

In [461]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [462]:

# Check for missing values
missing_values = X.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print(missing_values)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64


In [463]:
# Select categorical columns with no more than 15 unique values
categorical_cols = [col for col in X_train_full.columns if 
                   X_train_full[col].nunique() <= 15 and
                   X_train_full[col].dtype == 'object']

# Select numeric values
numeric_cols = [col for col in X_train_full.columns if
                X_train_full[col].dtype in ['int64', 'float64']]

# Keep selected columns
my_columns = categorical_cols + numeric_cols
X_train = X_train_full[my_columns].copy()
X_valid = X_valid_full[my_columns].copy()
X_test = X_test_full[my_columns].copy()

In [464]:
# Preprocessing numerical values
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing categorical values
categorical_transformer = Pipeline(steps=[
                                   ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
                                   ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                   ])

# Pack the preprocessors together
preprocessor = ColumnTransformer(transformers=[
                                 ('num', numerical_transformer, numeric_cols),
                                 ('cat', categorical_transformer, categorical_cols)
                                 ])

In [513]:
# Define the model with default parameters
model = XGBRegressor(verbosity=0, random_state=0)

# Pack preprocessing and modeling together in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                              ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.38955479452
MAE: 17158.

In [514]:
# Using KFold cross-validator
kfold = KFold(shuffle=True, random_state=42)

# Evaluating the Mean Absolute Error
scores = cross_validate(my_pipeline, X_train, y_train, 
                              scoring='neg_mean_absolute_error', cv=kfold)

# Multiply by -1 since sklearn calculates negative MAE
print('Average MAE score:', (scores['test_score'] * -1).mean())

Average MAE score: 18553.972603175098


In [469]:
# Define final model
final_model = XGBRegressor(n_estimators=600, 
                           max_depth=5, 
                           min_child_weight=0.0001, 
                           learning_rate=0.1, 
                           verbosity=0, 
                           random_state=0
                           )

# Create a pipeline
final_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('final_model', final_model)
                                 ])

# Fit the model
final_pipeline.fit(X_train, y_train)

# Get predictions on the test set
final_prediction = final_pipeline.predict(X_test)

In [470]:
# Save test predictions to .csv file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': final_prediction})
output.to_csv('submission.csv', index=False)