The "Housing Prices" competition is a regression problem: given some info on a house, we determine its selling price.

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. 

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # graph

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
for dirname, _, filenames in os.walk(os.getcwd()):
    for filename in filenames:
        print(os.path.join(dirname, filename))


wdir = os.getcwd()

D:\Projects\Kaggle-Projects\Competitions\Housing Prices\housing-prices_categorical-variables.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\housing-prices_intermediate-ML.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\housing-prices_missing-values.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\housing-prices_pipelines-cross-validation-XGboost.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\submission.csv
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\.ipynb_checkpoints\housing-prices_categorical-variables-checkpoint.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\.ipynb_checkpoints\housing-prices_intermediate-ML-checkpoint.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\.ipynb_checkpoints\housing-prices_missing-values-checkpoint.ipynb
D:\Projects\Kaggle-Projects\Competitions\Housing Prices\.ipynb_checkpoints\housing-prices_pipelines-cross-validation-XGboost-checkpoint.ipynb
D:\Projects\Kaggl

# Step 1: Gather the Data

### Checking the format of the data

In [2]:
train_data = pd.read_csv("input/train.csv", index_col='Id')
test_data = pd.read_csv("input/test.csv", index_col="Id")

# train_data.head()
# test_data.head()

# Step 2: Prepare the Data

In [3]:
# Separate target from predictors
y = train_data.SalePrice
X_full = train_data.drop(['SalePrice'], axis=1)

In [4]:
# Drop columns with missing values (simplest approach)
cols_with_na = [col for col in X_full.columns if X_full[col].isnull().any()]
X_reduced = X_full.drop(cols_with_na, axis=1)

print(cols_with_na)
print("X_reduced.shape = ", X_reduced.shape)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
def has_low_cardinality(df, col):
    is_categorical = (df[col].dtype == "object")
    has_low_card = (df[col].nunique() < 10)
    return (is_categorical and has_low_card)

categorical_cols = [col for col in X_reduced.columns if has_low_cardinality(X_reduced, col)]
print(categorical_cols)

# Select numerical columns
numerical_cols = [col for col in X_reduced.columns if X_reduced[col].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X = X_reduced[my_cols].copy()

# Divide data into training and validation subsets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
X_reduced.shape =  (1460, 60)
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']


## Defining the Preprocessing Pipeline

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
     ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Select the model

In [6]:
from sklearn.ensemble import RandomForestRegressor

model_rfr = RandomForestRegressor(n_estimators=100, random_state=0)

from xgboost import XGBRegressor

model_xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

ModuleNotFoundError: No module named 'xgboost'

## Create and Evaluate the Pipeline

In [None]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_xgb)
                             ])

# fitting the preprocessor step
preprocessor.fit(X_train)
X_valid_transformed = preprocessor.transform(X_valid)

my_pipeline.fit(X_train, y_train, 
                model__early_stopping_rounds=5, 
                model__eval_set=[(X_valid_transformed, y_valid)],
                model__verbose=True)

predictions = my_pipeline.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

In [None]:
from sklearn.model_selection import cross_val_score

final_pipeline =  Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=123, learning_rate=0.05, n_jobs=4))
])


# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(
    final_pipeline, X, y,
    cv=3,
    n_jobs=4,
    scoring='neg_mean_absolute_error',
    verbose=True,
)

print("MAE scores:\n", scores)