In [1]:
import numpy as np
import pandas as pd
from math import sqrt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn_pandas import CategoricalImputer

XGBoostLibraryNotFound: Cannot find XGBoost Library in the candidate path, did you install compilers and run build.sh in root path?
List of candidates:
C:\Users\David\anaconda3\envs\kaggle\Library\mingw-w64\bin\xgboost.dll
C:\Users\David\anaconda3\envs\kaggle\lib\site-packages\xgboost\../../windows/x64/Release/xgboost.dll
C:\Users\David\anaconda3\envs\kaggle\lib\site-packages\xgboost\./windows/x64/Release/xgboost.dll

In [2]:
data_dir = '../data'

In [3]:
df = pd.read_parquet(f"{data_dir}/interim/train.parq", engine='pyarrow')
display(df.shape)
df.head(2)

(1460, 69)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,...,ScreenPorch,MoSold,YrSold,SaleType,SaleCondition,SalePrice,total_rooms,total_area_house,total_area_miscellaneous,total_area
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65,8450,Reg,Lvl,Inside,Gtl,CollgCr,Norm,...,0,2,2008,WD,Normal,208500,11.5,2566.0,2319.0,4885.0
2,20,RL,80,9600,Reg,Lvl,FR2,Gtl,Veenker,Feedr,...,0,5,2007,WD,Normal,181500,8.5,2524.0,2020.0,4544.0


In [4]:
categorical_cols = [
    'SaleCondition', 'MSSubClass', 'BsmtFinType1', 'MSZoning', 'YrSold', 'LotConfig', 'SaleType', 'Condition1',
    'ExterQual', 'ExterCond', 'HeatingQC', 'HouseStyle', 'GarageType', 'Neighborhood', 'BsmtFinType2', 'LotShape', 
    'Exterior2nd', 'GarageFinish', 'Fireplaces', 'YearRemodAdd', 'BsmtQual', 'KitchenQual', 'YearBuilt',
    'LandContour', 'BsmtExposure', 'RoofStyle', 'Exterior1st', 'Foundation', 'MasVnrType', 'BldgType',
    'LandSlope', 'BsmtCond', 'CentralAir', 'Electrical', 'Functional', 'GarageYrBlt',
    'GarageQual', 'GarageCond', 'PavedDrive',
]
numeric_cols = [
    'FullBath', 'HalfBath', 'BedroomAbvGr', 'OpenPorchSF', 'WoodDeckSF', 'OverallQual', '1stFlrSF', '2ndFlrSF', 'LotArea', 'OverallCond', 'BsmtFinSF2', 'TotalBsmtSF',
    'BsmtFinSF1', 'MasVnrArea', 'GrLivArea', 'BsmtUnfSF', 'SalePrice', 'EnclosedPorch', 'GarageArea', 'MoSold', 'LotFrontage',
    'BsmtFullBath', 'BsmtHalfBath', 'TotRmsAbvGrd', 'GarageCars', 'ScreenPorch', 'total_rooms', 'total_area_house', 'total_area_miscellaneous'
]

In [5]:
df[categorical_cols] = df[categorical_cols].astype('category')

In [6]:
X = df.copy()
y = X.pop('SalePrice')

In [7]:
numeric_features = X.select_dtypes('number').columns
categorical_features = X.select_dtypes('category').columns

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)

In [9]:
categorical_transformer = Pipeline(steps=[
#     ('imputer', CategoricalImputer()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

In [10]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
])

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('variance_drop', VarianceThreshold(threshold=(0.95 * (1 - 0.95)))),
    ('clf', 'passthrough'),
])
parameters = [
    {
        'preprocessor__num__imputer__strategy': ['mean', 'median'],
        'clf': [RandomForestRegressor()],
        'clf__n_estimators': range(5, 510, 100),
        'clf__max_depth': [2, 4],
    },

    {
        'preprocessor__num__imputer__strategy': ['mean', 'median'],
        'clf':[XGBRegressor(random_state = 0)],
        'clf__n_estimators': range(200, 300, 20),
        'clf__max_depth': range(2, 6, 1),
        'clf__learning_rate': np.arange(0.04, 0.1, 0.005),
    }
]
# best = {clf: XGBRegressor, n_estimators: 900, max_depth: 2, learning_rate: 0.09} 

In [12]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('variance_drop', VarianceThreshold(threshold=(0.95 * (1 - 0.95)))),
    ('voting', 'passthrough'),
#     (
#         'voting', VotingRegressor([
#             'rf', 'passthrough',
#             'xgb', 'passthrough',
#             ])
#     )
])
parameters = [
    {
        'preprocessor__num__imputer__strategy': ['mean', 'median'],
        'voting': [VotingRegressor([
            ('rf', RandomForestRegressor(random_state=0)),
            ('xgb', XGBRegressor(random_state=0))
        ])],
#         'voting__rf': [RandomForestRegressor(random_state=0)],
        'voting__rf__n_estimators': range(5, 510, 100),
        'voting__rf__max_depth': [2, 4],
#         'voting__xgb': [XGBRegressor(random_state=0)],
        'voting__xgb__n_estimators': range(200, 400, 50),
        'voting__xgb__max_depth': range(2, 5, 1),
        'voting__xgb__learning_rate': np.arange(0.04, 0.1, 0.01),
    }
]

In [13]:
5 * len(ParameterGrid(parameters))

8640

In [14]:
custom_scoring = make_scorer(
    score_func=lambda y, y_pred: sqrt(mean_squared_error(np.log(y), np.log(y_pred))),
    greater_is_better=False,
)

In [15]:
grid_search = GridSearchCV(
    pipeline,
    param_grid=parameters,
    cv=5,
    scoring=custom_scoring,
    n_jobs=-1,
    verbose=True,
)

In [16]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 20.2min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 29.0min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 39.1min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 62.1min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 76.4min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 90.7min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 105.3min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 138.7min
[Parallel(n_jobs=-1)]: Done 8640 out of 8640 | elapsed: 146.1min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'Bsm...
                                                                               tree_method=None,
                                               

In [17]:
preds = grid_search.best_estimator_.predict(X_valid)
sqrt(mean_squared_error(np.log(y_valid), np.log(preds)))

0.13355019382485286

In [18]:
abs(grid_search.best_score_) # 0.1309148235996779

0.14408640146128998

In [19]:
grid_search.best_params_

{'preprocessor__num__imputer__strategy': 'median',
 'voting': VotingRegressor(estimators=[('rf',
                              RandomForestRegressor(max_depth=4,
                                                    n_estimators=205,
                                                    random_state=0)),
                             ('xgb',
                              XGBRegressor(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=0.08000000000000002,
                                           max_delta_step=None, max_depth=3,
                                           min_child_weight=None, mi

In [None]:
for name, importance in zip(df.columns, grid_search.best_estimator_.named_steps["clf"].feature_importances_):
    if importance <= 0:
        print(name)

In [None]:
plot_importance(grid_search.best_estimator_.named_steps["clf"]);

### Submit

In [23]:
X_test = pd.read_parquet(f"{data_dir}/interim/test.parq", engine='pyarrow')
display(X_test.shape)
X_test.head(2)

(1459, 68)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,...,EnclosedPorch,ScreenPorch,MoSold,YrSold,SaleType,SaleCondition,total_rooms,total_area_house,total_area_miscellaneous,total_area
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80,11622,Reg,Lvl,Inside,Gtl,NAmes,Feedr,...,0,120,6,2010,WD,Normal,6.0,1778.0,1886.0,3664.0
1462,20,RL,81,14267,IR1,Lvl,Corner,Gtl,NAmes,Norm,...,0,0,6,2010,WD,Normal,7.5,2658.0,2070.0,4728.0


In [42]:
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

In [44]:
preds_test = grid_search.best_estimator_.predict(X_test)

In [45]:
output = pd.DataFrame(
    {'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv(f"{data_dir}/processed/submission.csv", index=False)