In [1]:
import numpy  as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [14]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score= cross_val_score(
        model, X, y,cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

# Prepare data
df  = pd.read_csv('ames.csv')
X   = df.copy()
y   = X.pop("SalePrice") 

In [4]:
X_1 = pd.DataFrame() # dataframe to hold new features

X_1["LiveLotRatio"]    = df.GrLivArea /  df.LotArea
X_1["Spaciousness"]    = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
X_1["TotalOutsideSF"]  = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df.Threeseasonporch + df.ScreenPorch

In [5]:
# One-hot encode BldgType. Use 'prefix="Bldg"' in 'get_dummies'
X_2 = pd.get_dummies(df.BldgType, prefix="Bldg")
# Multiply
X_2 = X_2.mul(df.GrLivArea, axis=0)

In [7]:
X_3 = pd.DataFrame()
X_3["PorchTypes"] = df[[
    "WoodDeckSF"  , 
    "OpenPorchSF" ,
    "EnclosedPorch",
    "Threeseasonporch",
    "ScreenPorch",
]].gt(0,0).sum(axis=1)

In [8]:
df.MSSubClass.unique()

array(['One_Story_1946_and_Newer_All_Styles', 'Two_Story_1946_and_Newer',
       'One_Story_PUD_1946_and_Newer',
       'One_and_Half_Story_Finished_All_Ages', 'Split_Foyer',
       'Two_Story_PUD_1946_and_Newer', 'Split_or_Multilevel',
       'One_Story_1945_and_Older', 'Duplex_All_Styles_and_Ages',
       'Two_Family_conversion_All_Styles_and_Ages',
       'One_and_Half_Story_Unfinished_All_Ages',
       'Two_Story_1945_and_Older', 'Two_and_Half_Story_All_Ages',
       'One_Story_with_Finished_Attic_All_Ages',
       'PUD_Multilevel_Split_Level_Foyer',
       'One_and_Half_Story_PUD_All_Ages'], dtype=object)

In [9]:
X_4 = pd.DataFrame()
X_4["MSClass"] =  df.MSSubClass.str.split("_",n=1, expand=True)[0]

In [12]:
X_5 = pd.DataFrame()
X_5["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

In [15]:
X_new = X.join([X_1,X_2,X_3,X_4,X_5])
score_dataset(X_new, y)

0.13865658128932104