In this repo, I will use Feature Engineer techniques, I have learned in Feature Engineering practices, to improve predictive power for House Prices problems

### Set up notebook environment

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display

#set Matplotlib defaults
plt.style.use('seaborn-v0_8-whitegrid')
plt.rc('figure', autolayout=True)
plt.rc(
    "axes",
    labelweight='bold',
    labelsize='large',
    titleweight='bold',
    titlesize=14,
    titlepad=10,)

In [5]:
pd.set_option('display.max_columns', None) #showing all columns
pd.set_option("display.max_rows", None) #show all rows

In [6]:
#loading description file
discription_file = "./house-prices-advanced-regression-techniques/data_description.txt"
with open(discription_file) as f:
    contents = f.read()
    print(contents)

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM

#### loading data

In [7]:
df_pretrain = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv", index_col="Id")
df_pretest = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv", index_col="Id")
df = pd.concat([df_pretrain, df_pretest])
print(df.shape)
df.head(5)

(2920, 80)


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


#### preprocessing data

In [8]:
#Examine the features
#the loop will go through each features and it's values
#enter space to keep go through each features, enter 'break' to exit
print(f"There are {df.shape[1]} columns")
for col in df.columns:
    enter = input()
    if enter == "break": #exit loop
        break
    else:
        print(f"{col} - {df[col].dtype}:", df[col].unique())

There are 80 columns

MSSubClass - int64: [ 60  20  70  50 190  45  90 120  30  85  80 160  75 180  40]

MSZoning - object: ['RL' 'RM' 'C (all)' 'FV' 'RH']
break


Investting every columns, we see there is typos in Exterior2nd mismatching with Exterior1st, and wrong assigned dtype - MSSubClass should be object not interger type, and those missing values (nan value) in some categorical columns, is actually absence, likes house with no basements, no pool, no garage, we will treat them right for better performance, for those in continous and discrete columns we need to impute

In [14]:
def cleaning_df(df): #function to clean our df
    df['Exterior2nd'] = df['Exterior2nd'].replace({"Brk Cmn" : "BrkComn"})
    #Some values of GarageYrBlt are corrupt, so we'll replace them with the year the house was built
    df['GarageYrBlt'] = df['GarageYrBlt'].where(df['GarageYrBlt'] <= 2010, df.YearBuilt)
    
    
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df

In [19]:

# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", 
                "LotConfig", "Neighborhood", "Condition1", "Condition2", 
                "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", 
                "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation",
                "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]

# assigned levels, use list in python, they ordered
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

#mapping, use dictionary which is ordered of collections of lists
ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

#add None level for missing values, cause they are actually house without those features
ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}


def encoding_df(df):
    from pandas.api.types import CategoricalDtype #use pandas calls the categories levels ordered
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype('category')
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    #Ordinal categories
    for col_name, levels in ordered_levels.items():
        df[col_name] = df[col_name].astype(CategoricalDtype(levels, ordered=True))
    return df

In [20]:
def imputing_df(df):
    #We'll impute 0 for missing numeric values and "None" for missing categorical values
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

In [21]:
def preprocessing_df(df, df_pretrain, df_pretest):
    df = cleaning_df(df)
    df = encoding_df(df)
    df = imputing_df(df)
    # Reform splits
    df_train = df.loc[df_pretrain.index, :]
    df_test = df.loc[df_pretest.index, :]
    return df_train, df_test

In [22]:
#loading data
df_train, df_test = preprocessing_df(df, df_pretrain, df_pretest)

In [23]:
display(df_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2920 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   MSSubClass        2920 non-null   category
 1   MSZoning          2920 non-null   category
 2   LotFrontage       2920 non-null   float64 
 3   LotArea           2920 non-null   int64   
 4   Street            2920 non-null   category
 5   Alley             2920 non-null   category
 6   LotShape          2920 non-null   category
 7   LandContour       2920 non-null   category
 8   Utilities         2920 non-null   category
 9   LotConfig         2920 non-null   category
 10  LandSlope         2920 non-null   category
 11  Neighborhood      2920 non-null   category
 12  Condition1        2920 non-null   category
 13  Condition2        2920 non-null   category
 14  BldgType          2920 non-null   category
 15  HouseStyle        2920 non-null   category
 16  OverallQual       2920 n

None

### Baseline Model
Baseline model will be used to compared with our later model with new features

In [24]:
# import libraries
from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

In [25]:
# We can reuse this scoring function anytime we want to try out a new feature set.
def score_dataset(X, y, model=XGBRegressor()): #function compute the RMSLE scores for our models
    # Label encoding for categoricals
    #
    # Label encoding is good for XGBoost and RandomForest, but one-hot
    # would be better for models like Lasso or Ridge. The `cat.codes`
    # attribute holds the category levels.
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [26]:
X = df_train.copy()
y = X.pop("SalePrice")
# This baseline score helps us to know 
# whether some set of features we've assembled has actually led to any improvement or not
baseline_score = score_dataset(X, y)

print(f"Baseline score: {baseline_score:.5f} RMSLE")

Baseline score: 0.14086 RMSLE


### Feature Utility Scores

Use mutual information to compute a utility score for a feature, giving us an indication of how much potential the feature has


In [27]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [28]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [29]:
#get mi score for our features
X = df_train.copy()
y = X.pop("SalePrice")

mi_scores = make_mi_scores(X, y)
mi_scores

LotArea             4.828130
GrLivArea           3.399592
TotalBsmtSF         2.783034
BsmtUnfSF           2.727304
FirstFlrSF          2.702582
BsmtFinSF1          2.443508
GarageArea          1.981941
SecondFlrSF         1.745278
WoodDeckSF          1.134494
YearBuilt           1.112709
OpenPorchSF         1.012888
Neighborhood        1.007957
GarageYrBlt         0.939186
YearRemodAdd        0.919880
LotFrontage         0.834882
OverallQual         0.783158
BsmtFinSF2          0.611578
MSSubClass          0.595348
TotRmsAbvGrd        0.563421
MasVnrArea          0.543058
GarageCars          0.502129
Exterior2nd         0.491375
MoSold              0.482945
Exterior1st         0.469554
EnclosedPorch       0.466962
BsmtFinType1        0.466359
BsmtQual            0.458979
KitchenQual         0.455592
GarageFinish        0.451102
FireplaceQu         0.423499
ExterQual           0.416505
Foundation          0.376681
FullBath            0.367266
GarageType          0.366470
YrSold        