# Housing Prices Validation Test

## Import Libraries

In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Import Dataset

In [59]:
df = pd.read_csv("data/train.csv")

In [60]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Split Training and Test Set

In [109]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [110]:
train_set.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
254,255,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,145000
1066,1067,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2009,WD,Normal,178000
638,639,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,5,2008,WD,Normal,85000
799,800,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2007,WD,Normal,175000
380,381,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,127000


In [82]:
test_set.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
892,893,20,RL,70.0,8414,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2006,WD,Normal,154500
1105,1106,60,RL,98.0,12256,Pave,,IR1,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,325000
413,414,30,RM,56.0,8960,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,3,2010,WD,Normal,115000
522,523,50,RM,50.0,5000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,10,2006,WD,Normal,159000
1036,1037,20,RL,89.0,12898,Pave,,IR1,HLS,AllPub,...,0,,,,0,9,2009,WD,Normal,315500


## Clean Dataset and Preprocessing

### Split Numerical and Categorical Variables

In [111]:
num_feat = train_set.select_dtypes(include=['float', 'int'])
cat_feat = train_set.select_dtypes(include=['object'])

In [84]:
print(len(num_feat.columns) + len(cat_feat.columns))

81


### Impute Null Values and Drop Unecessary Columns

In [85]:
# Examine null values for numerical data
missing_num = num_feat.isnull().sum()
nan_num = missing_num[missing_num > 0]
nan_num

LotFrontage    217
MasVnrArea       6
GarageYrBlt     64
dtype: int64

In [86]:
# Examine null values for categorical data
missing_cat = cat_feat.isnull().sum()
nan_cat = missing_cat[missing_cat > 0]
nan_cat

Alley           1094
MasVnrType         6
BsmtQual          28
BsmtCond          28
BsmtExposure      28
BsmtFinType1      28
BsmtFinType2      28
Electrical         1
FireplaceQu      547
GarageType        64
GarageFinish      64
GarageQual        64
GarageCond        64
PoolQC          1162
Fence            935
MiscFeature     1122
dtype: int64

In [87]:
def feat_null_pct(num):
    missing = num.isnull().sum()
    nans = missing[missing > 0].sort_values(ascending=False)
    for col in nans.index:
        print(f"{col} Null Values: {round(100 - train_set[col].value_counts().sum()/len(train_set) * 100, 2)}% of the dataset")

feat_null_pct(num_feat)

LotFrontage Null Values: 18.58% of the dataset
GarageYrBlt Null Values: 5.48% of the dataset
MasVnrArea Null Values: 0.51% of the dataset


In [88]:
feat_null_pct(cat_feat)

PoolQC Null Values: 99.49% of the dataset
MiscFeature Null Values: 96.06% of the dataset
Alley Null Values: 93.66% of the dataset
Fence Null Values: 80.05% of the dataset
FireplaceQu Null Values: 46.83% of the dataset
GarageType Null Values: 5.48% of the dataset
GarageFinish Null Values: 5.48% of the dataset
GarageQual Null Values: 5.48% of the dataset
GarageCond Null Values: 5.48% of the dataset
BsmtQual Null Values: 2.4% of the dataset
BsmtCond Null Values: 2.4% of the dataset
BsmtExposure Null Values: 2.4% of the dataset
BsmtFinType1 Null Values: 2.4% of the dataset
BsmtFinType2 Null Values: 2.4% of the dataset
MasVnrType Null Values: 0.51% of the dataset
Electrical Null Values: 0.09% of the dataset


In [112]:
# Drop the columns with large proportions of Null Values and Unique Id
train_set.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)
test_set.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)

# Update numerical and categorical variable split
num_feat = train_set.select_dtypes(include=['int', 'float'])
cat_feat = train_set.select_dtypes(include=['object'])

num_feat_test = test_set.select_dtypes(include=['int', 'float'])
cat_feat_test = test_set.select_dtypes(include=['object'])

In [113]:
num_feat.shape

(1168, 37)

In [114]:
cat_feat.shape

(1168, 39)

In [115]:
from sklearn.impute import SimpleImputer

# Impute Numerical Columns
imputer_num = SimpleImputer(missing_values=np.nan, strategy='mean')
num_feat = pd.DataFrame(imputer_num.fit_transform(num_feat), columns=num_feat.columns)

imputer_num_test = SimpleImputer(missing_values=np.nan, strategy='mean')
num_feat_test = pd.DataFrame(imputer_num.fit_transform(num_feat_test), columns=num_feat_test.columns)

In [116]:
cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_feat = pd.DataFrame(cat_imputer.fit_transform(cat_feat), columns=cat_feat.columns)

cat_imputer_test = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_feat_test = pd.DataFrame(cat_imputer.fit_transform(cat_feat_test), columns=cat_feat_test.columns)

In [117]:
print(f"Null Numerical Data: {num_feat.isnull().any().sum() + num_feat_test.isnull().any().sum()}")
print(f"Null Categorical Data: {cat_feat.isnull().any().sum() + cat_feat_test.isnull().any().sum()}")

Null Numerical Data: 0
Null Categorical Data: 0


### Encoding Categorical Data

In [118]:
# Concatenating the training and test categorical features to avoid mismatching the columns
cat_concat = pd.concat([cat_feat, cat_feat_test], axis=0)

In [119]:
print(cat_feat.shape, cat_feat_test.shape, cat_concat.shape)

(1168, 39) (292, 39) (1460, 39)


In [120]:
cat_concat.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

In [128]:
# Encode categorical columns while avoiding dummy variable trap
def get_dummy_no_trap(cat_data):
    copy_cat = cat_data.copy()
    cat_out = cat_data
    first = True
    for col in copy_cat.columns:
        temp_cat = pd.get_dummies(copy_cat[col], drop_first=True)
        copy_cat.drop([col], axis=1, inplace=True)
        if first:
            cat_out = temp_cat.copy()
            first = False
        else:
            cat_out = pd.concat([cat_out, temp_cat], axis=1)
    cat_out = pd.concat([copy_cat, cat_out], axis=1)
    
    return cat_out

In [130]:
cat_copy = cat_concat.copy()

In [132]:
encoded_cat = get_dummy_no_trap(cat_copy)

In [133]:
encoded_cat.shape

(1460, 200)

In [134]:
encoded_cat.head()

Unnamed: 0,FV,RH,RL,RM,Pave,IR2,IR3,Reg,HLS,Low,...,ConLI,ConLw,New,Oth,WD,AdjLand,Alloca,Family,Normal,Partial
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


In [139]:
print(encoded_cat.columns)

Index(['FV', 'RH', 'RL', 'RM', 'Pave', 'IR2', 'IR3', 'Reg', 'HLS', 'Low',
       ...
       'ConLI', 'ConLw', 'New', 'Oth', 'WD', 'AdjLand', 'Alloca', 'Family',
       'Normal', 'Partial'],
      dtype='object', length=200)


In [168]:
# Resplit the rows
cat_feat = encoded_cat.iloc[:1168]
cat_feat_test = encoded_cat.iloc[1168:]

In [169]:
cat_feat

Unnamed: 0,FV,RH,RL,RM,Pave,IR2,IR3,Reg,HLS,Low,...,ConLI,ConLw,New,Oth,WD,AdjLand,Alloca,Family,Normal,Partial
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1164,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1165,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1166,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


In [170]:
cat_feat_test

Unnamed: 0,FV,RH,RL,RM,Pave,IR2,IR3,Reg,HLS,Low,...,ConLI,ConLw,New,Oth,WD,AdjLand,Alloca,Family,Normal,Partial
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0,0,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
288,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
289,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
290,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0


## Finalize the Dataset

In [172]:
# Recombine training set
train_set = pd.concat([num_feat, cat_feat], axis=1)
train_set = train_set.loc[:, ~train_set.columns.duplicated()]

# Recombine test set
test_set = pd.concat([num_feat_test, cat_feat_test], axis=1)
test_set = test_set.loc[:, ~test_set.columns.duplicated()]

In [173]:
train_set.shape

(1168, 177)

In [174]:
test_set.shape

(292, 177)

## Implement XGBoost Regressor

In [175]:
X_train = train_set.drop(['SalePrice'], axis=1)
y_train = train_set['SalePrice']
X_test = test_set.drop(['SalePrice'], axis=1)
y_test = test_set['SalePrice']

In [176]:
X_train

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,ConLI,ConLw,New,Oth,WD,AdjLand,Alloca,Family,Normal,Partial
0,20.0,70.0,8400.0,5.0,6.0,1957.0,1957.0,0.0,922.0,0.0,...,0,0,0,0,1,0,0,0,1,0
1,60.0,59.0,7837.0,6.0,7.0,1993.0,1994.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,1,0
2,30.0,67.0,8777.0,5.0,7.0,1910.0,1950.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,1,0
3,50.0,60.0,7200.0,5.0,7.0,1937.0,1950.0,252.0,569.0,0.0,...,0,0,0,0,1,0,0,0,1,0
4,50.0,50.0,5000.0,5.0,6.0,1924.0,1950.0,0.0,218.0,0.0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,20.0,78.0,9317.0,6.0,5.0,2006.0,2006.0,0.0,24.0,0.0,...,0,0,0,0,1,0,0,0,1,0
1164,50.0,65.0,7804.0,4.0,3.0,1928.0,1950.0,0.0,622.0,0.0,...,0,0,0,0,1,0,0,0,1,0
1165,20.0,60.0,8172.0,5.0,7.0,1955.0,1990.0,0.0,167.0,0.0,...,0,0,0,0,1,0,0,0,1,0
1166,50.0,55.0,7642.0,7.0,8.0,1918.0,1998.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,1,0


In [177]:
y_train

0       145000.0
1       178000.0
2        85000.0
3       175000.0
4       127000.0
          ...   
1163    176432.0
1164    135000.0
1165    115000.0
1166    189950.0
1167    174000.0
Name: SalePrice, Length: 1168, dtype: float64

In [178]:
from xgboost import XGBRegressor
xgb_reg = XGBRegressor()

n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]
base_score=[0.25,0.5,0.75,1]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [179]:
from sklearn.model_selection import RandomizedSearchCV

# Set up the random search with 5-fold cross validation
random_cv = RandomizedSearchCV(estimator=xgb_reg,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [180]:
random_cv.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [186]:
best_model = random_cv.best_estimator_
y_pred = best_model.predict(X_test)

## Evaluate Model

In [189]:
from sklearn.metrics import mean_squared_error

final_mse = mean_squared_error(y_test, y_pred)
final_rmse = np.sqrt(final_mse)

print(final_rmse)

30406.876432047124
