In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

sns.set(style="whitegrid", context="talk")

house_data_train = pd.read_csv('data/final.csv')
df = house_data_train.copy()

# TODO: Remove addition of _ord features from analysis.ipynb
df.drop(columns=[c for c in df.columns if "_ord" in c], inplace=True)


In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.shape

In [None]:
X = df.drop("SalePrice", axis=1)
y_train_log = np.log1p(df["SalePrice"])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y_train_log, test_size=0.2)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

from housing_pipeline import (
	Log1pFeatureImputer,
	LotFrontageNeighborhoodImputer,
	MeaningfullNAImputer,
	BooleanFeaturesImputer,
	SFImputer,
	GarageFeaturesImputer,
	BsmtBathImputer,
	MasVnrAreaImputer,
	HousingOrdinalEncoder,
	HousingNominalOneHotEncoder,
	QuadraticFeaturesImputer
)

impute_NA_features = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature", "MasVnrType", "Electrical"]
ord_cat_features = {
	'OverallQual': [1,2,3,4,5,6,7,8,9,10],
	'OverallCond': [1,2,3,4,5,6,7,8,9,10],
	'KitchenQual': ["NA","Po","Fa","TA","Gd","Ex"],
	'ExterQual': ["Po","Fa","TA","Gd","Ex"],
	'ExterCond': ["Po","Fa","TA","Gd","Ex"],
	'BsmtQual': ["NA","Po","Fa","TA","Gd","Ex"],
	'BsmtCond': ["NA","Po","Fa","TA","Gd","Ex"],
	'BsmtExposure': ["NA","No","Mn","Av","Gd"],
	'BsmtFinType1': ["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
	'BsmtFinType2': ["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
	'HeatingQC': ["Po","Fa","TA","Gd","Ex"],
	'FireplaceQu': ["NA","Po","Fa","TA","Gd","Ex"],
	'GarageQual': ["NA","Po","Fa","TA","Gd","Ex"],
	'GarageCond': ["NA","Po","Fa","TA","Gd","Ex"],
	'PoolQC': ["NA","Po", "Fa","TA","Gd","Ex"],
}
nom_cat_features = ["MSSubClass", "MSZoning", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical", "Functional", "GarageType", "PavedDrive", "Fence", "SaleType", "SaleCondition", "Street", "GarageFinish", "MiscFeature"]

custom_pipelines = Pipeline([
	("Log1pFeatureImputer", Log1pFeatureImputer(["LotArea", "GrLivArea"])),
	("LotFrontageNeighborhoodImputer", LotFrontageNeighborhoodImputer()),
	("MeaningfullNAImputer", MeaningfullNAImputer(impute_NA_features)),
	("BooleanFeaturesImputer", BooleanFeaturesImputer()),
	("TotalSFImputer", SFImputer()),
	("GarageFeaturesImputer", GarageFeaturesImputer()),
	("BsmtBathImputer", BsmtBathImputer()),
	("MasVnrAreaImputer", MasVnrAreaImputer()),
	("HousingOrdinalEncoder", HousingOrdinalEncoder(ord_cat_features)),
	("HousingNominalOneHotEncoder", HousingNominalOneHotEncoder(nom_cat_features)),
	("QuadraticFeaturesImputer", QuadraticFeaturesImputer()),
	("scaler", RobustScaler()),
])



# Checking the dataset after preprocessing pipeline

In [None]:
def null_mask(X):
	# Always return a boolean array/DataFrame of same shape
	if isinstance(X, (pd.DataFrame, pd.Series)):
		return X.isna()
	Xn = np.asarray(X)
	if np.issubdtype(Xn.dtype, np.number):
		return np.isnan(Xn)
	# object / mixed
	return pd.isna(Xn)

def null_count(mask):
	# Always return a scalar int
	if isinstance(mask, (pd.DataFrame, pd.Series)):
		return int(mask.to_numpy().sum())
	return int(np.asarray(mask).sum())

X_cur = X.copy()
def check_pipeline(X: pd.DataFrame, pipelines, is_fit=False):
	for name, step in pipelines.steps:
		if is_fit:
			X = step.transform(X)
		else:
			X = step.fit_transform(X)
		mask = null_mask(X)
		print(f"{name:>20} | shape={np.asarray(X).shape} | nulls={null_count(mask)}")
		
		if isinstance(X, pd.DataFrame):
			per_col = X.isna().sum()
			bad_cols = per_col[per_col > 0].sort_values(ascending=False)
			if len(bad_cols):
				print("   columns with nulls:", bad_cols.head(10).to_dict())

check_pipeline(X_cur, custom_pipelines, is_fit=False)

Safe to train models, no null values

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVR

def rmse(y_true, y_pred):
	return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:

base = Pipeline([
	("prep", custom_pipelines),
	("model", Ridge())
])

models_and_grids = {
	"ridge": {
		"model": [Ridge(random_state=42, max_iter=20000)],
		"model__alpha": [1.0, 3.0, 10.0, 30.0, 100.0],
		"prep__scaler": [RobustScaler()],
	},
	"lasso": {
		"model": [Lasso(random_state=42, max_iter=20000)],
		"model__alpha": [1e-4, 3e-4, 1e-3, 3e-3, 1e-2],
		"prep__scaler": [RobustScaler()],
	},
	"elasticnet": {
		"model": [ElasticNet(random_state=42, max_iter=20000)],
		"model__alpha": [1e-4, 1e-3, 1e-2],
		"model__l1_ratio": [0.1, 0.5, 0.9],
		"prep__scaler": [RobustScaler()],
	},
	"gbr": {
		"model": [GradientBoostingRegressor(random_state=42)],
		"model__n_estimators": [1000, 2000],
		"model__learning_rate": [0.03, 0.05],
		"model__max_depth": [2, 3],
		"model__subsample": [0.7, 1.0],
		"prep__scaler": ["passthrough"],
	},
	"rf": {
		"model": [RandomForestRegressor(random_state=42, n_jobs=-1)],
		"model__n_estimators": [500, 1000],
		"model__max_depth": [None, 10, 20],
		"model__min_samples_leaf": [1, 2, 5],
		"prep__scaler": ["passthrough"],
	},
	"svr": {
		"model": [SVR()],
		"model__C": [3, 10, 30],
		"model__gamma": ["scale", 0.01, 0.03],
		"model__epsilon": [0.05, 0.1],
		"prep__scaler": [RobustScaler()],
	},
}

results = []
best_estimators = {}

for name, grid in models_and_grids.items():
	print(f"Training {name} with {grid}")
	gs = GridSearchCV(
		estimator=base,
		param_grid=grid,
		scoring=rmse_scorer,
		cv=cv,
		n_jobs=-1,
		verbose=0
	)
	gs.fit(X_train, y_train)
	cv_rmse = -gs.best_score_

	# X_val predict
	val_pred = gs.predict(X_val)
	val_rmse = rmse(y_val, val_pred)
	
	results.append((name, cv_rmse, val_rmse, gs.best_params_))
	best_estimators[name] = gs.best_estimator_

results_df = pd.DataFrame(results, columns=["model", "cv_rmse", "val_rmse", "best_params"]).sort_values("val_rmse")
results_df

In [None]:
model = best_estimators['ridge']
val_pred = model.predict(X_val)

In [None]:
sns.scatterplot(x=y_val, y=val_pred)
plt.xlabel('y_val')
plt.ylabel('y_pred')

Most of the predictions are on diagonal which is good. However there are some outliers which need further investigation.

# Param tuning

Selecting Ridge for param tuning as it seems to be the best model

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Ridge

cv = KFold(n_splits=10, shuffle=True, random_state=42)

base = Pipeline([
	("prep", custom_pipelines),
	("model", Ridge(random_state=42))
])

param_grid = {
	"model__alpha": [0.1, 0.5, 1.0, 3.0, 10.0, 30.0, 100.0, 200, 500, 1000],
	"model__tol": [0.0001, 0.000001],
    "model__solver": ["svd", "cholesky", "auto"],
    "model__max_iter": [20000, 50000]
}

grid = GridSearchCV(
    estimator=base,
    param_grid=param_grid,
    scoring=rmse_scorer,
    cv=cv,
    n_jobs=-1,
	refit=True,
    return_train_score=True
)

grid.fit(X_train, y_train)

In [None]:
print(f"Best cv score: {-grid.best_score_}")
print(f"Best params: {grid.best_params_}")


In [None]:
val_pred = grid.predict(X_val)
rmse(y_val, val_pred)

# Making first model for submission

In [None]:
X.columns

In [None]:
from sklearn.impute import SimpleImputer

custom_pipelines = Pipeline([
	("Log1pFeatureImputer", Log1pFeatureImputer(["LotArea", "GrLivArea"])),
	("LotFrontageNeighborhoodImputer", LotFrontageNeighborhoodImputer()),
	("MeaningfullNAImputer", MeaningfullNAImputer(impute_NA_features)),
	("BooleanFeaturesImputer", BooleanFeaturesImputer()),
	("TotalSFImputer", SFImputer()),
	("GarageFeaturesImputer", GarageFeaturesImputer()),
	("BsmtBathImputer", BsmtBathImputer()),
	("MasVnrAreaImputer", MasVnrAreaImputer()),
	("HousingOrdinalEncoder", HousingOrdinalEncoder(ord_cat_features)),
	("HousingNominalOneHotEncoder", HousingNominalOneHotEncoder(nom_cat_features)),
	("QuadraticFeaturesImputer", QuadraticFeaturesImputer()),
    ('simple_imputer', SimpleImputer(strategy="median")),
	("scaler", RobustScaler()),
])

model = Ridge(random_state=42, alpha=10, max_iter=20000, solver='svd', tol=0.0001)

base = Pipeline([
	("prep", custom_pipelines),
	("model", model)
])

# Fitting on entire train dataset
base.fit(X, y_train_log)

In [None]:
y_pred = base.predict(X)
rmse(y_train_log, y_pred)

# Test dataset predictions

In [None]:
df_test = pd.read_csv('data/test.csv')
id_col = df_test.pop('Id')

In [None]:
df_test.head()

In [None]:
X_test = df_test.reindex(columns=X.columns)

In [None]:
id_col.head()

In [None]:
X_test.head()

In [None]:
y_test = base.predict(X_test)

In [None]:
y_test

In [None]:
submission = pd.DataFrame(
    {
        "Id": id_col,
        "SalePrice": np.expm1(y_test)
})

In [None]:
submission.to_csv('submissions/first_attempt.csv', index=False)