In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, f_regression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_rows', 100)

In [3]:
PATH = '../input/house-prices-advanced-regression-techniques'
train = pd.read_csv(PATH+'/train.csv', index_col="Id")
X_train = train.drop(columns=['SalePrice'])
y_train = train['SalePrice']
X_test = pd.read_csv(PATH+'/test.csv', index_col="Id")
submission_data = pd.read_csv(PATH+'/sample_submission.csv')

In [4]:
def plot_predictions(y_true, y_pred): 
    print(
        f"""
        RMSLE: {np.sqrt(mean_squared_log_error(y_true, y_pred))}
        MSE: {mean_squared_error(y_true, y_pred)}
        RMSE: {mean_squared_error(y_true, y_pred)**0.5}
        MAE: {mean_absolute_error(y_true, y_pred)}
        """
    )
    max_preds = min([max(y_pred.tolist()), max(y_true.tolist())])
    min_preds = max([min(y_pred.tolist()), min(y_true.tolist())])
    print(max_preds, min_preds)
    # plot
    plt.figure(figsize=(8,8))
    sns.scatterplot(x=y_pred, y=y_true)
    sns.lineplot(x=[min_preds,max_preds], y=[min_preds, max_preds], color='red')
    plt.ylabel('Observed')
    plt.xlabel('Predicted')
    plt.show()

    errors = y_pred - y_true
    plt.subplots(figsize=(12, 8))
    sns.histplot(errors)
    plt.vlines(x=0, ymin=0, ymax=150, color='red')
    plt.show()

    p_df = (
        pd.DataFrame({'y_true':y_true, 'y_pred':y_pred})
        .assign(error = lambda x: x['y_pred'] - x['y_true'])
        .sort_values(by='y_true')
        )

    plt.subplots(figsize=(12, 8))
    sns.scatterplot(data=p_df, x='y_true', y='error')
    plt.hlines(y=0, xmin=0, xmax=max(p_df['y_true']), color='red')
    plt.show()

### Encode the different feature types & impute missing values

In [5]:
# Ordered categorical columns
ordinal_categorical_features  = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC", "KitchenQual", 
                                 "FireplaceQu", "GarageQual", "GarageCond", "PoolQC","LotShape", "LandSlope",
                                 "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Functional", "GarageFinish", 
                                 "PavedDrive", "Utilities", "Electrical", "Fence"]

categorical_features = list(X_train.select_dtypes(include=["object"]))
# Unordered categorical columns
nominative_categorical_features = list(set(categorical_features) - set(ordinal_categorical_features))

numerical_features = list(X_train.select_dtypes(exclude=["object"]))

ordinal_categories = [
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Po", "Fa", "TA", "Gd", "Ex"],
    ["None", "Reg", "IR1", "IR2", "IR3"],
    ["None", "Sev", "Mod", "Gtl"],
    ["None", "No", "Mn", "Av", "Gd"],
    ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    ["None", "Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    ["None", "Unf", "RFn", "Fin"],
    ["None", "N", "P", "Y"],
    ["None", "NoSeWa", "NoSewr", "AllPub"],
    ["None", "Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    ["None", "MnWw", "GdWo", "MnPrv", "GdPrv"]
]

### Scale the ordinal data after encoding, fit the logarithm of the prices, impute numerical values with the median

In [6]:
# Create the scaling pipelines for categorical and numerical data
numeric_preprocessor = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', MinMaxScaler())
])
one_hot_preprocessor = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="constant", fill_value="None")),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
ordinal_preprocessor = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="constant", fill_value="None")),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories)),
    ('scale', MinMaxScaler())
])

# Apply the scaling pipeline to both categorical and numerical columns
full_preprocessor = ColumnTransformer(transformers=[
    ('numerical', numeric_preprocessor, numerical_features), 
    ('one_hot', one_hot_preprocessor, nominative_categorical_features), 
    ('ordinal', ordinal_preprocessor, ordinal_categorical_features)
])

In [14]:
lm_pipeline = Pipeline(steps=[
    ('processor', full_preprocessor), 
    ('selectkbest', SelectKBest(f_regression)),
    ('model', LinearRegression())
])

# specify range of hyperparameters to tune
hyper_params = {'selectkbest__k': [110, 120, 130, 140, 150, 160, 170, 180]}

folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

# perform grid search
model_cv = GridSearchCV(estimator = lm_pipeline, 
                        param_grid = hyper_params, 
                        scoring= 'neg_root_mean_squared_error', 
                        cv = 5, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(X_train, np.log(y_train))     

# Plot the training data predictions
training_predictions = np.exp(model_cv.predict(X_train))
# plot_predictions(y_train, training_predictions)

# RMSLE: 0.1454135108843468

print(f'RMSLE: {np.sqrt(mean_squared_log_error(y_train, training_predictions))}')

In [13]:
model_cv.best_params_

In [9]:
# Save the test data predictions for submission
# Take the exp-transform of the predictions to reconvert them to the correct scale for submissions
submission_predictions = np.exp(model_cv.predict(X_test))
submission = pd.DataFrame({'Id': X_test.index, 'SalePrice': submission_predictions})
submission.to_csv('submission.csv', index=False)
print("submission.csv has been saved.")

In [None]:
final_pipe = make_pipeline(
    cat_num_preprocessor,
    VarianceThreshold(),
    StandardScaler(),
    pca,
    RandomForestRegressor()
)

pipe_params = {
    'pca__n_components': [5, 15, 30, 60],
    'randomforestregressor__max_depth': [10, 20, 40],
    'randomforestregressor__min_samples_split': [2, 3, 5],
    'randomforestregressor__min_samples_leaf': [1, 2, 5]
}

clf = GridSearchCV(final_pipe, pipe_params, cv = 10, scoring='neg_root_mean_squared_error', 
                   error_score='raise')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [111]:
'''
search = GridSearchCV(tree_pipeline,     # you have defined this beforehand
                      param_grid,         # the parameter grid
                      cv=5,               # the value for K in K-fold Cross Validation
                      scoring='accuracy', # the performance metric to use
                      verbose=1, 
                      refit=True, 
                      n_jobs=-1)
'''
# step-1: create a cross-validation scheme
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

# step-2: specify range of hyperparameters to tune
# hyper_params = [{'model__n_features_to_select': list(range(1, 221))}]
hyper_params = {'selectkbest__k': [10, 20, 40]}

# step-3: perform grid search
# 3.1 specify model
# lm_pipeline.fit(X_train, np.log(y_train))
# rfe = RFE(lm_pipeline)             
# selector = RFE(estimator, n_features_to_select=5, step=1)
# selector = selector.fit(X, y)

lm_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('selectkbest', SelectKBest(f_regression)),
    ('model', LinearRegression())
])

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = lm_pipeline, 
                        param_grid = hyper_params, 
                        scoring= 'neg_root_mean_squared_error', 
                        cv = 5, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(X_train, np.log(y_train))     

In [112]:
model_cv.best_params_

In [None]:
final_pipe = make_pipeline(
    cat_num_preprocessor,
    VarianceThreshold(),
    StandardScaler(),
    pca,
    RandomForestRegressor()
)

pipe_params = {
    'pca__n_components': [5, 15, 30, 60],
    'randomforestregressor__max_depth': [10, 20, 40],
    'randomforestregressor__min_samples_split': [2, 3, 5],
    'randomforestregressor__min_samples_leaf': [1, 2, 5]
}

clf = GridSearchCV(final_pipe, pipe_params, cv = 10, scoring='neg_root_mean_squared_error', 
                   error_score='raise')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [37]:
def RMSLE_loss_func(model, X, y_true):
    y_pred = model.predict(X)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

score = make_scorer(RMSLE_loss_func, greater_is_better=False)

res = score(lm_pipeline, X_train, y_train)
res

In [None]:
training_predictions[training_predictions]

In [87]:
# neg_mean_squared_log_error
# neg_root_mean_squared_error
scores = cross_val_score(lm_pipeline, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
# scores = cross_val_score(lm_pipeline, X_train, y_train, scoring=score, cv=5)
scores

In [40]:
enc_X = full_processor.fit_transform(X_train)
print(pd.DataFrame(enc_X).shape)

In [49]:
from sklearn.ensemble import RandomForestRegressor

rfr_pipeline = Pipeline(steps=[
    ('processor', full_processor), 
    ('model', RandomForestRegressor())
])

rfr_pipeline.fit(X_train, y_train)

print(f'model score on training data: {rfr_pipeline.score(X_train, y_train)}')

In [91]:
num_features = rfr_pipeline.named_steps['processor'].transformers_[0][2]
one_hot_features = rfr_pipeline.named_steps['processor'].transformers_[1][2]
ordinal_features = rfr_pipeline.named_steps['processor'].transformers_[2][2]
feature_names = num_features + one_hot_features + ordinal_features

In [65]:
rfr_pipeline.named_steps['model'][1].feature_importances_

In [9]:
regressor = RandomForestRegressor()

gs = GridSearchCV(cv=5, error_score=np.nan, estimator=regressor,
# dictionaries containing values to try for the parameters
param_grid={'max_depth'   : [ 2,  5,  7, 10],
            'n_estimators': [20, 30, 50, 75]})
gs.fit(X_train, y_train)

# grid search has finished, now echo the results to the screen
print("The best parameters are ",gs.best_params_)
the_best_parameters = gs.best_params_




In [40]:
lm_pipeline.named_steps['model'].coef_

In [9]:
data = pd.concat([y_train, X_train['OverallQual']], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x='OverallQual', y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

In [33]:
# lm_pipeline.fit(X_train, np.log(y_train))
# plot_predictions(y_train, np.exp(lm_pipeline.predict(X_train)))

In [11]:
corr = X_train.corr()
corr_values = corr.unstack()
corr_values = corr_values.loc[corr_values!=1] # These features correlate with themselves
corr_values[corr_values > 0.8].sort_values(ascending=False)

In [12]:
train = pd.concat([X_train, y_train], axis=1)
plt.subplots(figsize=(12, 9))
sns.heatmap(train.corr(), vmin=-1, vmax=1)
plt.show()

In [20]:
corr = train.corr()
best_corr_cols = corr.nlargest(10, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[best_corr_cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, 
                 yticklabels=best_corr_cols.values, xticklabels=best_corr_cols.values)
plt.show()

In [21]:
sns.set()
sns.pairplot(train[best_corr_cols], size = 2.5)
plt.show();

In [36]:
# Explain where the model works at end and why it doesn't work for expensive houses for example
# Try out not using different features with grid search
# Try combining features.

'''
Cross validation feature selection 
use random forest regressor or decision tree

Extract feature names from one hot
Apply to coef_ and sort by impact


# titanic code
# get column names
col_names = num_col.copy()
col_names.extend(
    tree_pipeline.named_steps['preprocess'].transformers_[1][1].named_steps['one-hot'].get_feature_names_out()
)

plt.figure(figsize=(50, 10))
plot_tree(
    tree_pipeline['model'],
    filled=True, 
    rounded=True, 
    feature_names=col_names, 
    class_names=['Not Survived', 'Survived']
    );
    
'''