In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train

In [None]:
sample_submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission

# Data Cleaning

In [None]:
null_value_columns = train.isnull().sum().sort_values(ascending = False).head(20)
null_value_columns = null_value_columns.reset_index()
null_value_columns

Let's iterate through the columns of this particular dataset to obtain the datatypes of the columns

In [None]:
data_types = {}
for col in list(null_value_columns['index']):
    data_types[col] = type(train[col].iloc[0])
data_types

Let's remove the first two columns.

In [None]:
train = train.drop(columns = ['PoolQC', 'MiscFeature'])
test = test.drop(columns = ['PoolQC', 'MiscFeature'])

For the below features, it may be appropriate to impute based on the mode.

In [None]:
train['Alley'].value_counts()

Impute values in Alley with the mode.

In [None]:
train.loc[(train['Alley'] != 'Grvl') & (train['Alley'] != 'Pave'), 'Alley'] = 'Grvl'
test.loc[(test['Alley'] != 'Grvl') & (test['Alley'] != 'Pave'), 'Alley'] = 'Grvl'
train['Alley']

In [None]:
train['Fence'].value_counts()

Impute values in Fence with the mode.

In [None]:
train.loc[(train['Fence'] != 'MnPrv') & (train['Fence'] != 'GdPrv') & (train['Fence'] != 'GdWo') & (train['Fence'] != 'MnWw'), 'Fence'] = 'MnPrv'
train['Fence'].head(10)

In [None]:
test.loc[(test['Fence'] != 'MnPrv') & (test['Fence'] != 'GdPrv') & (test['Fence'] != 'GdWo') & (test['Fence'] != 'MnWw'), 'Fence'] = 'MnPrv'
test['Fence'].head(10)

In [None]:
train['FireplaceQu'].value_counts()

Impute values in FireplaceQu with the mode.

In [None]:
col = 'FireplaceQu'
train.loc[(train[col] != 'Gd') & (train[col] != 'TA') & (train[col] != 'Fa') & (train[col] != 'Ex') & (train[col] != 'Po'), col] = 'Gd'
train[col].head(10)

In [None]:
test.loc[(test[col] != 'Gd') & (test[col] != 'TA') & (test[col] != 'Fa') & (test[col] != 'Ex') & (test[col] != 'Po'), col] = 'Gd'
test[col].head(10)

In [None]:
the_columns_train = list(train.columns)
the_columns_train

In [None]:
the_columns_test = list(test.columns)
the_columns_test

The LotFrontage column contains values of the quantitative continuous type. Let's impute the null values.

In [None]:
#Much more efficient way for data cleaning
train_cols = list(train.columns)
test_cols = list(test.columns)
from sklearn.impute import SimpleImputer

def data_cleaning_pipeline(table, columns):
    numerical_columns = {}
    string_columns = {}
    for col in list(columns):
        the_feature = table[col].dropna()
        if type(the_feature.iloc[0]) == np.int64:
            numerical_columns[col] = np.int64
        elif type(the_feature.iloc[0]) == np.float64:
            numerical_columns[col] = np.float64
        elif type(the_feature.iloc[0]) == str:
            string_columns[col] = str
    
    string_columns = list(string_columns.keys())
    numerical_columns = list(numerical_columns.keys())
    
    imp_categorical = SimpleImputer(strategy="most_frequent")
    table_categorical = imp_categorical.fit_transform(train[string_columns])
    
    imp_numerical = SimpleImputer(strategy="median")
    table_numerical = imp_numerical.fit_transform(train[numerical_columns])
    
    table = np.hstack((table_categorical,table_numerical))
    string_columns.extend(numerical_columns)
    df = pd.DataFrame(table, columns = string_columns) 
    return df

train = data_cleaning_pipeline(train, train_cols)
test = data_cleaning_pipeline(test, test_cols)
the_columns_train = list(train.columns)
the_columns_test = list(test.columns)

In [None]:
test.iloc[:, 41]

Let's remove outliers in the label, which is the sale price.

In [None]:
summary_stats = train['SalePrice'].astype(float).describe()
summary_stats

In [None]:
lower_bound_outlier = summary_stats.loc['25%']-1.5*(summary_stats.loc['75%']-summary_stats.loc['25%'])
upper_bound_outlier = summary_stats.loc['75%']+1.5*(summary_stats.loc['75%']-summary_stats.loc['25%'])
train = train.loc[(train['SalePrice'] > lower_bound_outlier) & (train['SalePrice'] < upper_bound_outlier)]

Perform one-hot encoding on the categorical variables.

In [None]:
train = pd.get_dummies(train, columns=the_columns_train[0:41])
test = pd.get_dummies(test, columns=the_columns_test[0:41])
the_columns_train = list(train.columns)
the_columns_test = list(test.columns)

Perform feature selection using lasso, or l1 penalty.

In [None]:
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

y = train['SalePrice'].astype('float')
X = train.drop(columns = ['SalePrice'])

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter = 1000).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new_train = model.transform(X)
X_new_test = model.transform(test)

In [None]:
len(X_new_train[1, :])

Let's explore the distribution of the label, which is the sale price

Apply the standard scalar to normalize each of the quantitative continuous columns.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_new_train = scaler.fit_transform(X_new_train)
X_new_test = scaler.fit_transform(X_new_test)

In [None]:
from sklearn.preprocessing import RobustScaler
transformer = RobustScaler()
X_new_train = transformer.fit_transform(X_new_train)

In [None]:
X_new_train

In [None]:
X_new_test = transformer.fit_transform(X_new_test)

# Exploratory Data Analysis

Figure 1. Pairplot depicting the associations between the quantitative variables and the label, which is the sale price. All of these ten variables are associated with sale price, except for the id.

In [None]:
set_of_quantitative_columns1 = list(train.iloc[:, 0:10].columns)
set_of_quantitative_columns1.append('SalePrice')
sns.pairplot(train[set_of_quantitative_columns1])

Figure 2. Pairplot depicting the associations between the quantitative variables and the label, which is the sale price. All of these eleven variables are associated with sale price.

In [None]:
set_of_quantitative_columns2 = list(train.iloc[:, 10:21].columns)
set_of_quantitative_columns2.append('SalePrice')
sns.pairplot(train[set_of_quantitative_columns2])

Figure 3. Pairplot depicting the associations between the quantitative variables and the label, which is the sale price. All of these ten variables are associated with sale price.

In [None]:
set_of_quantitative_columns3 = list(train.iloc[:, 21:31].columns)
set_of_quantitative_columns3.append('SalePrice')
sns.pairplot(train[set_of_quantitative_columns3])

# Model Development

Build a least squares linear regression model to predict sale price of homes.

RMSE log difference cross validation function.

In [None]:
def rmse(actual, predicted):
    return np.sqrt(np.mean((np.log(actual.astype('float')) - np.log(predicted.astype('float')))**2))

In [None]:
from sklearn.model_selection import KFold
from sklearn.base import clone

def cross_validate_rmse(model, X, y, grid = None):
    if grid != None:
        model = clone(model)
        five_fold = KFold(n_splits=5)
        rmse_values = []
        y = pd.Series(y)
        best_params = []
        for tr_ind, va_ind in five_fold.split(X):
            model.fit(X[tr_ind,:], y.iloc[tr_ind])
            rmse_values.append(rmse(y.iloc[va_ind], model.predict(X[va_ind,:])))
            rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)
            # Fit the random search model
            rf_random.fit(X[va_ind,:], y.iloc[va_ind])
            best_params.append(rf_random.best_params_)
        return np.mean(rmse_values), best_params
    else:
        model = clone(model)
        five_fold = KFold(n_splits=5)
        rmse_values = []
        y = pd.Series(y)
        for tr_ind, va_ind in five_fold.split(X):
            model.fit(X[tr_ind,:], y.iloc[tr_ind])
            rmse_values.append(rmse(y.iloc[va_ind], model.predict(X[va_ind,:])))
    return np.mean(rmse_values)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_new_train, y)
actual = y
predicted = reg.predict(X_new_train)
print("RMSE on the training set:", rmse(actual, predicted))
print("Cross Validation Accuracy RMSE: ", cross_validate_rmse(reg, X_new_train, y))

Lasso regression model, with l1 regularization for effective l1 norm ball feature selection.

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 205)
lasso.fit(X_new_train, y)
actual = y
predicted = lasso.predict(X_new_train)
print("RMSE on the training set:", rmse(actual, predicted))
print("Cross Validation Accuracy RMSE: ", cross_validate_rmse(lasso, X_new_train, y))

In [None]:
alpha_value_cval_errors1 = {}
alphas = np.linspace(1, 210, num= 400)
min_rmse = 0
min_alpha = 0
for a in alphas:
    lasso = Lasso(alpha = a, max_iter = 10000)
    lasso.fit(X_new_train, y)
    cval = cross_validate_rmse(lasso, X_new_train, y)
    alpha_value_cval_errors1[a] = cval
    if cval < min_rmse:
        min_rmse = cval
        min_alpha = a

min_rmse = min(list(alpha_value_cval_errors1.values()))
print(min_alpha, min_rmse)

In [None]:
sns.scatterplot(x= list(alphas), y= list(alpha_value_cval_errors1.values()), data=pd.DataFrame(X_new_train))
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.xlabel('Alpha')
plt.ylabel('Cross Validation RMSE')
plt.show()

Ridge regression model, with l2 regularization for effective L2 norm ball feature selection.

In [None]:
from sklearn.linear_model import Ridge
ridge= Ridge(alpha = 58)
ridge.fit(X_new_train, y)
actual = y
predicted = ridge.predict(X_new_train)
print("RMSE on the training set:", rmse(actual, predicted))
print("Cross Validation Accuracy RMSE: ", cross_validate_rmse(ridge, X_new_train, y))

In [None]:
alpha_value_cval_errors2 = {}
alphas = np.linspace(1, 60, num= 200)
min_alpha = 0
min_rmse = 10
for a in alphas:
    ridge = Ridge(alpha = a, max_iter = 10000)
    ridge.fit(X_new_train, y)
    cval = cross_validate_rmse(ridge, X_new_train, y)
    alpha_value_cval_errors2[a] = cval
    if cval < min_rmse:
        min_rmse = cval
        min_alpha = a

min_rmse = min(list(alpha_value_cval_errors2.values()))
print(a, min_rmse)


As the value of alpha increases, the cross validation rmse increases. This ensures that with a large number of features, the linear regression model with l2 regularization is able to generalize well to unseen data.

In [None]:
sns.scatterplot(x= list(alphas), y= list(alpha_value_cval_errors2.values()), data=pd.DataFrame(X_new_train))
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.xlabel('Alpha')
plt.ylabel('Cross Validation RMSE')
plt.show()

Thus, alpha value of 30 is the ideal value used to underfit the ridge regression model.

SGD regressor, with no penalty.

In [None]:
from sklearn.linear_model import SGDRegressor
sgd_regressor = SGDRegressor(max_iter=10000, tol=1e-3, penalty = 'l2')
actual = y
sgd_regressor.fit(X_new_train, actual)
predicted = sgd_regressor.predict(X_new_train)
print("RMSE on the training set:", rmse(actual, predicted))
print("Cross Validation Accuracy RMSE: ", cross_validate_rmse(sgd_regressor, X_new_train, y))

Random Forest of Regression Trees. Some of the benefits of using this type of model is that it is able to generalize well to unseen data (low bias, as well as low variance), due to bagging different combinations of features, and bootstrapped data to capture the variance in the data.

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=0)
rf.fit(X_new_train, y)
actual = y
predicted = rf.predict(X_new_train)
print("RMSE on the training set:", rmse(actual, predicted))
print("Cross Validation Accuracy RMSE: ", cross_validate_rmse(rf, X_new_train, y))

Error on the training set as well as on the cross validation set after using RandomGridSearch for effective hyperparameter tuning for the random forest of regression trees model.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


Random hyperparameter grid to find out the best set of features for the random forest model.

In [None]:
rf2 = RandomForestRegressor(n_estimators = 280,
 min_samples_split= 2,
 min_samples_leaf = 0.2,
 max_features= 'sqrt',
 max_depth= 6,
 bootstrap = True)

rf2.fit(X_new_train, y)
actual = y
predicted = rf2.predict(X_new_train)
print("RMSE on the training set:", rmse(actual, predicted))
print("Cross Validation Accuracy RMSE: ", cross_validate_rmse(rf2, X_new_train, y))

In [None]:
#Average out the parameters which were fit to the validation set in the cross validation accuracy function.

dict_lst = [{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}, 
{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}, 
{'n_estimators': 2000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 50, 'bootstrap': True}, 
{'n_estimators': 2000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 50, 'bootstrap': True}, 
{'n_estimators': 1400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}]

def average_rf_parameters(dictionary_lst):
    average_dict = {}
    index = 0
    for dictionary in dict_lst:
        for key in list(dictionary.keys()):
            if index == 0:
                average_dict[key] = dictionary[key]
            else:
                average_dict[key] = average_dict[key]+dictionary[key]
    for key in list(average_dict.keys()):
        if key == 'bootstrap':
            pass
        elif key == 'max_features':
            average_dict[key] = 'sqrt'
        else:
            average_dict[key] = int(average_dict[key]/5)
        print(key)
    return average_dict

rf_parameters = average_rf_parameters(dict_lst)

In [None]:
rf_parameters

In [None]:
sgd_regressor = SGDRegressor(max_iter=100000000, tol=1e-3, penalty = 'l2')
actual = y
sgd_regressor.fit(X_new_train, actual)
ridge_regression_predictions = ridge.predict(X_new_test)
ridge_regression_predictions = ridge_regression_predictions[0:len(ridge_regression_predictions)-1]
sample_submission['SalePrice'] = ridge_regression_predictions

In [None]:
ridge= Ridge(alpha = 0)
ridge.fit(X, y)
ridge_regression_predictions = ridge.predict(test)
ridge_regression_predictions = ridge_regression_predictions[0:len(ridge_regression_predictions)-1]
sample_submission['SalePrice'] = ridge_regression_predictions

In [None]:
lasso = Lasso(alpha = 205)
lasso.fit(X_new_train, y)
lasso_regression_predictions = lasso.predict(X_new_test)
lasso_regression_predictions = lasso_regression_predictions[0:len(lasso_regression_predictions)-1]
sample_submission['SalePrice'] = lasso_regression_predictions

The random forest of regression trees model with optimized hyperparameters performs the best on the unseen data(the test set). This is because the random forest of regression trees combines weak regression trees that tend to overfit to the training set(low bias), but also minimizes the variance. Minimizing the variance is achieved through bagging, which is the process of bootstrapping the set of features and fitting these regression trees to data sampled with replacement from the original sample. Thus, the random forest model minimizes the bias, as well as the variance, bypassing the bias variance tradeoff through bagging.

In [None]:
rf2.fit(X_new_train, y)
random_forest_regression_predictions = rf2.predict(X_new_test)
random_forest_regression_predictions = random_forest_regression_predictions[0:len(random_forest_regression_predictions)-1]
sample_submission['SalePrice'] = random_forest_regression_predictions

In [None]:
sample_submission['SalePrice'].head(20)

In [None]:
sample_submission.to_csv('/kaggle/working/sample_submission15.csv', index = False)