In [None]:
# Installation with Pip Installer:
# !pip3 install numpy pandas sklearn statsmodels matplotlib
# !pip install tk
# !pip install descartes
# !pip install geopandas
# !pip install DateTime
# !pip install haversine
# !pip install xgboost
# !pip install bayesian-optimization

# Importing some standard python libraries:
import copy
import math
import pandas as pd
import numpy as np
import statsmodels.api as sm
import sklearn as sk
import matplotlib as mp
import pycaret as pyc
from pycaret.regression import *
import random
import seaborn as sns
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import LassoCV,RidgeCV,ElasticNetCV
from math import sqrt
import matplotlib.pyplot as plt
from pandas import Series
from numpy.random import randn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PolynomialFeatures,RobustScaler
from sklearn.model_selection import GridSearchCV,KFold,RandomizedSearchCV,StratifiedKFold,cross_val_score
%matplotlib inline

In [None]:
data_train = pd.read_csv("C:\\Users\\divya\\Documents\\QUEEN'S SMITH MMA\\Predictive Modelling - MMA 867\\Assignment 867\\Assignment 1\\house-prices-advanced-regression-techniques\\train.csv", 
                    sep = ',') 

data_test = pd.read_csv("C:\\Users\\divya\\Documents\\QUEEN'S SMITH MMA\\Predictive Modelling - MMA 867\\Assignment 867\\Assignment 1\\house-prices-advanced-regression-techniques\\test.csv", 
                    sep = ',') 

pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Checking the number of rows and columns in data.
print("Shape of Train Data: ", data_train.shape)
print("Shape of Test Data: ", data_test.shape)

# Display first few rows of data.
data_train.head()
data_test.head()


In [None]:
# Display type and structure of data.
data_train.info()

In [None]:
# Display summary of data.
data_train.describe()

In [None]:
# Display correlation of data.
corr = data_train.corr()
corr['SalePrice'].sort_values(ascending=False)

In [None]:
# Heatmap for first n numerical features that correlate with target the most 
n = 20
plt.figure(figsize = (32,10))
sns.set(font_scale=1.0)
sns.heatmap(data_train[corr.nlargest(n, 'SalePrice').index].corr(), 
            annot = True, 
            fmt = '.02f', 
            square = True, 
            cbar = False,).set(title = "Heatmap of Highly correlated values")

In [None]:
# Correlation Matrix

f, ax = plt.subplots(figsize=(50, 50))
mat = data_train.corr('pearson')
mask = np.triu(np.ones_like(mat, dtype=bool))
cmap = sns.color_palette("flare", as_cmap=True)
sns.set(font_scale = 1.6)
sns.heatmap(mat, mask=mask, cmap=cmap, vmax=1, center=0, annot = True,
            square=True, linewidths=.5, cbar_kws={"shrink": .8}).set_title("Heatmap for All Variables")
plt.show()

In [None]:
train_plot = data_train.drop(columns=['Id'], axis = 1).copy()

def plot_grid(data, fig_size, grid_size, plot_type, target = ''):
    """
    Custom function for plotting grid of plots.
    It takes: DataFrame of data, size of a grid, type of plots, string name of target variable;
    And it outputs: grid of plots.
    """
    
    fig = plt.figure(figsize = fig_size)
    if plot_type == 'histplot':
        for i, column_name in enumerate(data.select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.histplot(data[column_name], kde = True, color = 'red', stat = 'count')
            plot.set_xlabel(column_name, fontsize = 12)
    if plot_type == 'boxplot':
        for i, column_name in enumerate(data.select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.boxplot(x = data[column_name], color = 'red')
            plot.set_xlabel(column_name, fontsize = 12)
    if plot_type == 'scatterplot':
        for i, column_name in enumerate(data.drop(target, axis = 1).select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.scatterplot(x = data[column_name], y = data[target], color = 'red')
            plot.set_xlabel(column_name, fontsize = 12)
    if plot_type == 'boxplot_cat':
        for i, column_name in enumerate(data.select_dtypes(include = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            sort = data.groupby([column_name])[target].median().sort_values(ascending = False) # This is here to make sure boxes are sorted by median
            plot = sns.boxplot(x = data[column_name], y = data[target], order = sort.index, palette = 'Reds')
            plot.set_xlabel(column_name, fontsize = 12)
    plt.tight_layout()

In [None]:

plot_grid(train_plot, fig_size = (20, 40), grid_size = (12, 4), plot_type = 'scatterplot', target = 'SalePrice')

In [None]:
data_train.plot.scatter(x='GrLivArea', y='SalePrice')

In [None]:
data_train.plot.scatter(x='LotArea', y='SalePrice')

In [None]:
data_train.plot.scatter(x='MasVnrArea', y='SalePrice')

In [None]:
data_train.plot.scatter(x='TotalBsmtSF', y='SalePrice')

In [None]:
data_train.plot.scatter(x='OpenPorchSF', y='SalePrice')

In [None]:
from scipy import stats
sns.set_style('whitegrid')
# plt.figure(figsize = (16,6))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (24, 6), gridspec_kw={'width_ratios': [3, 2]})

sns.histplot(data_train.SalePrice, kde = True, color = 'red', stat = 'count', ax = ax1)
ax1.set_title('Histogram of SalePrice', fontsize = 16)
stats.probplot(data_train.SalePrice, plot = sns.lineplot(ax = ax2))
ax2.set_title('Probability Plot of SalePrice', fontsize = 16)
ax2.get_lines()[0].set_color('red')
ax2.get_lines()[1].set_color('black')

In [None]:
train_y = data_train['SalePrice']
train_y.shape 

In [None]:
train_cleaning = data_train.drop(columns=['SalePrice'], axis = 1).copy()
test_cleaning = data_test.copy()
train_test = pd.concat([train_cleaning, test_cleaning])


missing_values = pd.concat([train_test.isnull().sum().sort_values(ascending = False),
                            train_test.isnull().sum().sort_values(ascending = False).apply(lambda x: (x / train_test.shape[0]) * 100), 
                            train_test.dtypes],
                            axis = 1, keys = ['Values missing', 'Percent of missing', 'Type'])
missing_values[missing_values['Values missing'] > 0]


In [None]:
replace_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 
                'GarageType', 'BsmtExposure', 'BsmtCond', 'BsmtQual', 'BsmtFinType2', 'BsmtFinType1', 'MasVnrType', 
                'Exterior2nd', 'Exterior1st']
train_test[replace_none] = train_test[replace_none].fillna('None')

replace_mode = ['MasVnrArea','GarageCars',  'GarageArea', 'Functional', 
                'Utilities','KitchenQual', 'SaleType', 'Electrical']

for col_name in replace_mode:
    train_test[col_name].replace(np.nan, train_test[col_name].mode()[0], inplace = True)

replace_zero = ['BsmtFinSF2', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtHalfBath', 'BsmtFullBath', 'GarageYrBlt']
train_test[replace_zero] = train_test[replace_zero].fillna(0)


train_test['MSZoning'] = train_test.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
train_test['LotFrontage'] = train_test.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

train_test.isnull().sum().sort_values(ascending = False)
# train_test.head(10)

In [None]:
train_test_plot = train_test.drop(columns=['Id'], axis = 1).copy()
plot_grid(train_test_plot, fig_size = (20, 36), grid_size = (12, 4), plot_type = 'histplot')

In [None]:
plot_grid(train_test_plot, fig_size = (20, 40), grid_size = (12, 4), plot_type = 'boxplot')

In [None]:
from scipy.stats import norm
train_y_log = np.log1p(train_y)
# print(train_y_log)

fig, ax = plt.subplots(1,2, figsize= (15,5))
fig.suptitle("qq-plot & distribution SalePrice ", fontsize= 10)

sm.qqplot(train_y_log, stats.t, distargs=(4,),fit=True, line="45", ax = ax[0])
sns.distplot(train_y_log, kde = True, hist=True, fit = norm, ax = ax[1])
plt.show()

# Feature Engineering

In [None]:
train_test_num= train_test.select_dtypes(include=['float64','int64']).columns  # Numerical columns
train_test_cat = train_test.select_dtypes(include=['object']) 
train_test_cat.head()

# Interaction features
train_test['TotalPorchSF'] = (train_test['OpenPorchSF'] + train_test['3SsnPorch'] + 
                              train_test['EnclosedPorch'] + train_test['ScreenPorch'] + train_test['WoodDeckSF'])

train_test['TotalSF'] = (train_test['BsmtFinSF1'] + train_test['BsmtFinSF2'] + 
                         train_test['1stFlrSF'] + train_test['2ndFlrSF'] + 
                         train_test['TotalPorchSF'] + train_test['GarageArea'])

train_test['TotalBathrooms'] = (train_test['FullBath'] + (0.5 * train_test['HalfBath']) + 
                                train_test['BsmtFullBath'] + (0.5 * train_test['BsmtHalfBath']))

train_test['TotalRms'] = (train_test['TotRmsAbvGrd'] + train_test['TotalBathrooms'])

train_test['age']= train_test['YrSold'] - train_test['YearBuilt']

# Indicator features 
train_test['HasPool'] = train_test['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
train_test['Has2ndFloor'] = train_test['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
train_test['HasFireplace'] = train_test['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
train_test['WasRemod'] = (train_test['YearRemodAdd'] != train_test['YearBuilt']).apply(lambda x: 1 if x == True else 0)


train_test.shape

In [None]:
# splitting train and test data
train_new= train_test.iloc[:len(train_y), :].reset_index(drop=True)
test_new = train_test.iloc[len(train_new):, :].reset_index(drop=True)

# train_new = train_test[1460,:]
# train_new = train_test[:,1461]

train_new = pd.concat([train_new, train_y_log], axis = 1)
# train_new = pd.concat([train_new, train_y], axis = 1)

train_new.head()

In [None]:
train_new.to_csv("C:\\Users\\divya\\Documents\\QUEEN'S SMITH MMA\\Predictive Modelling - MMA 867\\Assignment 867\\Assignment 1\\house-prices-advanced-regression-techniques\\train_new.csv", index = False)
test_new.to_csv("C:\\Users\\divya\\Documents\\QUEEN'S SMITH MMA\\Predictive Modelling - MMA 867\\Assignment 867\\Assignment 1\\house-prices-advanced-regression-techniques\\test_new.csv", index = False)

In [None]:
test_new['SalePrice'] = np.nan
test_new.head()

In [None]:
train_test.to_csv("C:\\Users\\divya\\Documents\\QUEEN'S SMITH MMA\\Predictive Modelling - MMA 867\\Assignment 867\\Assignment 1\\house-prices-advanced-regression-techniques\\train_test.csv", index = False)