<a href="https://www.kaggle.com/code/anandparmar/house-price-prediction-ensemble-and-bagging?scriptVersionId=103666171" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv(os.path.join(dirname, "train.csv"))
# data = pd.read_csv(os.path.join(dirname, "test.csv"))

In [None]:
!cat /kaggle/input/house-prices-advanced-regression-techniques/data_description.txt

In [None]:
#crating groups of related attributes for convenience
locality = ['MSZoning', 'Street', 'Alley','Neighborhood', 'Condition1', 'Condition2']
lot = ['LotFrontage', 'LotArea', 'LotShape']
masonry=['MasVnrType', 'MasVnrArea', 'Foundation']
land = ['LandContour', 'LandSlope' ]
condition = ['OverallCond', 'ExterCond', 'BsmtCond', 'GarageCond', 'HeatingQC']
quality = ['OverallQual', 'ExterQual', 'BsmtQual', 'GarageQual', 'KitchenQual','PoolQC', 'HeatingQC', 'FireplaceQu']
house=['MSSubClass', 'BldgType', 'HouseStyle', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF','GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd']
utilities=['Utilities', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'PoolArea']
basement=['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
fireplace=['FirePlaces', 'FirePlaceQu']
garage=['GarageType', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'GarageYrBlt']
porch=['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
pool=['PoolArea', 'PoolQC']
history=['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
roof=['RoofStyle', 'RoofMatl']
exterior=['Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond']
misc=['Fence', 'Foundation', 'MiscFeature', 'MiscVal']
sell=['MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SellPrice']

In [None]:
print("data shape: ", data.shape)
data.head()

In [None]:
sale_binned = pd.cut(data.SalePrice,bins=[0, 130000, 250000, 330000,500000, 1000000], labels=["Low", "Med", "High", "Very High", "Exceptional" ] )
subs = make_subplots(1,2, subplot_titles=["Sale Price BoxPlot", "Sale Price Distribution"])
data["SalePrice_binned"] = sale_binned

#make different datasets for earch bin
low_price = data[data["SalePrice_binned"] == "Low"]
med_price = data[data["SalePrice_binned"] == "Med"]
high_price = data[data["SalePrice_binned"] == "High"]
very_high_price = data[data["SalePrice_binned"] == "Very High"]
exceptional_price = data[data["SalePrice_binned"] == "Exceptional"]

subs.add_trace(go.Box(x=data.SalePrice), row=1, col=1)
subs.add_trace(go.Histogram(x=data.SalePrice), row=1,col=2)
subs.show()
px.histogram(sale_binned, title="Sale Price Binned").show()

# Data Cleaning

In [None]:
def show_missing_values(data):
    #Missing values
    missing = pd.DataFrame(data.isnull().sum())
    return missing[missing[0] > 0]
show_missing_values(data)

## 1. Locality

In [None]:
locality_data = data[locality].copy()
locality_data.head()

In [None]:
locality_data.isnull().sum()

In [None]:
def clean_locality(locality_data):
    # Allley has huge amount of missing values, alley is supposed to have following classes: 'Grvl', 'Pave' and 'None'
    #we are missing 'None' class. It is a mistake when entering data. Instead of 'None' it is filled with NaN. We can observe that after looking at frequency counts for Alley
    locality_data = locality_data.fillna("None")
    return locality_data

In [None]:
data[locality] = clean_locality(locality_data)

In [None]:
subs = make_subplots(2,3, subplot_titles=locality)
col_no=0
for r in range(1, 3):
    for c in range(1, 4):
        attr = locality[col_no]
        subs.add_trace(go.Histogram(x=locality_data[attr]), row=r, col=c)
        col_no=col_no+1
subs.show()

## 2. Lot

In [None]:
lot_data = data[lot].copy()
lot_data.head()

In [None]:
lot_data.isnull().sum()

In [None]:
subplots = make_subplots(1, 3, subplot_titles=("LotShape Count", "LotShape vs LotFrontage", "LotShape vs LotArea"))
subplots.add_trace(go.Histogram(x=lot_data.LotShape), row=1, col=1)
subplots.add_trace(go.Histogram(x=lot_data.LotShape, y=lot_data.LotFrontage, histfunc="avg"), row=1, col=2)
subplots.add_trace(go.Histogram(x=lot_data.LotShape, y=lot_data.LotArea, histfunc="avg"), row=1, col=3)

In [None]:
subs = make_subplots(2, 2, subplot_titles=("Regular", "IR1", "IR2", "IR3"))

idx=0
lotShapes = lot_data.LotShape.unique()
for r in range(1, 3):
    for c in range(1, 3):
        filtered_data = lot_data[lot_data.LotShape == lotShapes[idx]]
        subs.add_trace(go.Scatter(x=filtered_data.LotFrontage, y=filtered_data.LotArea, mode="markers"), row=r, col=c)
        idx=idx+1
subs.show()

In [None]:
def clean_lot(lot_data):
    #now, we compute mean and impute missing values in LotFrontage column
    mean_value = lot_data.LotFrontage.mean()
    lot_data.LotFrontage=lot_data.LotFrontage.fillna(mean_value)
    
    return lot_data

In [None]:
#from above visualizations, we can figure out that we may have 2 outliers when it comes to lotfrontage, that is values with LotFrontage==313
#let's remove them
lot_data = lot_data[lot_data.LotFrontage < 313]
#also remove one possible outlier for LotArea attribuet, that is, LotArea==215
lot_data = lot_data[lot_data.LotArea < 215000]

data = data[(data.LotFrontage < 313) & (data.LotArea <215000)]
data[lot] = clean_lot(lot_data)
#visualizing results
subplots = make_subplots(1, 2, subplot_titles=("LotShape VS LotFrontage", "LotShape VS LotArea"))
subplots.add_trace(go.Histogram(x=lot_data.LotShape, y = lot_data.LotFrontage, histfunc='avg'), row=1, col=1)
subplots.add_trace(go.Histogram(x=lot_data.LotShape, y=lot_data.LotArea, histfunc="avg"), row=1, col=2)

## 3. Masonry

In [None]:
masonry_data = data[masonry].copy()
print("Missing values: ", masonry_data.isnull().sum())

print("Unique values for Masonry Vaneer Type: ", masonry_data.MasVnrType.unique())

In [None]:
masonry_data.columns

In [None]:
def clean_masonry(masonry_data):
    #fill missing masonry vaneer area with mean values
    masonry_data.MasVnrArea = masonry_data.MasVnrArea.fillna(masonry_data.MasVnrArea.mean())
    mode =  masonry_data.MasVnrType.mode()
    print("Mode: ", mode[0])
    masonry_data.MasVnrType= masonry_data.MasVnrType.fillna(mode[0])
    return masonry_data

In [None]:
data[masonry] = clean_masonry(masonry_data)

In [None]:
subs = make_subplots(3,3, subplot_titles=("Vaneer Type", "Vaneer Type Vs avg SalePrice", "Foundation", "Vaneer Area vs SalePrice", "Foundation vs SalePrice", "Foundation vs Vaneer Area", "Vaneer Type Vs Vaneer Area"))
subs.add_trace(go.Histogram(x=masonry_data.MasVnrType), row=1, col=1)
subs.add_trace(go.Histogram(x=masonry_data.MasVnrType, y=data.SalePrice, histfunc="avg"), row=1, col=2)
subs.add_trace(go.Histogram(x=masonry_data.Foundation, y=masonry_data.MasVnrArea), row=1, col=3)
subs.add_trace(go.Scatter(x=masonry_data.MasVnrArea, y=data.SalePrice, mode="markers"), row=2, col=1)
subs.add_trace(go.Histogram(x=masonry_data.Foundation, y=data.SalePrice, histfunc="avg"), row=2, col=2)
subs.add_trace(go.Histogram(x=masonry_data.Foundation, y=masonry_data.MasVnrArea, histfunc="avg"), row=2, col=3)
subs.add_trace(go.Histogram(x=masonry_data.MasVnrType, y=masonry_data.MasVnrArea, histfunc="avg"), row=3, col=1)
subs.show()

## 4. Basement

In [None]:
basement_data = data[basement].copy()
basement_data.head()

In [None]:
basement_data.isnull().sum()

they all seem to have almost same amount of missing samples. Is there any relation between them?

In [None]:
def clean_basement(basement_data):
    bsmt_cat = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'BsmtFinType1']
    bsmt_num = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']

    basement_data[bsmt_cat] = basement_data[bsmt_cat].fillna("None")
    basement_data[bsmt_num] = basement_data[bsmt_num].fillna(0)
    return basement_data

In [None]:
data[basement] = clean_basement(basement_data)

In [None]:
basement_data.columns

In [None]:
subs = make_subplots(3, 3, subplot_titles=basement)
col_no=0
for r in range(1, 4):
    for c in range(1, 4):
        attr = basement[col_no]
        subs.add_trace(go.Histogram(x=basement_data[attr], y=data.SalePrice, histfunc="avg"), row=r, col=c)
        col_no=col_no+1
subs.show()

## 5. Utilites

In [None]:
utilities_data = data[utilities].copy()
utilities_data.isnull().sum()

In [None]:
def clean_utilities(utilities_data):
    utilities_data.Utilities = utilities_data.Utilities.fillna(utilities_data.Utilities.mode()[0])
    utilities_data.Electrical = utilities_data.Electrical.fillna(utilities_data.Electrical.mode()[0])
    return utilities_data

In [None]:
data[utilities] = clean_utilities(utilities_data)

In [None]:
subs = make_subplots(3,3, subplot_titles=utilities)
col_no=0
for r in range(1, 4):
    for c in range(1, 4):
        attr = utilities[col_no]
        subs.add_trace(go.Histogram(x=utilities_data[attr]), row=r, col=c)
        col_no=col_no+1
subs.show()

## 6. Quality

In [None]:
qual_data = data[quality].copy()
qual_data.isnull().sum()

In [None]:
qual_data.PoolQC.unique() # other two also have same results, they have missing values where there is no Pool, Garage or Fireplace

In [None]:
def clean_quality(qual_data):
    qual_data = qual_data.fillna("None")
    return qual_data

In [None]:
data[quality] = clean_quality(qual_data)

In [None]:
subs = make_subplots(3,3, subplot_titles=quality)
col_no=0
for r in range(1, 4):
    for c in range(1, 4):
        if(col_no < len(quality)):
            attr = quality[col_no]
            subs.add_trace(go.Histogram(x=qual_data[attr]), row=r, col=c)
            col_no=col_no+1
subs.show()

## 7. Garage

In [None]:
garage_data = data[garage].copy()
garage_data.isnull().sum()

In [None]:
def clean_garage(garage_data):
    garage_cats = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'GarageYrBlt']
    garage_nums = ['GarageCars', 'GarageArea']
    
    garage_data["GarageYrBlt"] = garage_data['GarageYrBlt'].fillna(garage_data.GarageYrBlt.mode()[0]) 
    garage_data[garage_cats] = garage_data[garage_cats].fillna("None")
    garage_data[garage_nums] = garage_data[garage_nums].fillna(0)
    return garage_data

In [None]:
data[garage] = clean_garage(garage_data)

## 8. Misc Features

In [None]:
misc_data = data[misc]
misc_data.isnull().sum()

In [None]:
def clean_misc(misc_data):
    misc_data = misc_data.fillna("None")
    return misc_data

In [None]:
data[misc] = clean_misc(misc_data)

In [None]:
subs = make_subplots(2,2, subplot_titles=misc)
col_no=0
for r in range(1, 3):
    for c in range(1, 3):
        if(col_no < len(misc)):
            attr = misc[col_no]
            subs.add_trace(go.Histogram(x=misc_data[attr]), row=r, col=c)
            col_no=col_no+1
subs.show()

## 9. Exterior

In [None]:
ext_data = data[exterior]
ext_data.isnull().sum()

In [None]:
ext_data.head()

In [None]:
def clean_exterior(ext_data):
    ext_mode = ext_data[exterior].mode()
    ext_data = ext_data.fillna({'Exterior1st': ext_mode.Exterior1st[0], 'Exterior2nd': ext_mode.Exterior2nd[0], 'ExterQual': ext_mode.ExterQual[0], 'ExterCond': ext_mode.ExterCond[0] })
    return ext_data

In [None]:
data[exterior] = clean_exterior(ext_data)

# Analysis
In this section, we ask two questions:
1. Why:- why these houses have low prices and other one's have high prices, by comparing them based on best predictors for the SalePrice
2. Which:- Which Features affect the data
3. How:- How do these features affect the data, they cause pricing to increase or decrease, or change in other variables.

## 1. Low Price houses
let's take a look at what low price houses look like. It will also help us in validating if the data is correct for such data class

In [None]:
housedata_low = low_price[house].copy()
housedata_low["SalePrice"]= data.SalePrice
housedata_low.head()

In [None]:
import plotly.figure_factory as ff

df_corr = housedata_low.corr()

x = list(df_corr.columns)
y = list(df_corr.index)
z = np.array(df_corr)

fig = ff.create_annotated_heatmap(
    z,
    x = x,
    y = y ,
    annotation_text = np.around(z, decimals=2),
    hoverinfo='z',
    colorscale='Viridis'
    )
fig.show()

we can see the key predictors are all related to the area, which means generally greater the area greater will be the price of the house. Other than that, there are few autocorrelated variables on the variables explaining area.

**Key Predictors:** TotalRoomsAboveGround, BeedroomsAboveGround, GroundLivingArea, 1stFloorSquareFeetArea

**Interrelated Variables:** 
1. TotalRoomsAboveGround: 1stFloorSF, 2ndFloorSF, GrLivArea, FullBath, Bedroom, Kitchen
2. Beedroom: 1st and 2nd FloorSF,  GrLivArea, FullBath
3. GroundLivingArea: MSSubClass, 1st and 2nd FloorSF
4. 1stFloorSF: none

**We will only look out for these variables:** 1stFloorSF, 2ndFloorSF, GrLivArea, FullBath, Bedroom, Kitchen

In [None]:
features = ["TotRmsAbvGrd", "BedroomAbvGr", "1stFlrSF", "GrLivArea"]
subs = make_subplots(1, 4, subplot_titles=[x + " VS SalePrice" for x in features])
col_no=0

for r in range(1, 2):
    for c in range(1, 5):
        attr = features[col_no]
        plt = go.Histogram(y=housedata_low.SalePrice, x=housedata_low[attr]) if col_no < 2 else go.Scatter(x=housedata_low.SalePrice, y=housedata_low[attr], mode="markers")
        subs.add_trace(plt,row=r, col=c)
        col_no +=1
subs.show()

it seems, Ground Living Area and 1st Floor Square Feet area has positive correlation with Sale Price

Looking closely at data

In [None]:
features = ["1stFlrSF", "2ndFlrSF", "GrLivArea"]
subs = make_subplots(1, 3, subplot_titles=[x + " VS Total Rooms" for x in features])
col_no=0

for r in range(1, 2):
    for c in range(1, 4):
        attr = features[col_no]
        subs.add_trace(go.Histogram(x=housedata_low.TotRmsAbvGrd, y=housedata_low[attr], histfunc="avg"),row=r, col=c)
        col_no +=1
subs.show()

After looking at data closely, we can see that, features that define house properties are highly interrelated, such as Total Rooms and features describing area. Also Total rooms and Bedrooms, Bath and Kitchen. 
Thankfully decision trees can do automatic feature selection. That's why we will leave the rest as it is

### The same can be done between data of other SalePrice classes and other feature groups to derive maximum knowledge about the data
for now, we will not be focusing on instead we will focus on fitting the model using Random Forest and Boosting techniques such as AdaBoost and XGboost

# Fitting the Model

In [None]:
data = data.drop("SalePrice_binned", axis=1)

In [None]:
categorical_features = []
numerical_features = []
for x in data.dtypes.iteritems():
    if(x[1] == "object" or x[1] == "category"):  categorical_features.append(x[0])
    else: numerical_features.append(x[0])
print(categorical_features)

before we begin encoding categorical variables, let's make sure there are'nt any variables that have same category names

In [None]:
alist = []
for x in categorical_features:
    for y in data[x].unique():
        alist.append(y)
print("Total new columns made when encoded: ", len(alist))

In [None]:
from collections import Counter

counts = dict(Counter(alist))
duplicates = {key:value for key, value in counts.items() if value > 1}
print(duplicates)

and, we have variables with same category/class names. actually lots of them. 

what do we do? let's append each category with it's respective variable name, when encoding. 

such that, it becomes, `variable1_Fa` and `variable2_Fa` and we can differentiate which one is of which variable

In [None]:
#to cross check number of columns after encoding
total_features = 81 #including numerical and categorical features
total_categories = 257 #summing up all categories in all categorical features
total_categorical_features = 43 #all categorical features MSZoning, Condition1 etc.
print("Correct number of features after transformation: ", total_features + total_categories - total_categorical_features)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# enc = OneHotEncoder()
# enc.fit(data[categorical_features])

def transform_features(data):
    enc = OneHotEncoder()

    fitted_values = enc.fit_transform(data[categorical_features]).toarray()
    cats = []
    
    for idx, category in enumerate( enc.categories_):
        for c in category:
            try:
                cats.append(categorical_features[idx]+"_"+c)
            except:
                print("Error while transforming: ", categorical_features[idx], " " , c)
    data[cats] = fitted_values
    return data

In [None]:
transformed_data = transform_features(data.copy())
transformed_data = transformed_data.drop(categorical_features, axis=1)
transformed_data[['Functional_Sev', 'RoofStyle_Shed', 'MiscFeature_Gar2', 'MSZoning_None', 'KitchenQual_None']] = 0

In [None]:
print(transformed_data.shape)
transformed_data.head()

In [None]:
from sklearn.model_selection import train_test_split

y = transformed_data.SalePrice
x = transformed_data.drop("SalePrice", axis=1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

## Random Forest

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=5,oob_score=True, bootstrap=True)
regr.fit(X_train, y_train)
output = cross_validate(regr, X_train, y_train, cv=5, return_estimator=True, verbose=1)

In [None]:
regr.score(X_test, y_test)

In [None]:
importances = []
for idx,estimator in enumerate(output['estimator']):
    feature_importances = pd.DataFrame(estimator.feature_importances_,
                                       index = X_train.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)
#     print(estimator.get_params())
    importances.append(feature_importances)

In [None]:
print("Avaregae Score: ", sum(output['test_score'])/len(output['test_score']))    

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada_regr = AdaBoostRegressor(base_estimator=regr, random_state=0, n_estimators=10)
ada_regr.fit(X_train, y_train)

In [None]:
ada_regr.score(X_test, y_test)

In [None]:
#cross-validation
output = cross_validate(ada_regr, X_train, y_train, cv=5, return_estimator=True, verbose=1)
print("Avaregae Score: ", sum(output['test_score'])/len(output['test_score']))    

## XGBoost

In [None]:
from sklearn import ensemble
xg_regr = reg = ensemble.GradientBoostingRegressor(n_estimators=500, max_depth=5, learning_rate=0.01, loss="squared_error")
xg_regr.fit(X_train, y_train)

In [None]:
xg_regr.score(X_test, y_test)

In [None]:
#cross-validation
output = cross_validate(xg_regr, X_train, y_train, cv=5, return_estimator=True, verbose=1)
print("Avaregae Score: ", sum(output['test_score'])/len(output['test_score']))    

# Submission

In [None]:
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
def clean_data(data):
    data[locality] = clean_locality(data[locality])
    data[lot] = clean_lot(data[lot])
    data[masonry] = clean_masonry(data[masonry])
    data[basement] = clean_basement(data[basement])
    data[utilities] = clean_utilities(data[utilities])
    data[quality] = clean_quality(data[quality])
    data[garage] = clean_garage(data[garage])
    data[misc] = clean_misc(data[misc])
    data[exterior] = clean_exterior(data[exterior])
    return data

In [None]:
test_data = clean_data(test_data)
test_data.shape

In [None]:
show_missing_values(test_data)

In [None]:
modes = test_data[['BsmtFullBath', 'BsmtHalfBath', 'Functional', 'SaleType']].mode()

test_data = test_data.fillna({'BsmtFullBath': modes.BsmtFullBath[0], 'BsmtHalfBath': modes.BsmtHalfBath[0] , 'Functional': modes.Functional[0] , 'SaleType': modes.SaleType[0] })
show_missing_values(test_data)

In [None]:
transformed_test_data = transform_features(test_data.copy())
transformed_test_data.drop(categorical_features, axis=1, inplace=True)
print(transformed_test_data.shape)
transformed_test_data.head()

In [None]:
a_set = set(transformed_data)
b_set = set(transformed_test_data)

missing_columns = list( a_set ^ b_set)
missing_columns

In [None]:
not_in_test_data = []
not_in_data = []

for x in missing_columns:
    if x not in transformed_test_data.columns: not_in_test_data.append(x)
    elif x not in transformed_data.columns: not_in_data.append(x)

print(not_in_data)

if 'SalePrice' in not_in_test_data: not_in_test_data.remove('SalePrice')
transformed_test_data[not_in_test_data] = 0

In [None]:
results = xg_regr.predict(transformed_test_data)

In [None]:
px.histogram(results)

In [None]:
sub_frame = pd.DataFrame({"Id": transformed_test_data.Id, "SalePrice": results})
sub_frame.to_csv('submission.csv', index=False)