<a href="https://www.kaggle.com/code/callumbrown123/house-price-predictor?scriptVersionId=158094289" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/house-prices-advanced-regression-techniques/train.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Load Revelant Libraries
import csv
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import seaborn as sns
import tensorflow as tf

In [3]:
# Read in data from csv file
df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")

In [4]:
# Generate descriptive features of data
print(df['SalePrice'].describe())


# generate information about columns with missing values
cols_with_missing = [col for col in df.columns
                     if df[col].isnull().any()]

print(df.columns)
print(cols_with_missing)

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'Kitche

In [5]:
# drop missing values in SalePrice column
df.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Set X and y variables
y = df['SalePrice']
df.drop(['SalePrice'], axis=1, inplace=True)

# Remove categorical data from X variables
X = df.select_dtypes(exclude=['object'])
print(X.columns)

# split data into training and testing
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')


In [6]:
# import simple imputer to impute average values in for missing values
from sklearn.impute import SimpleImputer

myimputer = SimpleImputer()

# impute data for missing values in train_X data frame
imputed_train_X = pd.DataFrame(myimputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(myimputer.fit_transform(val_X))

# replace columns names
imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

In [7]:
# define get_mae function to compute the mean absolute error when training 
# the data using a random decision forest

def get_mae(max_leaf_nodes, imputed_train_X, imputed_val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(imputed_train_X, train_y)
    preds_val = model.predict(imputed_val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# run a while loop to determine the optimal height of the random decision forest
# by computing the mae when the model is trained with different numbers of leaf nodes

opt_leaf_val = [3, get_mae(3, imputed_train_X, imputed_val_X, train_y, val_y)]
l = 4
while l < 500:
    my_mae = get_mae(l, imputed_train_X, imputed_val_X, train_y, val_y)
    if my_mae < opt_leaf_val[1]:
        opt_leaf_val = [l, my_mae]
    l = l + 20
    
print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(l, my_mae))

print(opt_leaf_val[0])


Max leaf nodes: 504  		 Mean Absolute Error:  16602
484


In [8]:
# train the model with the optimal number of leaf nodes to minimize the mae

boston_model = RandomForestRegressor(random_state = 1, max_leaf_nodes=opt_leaf_val[0])

boston_model.fit(imputed_train_X, train_y)

boston_model_mae = mean_absolute_error(boston_model.predict(imputed_val_X), val_y)

print(boston_model_mae)


16943.07042824603
