<a href="https://www.kaggle.com/code/exceededdose/house-price-pred?scriptVersionId=190782813" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
tqdm.pandas()

from scipy.stats import norm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 20

### Data Visualisation

##### Let's now spend some time doing what is arguably the most important step - understanding the data.

Understanding the distribution of various numeric variables
If there is some obvious multicollinearity going on, this is the first place to catch it
Here's where you'll also identify if some predictors directly have a strong association with the outcome variable

In [None]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df.head()

In [None]:
df.info()

In [None]:
for column in df:
    if df[column].dtype =="float":
        df[column]=pd.to_numeric(df[column],downcast="float")
    if df[column].dtype=="int64":
        df[column] = pd.to_numeric(df[column],downcast = "integer")

In [None]:
df.info()

##### Benefit, increase the processing speed, decreasing file size

we can able to achive the reduce from 924.0+ KB to 596.1+ KB

### Basic Checks

#### Checking for missing values in the dataframe

In [None]:
plt.figure(figsize = (10,5))
sns.heatmap(df.head().isna().T)

In [None]:
null_df = pd.DataFrame(df.isna().sum())
null_df.reset_index(drop = False, inplace = True)
null_df[null_df[0] != 0].rename(columns = {'index':'Feature', 0:'Count of missing values'})

###### - There is no values for Misc Feature
    - Drop columns since it does not have any values

In [None]:
df.drop(columns = ['MiscFeature'], inplace = True)

In [None]:
temp_dict = {'Feature':[], 'Len':[], 'Values':[]}
for x in df:
    temp_dict['Feature'].append(x)
    temp_dict['Len'].append(len(df[x].unique()))
    temp_dict['Values'].append(df[x].unique())

In [None]:
data_field = pd.DataFrame(temp_dict)

#### Checking the length, unique values, missing values, data types, and percentage of missing values

In [None]:
data_field = data_field.sort_values(by = ['Len'], ascending = False).reset_index(drop = True)
data_field = data_field.merge(null_df, left_on = 'Feature', right_on = 'index').drop(columns = 'index').rename(columns = {0:'Missing Values'})

In [None]:
data_field['Data Types'] = data_field.progress_apply(lambda x: df[x['Feature']].dtype, axis = 1)

In [None]:
data_field['Percentage of missing values'] = data_field.progress_apply(lambda x: x['Missing Values'] / len(df) * 100,axis = 1)

In [None]:
data_field.head()

#### Missing Data

In [None]:
missing_col_num_df = data_field[(data_field['Data Types'] != 'object') & (data_field['Missing Values'] != 0)]
missing_col_num = data_field[(data_field['Data Types'] != 'object') & (data_field['Missing Values'] != 0)]['Feature'].tolist()

In [None]:
# numerical column missing values
missing_col_num_df

In [None]:
# replacing missing values in numeric columns with average value since the missing percentage is not too high
for x in missing_col_num:
    df[x].fillna(df[x].mean(), inplace = True)

In [None]:
# categorical column missing values
missing_col_cat = data_field[(data_field['Data Types'] == 'object') & (data_field['Missing Values'] != 0)].sort_values(by = 'Percentage of missing values')
missing_col_cat

In [None]:
# segmenting the categorical columns with less than 10 % missing values
missing_col_cat_less10_df = missing_col_cat[missing_col_cat['Percentage of missing values'] < 10]
missing_col_cat_less10 = missing_col_cat_less10_df['Feature'].tolist()

# segmenting the categorical columns with more than 10 % missing values
missing_col_cat_great10_df = missing_col_cat[missing_col_cat['Percentage of missing values'] > 10]
missing_col_cat_great10 = missing_col_cat_great10_df['Feature'].tolist()

In [None]:
# Less than 10% missing values
missing_col_cat_less10_df

In [None]:
# for feature with less than 10% missing values, impute the mode directly since the missing percentage is not too high
for x in missing_col_cat_less10:
    df[x] = df[x].fillna(df[x].mode()[0])

In [None]:
# Greater than 10% missing values
missing_col_cat_great10_df

In [None]:
# Checking the releationship of high missing values column with the target
pal = sns.color_palette("mako", len(df[x].unique()))
plot_count = 1
plt.figure(figsize=(20, 15))
for x in missing_col_cat_great10:
    plt.subplot(3, 2, plot_count) 
    plt.title("Average SalePrice per " + x)
    sns.barplot(data=df[[x, 'SalePrice']].fillna('No value'), x=x, y='SalePrice', estimator=np.mean, palette=pal)
    plot_count += 1
plt.tight_layout()
plt.show()


###### Based on the chart above, the columns with missing value has a significant effect to SalePrice value. The null values will be imputed as "No value"

In [None]:
# for feature with greater than 10% missing values, 
# it is confirmed that even the columns has high missing percentage, it is still significant. Impute the mode to missing values
for x in missing_col_cat_great10:
    df[x] = df[x].fillna(df[x].mode()[0])

In [None]:
# This is the new dataset with no missing values
df.info()

## EDA

#### Correlation Plotting

In [None]:
cols = """SalePrice
OverallQual
GrLivArea
GarageCars
TotalBsmtSF
FullBath
YearBuilt
YearRemodAdd"""
cols = cols.split('\n')
print(cols)

In [None]:
sns.pairplot(df[cols], height = 1.5)

In [None]:
dist_df = pd.DataFrame(index = ['Skewness', 'Kurtosis'], data = {'Normal':[df['SalePrice'].skew(), df['SalePrice'].kurt()],
                                                       'log':[np.log1p(df['SalePrice']).skew(), np.log1p(df['SalePrice']).kurt()]})
dist_df['% change'] = dist_df.apply(lambda x: (x['log'] / x['Normal'] - 1) * 100, axis = 1)
dist_df

###### Implementing log-transformation in the dataset lowers the skewness and kurtosis.

#### Distribution of saleprice before and after log-transformation

#### Before

In [None]:
sns.distplot(df['SalePrice'], fit = norm)

In [None]:
stats.probplot(df['SalePrice'], plot=plt)
plt.show()

#### After

In [None]:
sns.distplot(np.log1p(df['SalePrice']), fit = norm)

In [None]:
stats.probplot(np.log1p(df['SalePrice']), plot=plt)
plt.show()

###### Based on the probability plot, it is more fit when log-transformed

##### Focusing on the before distribution, we could notice that it is right skewed. Therefore there maybe outliers in the dataset

#### Checking for outliers

In [None]:
sns.set()
plot_count = 1
plt.figure(figsize = (20,25))
for x in cols:
    plt.subplot(4,2,plot_count)
    plt.title(x)
    sns.boxplot(df[x])
    plot_count += 1

In [None]:
dist_df = pd.DataFrame(index = ['Skewness', 'Kurtosis'], data = {'Normal':[df['SalePrice'].skew(), df['SalePrice'].kurt()],
                                                       'log':[np.log1p(df['SalePrice']).skew(), np.log1p(df['SalePrice']).kurt()]})
dist_df['% change'] = dist_df.apply(lambda x: (x['log'] / x['Normal'] - 1) * 100, axis = 1)
dist_df

##### Based on the chart above, saleprice above 500k can be considered as an outlier

### Implementing log-transformation in the dataset lowers the skewness and kurtosis.

###### Skewness, Kurtosis and Outliers 

###### Skewness: 
Skewness is a measure of the asymmetry of the probability distribution of a real-valued random variable about its mean.

If skewness is less than -1 or greater than 1, the distribution is highly skewed. 
If skewness is between -1 and -0.5 or between 0.5 and 1, the distribution is moderately skewed. 
If skewness is between -0.5 and 0.5, the distribution is approximately symmetric.

##### Kurtosis: 
Kurtosis is a statistical measure that defines how heavily the tails of a distribution differ from the tails of a normal distribution. In other words, kurtosis identifies whether the tails of a given distribution contain extreme values.

A normal distribution has kurtosis exactly 3 (excess kurtosis exactly 0). Any distribution with kurtosis ≈3 (excess ≈0) is called mesokurtic. A distribution with kurtosis <3 (excess kurtosis <0) is called platykurtic. Compared to a normal distribution, its tails are shorter and thinner, and often its central peak is lower and broader. A distribution with kurtosis >3 (excess kurtosis >0) is called leptokurtic. Compared to a normal distribution, its tails are longer and fatter, and often its central peak is higher and sharper. 

##### Outliers: 
They are data records that differ dramatically from all others, they distinguish themselves in one or more characteristics. In other words, an outlier is a value that escapes normality and can (and probably will) cause anomalies in the results obtained through algorithms and analytical systems.

##### Summary:
1. Columns to remove because they are also correlated with other columns
    - GarageArea
    - 1stFlrSF
    - TotRmsAbvGrd
    - Id -> unique column
2. Log-tranformation decreases the skewness and kurtosis value of the dataset
3. The distribution of saleprice shows that above 500k is an outlier

In [None]:
import statsmodels.api as sm

In [None]:
all_columns = "+".join(cols)

In [None]:
my_formula = "SalePrice~"+all_columns

In [None]:
lm = sm.OLS.from_formula(formula = my_formula, data = df)
result = lm.fit()

In [None]:
result.summary()

In [None]:
# filtering correlated columns
df_clean = df[[x for x in df.columns if x not in ['GarageArea', '1stFlrSF', 'TotRmsAbvGrd', 'Id']]]

# filterout above 500k saleprice
df_clean = df_clean[df_clean['SalePrice'] < 500000]

# log-transformation
df_clean['SalePrice'] = np.log1p(df_clean['SalePrice'])

#transform categorical columns
df_clean = pd.get_dummies(df_clean)

In [None]:
df_clean.drop('SalePrice', axis = 1)

### Data Model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_clean.drop('SalePrice', axis = 1)
y = df_clean['SalePrice']

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

class Model:
    scores = {'Model':[], 'r2_score-train':[], 'mse-train':[], 'r2_score-test':[], 'mse-test':[]}
    
    def __init__(self, model, model_name):
        self.model = model
        self.model_name = model_name
        
    def predict(self):
        self.model.fit(train_x, train_y)
        
        #training dataset pred
        pred_train = self.model.predict(train_x)
        r2_train = r2_score(train_y, pred_train)
        mse_train = mean_squared_error(train_y, pred_train)
        
        #testing dataset pred
        pred_test = self.model.predict(test_x)
        r2_test = r2_score(test_y, pred_test)
        mse_test = mean_squared_error(test_y, pred_test)
        
        self.performance(r2_train, mse_train, r2_test, mse_test)
    
    def performance(self, r2_train, mse_train, r2_test, mse_test):
    
        Model.scores['Model'].append(self.model_name)         
        Model.scores['r2_score-test'].append(r2_test)
        Model.scores['r2_score-train'].append(r2_train)
        Model.scores['mse-test'].append(mse_test)
        Model.scores['mse-train'].append(mse_train)
        
        print("**Training**")
        print(f'r2_score: {r2_train}')
        print(f'mse: {mse_train}')
        print("==========================================================")
        print("**Test**")
        print(f'r2_score: {r2_test}')
        print(f'mse: {mse_test}')

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model = Model(LinearRegression(), 'Regression')
model.predict()

### XGBRegressor

In [None]:
from xgboost import XGBRegressor

model = Model(XGBRegressor(), 'XGBRegressor')
model.predict()

### CatBoost Regressor

In [None]:
from catboost import CatBoostRegressor
model = Model(CatBoostRegressor(), 'CatBoostRegressor')
model.predict()

### Model Performance Summary

In [None]:
performance_df = pd.DataFrame(Model.scores)
performance_df.sort_values(by='r2_score-test', ascending=False, inplace=True)
performance_df.reset_index(drop = True, inplace = True)
performance_df

In [None]:
### Ridge

In [None]:
from sklearn.linear_model import Ridge

model = Model(Ridge(alpha=1.0), 'Ridge')
model.predict()

In [None]:
### Lasso

In [None]:
from sklearn.linear_model import Lasso

model = Model(Lasso(alpha=0), 'Lasso')
model.predict()

# Using AutoML

#### This part will be one time running, as this only need to search for the most optimized parameters

In [None]:
import tpot

from tpot import TPOTRegressor
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state = 1, n_jobs = -1)

tpot.fit(train_x, train_y)

In [None]:
print(tpot.score(test_x, test_y))
tpot.export('HousePricePred - AutoML.py')

In [None]:
tpot = TPOTRegressor(generations=15, population_size=50, verbosity=2, random_state = 1, n_jobs = -1)

tpot.fit(train_x, train_y)
print(tpot.score(test_x, test_y))
tpot.export('HousePricePred - AutoML gen15.py')

In [None]:
scores = {'Model':[], 'r2_score-train':[], 'mse-train':[], 'r2_score-test':[], 'mse-test':[]}
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoLarsCV, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from tpot.export_utils import set_param_recursive

# # NOTE: Make sure that the outcome column is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# features = tpot_data.drop('target', axis=1)
# training_features, testing_features, training_target, testing_target = \
#             train_test_split(features, tpot_data['target'], random_state=1)

# Average CV score on the training set was: -0.013978187312131077
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    RandomForestRegressor(bootstrap=True, max_features=0.8500000000000001, min_samples_leaf=3, min_samples_split=17, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)



exported_pipeline.fit(train_x, train_y)
results = exported_pipeline.predict(test_x)


#training dataset pred
results = exported_pipeline.predict(train_x)
r2_train = r2_score(train_y, results)
mse_train = mean_squared_error(train_y, results)

#testing dataset pred
results = exported_pipeline.predict(test_x)
r2_test = r2_score(test_y, results)
mse_test = mean_squared_error(test_y, results)


scores['Model'].append('AutoML - 15 generation')
scores['r2_score-train'].append(r2_train)
scores['mse-train'].append(mse_train)
scores['r2_score-test'].append(r2_test)
scores['mse-test'].append(mse_test)

print("**Training**")
print(f'r2_score: {r2_train}')
print(f'mse: {mse_train}')
print("==========================================================")
print("**Test**")
print(f'r2_score: {r2_test}')
print(f'mse: {mse_test}')

In [None]:
scores = {'Model':[], 'r2_score-train':[], 'mse-train':[], 'r2_score-test':[], 'mse-test':[]}
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoLarsCV, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from tpot.export_utils import set_param_recursive

# # NOTE: Make sure that the outcome column is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# features = tpot_data.drop('target', axis=1)
# training_features, testing_features, training_target, testing_target = \
#             train_test_split(features, tpot_data['target'], random_state=1)

# Average CV score on the training set was: -0.015860838312438157
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=RidgeCV()),
    RandomForestRegressor(bootstrap=True, max_features=0.45, min_samples_leaf=3, min_samples_split=20, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)



exported_pipeline.fit(train_x, train_y)
results = exported_pipeline.predict(test_x)


#training dataset pred
results = exported_pipeline.predict(train_x)
r2_train = r2_score(train_y, results)
mse_train = mean_squared_error(train_y, results)

#testing dataset pred
results = exported_pipeline.predict(test_x)
r2_test = r2_score(test_y, results)
mse_test = mean_squared_error(test_y, results)


scores['Model'].append('AutoML - 10 generation')
scores['r2_score-train'].append(r2_train)
scores['mse-train'].append(mse_train)
scores['r2_score-test'].append(r2_test)
scores['mse-test'].append(mse_test)

print("**Training**")
print(f'r2_score: {r2_train}')
print(f'mse: {mse_train}')
print("==========================================================")
print("**Test**")
print(f'r2_score: {r2_test}')
print(f'mse: {mse_test}')

In [None]:
performance_df = pd.concat([performance_df,pd.DataFrame(scores)]).sort_values(by = ['r2_score-test'], ascending = False)
performance_df.reset_index(drop = True, inplace = True)
performance_df['Dimension-Reduction'] = "None"
performance_df