# Housing regression - Gradient Booster and Variations

---
## Import libraries and files 💾

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error

In [2]:
# reading
housing = pd.read_csv('train.csv', index_col='Id')

In [3]:
housing.shape

(1460, 80)

In [4]:
housing.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
#Prechecks!!!!

# Check if there are columns where 'SalePrice' has NaN values
#housing['SalePrice'].isna().sum()

#housing = housing.dropna(subset=['SalePrice'])

#Check if there are duplicates
#housing.duplicated().sum()

#housing = housing.drop_duplicates()

In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuilt    

## Feature Engineering

#### !!Feature Selection only on training data first!! Then applying on test data as well. Feature Selection worsened the results.

#Feature Engineering

housing['SF'] = housing['1stFlrSF'] + housing['2ndFlrSF']
housing['TotalBath'] = housing['FullBath'] + 0.5 * housing['HalfBath']
housing['HouseAge'] = housing['YrSold'] - housing['YearBuilt']
housing['YearsSinceRemodel'] = housing['YrSold'] - housing['YearRemodAdd']

housing = housing.drop(['1stFlrSF', '2ndFlrSF', 'FullBath', 'HalfBath', 'YrSold', 'YearBuilt', 'YearRemodAdd'], axis=1)
housing

#### Better run first everything without feature engineering

## Handling of columns with a high number of NaN

#### Dropping columns worsens results

In [7]:
housing.isna().sum().sort_values(ascending=False).head(10)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
MasVnrType      872
FireplaceQu     690
LotFrontage     259
GarageYrBlt      81
GarageCond       81
GarageType       81
dtype: int64

In [8]:
#Drop categorical columns with high number of missing values
#housing = housing.drop(['FireplaceQu', 'Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature', 'LotFrontage'], axis=1)

## Splitting the data

In [9]:
X = housing.copy()
y = X.pop('SalePrice')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Setting up Preprocessor

#### I use the MinMaxSCaler on all three pipelines

In [10]:
# select categorical and numerical column names
cat_columns = X.select_dtypes(exclude="number").columns
num_columns = X.select_dtypes(include="number").columns

# create numerical pipeline
num_pipe = make_pipeline(
    SimpleImputer(strategy="median")
)

# create categorical pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent").set_output(transform="pandas"),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

preprocessor1 = make_column_transformer(
    (num_pipe, num_columns),
    (cat_pipe, cat_columns)
)

### Preprocessor with ordinal encoding

In [11]:
# building the pipeline
cat_columns = X.select_dtypes(exclude="number").columns
num_columns = X.select_dtypes(include="number").columns

# subdividing categorical features into ordinals and non-ordinals.
ordinal_list = ['ExterQual',
                'ExterCond',
                'BsmtQual',
                'BsmtCond',
                'BsmtExposure',
                'BsmtFinType1',
                'KitchenQual',
                'LandSlope',
                'BsmtFinType2',
                'HeatingQC',
                'GarageFinish',
                'GarageQual',
                'GarageCond']

ord_columns = X[cat_columns][ordinal_list].columns
non_ord_columns = X[cat_columns].drop(ordinal_list, axis=1).columns

ord_cats1 = ["Ex", "Gd", "TA", "Fa", "Po","NA"]#ExterQual
ord_cats2 = ["Ex", "Gd", "TA", "Fa", "Po","NA"]#ExterCond
ord_cats3 = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]#BsmtQual
ord_cats4 = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]#BsmtCond
ord_cats5 = ['Gd','Av', 'Mn', 'No', "NA"]#BsmtExposure
ord_cats6 = ['GLQ', 'ALQ','BLQ', 'Rec','LwQ', 'Unf', "NA"]#BsmtFinType1
ord_cats7 = ["Ex", "Gd", "TA", "Fa", "Po","NA"]#KitchenQual
ord_cats8 = ["Gtl", "Mod", "Sev","NA"]#LandSlope
ord_cats9 = ['GLQ', 'ALQ','BLQ', 'Rec','LwQ', 'Unf', "NA"]#BsmtFinType2
ord_cats10 = ["Ex", "Gd", "TA", "Fa", "Po","NA"]#HeatingQC
ord_cats11 = ["Fin", "RFn", "Unf", "NA"]#GarageFinish
ord_cats12 = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]#GarageQual
ord_cats13 = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]#GarageCond

# create numerical pipeline
num_pipe = make_pipeline(
    SimpleImputer(strategy="median"))

# create non ordinal pipeline
non_ord_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent").set_output(transform="pandas"),
    OneHotEncoder(handle_unknown="ignore",sparse_output=False)
)

# create ordinal pipeline
ord_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent").set_output(transform="pandas"),
    OrdinalEncoder(categories=[ord_cats1, 
                                    ord_cats2, 
                                    ord_cats3,
                                    ord_cats4,
                                    ord_cats5,
                                    ord_cats6,
                                    ord_cats7,
                                    ord_cats8,
                                    ord_cats9,
                                    ord_cats10,
                                    ord_cats11,
                                    ord_cats12,
                                    ord_cats13])
)

preprocessor2 = make_column_transformer(
    (num_pipe, num_columns),
    (ord_pipe, ord_columns),
    (non_ord_pipe, non_ord_columns)
)

## Gradient Boosting

In [12]:
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor1),
    ('scaler', MinMaxScaler()),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Fit the pipeline to the training data
gb_pipeline.fit(X_train, y_train)

In [13]:
# Prediction on test data
y_test_pred = gb_pipeline.predict(X_test)

#Calculating Errors
mae = round(mean_absolute_error(y_true = y_test,
                               y_pred = y_test_pred))


mse = round(mean_squared_error(y_true = y_test,
                               y_pred = y_test_pred,
                               squared=False))

mape = round(mean_absolute_percentage_error(y_true = y_test,
                                           y_pred = y_test_pred),3)

r2 = round(r2_score(y_true = y_test,
                   y_pred = y_test_pred),3)

msle_test = round(mean_squared_log_error(y_true = y_test,
                                    y_pred = y_test_pred),5)



# Storing the errors in a dictionary
errors = {
    "Mean Absolute Error": mae,
    "Mean Squared Error": mse,
    "Mean Absolute Percentage Error": mape,
    "R Squared": r2,
    "Mean Squared Log Error": msle_test
}

errors

{'Mean Absolute Error': 16532,
 'Mean Squared Error': 26217,
 'Mean Absolute Percentage Error': 0.098,
 'R Squared': 0.91,
 'Mean Squared Log Error': 0.01843}

In [14]:
# Prediction on training data
y_test_pred_train = gb_pipeline.predict(X_train)

msle_train = round(mean_squared_log_error(y_true = y_train,
                                    y_pred = y_test_pred_train),5)
msle_train

0.00702

#### With ordinal encoding

In [15]:
gb_pipeline_ord = Pipeline([
    ('preprocessor', preprocessor2),
    ('scaler', MinMaxScaler()),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

gb_pipeline_ord.fit(X_train, y_train)

In [16]:
# Prediction on test data
y_test_pred = gb_pipeline_ord.predict(X_test)

#Calculating Errors
mae = round(mean_absolute_error(y_true = y_test,
                               y_pred = y_test_pred))


mse = round(mean_squared_error(y_true = y_test,
                               y_pred = y_test_pred,
                               squared=False))

mape = round(mean_absolute_percentage_error(y_true = y_test,
                                           y_pred = y_test_pred),3)

r2 = round(r2_score(y_true = y_test,
                   y_pred = y_test_pred),3)

msle = round(mean_squared_log_error(y_true = y_test,
                                    y_pred = y_test_pred),5)
 
# Storing the errors in a dictionary
errors = {
    "Mean Absolute Error": mae,
    "Mean Squared Error": mse,
    "Mean Absolute Percentage Error": mape,
    "R Squared": r2,
    "Mean Squared Log Error": msle
}

errors

{'Mean Absolute Error': 16584,
 'Mean Squared Error': 26875,
 'Mean Absolute Percentage Error': 0.098,
 'R Squared': 0.906,
 'Mean Squared Log Error': 0.01906}

In [17]:
# Prediction on training data
y_test_pred_train = gb_pipeline_ord.predict(X_train)

msle_train = round(mean_squared_log_error(y_true = y_train,
                                    y_pred = y_test_pred_train),5)
msle_train

0.00669

#### With GridSearch

In [18]:
gb_pipeline_ord = Pipeline([
    ('preprocessor', preprocessor2),
    ('scaler', MinMaxScaler()),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

param_grid = {
    'preprocessor__pipeline-1__simpleimputer__strategy': ["median"],
    'regressor__n_estimators': [100],
    'regressor__max_depth': [2],
    'regressor__min_samples_split': [20],
    #'regressor__learning_rate': [0.1],
    #'regressor__subsample': [0.9],
    #'regressor__loss': ['ls', 'lad']
}

gb_search_ord = GridSearchCV(gb_pipeline_ord, param_grid, cv=5, verbose=1)

gb_search_ord.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [19]:
# Prediction on test data
y_test_pred = gb_search_ord.predict(X_test)

#Calculating Errors
mae = round(mean_absolute_error(y_true = y_test,
                               y_pred = y_test_pred))


mse = round(mean_squared_error(y_true = y_test,
                               y_pred = y_test_pred,
                               squared=False))

mape = round(mean_absolute_percentage_error(y_true = y_test,
                                           y_pred = y_test_pred),3)

r2 = round(r2_score(y_true = y_test,
                   y_pred = y_test_pred),3)

msle = round(mean_squared_log_error(y_true = y_test,
                                    y_pred = y_test_pred),5)
 
# Storing the errors in a dictionary
errors = {
    "Mean Absolute Error": mae,
    "Mean Squared Error": mse,
    "Mean Absolute Percentage Error": mape,
    "R Squared": r2,
    "Mean Squared Log Error": msle
}

errors

{'Mean Absolute Error': 18354,
 'Mean Squared Error': 28950,
 'Mean Absolute Percentage Error': 0.112,
 'R Squared': 0.891,
 'Mean Squared Log Error': 0.02294}

In [20]:
# Prediction on training data
y_test_pred_train = gb_search_ord.predict(X_train)

msle_train = round(mean_squared_log_error(y_true = y_train,
                                    y_pred = y_test_pred_train),5)
msle_train

0.01205

In [21]:
gb_search_ord.best_params_

{'preprocessor__pipeline-1__simpleimputer__strategy': 'median',
 'regressor__max_depth': 2,
 'regressor__min_samples_split': 20,
 'regressor__n_estimators': 100}

### With Variance Treshold

In [22]:
gb_pipeline_var_ord = Pipeline([
    ('preprocessor', preprocessor2),
    ('scaler', MinMaxScaler()),
    ('feature_selector', VarianceThreshold(threshold=0)),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Fit the pipeline to the training data
gb_pipeline_var_ord.fit(X_train, y_train)

In [23]:
# Prediction on test data
y_test_pred = gb_pipeline_var_ord.predict(X_test)

#Calculating Errors
mae = round(mean_absolute_error(y_true = y_test,
                               y_pred = y_test_pred))


mse = round(mean_squared_error(y_true = y_test,
                               y_pred = y_test_pred,
                               squared=False))

mape = round(mean_absolute_percentage_error(y_true = y_test,
                                           y_pred = y_test_pred),3)

r2 = round(r2_score(y_true = y_test,
                   y_pred = y_test_pred),3)

msle = round(mean_squared_log_error(y_true = y_test,
                                    y_pred = y_test_pred),5)
 
# Storing the errors in a dictionary
errors = {
    "Mean Absolute Error": mae,
    "Mean Squared Error": mse,
    "Mean Absolute Percentage Error": mape,
    "R Squared": r2,
    "Mean Squared Log Error": msle
}

errors

{'Mean Absolute Error': 16584,
 'Mean Squared Error': 26875,
 'Mean Absolute Percentage Error': 0.098,
 'R Squared': 0.906,
 'Mean Squared Log Error': 0.01906}

In [24]:
# Prediction on training data
y_test_pred_train = gb_pipeline_var_ord.predict(X_train)

msle_train = round(mean_squared_log_error(y_true = y_train,
                                    y_pred = y_test_pred_train),5)
msle_train

0.00669

In [25]:
gb_pipeline_var_ord = Pipeline([
    ('preprocessor', preprocessor2),
    ('scaler', MinMaxScaler()),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

param_grid = {
    'preprocessor__pipeline-1__simpleimputer__strategy': ["mean"],
    'regressor__n_estimators': [200],
    'regressor__max_depth': [7],
    'regressor__min_samples_split': [20],
    #'regressor__learning_rate': [0.01, 0.1, 0.2],
    #'regressor__subsample': [0.7, 0.9, 1.0],
    #'regressor__loss': ['ls', 'lad']
}

gb_search_var_ord = GridSearchCV(gb_pipeline_var_ord, param_grid, cv=5, verbose=1)

gb_search_var_ord.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [26]:
# Prediction on test data
y_test_pred = gb_search_var_ord.predict(X_test)

#Calculating Errors
mae = round(mean_absolute_error(y_true = y_test,
                               y_pred = y_test_pred))


mse = round(mean_squared_error(y_true = y_test,
                               y_pred = y_test_pred,
                               squared=False))

mape = round(mean_absolute_percentage_error(y_true = y_test,
                                           y_pred = y_test_pred),3)

r2 = round(r2_score(y_true = y_test,
                   y_pred = y_test_pred),3)

msle = round(mean_squared_log_error(y_true = y_test,
                                    y_pred = y_test_pred),5)
 
# Storing the errors in a dictionary
errors = {
    "Mean Absolute Error": mae,
    "Mean Squared Error": mse,
    "Mean Absolute Percentage Error": mape,
    "R Squared": r2,
    "Mean Squared Log Error": msle
}

errors

{'Mean Absolute Error': 16372,
 'Mean Squared Error': 27181,
 'Mean Absolute Percentage Error': 0.1,
 'R Squared': 0.904,
 'Mean Squared Log Error': 0.02084}

### Feature Selection - K Best

In [27]:
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor2),
    ('scaler', MinMaxScaler()),
    ('feature_selector', SelectKBest(f_regression, k=20)),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Fit the pipeline to the training data
gb_pipeline.fit(X_train, y_train)

In [28]:
# Prediction on test data
y_test_pred = gb_pipeline.predict(X_test)

#Calculating Errors
mae = round(mean_absolute_error(y_true = y_test,
                               y_pred = y_test_pred))


mse = round(mean_squared_error(y_true = y_test,
                               y_pred = y_test_pred,
                               squared=False))

mape = round(mean_absolute_percentage_error(y_true = y_test,
                                           y_pred = y_test_pred),3)

r2 = round(r2_score(y_true = y_test,
                   y_pred = y_test_pred),3)

msle = round(mean_squared_log_error(y_true = y_test,
                                    y_pred = y_test_pred),5)
 
# Storing the errors in a dictionary
errors = {
    "Mean Absolute Error": mae,
    "Mean Squared Error": mse,
    "Mean Absolute Percentage Error": mape,
    "R Squared": r2,
    "Mean Squared Log Error": msle
}

errors

{'Mean Absolute Error': 19269,
 'Mean Squared Error': 30021,
 'Mean Absolute Percentage Error': 0.118,
 'R Squared': 0.883,
 'Mean Squared Log Error': 0.02625}

In [29]:
# Prediction on training data
y_test_pred_train = gb_pipeline.predict(X_train)

msle_train = round(mean_squared_log_error(y_true = y_train,
                                    y_pred = y_test_pred_train),5)
msle_train

0.01189

### Feature Selection - Select from Model

In [30]:
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor2),
    ('scaler', MinMaxScaler()),
    ('feature_selector', SelectFromModel(GradientBoostingRegressor())),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Fit the pipeline to the training data
gb_pipeline.fit(X_train, y_train)

In [31]:
# Prediction on test data
y_test_pred = gb_pipeline.predict(X_test)

#Calculating Errors
mae = round(mean_absolute_error(y_true = y_test,
                               y_pred = y_test_pred))


mse = round(mean_squared_error(y_true = y_test,
                               y_pred = y_test_pred,
                               squared=False))

mape = round(mean_absolute_percentage_error(y_true = y_test,
                                           y_pred = y_test_pred),3)

r2 = round(r2_score(y_true = y_test,
                   y_pred = y_test_pred),3)

msle = round(mean_squared_log_error(y_true = y_test,
                                    y_pred = y_test_pred),5)
 
# Storing the errors in a dictionary
errors = {
    "Mean Absolute Error": mae,
    "Mean Squared Error": mse,
    "Mean Absolute Percentage Error": mape,
    "R Squared": r2,
    "Mean Squared Log Error": msle
}

errors

{'Mean Absolute Error': 17172,
 'Mean Squared Error': 27899,
 'Mean Absolute Percentage Error': 0.101,
 'R Squared': 0.899,
 'Mean Squared Log Error': 0.01971}

In [32]:
# Prediction on training data
y_test_pred_train = gb_pipeline.predict(X_train)

msle_train = round(mean_squared_log_error(y_true = y_train,
                                    y_pred = y_test_pred_train),5)
msle_train

0.00878

## Uploading and preprocessing test data

In [33]:
# reading
test = pd.read_csv('test.csv', index_col='Id')

In [34]:
test['SalePrice'] = gb_pipeline.predict(test)

submission = test.reset_index()[['Id', 'SalePrice']]
submission

Unnamed: 0,Id,SalePrice
0,1461,126659.994061
1,1462,153868.199306
2,1463,177846.470993
3,1464,182240.283262
4,1465,199231.793979
...,...,...
1454,2915,83977.906677
1455,2916,84289.335025
1456,2917,180405.283906
1457,2918,122400.675919


In [35]:
submission.to_csv('housing_regression_submission5.csv', index=False)