In [None]:
! pip install catboost

In [None]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
 
from scipy.stats import skew
 
import warnings
warnings.filterwarnings("ignore")
 
pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('/content/drive/My Drive/Kaggle house pricing /train.csv')
test = pd.read_csv('/content/drive/My Drive/Kaggle house pricing /test.csv')

In [None]:
data = pd.concat([train.drop("SalePrice", axis=1),test], axis=0) #Join train and test dataset
y = train[['SalePrice']]

In [None]:
#Drop features with high missing values(over 80%)
high_missing_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence',]
data = data.drop(high_missing_cols, axis=1)

In [None]:
 # I'll drop some features due to multicollinearity, low class representation, etc
to_drop = ['Id', 'YrSold', 'MoSold', 'Utilities', 'Street', 'Condition2', 'RoofMatl', 'Heating',
           'LowQualFinSF', '3SsnPorch', 'PoolArea', 'MiscVal']
data = data.drop(to_drop, axis=1)

In [None]:
 #Get list of categorical features
categorical_cols = data.select_dtypes(include=['object']).columns
categorical_cols

Index(['MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

In [None]:
# Get list of numeric columns
numeric_cols = data.select_dtypes(include=np.number).columns
numeric_cols

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', 'ScreenPorch'],
      dtype='object')

In [None]:
#Replace NaN with none.
none_cols = ['FireplaceQu', 'GarageType','GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual',
             'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']
for col in none_cols:
    data[col].replace(np.nan, 'None', inplace=True)

In [None]:
 #Fill missing categorical columns with the mode
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

In [None]:
 # Handle missing values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer()
#Handle numeric missing values
data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

In [None]:
 # Label encode categorical features
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])

In [None]:
# Handling skewed features using log transformation
skew_features = np.abs(data[numeric_cols].apply(lambda x: skew(x)).sort_values(ascending=False))
skew_features[:10] # Displaying top ten skewed features

LotArea          12.822431
KitchenAbvGr      4.302254
BsmtFinSF2        4.146034
EnclosedPorch     4.003891
ScreenPorch       3.946694
BsmtHalfBath      3.931148
MasVnrArea        2.602112
OpenPorchSF       2.535114
WoodDeckSF        1.842433
LotFrontage       1.563371
dtype: float64

In [None]:
# Filtering skewed features.
high_skew = skew_features[skew_features > 1]
# Taking indexes of high skew.
skew_index = high_skew.index
#Applying log transformation
for i in skew_index:
    data[i] = np.log1p(data[i])

In [None]:
 # Creating new features  based on previous observations...
data['TotalSF'] = data['BsmtFinSF1'] + data['BsmtFinSF2'] + data['1stFlrSF'] + data['2ndFlrSF']
data['TotalBathrooms'] = data['FullBath'] + (0.5*data['HalfBath']) + data['BsmtFullBath'] + (0.5*data['BsmtHalfBath'])
data['TotalPorchSF'] = data['OpenPorchSF'] +  data['EnclosedPorch'] + data['ScreenPorch'] + data['WoodDeckSF']
data['YearBlRm'] = data['YearBuilt'] + data['YearRemodAdd']
 
# Merging quality and conditions.
data['TotalExtQual'] = data['ExterQual'] + data['ExterCond']
data['TotalBsmQual'] = data['BsmtQual'] + data['BsmtCond'] + data['BsmtFinType1'] + data['BsmtFinType2']
data['TotalGrgQual'] = data['GarageQual'] + data['GarageCond']
data['TotalQual'] = data['OverallQual'] + data['TotalExtQual'] + data['TotalBsmQual'] + data['TotalGrgQual'] + data['KitchenQual'] + data['HeatingQC']
 
# Creating new features by using new quality indicators.
data['QualGr'] = data['TotalQual'] * data['GrLivArea']
data['QualBsm'] = data['TotalBsmQual'] * (data['BsmtFinSF1'] + data['BsmtFinSF2'])
data['QualPorch'] = data['TotalExtQual'] * data['TotalPorchSF']
data['QualExt'] = data['TotalExtQual'] * data['MasVnrArea']
data['QualGrg'] = data['TotalGrgQual'] * data['GarageArea']
data['QlLivArea'] = (data['GrLivArea']  * data['TotalQual'])
data['QualSFNg'] = data['QualGr'] * data['Neighborhood']
 
#create binary columns
binary_column = ['2ndFlrSF', 'QualGrg', 'Fireplaces', 'QualBsm', 'QualPorch','TotalPorchSF']
for col in binary_column:
    col_name = 'has_'+ col
    data[col_name] = data[col].apply(lambda x: 1 if x > 0 else 0)

In [None]:
X = data.iloc[:1460,:]
X_test = data.iloc[1460:, :]

In [None]:
 # Scale the dataset
from sklearn.preprocessing import RobustScaler
 
cols = X.select_dtypes(np.number).columns
scaler = RobustScaler().fit(X[cols])
X[cols] = scaler.transform(X[cols])
X_test[cols] = scaler.transform(X_test[cols])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=42)

In [None]:
from xgboost import XGBRegressor
xg = XGBRegressor(random_state=42, n_jobs=-1)

In [None]:
 from sklearn.metrics import mean_absolute_error

In [None]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor(random_state=42, thread_count=-1, verbose=False)

In [None]:
from lightgbm import LGBMRegressor
lg = LGBMRegressor(random_state=42, n_jobs=-1)

In [None]:
# checking the performance of the models 
cat.fit(X_train, y_train)
print("Model name: CatBoostRegressor") 
print("Model score:", mean_absolute_error(y_val, cat.predict(X_val)))
 
xg.fit(X_train, y_train)
print("Model name: XGBoostRegressor") 
print("Model score:", mean_absolute_error(y_val, xg.predict(X_val)))
 
lg.fit(X_train, y_train)
print("Model name: LGBMRegressor") 
print("Model score:", mean_absolute_error(y_val, lg.predict(X_val)))

Model name: CatBoostRegressor
Model score: 14888.545379453079
Model name: XGBoostRegressor
Model score: 16401.73058468893
Model name: LGBMRegressor
Model score: 16192.512272685179


In [None]:
# Import Ensemble models present in the scikit library
 
from sklearn.ensemble import VotingRegressor, StackingRegressor

 
    STACKINGREGRESSOR 
 
 Stack of estimators with a final regressor.
 
Stacked generalization consists in stacking the output of individual estimator and use a regressor to compute the final prediction. Stacking allows to use the strength of each individual estimator by using their output as input of a final estimator.
 
Note that estimators_ are fitted on the full X while final_estimator_ is trained using cross-validated predictions of the base estimators using cross_val_predict.
 
    VOTINGREGRESSOR 
A voting regressor is an ensemble meta-estimator that fits several base regressors, each on the whole dataset. Then it averages the individual predictions to form a final prediction.

In [None]:
 
# instantiating a VotingRegressor and a StackingRegressor 
 
vc = VotingRegressor(estimators=[("CatBoost", CatBoostRegressor(verbose=False, random_state=42)), ("lgbm", LGBMRegressor(n_jobs=-1, random_state=42)), ("Xgboost", XGBRegressor(random_state=42, n_jobs=-1))]) 
 
stack = StackingRegressor(estimators=[("CatBoost", CatBoostRegressor(verbose=False, random_state=42)), ("lgbm", LGBMRegressor(n_jobs=-1, random_state=42)), ("Xgboost", XGBRegressor(random_state=42, n_jobs=-1))], final_estimator=cat, passthrough=True, cv=5)

In [None]:

Checking their performances

vc.fit(X_train, y_train)
print("Model name: VotingRegressor") 
print("Model score:", mean_absolute_error(y_val, vc.predict(X_val)))
 
stack.fit(X_train, y_train)
print("Model name: StackingRegressor") 
print("Model score:", mean_absolute_error(y_val, stack.predict(X_val)))

Model name: VotingRegressor
Model score: 15091.104341056787
Model name: StackingRegressor
Model score: 15183.614122451601


In [None]:
# catboost prediction on the test data
cat_pred = cat.predict(X_test) 
 pd.DataFrame(
    {"Id": test["Id"],
     "SalePrice": cat_pred}
).to_csv("cat-submission9999.csv", index = False)

In [None]:
# lgbm prediction on the test data
lg_pred = lg.predict(X_test) 
 pd.DataFrame(
    {"Id": test["Id"],
     "SalePrice": lg_pred}
).to_csv("lg-submission9999.csv", index = False)

In [None]:
# VotingRegressor on the test data
vc_pred = vc.predict(X_test) 
 pd.DataFrame(
    {"Id": test["Id"],
     "SalePrice": vc_pred}
).to_csv("vc-submission9999.csv", index = False)

In [None]:
# StackingRegressor on the test data 
stack_pred = stack.predict(X_test) 
 pd.DataFrame(
    {"Id": test["Id"],
     "SalePrice": stack_pred}
).to_csv("stack-submission9999.csv", index = False)

In [None]:

# performing a weighted Ensemble on the VotingRegressor and StackingRegressor predictions 
vc_stack = vc_pred*0.7 + stack_pred*0.3
 
pd.DataFrame(
    {"Id": test["Id"],
     "SalePrice": vc_stack}
).to_csv("vc_stack-submission9999.csv", index = False)

In [None]:

# performing a weighted average on the predictions of the VotingRegressor and CatBoost 

vc_cat = cat_pred*0.6 + vc_pred*0.4
 
pd.DataFrame(
    {"Id": test["Id"],
     "SalePrice": vc_cat}
).to_csv("vc_cat-submission9999.csv", index = False)

In [None]:
 
# An average Ensemble on the predictions for CatBoost and VotingRegressor 
vc_cat1 = cat_pred*0.5 + vc_pred*0.5
 
pd.DataFrame(
    {"Id": test["Id"],
     "SalePrice": vc_cat}
).to_csv("vc_cat-submission999912.csv", index = False)

In [None]:

# a final weighted average on the weighted and average Ensembles we have made before
final = vc_cat*0.7 + vc_cat1*0.3
 
pd.DataFrame(
    {"Id": test["Id"],
     "SalePrice": vc_cat}
).to_csv("vc_cat+vc_cat1-submission999912.csv", index = False)