![My Title](input/housesbanner.png)

# House Prices: Advanced Regression Techniques

This notebook is focusing on Feature Selection and Model Building.

In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

# import plotly_express as px

from scipy import stats
from scipy.stats import norm, skew 

# import pandas_profiling
from sklearn.feature_selection import SelectKBest,SelectFromModel,RFE,chi2,mutual_info_classif,f_classif

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC,LinearRegression
 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
import lightgbm as lgb

from sklearn.linear_model import Lasso,Ridge
# Managing Warnings 
import warnings
warnings.filterwarnings('ignore')

# Plot the Figures Inline
%matplotlib inline

pd.pandas.set_option('display.max_columns',None)

# Read Data

In [2]:
df_train = pd.read_csv('./output/train_modified.csv')
df_test = pd.read_csv('./output/test_modified.csv')
print(df_train.shape)
print(df_test.shape)

df_train.head()

(1460, 81)
(1459, 80)


Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,208500,0.235294,0.75,0.150685,0.03342,1.0,0.5,1.0,1.0,0.0,1.0,0.0,0.208333,0.25,0.285714,0.0,0.714286,0.666667,0.5,0.036765,0.098361,0.2,0.142857,0.857143,0.866667,0.25,0.1225,0.666667,1.0,0.4,0.5,1.0,1.0,0.333333,0.125089,1.0,0.0,0.064212,0.140098,0.2,0.0,1.0,1.0,0.11978,0.413559,0.0,0.259231,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.5,1.0,0.0,0.6,0.166667,0.036765,0.666667,0.5,0.38646,1.0,1.0,1.0,0.0,0.111517,0.0,0.0,0.0,0.0,1.0,1.0,0.25,0.0,0.090909,0.5,1.0,0.8
1,2,181500,0.0,0.75,0.202055,0.038795,1.0,0.5,1.0,1.0,0.0,0.5,0.0,1.0,0.125,0.285714,0.0,0.285714,0.555556,0.875,0.227941,0.52459,0.2,0.142857,0.571429,0.533333,0.75,0.0,1.0,1.0,0.2,0.5,1.0,0.25,0.0,0.173281,1.0,0.0,0.121575,0.206547,0.2,0.0,1.0,1.0,0.212942,0.0,0.0,0.17483,0.0,0.5,0.666667,0.0,0.375,0.333333,1.0,0.333333,1.0,0.333333,1.0,0.166667,0.227941,0.666667,0.5,0.324401,1.0,1.0,1.0,0.347725,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.25,0.0,0.363636,0.25,1.0,0.8
2,3,223500,0.235294,0.75,0.160959,0.046507,1.0,0.5,0.0,1.0,0.0,1.0,0.0,0.208333,0.25,0.285714,0.0,0.714286,0.666667,0.5,0.051471,0.114754,0.2,0.142857,0.857143,0.866667,0.25,0.10125,0.666667,1.0,0.4,0.5,1.0,0.5,0.333333,0.086109,1.0,0.0,0.185788,0.150573,0.2,0.0,1.0,1.0,0.134465,0.41937,0.0,0.273549,0.333333,0.0,0.666667,0.5,0.375,0.333333,0.666667,0.333333,1.0,0.333333,1.0,0.166667,0.051471,0.666667,0.5,0.428773,1.0,1.0,1.0,0.0,0.076782,0.0,0.0,0.0,0.0,1.0,1.0,0.25,0.0,0.727273,0.5,1.0,0.8
3,4,140000,0.294118,0.75,0.133562,0.038561,1.0,0.5,0.0,1.0,0.0,0.0,0.0,0.25,0.25,0.285714,0.0,0.714286,0.666667,0.5,0.669118,0.606557,0.2,0.142857,0.928571,1.0,0.75,0.0,1.0,1.0,0.0,1.0,0.25,1.0,0.0,0.038271,1.0,0.0,0.231164,0.123732,0.2,0.5,1.0,1.0,0.143873,0.366102,0.0,0.26055,0.333333,0.0,0.333333,0.0,0.375,0.333333,0.666667,0.416667,1.0,0.333333,0.4,0.833333,0.058824,1.0,0.75,0.45275,1.0,1.0,1.0,0.0,0.063985,0.492754,0.0,0.0,0.0,1.0,1.0,0.25,0.0,0.090909,0.0,1.0,0.0
4,5,250000,0.235294,0.75,0.215753,0.060576,1.0,0.5,0.0,1.0,0.0,0.5,0.0,0.625,0.25,0.285714,0.0,0.714286,0.777778,0.5,0.058824,0.147541,0.2,0.142857,0.857143,0.866667,0.25,0.21875,0.666667,1.0,0.4,0.5,1.0,0.0,0.333333,0.116052,1.0,0.0,0.20976,0.187398,0.2,0.0,1.0,1.0,0.186095,0.509927,0.0,0.351168,0.333333,0.0,0.666667,0.5,0.5,0.333333,0.666667,0.583333,1.0,0.333333,1.0,0.166667,0.058824,0.666667,0.75,0.589563,1.0,1.0,1.0,0.224037,0.153565,0.0,0.0,0.0,0.0,1.0,1.0,0.25,0.0,1.0,0.5,1.0,0.8


# Faeature Selection

## Using scikit-learn methods

In [3]:
estimator=RandomForestRegressor(n_estimators=100)
def FeatureSelection(X_train,y_train):
#     selector =  SelectKBest( k=5)  
    selector = SelectFromModel(estimator)  
#     selector =  RFE(estimator, n_features_to_select=8, step=1)

    X_train_new=selector.fit(X_train, y_train).transform(X_train)
    cols=X_train.columns.values[selector.get_support()]
    X_train_new_df = pd.DataFrame(X_train_new, index=X_train.index, columns=cols)
    return X_train_new_df,selector,cols

X=df_train.drop('SalePrice',axis=1)
y=df_train['SalePrice']

X,selector,cols = FeatureSelection(X,y)
print(X.shape)

(1460, 8)


## Using highly correlate features

In [4]:
# df_train_corr = df_train.corr()
# X=df_train.drop('SalePrice',axis=1)
# y=df_train['SalePrice']

# top_correlated_features=df_train_corr[df_train_corr['SalePrice']>0.1]['SalePrice'].sort_values(ascending=False)
# cols=top_correlated_features.index.values[1:]
# X=X[cols]
# X.shape

# Train Moldels

In [5]:
def RMSE(y_real,y_pred):
    y_real=abs(y_real)
    y_pred=abs(y_pred)
    rmse=np.sqrt(mean_squared_error(np.log1p(y_real), np.log1p(y_pred)))
#     rmse=np.sqrt(mean_squared_error(y_real, y_pred))
    return round(rmse,4)

df_result = pd.DataFrame(columns=['Model','RMSE'])
# Initiate train val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True,random_state=42)

# Linear Regression Model
reg = LinearRegression()
reg = reg.fit(X_train,y_train)
y_pred = reg.predict(X_val)
rmse = RMSE(y_val,y_pred)
df_result = df_result.append(pd.DataFrame([['LinearRegression', rmse]], columns=df_result.columns))

# Lasso Model
reg = Lasso(alpha=0.1).fit(X_train, y_train)
reg = reg.fit(X_train,y_train)
y_pred = reg.predict(X_val)
rmse = RMSE(y_val,y_pred)
df_result = df_result.append(pd.DataFrame([['Lasso', rmse]], columns=df_result.columns))


# Ridge Model
reg =Ridge(alpha=.5).fit(X_train, y_train)
reg = reg.fit(X_train,y_train)
y_pred = reg.predict(X_val)
rmse = RMSE(y_val,y_pred)
df_result = df_result.append(pd.DataFrame([['Ridge', rmse]], columns=df_result.columns))


# RandomForestRegressor Model
reg = RandomForestRegressor(random_state=10)
reg = reg.fit(X_train,y_train)
y_pred = reg.predict(X_val)
rmse = RMSE(y_val,y_pred)
df_result = df_result.append(pd.DataFrame([['RandomForestRegressor', rmse]], columns=df_result.columns))


# GradientBoostingRegressor Model
reg = GradientBoostingRegressor(n_estimators=100, max_depth=4).fit(X_train, y_train)
reg = reg.fit(X_train,y_train)
y_pred = reg.predict(X_val)
rmse = RMSE(y_val,y_pred)
df_result = df_result.append(pd.DataFrame([['GradientBoostingRegressor', rmse]], columns=df_result.columns))


# LGBMRegressor Model
reg = LGBMRegressor().fit(X_train, y_train)
reg = reg.fit(X_train,y_train)
y_pred = reg.predict(X_val)
rmse = RMSE(y_val,y_pred)
df_result = df_result.append(pd.DataFrame([['LGBMRegressor', rmse]], columns=df_result.columns))



df_result.sort_values(by='RMSE').head(10)

Unnamed: 0,Model,RMSE
0,RandomForestRegressor,0.1658
0,GradientBoostingRegressor,0.1663
0,LGBMRegressor,0.1697
0,Ridge,0.2884
0,LinearRegression,0.3297
0,Lasso,0.3297


# Hyperparameter Tuning

In [6]:
# from sklearn.datasets import load_boston
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [50,  100],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100,  1000]
# }

# rf = RandomForestRegressor()
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, verbose = False)

# grid_search.fit(X_train, y_train)

# reg = grid_search.best_estimator_.fit(X_train,y_train)
# y_pred = grid_search.best_estimator_.predict(X_val)
# rmse = RMSE(y_val,y_pred)
# df_result = df_result.append(pd.DataFrame([['Tuning RandomForestRegressor', rmse]], columns=df_result.columns))

# df_result.sort_values(by='RMSE').head(10)

In [7]:
# grid_search.best_estimator_

# Submission
Submissions are evaluated on **Root-Mean-Squared-Error (RMSE)** between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

In [10]:
X_test=selector.transform(df_test)
# X_test=df_test[cols]

# X_test = pd.DataFrame(X_test, index=df_test.index, columns=cols)

# reg = grid_search.best_estimator_.fit(X, y)

reg = GradientBoostingRegressor(n_estimators=100, max_depth=4).fit(X, y)

y_pred = reg.predict(X_test)

submission = pd.DataFrame({"Id": df_test["Id"],"SalePrice": y_pred})
submission.to_csv('./output/submission.csv', index=False)

print("Mission was done!!!")

Mission was done!!!
