In [None]:
library(ggplot2)
library(survival)
library(plyr)
library(dplyr)
library(stringr)
library(data.table)
library(tidyr)
library(corrplot)
library(Metrics)
library(caret)
library(dummies)
library(rpart)
library(rpart.plot)
library(e1071)
library(randomForest)
library(glmnet)
library(gbm)
library(Matrix)
library(iterators)
library(xgboost)
library(parallel)
library(parallelMap) 
library(caretEnsemble)
library(ensembleR)
library(caTools)
library(mlbench)
library(party)
library(ranger)
library(lars)

#  Reading in data

In [None]:
test <- read.csv("../input/house-prices-advanced-regression-techniques/test.csv")
train <- read.csv("../input/house-prices-advanced-regression-techniques/train.csv")


df <- bind_rows(train %>% 
          mutate(data_split="train"),test %>% 
          mutate(data_split="test"))

In [None]:
summary(df)

In [None]:
for(i in 1:ncol(df)){
  df[is.na(df[,i]), i] <- mean(df[,i], na.rm = TRUE)
}


# Plotting outcome variable

In [None]:
train <- df %>% filter(data_split=="train")
ggplot(train, aes(x=SalePrice)) + geom_histogram()

The sales price is skewed to the right, this distribution can be improved through transformation

# Transforming outcome variable and feature engineering

In [None]:
t1.price = log(df$SalePrice)
df <- df %>%
mutate(Log_price = t1.price, 
       Basementareasquarefeet = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + TotalBsmtSF,
       Garageage = YrSold - GarageYrBlt,
       Houseage = YrSold - YearBuilt,
       Remodelled = ifelse(YearRemodAdd == YearBuilt, "1","0"),
       Porchareasqt = WoodDeckSF + OpenPorchSF + EnclosedPorch + ScreenPorch + X3SsnPorch,
       Firstandsecondflrsqft = X1stFlrSF + X2ndFlrSF,
       Totalnumberofbathrooms = BsmtFullBath+FullBath+ (BsmtHalfBath*0.5) + (HalfBath*0.5),
       Postfinancialcrisis = ifelse(YrSold >="2008",1,0)
      )

# Dropping unneeded columns after feature engineering and imbalanced categorical variables

In [None]:
df <-subset(df,select = -c(BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,
                           YearRemodAdd,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,
                           X3SsnPorch,X1stFlrSF,X2ndFlrSF,PoolQC,MiscFeature,Utilities, Alley,
                           SalePrice,PoolArea,MiscVal,LowQualFinSF,YrSold,Exterior1st,YearBuilt,GarageYrBlt,
                           HeatingQC,Exterior2nd,Fence,SaleType,GarageCond, GarageQual,Electrical,LandSlope,
                           Functional,Electrical,Heating,BsmtFinType2,BsmtFinType1,Condition2, Condition1,
                           SaleType,GarageFinish,GarageType,KitchenQual,BsmtExposure, BsmtCond, BsmtQual,
                           Exterior1st,Exterior2nd,MSZoning,Street,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,
                           RoofMatl,MasVnrType,FireplaceQu))


In [None]:
summary(df)

# Plotting distribution of continuos variables

In [None]:
train <- df %>% filter(data_split=="train")
ggplot(train, aes(x=Log_price)) + geom_histogram()

The sales price distribution has slightly improved after the log transformation, outliers on the left to be removed

In [None]:
ggplot(df, aes(x=Basementareasquarefeet)) + geom_histogram()


In [None]:
ggplot(df, aes(x=Firstandsecondflrsqft)) + geom_histogram()


In [None]:
ggplot(df, aes(x=LotArea)) + geom_histogram()

In [None]:
ggplot(df, aes(x=Porchareasqt)) + geom_histogram()

In [None]:
ggplot(df, aes(x=MoSold)) + geom_histogram()

Some of the plotted distributions above have outlies which can skew the predictions, the outliers are excluded in the next step

# Converting variables to numeric and excluding outliers

In [None]:
df$Basementareasquarefeet <- as.numeric(df$Basementareasquarefeet)
df$Firstandsecondflrsqft <- as.numeric(df$Firstandsecondflrsqft)
df$LotArea <- as.numeric(df$LotArea)
df$Porchareasqt <- as.numeric(df$Porchareasqt)

In [None]:
df <- df %>%
filter(Firstandsecondflrsqft <= 3000)
df <- df %>%
filter(Basementareasquarefeet <= 4500 & Basementareasquarefeet > 500)
df <- df %>%
filter(Log_price >= 11)
df <- df %>%
filter(LotArea <= 28000)
df <- df %>%
filter(GarageArea > 0 & GarageArea <1200)
df <- df %>%
filter(MasVnrArea <= 500)
df <- df %>%
filter(LotFrontage <= 120)
df <- df %>%
filter(Porchareasqt <= 750)
df <- df %>%
filter(Houseage >= 0)

# Correlation of continuos variables

In [None]:

trainnumerical <- subset(train, select = c(LotFrontage,LotArea,MasVnrArea,GarageCars,GarageArea,
                                     Totalnumberofbathrooms,Log_price,Basementareasquarefeet,Garageage,
                                     Houseage,Porchareasqt,Firstandsecondflrsqft,OverallQual,OverallCond,
                                    GrLivArea,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces))
corr <- round(cor(trainnumerical), 1)

corrplot(corr, method = 'color', order = 'alphabet') 

*  Garage area and the age of the house have a strong negative corelation with the log of the house salesprice   Overall house condition, GrLivArea, first and second floor square feet have a strong positive correlation with the house salesprice
* There are strong correlations amongst some variables, eg,Garage yearbuilt and garage age, house age and year built, one of these paired variables will be dropped 

# Splitting out training and testing sets

In [None]:
train1 <- df %>% filter(data_split=="train")
test <- df %>% filter(data_split=="test")

# Creating validating set from training set

In [None]:
set.seed(123) 
train1 <-subset(train1, select = -c(data_split))
sample = sample.split(train1,SplitRatio = 0.75)
train2 =subset(train1,sample ==TRUE)
validation =subset(train1, sample==FALSE)

validationx <-subset(validation, select = -c(Log_price))
validationy <-subset(validation,select = c(Id,Log_price))
validationy$Log_price <-  as.numeric(validationy$Log_price)

# LM modelling and validation

In [None]:

model1 = lm(Log_price ~.,data = train2)
pred1 <- predict(model1,validationx)
result = rmse(validationy$Log_price, pred1)
result


# Random forest modelling and validation

In [None]:
model2 <- randomForest(Log_price ~ ., data = train2,ntree = 500)
pred2 <- predict(model2,validationx)
result1 = rmse(validationy$Log_price, pred2)
result1

# XG boosting modelling and validation

In [None]:

trainx <-subset(train2,select = -c(Log_price))
trainy <- subset(train2, select = c(Log_price))

trainx <- data.frame(lapply(trainx, as.numeric))
trainy <- data.frame(lapply(trainy, as.numeric))

validationxm <-subset(validationx)
validationxm <- data.frame(lapply(validationxm, as.numeric))

trainxm <- as.matrix(trainx)
trainym <- as.matrix(trainy)
validationxg <-as.matrix(validationxm)

model3 <- xgboost(data = trainxm, 
                  booster="gbtree",
                  objective = "reg:linear",
                  eval_metric = "rmse",
                  nround = 100,
                  max.depth = 3,
                  label = trainym,
                  early_stopping_rounds = 40)

pred3 <- predict(model3,validationxg)
result2 = rmse(validationy$Log_price, pred3)

# Lasso regression modelling and validation

In [None]:
train3 <-subset(train2)
train3 <- data.frame(lapply(train3, as.numeric))
validationy <- data.frame(lapply(validationy, as.numeric))


# Model Building :Lasso Regression
set.seed(123)
control = trainControl(method ="cv", number = 5)
Grid_la_reg = expand.grid(alpha = 1,
              lambda = seq(0.001, 0.1, by = 0.0002))
  
# Training lasso regression model
lasso_model = train(x = subset(train3, select =-c(Id,Log_price)),
                    y =train3$Log_price,
                    method = "glmnet",
                    trControl = control,
                    tuneGrid = Grid_la_reg
                    )


pred4 <- predict(lasso_model,validationxm)
result3 = rmse(validationy$Log_price, pred4)
result3


Model performance using RMSE evaluation is in this order, XGBoost,linear modeling,lasso, random forest

# Exporting data for final modelling



In [None]:
dffinal<- bind_rows(train1 %>% 
          mutate(type="train"),test %>% 
          mutate(type="test"))
write.csv(dffinal, file = "dffinalrevised.csv")
