In [3]:
library(ggplot2)
library(survival)
library(plyr)
library(dplyr)
library(stringr)
library(data.table)
library(tidyr)
library(corrplot)
library(Metrics)
library(caret)
library(dummies)
library(rpart)
library(rpart.plot)
library(e1071)
library(randomForest)
library(glmnet)
library(gbm)
library(Matrix)
library(iterators)
library(parallel)
library(xgboost)
library(parallel)
library(parallelMap) 
library(caretEnsemble)
library(ensembleR)
library(caTools)
library(mlbench)
library(party)
library(ranger)
library(lars)

# Reading in cleaned data file

In [4]:
df <- read.csv("../input/housepricesfinalcleaneddf/dffinalrevised.csv")

# Splitting dataframe into test and train

In [5]:
train <- df %>% filter(type =="train")
test <- df %>% filter(type =="test")

# Dropping unneeded colums

In [6]:
test <-subset(test,select = -c(Log_price))
test <-subset(test,select = -c(type))
train <-subset(train,select = -c(type))
test <-subset(test,select = -c(ExterCond))
train <-subset(train,select = -c(ExterCond))

# Fitting LM and predicting

In [7]:
model1 = lm(Log_price ~., data = train)
pred1 <- predict(model1,test)
model1pred<- data.frame('Id' = test$Id, 'Saleprice' = pred1)
write.csv(model1pred, file = "model1pred.csv")
summary(model1pred)

       Id         Saleprice    
 Min.   :1461   Min.   :10.99  
 1st Qu.:1817   1st Qu.:11.77  
 Median :2174   Median :11.98  
 Mean   :2181   Mean   :12.02  
 3rd Qu.:2535   3rd Qu.:12.25  
 Max.   :2919   Max.   :13.09  

# Fitting RF and predicting

In [8]:
model2 <- randomForest(Log_price ~.,data = train,ntree = 500)
pred2 <- predict(model2,test)
model2pred <- data.frame('Id' = test$Id, 'Saleprice' = pred2)
write.csv(model2pred, file = "model2pred.csv")
summary(model2pred)

       Id         Saleprice    
 Min.   :1461   Min.   :11.26  
 1st Qu.:1817   1st Qu.:11.80  
 Median :2174   Median :11.97  
 Mean   :2181   Mean   :12.02  
 3rd Qu.:2535   3rd Qu.:12.21  
 Max.   :2919   Max.   :12.99  

# Fitting XGboost and predicting

In [9]:

trainx <-subset(train,select = -c(Log_price))
trainy <- subset(train, select = c(Log_price))

trainx <- data.frame(lapply(trainx, as.numeric))
trainy <- data.frame(lapply(trainy, as.numeric))

testx <-subset(test)
testx <- data.frame(testx)
testx <- data.frame(lapply(testx, as.numeric))

trainx <- as.matrix(trainx)
trainy <- as.matrix(trainy)
testx <- as.matrix(testx)

model3 <- xgboost(data = trainx, 
                  booster="gbtree",
                  objective = "reg:linear",
                  eval_metric = "rmse",
                  nround = 100,
                  max.depth = 3,
                  label = trainy,
                  early_stopping_rounds = 40)

pred3 <- predict(model3,testx)
model3pred <- data.frame('Id' = test$Id, 'Salesprice' = pred3)
write.csv(model3pred, file = "model3pred.csv")
summary(model3pred)

[1]	train-rmse:8.084595 
Will train until train_rmse hasn't improved in 40 rounds.

[2]	train-rmse:5.666029 
[3]	train-rmse:3.972733 
[4]	train-rmse:2.787571 
[5]	train-rmse:1.958262 
[6]	train-rmse:1.378321 
[7]	train-rmse:0.973147 
[8]	train-rmse:0.690701 
[9]	train-rmse:0.494691 
[10]	train-rmse:0.359132 
[11]	train-rmse:0.267003 
[12]	train-rmse:0.205587 
[13]	train-rmse:0.165575 
[14]	train-rmse:0.140960 
[15]	train-rmse:0.125695 
[16]	train-rmse:0.116497 
[17]	train-rmse:0.110683 
[18]	train-rmse:0.106977 
[19]	train-rmse:0.104814 
[20]	train-rmse:0.102373 
[21]	train-rmse:0.100742 
[22]	train-rmse:0.099202 
[23]	train-rmse:0.097379 
[24]	train-rmse:0.096191 
[25]	train-rmse:0.094919 
[26]	train-rmse:0.093841 
[27]	train-rmse:0.092282 
[28]	train-rmse:0.091308 
[29]	train-rmse:0.090286 
[30]	train-rmse:0.089684 
[31]	train-rmse:0.088561 
[32]	train-rmse:0.087289 
[33]	train-rmse:0.086085 
[34]	train-rmse:0.085005 
[35]	train-rmse:0.084700 
[36]	train-rmse:0.084205 
[37]	train-rms

       Id         Salesprice   
 Min.   :1461   Min.   :10.95  
 1st Qu.:1817   1st Qu.:11.78  
 Median :2174   Median :11.98  
 Mean   :2181   Mean   :12.02  
 3rd Qu.:2535   3rd Qu.:12.24  
 Max.   :2919   Max.   :13.21  

# Fitting lasso and predicting

In [10]:
train1 <-subset(train)
train1 <- data.frame(lapply(train1, as.numeric))

test1 <- data.frame(test)
test1 <- data.frame(lapply(test1, as.numeric))

# Model Building :Lasso Regression
set.seed(123)
control = trainControl(method ="cv", number = 5)
Grid_la_reg = expand.grid(alpha = 1,
              lambda = seq(0.001, 0.1, by = 0.0002))
  
# Training lasso regression model
lasso_model = train(x = subset(train1, select =-c(Id,Log_price)),
                    y = train1$Log_price,
                    method = "glmnet",
                    trControl = control,
                    tuneGrid = Grid_la_reg
                    )
pred4 <- predict(lasso_model,test1)
model4pred <- data.frame('Id' = test$Id, 'Salesprice' = pred4)
write.csv(model4pred, file = "model4pred.csv")
summary(model4pred)

       Id         Salesprice   
 Min.   :1461   Min.   :10.99  
 1st Qu.:1817   1st Qu.:11.77  
 Median :2174   Median :11.99  
 Mean   :2181   Mean   :12.02  
 3rd Qu.:2535   3rd Qu.:12.24  
 Max.   :2919   Max.   :13.06  

# Blending the different models

Since XGBoost and linear modelling performed better in the validation, I am multiplying the output the output of RF and lasso by some factor then getting the average prediction

In [None]:
lasso <- summary(model4pred)
lasso
xgboost <- summary(model3pred)
xgboost
rf <- summary(model2pred)
rf
lm <- summary(model1pred)
lm
model2improved = model2pred$Saleprice*1.005
rfimproved <- summary(model2improved)
model2improved

write.csv(model4pred, file = "lasso1.csv")
write.csv(model3pred, file = "xgboost1.csv")
write.csv(model1pred, file = "lm1.csv")
write.csv(model2improved, file = "rfimproved1.csv")
