-
Notifications
You must be signed in to change notification settings - Fork 0
/
House_Price.Rmd
132 lines (98 loc) · 3.82 KB
/
House_Price.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
Let us solve Regression problem using Neural Network.
# The implementation demonstrates NN using the
#Housing Price dataset from Kaggle.
# Primary aim of this code is to implement neural network to solve House Price problem
# Therefore, only 5 independent features were used. A robust implemention must consider
# feature engineering, data cleaning, and cross-validation.
require(data.table)
require(stringr)
require(lubridate)
require(zoo)
require(lightgbm)
train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)
# This implementation of the Kaggle's House Price problem
# only considers 5 coulmns to simplify the neural network implementation.
# The five features are (These features were selected by learning from other Kaggler's):
# 1. SalePrice,
# 2. "OverallQual",
# 3. "GrLivArea",
# 4. "TotalBsmtSF",
# 5. "GarageCars",
# 6. FullBath"
#Extract required columns to train and test dataset
train <- train[,c("OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars",
"FullBath", "SalePrice")]
test <- test[,c("OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars",
"FullBath")]
#Storing for Scaling back the predictions..
train_o <- train
# DATA CLEANING:
## check the train data
summary(train$SalePrice) # CLEAN
summary(train$OverallQual) # CLEAN
summary(train$GrLivArea)# CLEAN
summary(train$TotalBsmtSF)# CLEAN
summary(train$GarageCars)# CLEAN
summary(train$FullBath)
## check teh test data
summary(test$SalePrice) # CLEAN
summary(test$OverallQual) # CLEAN
summary(test$GrLivArea)# CLEAN
summary(test$TotalBsmtSF)
summary(test$GarageCars)
summary(test$FullBath)# CLEAN
#Replace missing value with median
summary(test$TotalBsmtSF)
test$TotalBsmtSF[which(is.na(test$TotalBsmtSF))] <- 988.0
summary(test$GarageCars)
test$GarageCars[which(is.na(test$GarageCars))] <- 2.0
train_o <- train
#SCALING OR NORMALIZATION
# Normalization brings all the vlaues in the required range.
# For this problem, the range is 0 to 1. Therefore, after scaling
# all the values in the selected dataset should fall between 0 and 1
# A USer Defined Function to scale
UDF <- function(x) {
(x -min(x))/ (max(x)- min(x))
}
train <- as.data.frame(apply(train, 2, UDF))
test <- as.data.frame(apply(test, 2, UDF))
# SPLItting the data.
index <- sample(nrow (train), round(0.6 * nrow(train)))
train.wp <- train[index,]
test.wp <- train[-index,]
# MODEL
library(neuralnet)
allVars <- colnames(train)
predictorVars <- allVars[!allVars%in%"SalePrice"]
predictorVars <- paste(predictorVars, collapse = "+")
form = as.formula(paste("SalePrice~", predictorVars, collapse = "+"))
# Prediction Model
nn_model <- neuralnet(formula = form, train.wp, hidden = c(4,2), linear.output = TRUE)
# the fitted values i.e. weights
nn_model$net.result
plot(nn_model)
#PREDICTION
prediction1 <- compute(nn_model, test)
str(prediction1)
# UDF: Convert the scaled values to original
UDF_2 <- function(prediction) {
prediction1$net.result * (max(train_o$SalePrice)-min(train_o$SalePrice)) + min(train_o$SalePrice)
}
ActualPrediction <- prediction1$net.result * (max(train_o$SalePrice)-min(train_o$SalePrice)) + min(train_o$SalePrice)
table(ActualPrediction)
submit.df <- data.frame(Id = rep(1461:2919), SalePrice= ActualPrediction)
write.csv(submit.df, file = "Submission_20171130_4.csv", row.names = FALSE)
# Plot to show the correlation among the selected variables
mydata <- train[, c("OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars",
"FullBath", "SalePrice")]
train_ <- round(cor(mydata),2)
head(train_)
library(reshape2)
melted_train <- melt(train_)
head(melted_train)
library(ggplot2)
ggplot(data = melted_train, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
head(mydata)