House_Price.Rmd

 Let us solve Regression problem using Neural Network. 
# The implementation demonstrates NN using the 
#Housing Price dataset from Kaggle.

# Primary aim of this code is to implement neural network to solve House Price problem
# Therefore, only 5 independent features were used. A robust implemention must consider 
# feature engineering, data cleaning, and cross-validation. 

require(data.table)
require(stringr)
require(lubridate)
require(zoo)
require(lightgbm)


train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)


# This implementation of the Kaggle's House Price problem
# only considers 5 coulmns to simplify the neural network implementation. 
# The five features are (These features were selected by learning from other Kaggler's): 
# 1. SalePrice, 
# 2. "OverallQual", 
# 3. "GrLivArea", 
# 4. "TotalBsmtSF", 
# 5. "GarageCars",
# 6. FullBath"

#Extract required columns to train and test dataset

train <- train[,c("OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars",
                  "FullBath", "SalePrice")]

test <- test[,c("OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars",
                  "FullBath")]
#Storing for Scaling back the predictions..
train_o <- train
# DATA CLEANING:

## check the train data
summary(train$SalePrice) # CLEAN
summary(train$OverallQual) # CLEAN
summary(train$GrLivArea)# CLEAN
summary(train$TotalBsmtSF)# CLEAN
summary(train$GarageCars)# CLEAN
summary(train$FullBath)

## check teh test data
summary(test$SalePrice) # CLEAN
summary(test$OverallQual) # CLEAN
summary(test$GrLivArea)# CLEAN
summary(test$TotalBsmtSF)
summary(test$GarageCars)
summary(test$FullBath)# CLEAN

#Replace missing value with median
summary(test$TotalBsmtSF)
test$TotalBsmtSF[which(is.na(test$TotalBsmtSF))] <- 988.0

summary(test$GarageCars)
test$GarageCars[which(is.na(test$GarageCars))] <- 2.0
train_o <- train




#SCALING OR NORMALIZATION 
# Normalization brings all the vlaues in the required range.
# For this problem, the range is 0 to 1. Therefore, after scaling
# all the values in the selected dataset should fall between 0 and 1

# A USer Defined Function to scale
UDF <- function(x) {
  (x -min(x))/ (max(x)- min(x))
}

train <- as.data.frame(apply(train, 2, UDF))
test <- as.data.frame(apply(test, 2, UDF))
# SPLItting the data.

index <- sample(nrow (train), round(0.6 * nrow(train)))

train.wp <- train[index,]
test.wp <- train[-index,]


# MODEL

library(neuralnet)

allVars <- colnames(train)
predictorVars <- allVars[!allVars%in%"SalePrice"]
predictorVars <- paste(predictorVars, collapse = "+")
form = as.formula(paste("SalePrice~", predictorVars, collapse = "+"))

# Prediction Model
nn_model <- neuralnet(formula = form, train.wp, hidden = c(4,2), linear.output = TRUE)

# the fitted values i.e. weights
nn_model$net.result
plot(nn_model)

#PREDICTION

prediction1 <- compute(nn_model, test)
str(prediction1)
# UDF: Convert the scaled values to original 
UDF_2 <- function(prediction) {
     prediction1$net.result * (max(train_o$SalePrice)-min(train_o$SalePrice)) + min(train_o$SalePrice)
}

ActualPrediction <-  prediction1$net.result * (max(train_o$SalePrice)-min(train_o$SalePrice)) + min(train_o$SalePrice)

table(ActualPrediction)

submit.df <- data.frame(Id = rep(1461:2919), SalePrice= ActualPrediction)
write.csv(submit.df, file = "Submission_20171130_4.csv", row.names = FALSE)

# Plot to show the correlation among the selected variables
mydata <- train[, c("OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars",
                     "FullBath", "SalePrice")]

train_ <- round(cor(mydata),2)
head(train_)
library(reshape2)
melted_train <- melt(train_)
head(melted_train)
library(ggplot2)
ggplot(data = melted_train, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()
head(mydata)