## Reading the necessary libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#Importing the already downloaded train dataset
train = pd.read_csv('train.csv')

In [4]:
#Importing the already downloaded test dataset
test = pd.read_csv('test.csv')

## Data Cleaning 

In [5]:
#Determing the types of data present in your dataset
list(set(train.dtypes.tolist()))

[dtype('float64'), dtype('int64'), dtype('O')]

In [6]:
#Selecting all numeric values for use in the training dataset
train_numeric_values = train.select_dtypes(include = ['int64','float64'])

In [7]:
#Dropping the ID column from the training dataset as it is not necessary for the project
train_numeric_values.drop('Id',axis=1,inplace=True)

In [8]:
train_numeric_values.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000


In [9]:
#Selecting all numeric values for use in the testing dataset
test_numeric_values = test.select_dtypes(include = ['int64','float64'])

In [10]:
#Dropping the ID column from the testing dataset as it is not necessary for the project
test_numeric_values.drop('Id',axis=1,inplace=True)

In [11]:
test_numeric_values.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,730.0,140,0,0,0,120,0,0,6,2010
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,482.0,212,34,0,0,0,0,0,3,2010
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,470.0,360,36,0,0,0,0,0,6,2010
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,506.0,0,82,0,0,144,0,0,1,2010


## Data splitting

In [12]:
#The sale price of the house is the target variable and is assigned to the 'y' variable and all the other variables are the 
#assigned to the 'X' variable.
X_train = train_numeric_values.drop('SalePrice',axis=1)
y_train = train_numeric_values['SalePrice']

## Filling the null values

In [13]:
X_test = test_numeric_values

In [14]:
#Filling the null values in the dataset with their respective column means
X_train.fillna(X_train.mean(),inplace=True)
X_test.fillna(X_test.mean(),inplace=True)

## Building model using RandomForestRegressor

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
model = RandomForestRegressor()

In [17]:
model.fit(X_train,y_train)

## Prediction using the model

In [18]:
#Creating a dataframe of the predictions made from the fitted RandomForest Classifier model 
predictions = pd.DataFrame(model.predict(X_test), columns = ['Predicted'])

In [19]:
#Creating dataframe for the 'Id' column
HouseId = pd.DataFrame(test['Id'], columns = ['Id'])

In [20]:
#Combining both the Id of the house dataframe and the Predictions dataframe to for a single dataframe indicating 
#the house Id and the predicted price of the house
RDT = pd.concat([HouseId,predictions],axis=1);RDT.head()

Unnamed: 0,Id,Predicted
0,1461,126761.66
1,1462,154240.0
2,1463,183940.9
3,1464,179863.32
4,1465,201618.0


In [21]:
#Accuracy of the Random Forest Classifier
RFC_Accuracy = round(model.score(X_train,y_train)*100,4)
RFC_Accuracy

97.9423

## Building model using the DecisionTree Regressor

In [22]:
from sklearn.tree import DecisionTreeRegressor

In [23]:
model1 = DecisionTreeRegressor()

In [24]:
model1.fit(X_train,y_train)

## Prediction

In [25]:
Predictions = pd.DataFrame(model1.predict(X_test), columns = ['Predicted'])

In [26]:
DTC = pd.concat([HouseId,Predictions],axis=1);DTC.head()

Unnamed: 0,Id,Predicted
0,1461,134800.0
1,1462,164900.0
2,1463,208900.0
3,1464,181000.0
4,1465,213500.0


In [27]:
#Accuracy of the Decison Tree Classifier
DTC_Accuracy = round(model1.score(X_train,y_train)*100,4)
DTC_Accuracy

99.9996

## Linear Regression Model

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
LR = LinearRegression()

In [30]:
LR.fit(X_train,y_train)
pred1 = round(pd.DataFrame(LR.predict(X_test), columns = ['Predicted']),0)

In [31]:
Linear_Reg= pd.concat([HouseId,pred1],axis=1);Linear_Reg.head()

Unnamed: 0,Id,Predicted
0,1461,116736.0
1,1462,151923.0
2,1463,172781.0
3,1464,199311.0
4,1465,195766.0


In [32]:
#Accuracy of the Linear Regression model
LR_Accuracy = round(LR.score(X_train,y_train)*100,4)
LR_Accuracy

81.3125

## Sample submission

In [33]:
DTC = pd.concat([HouseId,Predictions],axis=1);DTC.head()

Unnamed: 0,Id,Predicted
0,1461,134800.0
1,1462,164900.0
2,1463,208900.0
3,1464,181000.0
4,1465,213500.0


In [34]:
DTC.to_csv('House Pricing.csv', index=False)