In [124]:
# Importing Dependancies
import pandas as pd
pd.set_option('display.max_rows', 500)

In [125]:
# Reading Traing Data
home_data = pd.read_csv("train.csv")

In [144]:
home_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1451 entries, 0 to 1459
Data columns (total 61 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1451 non-null   int64  
 1   MSZoning       1451 non-null   object 
 2   LotArea        1451 non-null   int64  
 3   Street         1451 non-null   object 
 4   LotShape       1451 non-null   object 
 5   LandContour    1451 non-null   object 
 6   Utilities      1451 non-null   object 
 7   LotConfig      1451 non-null   object 
 8   LandSlope      1451 non-null   object 
 9   BldgType       1451 non-null   object 
 10  HouseStyle     1451 non-null   object 
 11  OverallQual    1451 non-null   int64  
 12  OverallCond    1451 non-null   int64  
 13  YearBuilt      1451 non-null   int64  
 14  YearRemodAdd   1451 non-null   int64  
 15  RoofStyle      1451 non-null   object 
 16  RoofMatl       1451 non-null   object 
 17  Exterior1st    1451 non-null   object 
 18  Exterior

In [127]:
# Checking Object Columns for type of values inside
home_data.MasVnrType

0       BrkFace
1          None
2       BrkFace
3          None
4       BrkFace
         ...   
1455       None
1456      Stone
1457       None
1458       None
1459       None
Name: MasVnrType, Length: 1460, dtype: object

In [128]:
# NaN values are very less in these columns,Dropping Nan Rows
home_data.dropna(subset =['Electrical','MasVnrType','MasVnrArea'],inplace=True)

In [129]:
# Checking for Null values
home_data.isna().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       257
LotArea             0
Street              0
Alley            1361
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          0
MasVnrArea          0
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          0
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [130]:
# Preprocessing Function to clean data

def preprocessing(data):
 
  #Dropping all columns with Nan Values
  data.drop(['Id','LotFrontage','Alley','FireplaceQu','PoolQC','Fence','MiscFeature','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond'],axis=1,inplace=True,errors='ignore')
  
 #Dropped "Neighborhood","Condition1","Condition2","" coz of lots of values for onehotencoding
  data.drop(['Neighborhood','Condition1','Condition2'],axis=1,inplace=True,errors='ignore')

In [131]:
# Applying the preprocessing function on train_data
preprocessing(home_data)

In [132]:
# Taking insight for Null values
home_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1451 entries, 0 to 1459
Data columns (total 61 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1451 non-null   int64  
 1   MSZoning       1451 non-null   object 
 2   LotArea        1451 non-null   int64  
 3   Street         1451 non-null   object 
 4   LotShape       1451 non-null   object 
 5   LandContour    1451 non-null   object 
 6   Utilities      1451 non-null   object 
 7   LotConfig      1451 non-null   object 
 8   LandSlope      1451 non-null   object 
 9   BldgType       1451 non-null   object 
 10  HouseStyle     1451 non-null   object 
 11  OverallQual    1451 non-null   int64  
 12  OverallCond    1451 non-null   int64  
 13  YearBuilt      1451 non-null   int64  
 14  YearRemodAdd   1451 non-null   int64  
 15  RoofStyle      1451 non-null   object 
 16  RoofMatl       1451 non-null   object 
 17  Exterior1st    1451 non-null   object 
 18  Exterior

In [133]:
# Make a copy to ensure there will be no data loss
abt = home_data.copy()

# Create feature columns
# Drop identifier column
X = abt.drop(['SalePrice'],axis=1)
# One hot encode
X = pd.get_dummies(X)
# Create target columns
y = home_data.SalePrice

In [134]:
# Get info to look at no of columns after One-Hot-Encoding
X.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1451 entries, 0 to 1459
Columns: 183 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(1), int64(33), uint8(149)
memory usage: 607.9 KB


In [135]:
# Train_Test_Split
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X,y,random_state=0)


In [136]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
home_price_model = DecisionTreeRegressor()
# Fitting data in the model
home_price_model.fit(train_X,train_y)

DecisionTreeRegressor()

In [137]:
# Checking Regression Loss - mean_absolute_error
from sklearn.metrics import mean_absolute_error
# Predictions on test_data from train_test_split
val_predictions = home_price_model.predict(test_X)
# Applying mean_absolute_error
print(mean_absolute_error(test_y,val_predictions))

25886.225895316806


In [138]:
#Model is ready to use
#Now loading test data and applying preprocessing

In [145]:
# Load Test Data
test_data_1 = pd.read_csv("test.csv")

In [152]:
# The test data has different no of columns than the given train data
# It has Extraa Columns
# Keeping only those columns which matches with the Training Dataset as model is built on Training Dataset

home_data_columns = list(home_data.columns)
# Excluding the Target Column
home_data_columns.remove("SalePrice") 

In [153]:
# Matching Columns and Leaving Extra columns
test_data = test_data_1[home_data_columns]

In [157]:
# Creating a copy to avoid anamolies
test_abt = test_data.copy()
# Dropping Nan
test_abt.dropna(inplace=True)
# Applying Preprocesing Function
preprocessing(test_abt)
# One-Hot-Encoding
test_abt = pd.get_dummies(test_abt)

In [160]:
# Matching Columns After One-Hot-Encoding in order to fit in the model
test_abt = test_abt.reindex(labels=X.columns,axis=1)

In [168]:
# Handeling Nan for the newly created Columns
test_abt.fillna(0,inplace=True)

In [169]:
# Getting insight on the Test Data
test_abt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1431 entries, 0 to 1458
Columns: 183 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(22), int64(25), uint8(136)
memory usage: 726.7 KB


In [170]:
# Making Prediction on the Test Data
yhat_test = home_price_model.predict(test_abt)

In [171]:
# Having a look at Presiction
yhat_test

array([123000., 177000., 197500., ..., 157000., 110000., 205000.])

In [173]:
# Making a Submission
# Creating Dataset in desired format
submission = pd.DataFrame([test_data_1['Id'], yhat_test]).T
submission.columns = ['Id', 'SalePrice']

In [176]:
# Final Submission
submission.to_csv('Home_Price_submission.csv', index=False)