### Import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# load the dataset
df = pd.read_csv("D:/Imarticus/Dataset/HousePrices.csv")
df

Unnamed: 0,Id,Dwell_Type,Zone_Class,LotFrontage,LotArea,Road_Type,Alley,Property_Shape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Property_Sale_Price
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068,942,60,RL,,8755,Pave,,IR1,Lvl,AllPub,...,0,,GdPrv,,0,6,2009,WD,Normal,214000
2069,943,90,RL,63.0,7711,Pave,,IR1,Lvl,AllPub,...,0,,,,0,8,2007,Oth,Abnorml,150000
2070,944,90,RL,313.0,25000,Pave,,Reg,Low,AllPub,...,0,,,,0,6,2007,WD,Normal,143000
2071,945,20,RL,52.0,14375,Pave,,IR1,Lvl,NoSeWa,...,0,,,,0,1,2009,COD,Abnorml,137500


### EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2073 entries, 0 to 2072
Data columns (total 81 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   2073 non-null   int64  
 1   Dwell_Type           2073 non-null   int64  
 2   Zone_Class           2073 non-null   object 
 3   LotFrontage          1753 non-null   float64
 4   LotArea              2073 non-null   int64  
 5   Road_Type            2073 non-null   object 
 6   Alley                129 non-null    object 
 7   Property_Shape       2073 non-null   object 
 8   LandContour          2073 non-null   object 
 9   Utilities            2073 non-null   object 
 10  LotConfig            2073 non-null   object 
 11  LandSlope            2073 non-null   object 
 12  Neighborhood         2073 non-null   object 
 13  Condition1           2073 non-null   object 
 14  Condition2           2073 non-null   object 
 15  Dwelling_Type        2073 non-null   o

In [4]:
# finding column names that have null values
[(i,df[i].isnull().sum()) for i in df.columns if df[i].isnull().sum()>0]

[('LotFrontage', 320),
 ('Alley', 1944),
 ('MasVnrType', 14),
 ('MasVnrArea', 14),
 ('BsmtQual', 59),
 ('BsmtCond', 59),
 ('BsmtExposure', 61),
 ('BsmtFinType1', 59),
 ('BsmtFinType2', 60),
 ('Electrical', 1),
 ('FireplaceQu', 988),
 ('GarageType', 113),
 ('GarageYrBlt', 113),
 ('GarageFinish', 113),
 ('GarageQual', 113),
 ('GarageCond', 113),
 ('PoolQC', 2065),
 ('Fence', 1669),
 ('MiscFeature', 1993)]

In [5]:
# Creating new age and remodage columns 
df["Age"] = df["YearBuilt"] - df["YrSold"]
df["RemodAge"] = df["YearRemodAdd"] - df["YrSold"]

In [6]:
# dropping unwanted columns
df.drop(["Id", "YearBuilt", "YearRemodAdd", "MoSold", "GarageYrBlt"], axis = 1, inplace = True)

In [7]:
columns = []
for i in df.columns:
    columns.append(i)
print(columns)

['Dwell_Type', 'Zone_Class', 'LotFrontage', 'LotArea', 'Road_Type', 'Alley', 'Property_Shape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'Dwelling_Type', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal

In [8]:
# filling missing values
df["MasVnrType"].fillna(value = df["MasVnrType"].mode()[0], inplace = True)
df["LotFrontage"].fillna(value = round(df["LotFrontage"].mean(), 1), inplace = True)
df["MasVnrArea"].fillna(value = round(df["MasVnrArea"].mean(), 1), inplace = True)
df["Electrical"].fillna(value = df["Electrical"].mode()[0], inplace = True)
df.Alley.fillna("None",inplace=True)
df.BsmtQual.fillna("None",inplace=True)
df.BsmtCond.fillna("None",inplace=True)
df.BsmtExposure.fillna('None',inplace=True)
df.BsmtFinType1.fillna("None",inplace=True)
df.BsmtFinType2.fillna("None",inplace=True)
df.FireplaceQu.fillna("None",inplace=True)
df.GarageType.fillna("None",inplace=True)
df.GarageFinish.fillna("None",inplace=True)
df.GarageQual.fillna("None",inplace=True)
df.GarageCond.fillna("None",inplace=True)
df.PoolQC.fillna("None",inplace=True)
df.MiscVal.fillna("None",inplace=True)
df.Fence.fillna("None",inplace=True)

In [9]:
# sns.heatmap(data = df.corr())

In [10]:
# dropping columns with low correlation
df.drop(["Dwell_Type", "OverallCond", "BsmtFinSF2", "LowQualFinSF", "BsmtHalfBath", "BedroomAbvGr", "KitchenAbvGr", "EnclosedPorch",
"ScreenPorch", "PoolArea", "MiscVal", "YrSold", ], axis = 1, inplace = True)

In [11]:
obj_col = df.select_dtypes(include=['object']).columns
obj_col

Index(['Zone_Class', 'Road_Type', 'Alley', 'Property_Shape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'Dwelling_Type', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC',
       'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object')

In [12]:
float_col = df.select_dtypes(include=['float64']).columns
float_col

Index(['LotFrontage', 'MasVnrArea'], dtype='object')

In [13]:
int_col = df.select_dtypes(include=['int64']).columns
int_col

Index(['LotArea', 'OverallQual', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
       'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', '3SsnPorch', 'Property_Sale_Price', 'Age',
       'RemodAge'],
      dtype='object')

In [14]:
# encoding ordinal data
LE = OrdinalEncoder()
df[['OverallQual', 'ExterQual', 'ExterCond',  'BsmtQual',
    'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'HeatingQC', 
    'KitchenQual', 'Functional', 'GarageQual', 'GarageCond','FireplaceQu', 'PoolQC']] = LE.fit_transform(df[['OverallQual', 
                                                                       'ExterQual', 'ExterCond',  'BsmtQual',
                                                        'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'HeatingQC', 
                                            'KitchenQual', 'Functional', 'GarageQual', 'GarageCond','FireplaceQu', 'PoolQC']])

In [15]:
# dropping columns after encoding
df.drop(['OverallQual', 'ExterQual', 'ExterCond',  'BsmtQual',
    'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'HeatingQC', 
    'KitchenQual', 'Functional', 'GarageQual', 'GarageCond','FireplaceQu', 'PoolQC'], axis = 1, inplace = True)

In [16]:
# one hot encoding 
df = pd.get_dummies(df, columns=['Alley', 'Zone_Class', 'Road_Type',  'Property_Shape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'Dwelling_Type', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 
       'Foundation', 'Heating', 'CentralAir', 'Electrical',
         'GarageType',
       'GarageFinish', 'PavedDrive',  "MiscFeature", 
         'SaleType', 'SaleCondition', 'Fence'], drop_first = True)
df.sample(10)

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,...,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None
1125,60.0,10434,0.0,0,1005,1005,1005,0,1005,0,...,1,0,0,0,1,0,0,0,0,1
1686,82.0,9622,0.0,104,0,816,816,0,816,1,...,1,0,0,0,1,0,0,0,0,1
236,65.0,8773,78.0,24,1390,1414,1414,0,1414,0,...,1,0,0,0,1,0,0,0,0,1
1093,71.0,9230,166.0,661,203,864,1200,0,1200,1,...,1,0,0,0,1,0,0,1,0,0
1157,34.0,5001,166.0,904,410,1314,1314,0,1314,1,...,1,0,0,0,1,0,0,0,0,1
583,75.0,13500,0.0,0,1237,1237,1521,1254,2775,0,...,1,0,0,0,1,0,0,0,0,1
674,80.0,9200,0.0,892,244,1136,1136,0,1136,1,...,1,0,0,0,1,0,0,0,0,1
797,57.0,7677,0.0,570,203,773,773,0,773,0,...,1,0,0,0,0,0,0,0,0,1
196,79.0,9416,205.0,1126,600,1726,1726,0,1726,1,...,0,0,0,0,0,1,0,0,0,1
434,21.0,1890,0.0,495,135,630,630,0,630,1,...,1,0,0,0,1,0,0,0,0,1


In [17]:
X = df.drop(labels= ["Property_Sale_Price"] , axis = 1)
Y = df["Property_Sale_Price"]

### Splitting train and test data

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 2)

### Using robust scaler

In [19]:
from sklearn.preprocessing import RobustScaler
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20,random_state=0)
st_x= RobustScaler()    
X_train= st_x.fit_transform(X_train)    
X_test= st_x.transform(X_test)

### Linear Regression

In [20]:
model = LinearRegression()

In [21]:
model.fit(X_train, Y_train)

LinearRegression()

In [22]:
model.score(X_train, Y_train)

0.9017555701370908

In [23]:
model.score(X_test, Y_test)

0.7784901938843792