In [1]:
import numpy as np
import pandas as pd 
import sklearn

In [2]:
#Importing data and removing useless columns
DF_House = pd.read_csv("train.csv")
DF_House.drop(columns=["Id"],inplace=True)
DF_House.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
#Handling the missing data 
DF_House.drop(columns=DF_House.columns[DF_House.isnull().sum().values>200],inplace=True)
DF_House.dropna(inplace=True)
DF_House.isnull().sum().values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [4]:
#Replacing strings with numerical values
obj_to_replace = DF_House["MSZoning"].dtype

for column in DF_House.columns:
    if DF_House[column].dtype == obj_to_replace:
        uniques = np.unique(DF_House[column].values)
        for idx,item in enumerate(uniques):
            DF_House[column] = DF_House[column].replace(item,idx)
            
DF_House.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,8450,1,3,3,0,4,0,5,...,0,0,0,0,0,2,2008,8,4,208500
1,20,3,9600,1,3,3,0,2,0,24,...,0,0,0,0,0,5,2007,8,4,181500
2,60,3,11250,1,0,3,0,4,0,5,...,0,0,0,0,0,9,2008,8,4,223500
3,70,3,9550,1,0,3,0,0,0,6,...,272,0,0,0,0,2,2006,8,0,140000
4,60,3,14260,1,0,3,0,2,0,15,...,0,0,0,0,0,12,2008,8,4,250000


In [5]:
#Adding bias column
DF_House["bias"] = np.ones(DF_House.shape[0])
DF_House.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,bias
0,60,3,8450,1,3,3,0,4,0,5,...,0,0,0,0,2,2008,8,4,208500,1.0
1,20,3,9600,1,3,3,0,2,0,24,...,0,0,0,0,5,2007,8,4,181500,1.0
2,60,3,11250,1,0,3,0,4,0,5,...,0,0,0,0,9,2008,8,4,223500,1.0
3,70,3,9550,1,0,3,0,0,0,6,...,0,0,0,0,2,2006,8,0,140000,1.0
4,60,3,14260,1,0,3,0,2,0,15,...,0,0,0,0,12,2008,8,4,250000,1.0


In [6]:
#Dividing the data 
DF_House= DF_House.sample(frac=1).reset_index(drop=True)
DF_House_train = DF_House[:-100]
DF_House_test = DF_House[-100:]
Y_train = DF_House_train["SalePrice"].values
X_train = DF_House_train.drop(columns=["SalePrice"]).values
Y_test = DF_House_test["SalePrice"].values
X_test = DF_House_test.drop(columns=["SalePrice"]).values

print(X_train.shape)
print(np.mean(Y_train))

(1238, 74)
187130.53069466882


In [7]:
#Training using sklearn linear regression
import sklearn 
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
LR.fit(X_train,Y_train)
print(f"--> The weights are: \n {LR.coef_}")

Y_predict = LR.predict(X_test)
print(f"--> The predictions are: \n {Y_predict}")
print(f"--> The Real Prices are: \n {Y_test}")

#Calculating the Mean Absolute Error
from sklearn.metrics import mean_absolute_error
print(f"--> The MAE = \n {mean_absolute_error(Y_test,Y_predict)}")






--> The weights are: 
 [-1.43874894e+02 -1.74314650e+03  3.94258355e-01  4.67466942e+04
 -1.41884604e+03  4.45172539e+03 -4.93015316e+04  3.74060361e+02
  6.34074622e+03  3.63445903e+02 -2.45953540e+02 -8.52130062e+03
 -5.81059103e+02 -1.21285164e+03  1.24819135e+04  4.81365905e+03
  2.25554527e+02  2.47180644e+01  2.71191117e+03  4.53472346e+03
 -1.14196482e+03  5.01723735e+02  4.24104613e+03  2.95289855e+01
 -7.41332547e+03 -5.18766814e+02  1.91583559e+03 -9.26644538e+03
  2.99565111e+03 -3.40827555e+03 -1.29732352e+03  2.53994896e+00
  2.95566953e+02  4.39554331e+00 -3.16945331e+00  3.76603893e+00
 -4.09535352e+03 -6.61776686e+02  1.95658557e+03 -1.74454135e+02
  1.55829603e+01  2.00216436e+01 -4.00302311e+00  3.16015807e+01
  6.62542189e+03  7.61503563e+02  1.88940529e+03 -3.35690554e+03
 -3.78372029e+03 -1.90370913e+04 -8.31611970e+03  2.94481394e+03
  4.07773053e+03  4.38800523e+03  6.39183442e+02 -1.23766040e+02
 -5.18857233e+02  1.34605905e+04  1.16197010e+00 -5.64048755e+02
  

In [8]:
#Training using the Linear Regression with only NumPy
W = np.linalg.pinv(X_train)@Y_train
print(f"--> The weights are: {W}")

Y_predict = np.dot(X_test,W)
print(f"--> The predictions are: \n {Y_predict}")
print(f"--> The Real Prices are: \n {Y_test}")

#Calculating the Mean Absolute Error
Error = np.mean(np.abs(Y_predict - Y_test))
print(f"--> The MAE = \n {Error}")



--> The weights are: [-1.43874894e+02 -1.74314650e+03  3.94258355e-01  4.67466942e+04
 -1.41884604e+03  4.45172539e+03 -4.93015316e+04  3.74060361e+02
  6.34074622e+03  3.63445903e+02 -2.45953540e+02 -8.52130062e+03
 -5.81059103e+02 -1.21285164e+03  1.24819135e+04  4.81365905e+03
  2.25554527e+02  2.47180644e+01  2.71191117e+03  4.53472346e+03
 -1.14196482e+03  5.01723735e+02  4.24104613e+03  2.95289855e+01
 -7.41332547e+03 -5.18766814e+02  1.91583559e+03 -9.26644538e+03
  2.99565111e+03 -3.40827555e+03 -1.29732352e+03  2.53993054e+00
  2.95566953e+02  4.39552489e+00 -3.16947173e+00  3.76605735e+00
 -4.09535352e+03 -6.61776686e+02  1.95658557e+03 -1.74454135e+02
  1.55829391e+01  2.00216224e+01 -4.00304433e+00  3.16016019e+01
  6.62542189e+03  7.61503563e+02  1.88940529e+03 -3.35690554e+03
 -3.78372029e+03 -1.90370913e+04 -8.31611970e+03  2.94481394e+03
  4.07773053e+03  4.38800523e+03  6.39183442e+02 -1.23766040e+02
 -5.18857233e+02  1.34605905e+04  1.16197010e+00 -5.64048755e+02
  1.