In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [31]:
class MultiLinearReg:
    
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None
        
    def fit(self,X_train,y_train):
        X_train = np.insert(X_train,0,1,axis=1)
        
        # calcuate the coeffs
        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]
        return self.intercept_,self.coef_
    
    def predict(self,X_test):
        y_pred = np.dot(X_test,self.coef_) + self.intercept_
        return y_pred

In [11]:
data = pd.read_csv("data/housing_data.csv")
data.head()


Unnamed: 0,BuildingClass,StreetSize,LotArea,BuildingType,OverallCondition,YearRemodeled,LivingAreaSize,Bathroom,Bedroom,Kitchen,TotalRooms,Fireplaces,GarageCars,GarageArea,SaleCondition,SalePrice
0,60.0,65.0,8450.0,1Family,5.0,2003.0,1710.0,2.0,3.0,1.0,8.0,0.0,2.0,548.0,Normal,208500
1,20.0,80.0,9600.0,1Family,8.0,,1262.0,2.0,3.0,1.0,6.0,1.0,2.0,460.0,Normal,181500
2,60.0,68.0,11250.0,1Family,5.0,,1786.0,2.0,3.0,1.0,6.0,1.0,2.0,608.0,Normal,223500
3,70.0,60.0,9550.0,1Family,5.0,1970.0,1717.0,1.0,3.0,1.0,7.0,1.0,3.0,642.0,Abnormal,140000
4,60.0,84.0,14260.0,1Family,5.0,2000.0,2198.0,2.0,4.0,,9.0,1.0,3.0,836.0,Normal,250000


In [12]:
data.select_dtypes(include=["object","category"])
data.drop(columns=data.select_dtypes(include=["object","category"]).columns,inplace=True)
data.head()

Unnamed: 0,BuildingClass,StreetSize,LotArea,OverallCondition,YearRemodeled,LivingAreaSize,Bathroom,Bedroom,Kitchen,TotalRooms,Fireplaces,GarageCars,GarageArea,SalePrice
0,60.0,65.0,8450.0,5.0,2003.0,1710.0,2.0,3.0,1.0,8.0,0.0,2.0,548.0,208500
1,20.0,80.0,9600.0,8.0,,1262.0,2.0,3.0,1.0,6.0,1.0,2.0,460.0,181500
2,60.0,68.0,11250.0,5.0,,1786.0,2.0,3.0,1.0,6.0,1.0,2.0,608.0,223500
3,70.0,60.0,9550.0,5.0,1970.0,1717.0,1.0,3.0,1.0,7.0,1.0,3.0,642.0,140000
4,60.0,84.0,14260.0,5.0,2000.0,2198.0,2.0,4.0,,9.0,1.0,3.0,836.0,250000


In [14]:
X = data.iloc[:,0:13]
Y=data.iloc[:,13]
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.30,random_state=24)

In [15]:
X_train.isnull().sum()

BuildingClass        52
StreetSize          216
LotArea              57
OverallCondition     36
YearRemodeled        54
LivingAreaSize       59
Bathroom             37
Bedroom              49
Kitchen              55
TotalRooms           46
Fireplaces           56
GarageCars           41
GarageArea           48
dtype: int64

In [16]:
cols = X_train.columns.tolist()
print(cols)

['BuildingClass', 'StreetSize', 'LotArea', 'OverallCondition', 'YearRemodeled', 'LivingAreaSize', 'Bathroom', 'Bedroom', 'Kitchen', 'TotalRooms', 'Fireplaces', 'GarageCars', 'GarageArea']


In [17]:
for col in cols:
    mean_val = X_train[col].mean()
    X_train[col].fillna(mean_val, inplace=True)
    X_test[col].fillna(mean_val, inplace=True)


In [19]:
y_train.isnull().sum()


0

In [20]:
X_train.isnull().sum()


BuildingClass       0
StreetSize          0
LotArea             0
OverallCondition    0
YearRemodeled       0
LivingAreaSize      0
Bathroom            0
Bedroom             0
Kitchen             0
TotalRooms          0
Fireplaces          0
GarageCars          0
GarageArea          0
dtype: int64

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
LinReg = LinearRegression()
LinReg.fit(X_train,y_train)
Y_prediction=LinReg.predict(X_test)

In [25]:
from sklearn.metrics import mean_squared_error


In [26]:
mse = mean_squared_error(y_test, Y_prediction)
print("MSE:", mse)
RMSE = np.sqrt(mse)
print("RMSE", RMSE)

MSE: 2294101060.0785193
RMSE 47896.775048833086


In [27]:
from sklearn.metrics import r2_score


In [28]:
r2 = r2_score(y_test, Y_prediction)
print("R²:", r2)

R²: 0.7071557275442308


In [32]:
MLROLS = MultiLinearReg()

In [33]:
MLROLS.fit(X_train,y_train)

(181249.09491193743,
 array([ -7136.30617446,  -1564.56818221,   4379.95150549,   -376.37014906,
         15862.70971414,  25355.10515745,   5934.92597073, -11867.03730717,
         -7965.7838044 ,  11168.27633411,   9524.96179133,  14058.5367313 ,
          8647.72019483]))

In [34]:
LinReg.coef_

array([ -7136.30617446,  -1564.56818221,   4379.95150549,   -376.37014906,
        15862.70971414,  25355.10515745,   5934.92597073, -11867.03730717,
        -7965.7838044 ,  11168.27633411,   9524.96179133,  14058.5367313 ,
         8647.72019483])

In [35]:
LinReg.intercept_

181249.09491193737