In [25]:
import pandas as pd
from pprint import pprint
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import copy
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score as sklearn_r2_score
import time
from sklearn.linear_model import LinearRegression
import numpy as np

In [57]:


dataset = pd.read_csv("housing.csv")
# dataset.head()
# dataset = dataset.fillna(dataset.median())
X = dataset.drop(columns=["median_house_value", 'ocean_proximity'])
y = dataset['median_house_value']



# pd.get_dummies(dataset['ocean_proximity'])
X = pd.concat([X, pd.get_dummies(dataset['ocean_proximity'], drop_first=True)], axis=1)

M, N = X.shape
X['total_bedrooms'] = X['total_bedrooms'].fillna(X['total_bedrooms'].median())


columns = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
scaler = StandardScaler()
X[columns] =  scaler.fit_transform(X[columns])
y = StandardScaler().fit_transform(dataset[['median_house_value']]).reshape((-1))



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X.isna().sum())
X.head()
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

print(y)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
INLAND                0
ISLAND                0
NEAR BAY              0
NEAR OCEAN            0
dtype: int64
[ 2.12963148  1.31415614  1.25869341 ... -0.99274649 -1.05860847
 -1.01787803]


In [58]:
model = LinearRegression()

start = time.time()
reg = model.fit(X_train, y_train)
time_taken = time.time() - start

y_train_prediction = reg.predict(X_train)
y_test_prediction = reg.predict(X_test)


print("Time taken: ", time_taken)
print()

print("On training dataset:")
print("MSE - ", mean_squared_error(y_train, y_train_prediction)/2)
print("RMSE - ", root_mean_squared_error(y_train, y_train_prediction)/2**0.5)
print("MAE - ", mean_absolute_error(y_train, y_train_prediction))
print("R2-Score - ", sklearn_r2_score(y_train, y_train_prediction))
print()

print("On test dataset:")
print("MSE - ", mean_squared_error(y_test, y_test_prediction)/2)
print("RMSE - ", root_mean_squared_error(y_test, y_test_prediction)/2**0.5)
print("MAE - ", mean_absolute_error(y_test, y_test_prediction))
print("R2-Score - ", sklearn_r2_score(y_test, y_test_prediction))



Time taken:  0.012998342514038086

On training dataset:
MSE -  0.17755556968690311
RMSE -  0.42137343258314597
MAE -  0.4309052334205368
R2-Score -  0.6470480227253683

On test dataset:
MSE -  0.17774504173074568
RMSE -  0.4215981993922005
MAE -  0.4339793296167354
R2-Score -  0.6393611711434393


In [59]:
X_new = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], axis=1).astype(np.float64)
# X_new = X_new[:1]



# print(np.isnan(X_new).sum())
# print((X_new.T@X_new).tolist())

params = np.linalg.inv(X_new.T@X_new)@X_new.T@y_train

In [60]:
print(params)

print(reg.intercept_,reg.coef_)

[ 0.11360791 -0.46013883 -0.46118261  0.12062252 -0.11101941  0.3842319
 -0.36631703  0.1401925   0.64651846 -0.3559672   1.17761699 -0.05256867
  0.02749749]
0.11360790688485484 [-0.46013883 -0.46118261  0.12062252 -0.11101941  0.3842319  -0.36631703
  0.1401925   0.64651846 -0.3559672   1.17761699 -0.05256867  0.02749749]


In [61]:
W = params[1:]
b = params[0]

mse = mean_squared_error(y_test, X_test@W+b)
print(mse)

0.35549008346149125
