In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
print("The train columns are:")
li = train.columns.tolist()
for i in range(len(li)):
  print(li[i])

print("\n")

print("The test columns are:")
li2 = test.columns.tolist()
for i in range(len(li2)):
  print(li2[i])


The train columns are:
Unnamed: 0
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15


The test columns are:
Unnamed: 0
id
date
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15


In [5]:


train_df = train.drop(columns=["Unnamed: 0", "zipcode"])
test_df  = test.drop(columns=["Unnamed: 0", "zipcode", "id", "date"])


train_df['price'] = train_df["price"] / 1000
test_df['price']  = test_df["price"] / 1000

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"]

X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]
scaler = StandardScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test  = scaler.transform(X_test)







In [6]:
def closed_form(X, y):
    ones = np.ones((X.shape[0], 1))
    X = np.concatenate([ones, X], axis=1)
    theta = np.linalg.inv(X.T @ X) @ X.T @ y
    return theta

def pred(X, theta):
    ones = np.ones((X.shape[0], 1))
    X = np.concatenate([ones, X], axis=1)
    return X @ theta


In [7]:

theta = closed_form(scaled_X_train, y_train)
feat = X_train.columns.tolist()


for i in range(len(feat)):
    print(f"{feat[i]}: {theta[i+1]}")
print()
print(f"Intercept: {theta[0]}")


bedrooms: -20.87667924977909
bathrooms: 60.18715963689467
sqft_living: 13.799741372531088
sqft_lot: 11.192067728941304
floors: 8.922040345468229
waterfront: 63.690796070497825
view: 48.34391232232531
condition: 12.920954505967757
grade: 85.8786371556937
sqft_above: 84.09946632064
sqft_basement: 37.08640535642442
yr_built: -67.64311741342645
yr_renovated: 17.271379530344788
lat: 78.3757369320784
long: -1.0352030837351087
sqft_living15: 45.57765781263865
sqft_lot15: -12.930090977794698

Intercept: 520.4148340000011


In [8]:



train_pred = pred(scaled_X_train, theta)
train_mse = mean_squared_error(y_train, train_pred)
train_r2  = r2_score(y_train, train_pred)
print(f"Training MSE: {train_mse}")
print(f"Training R²: {train_r2}")



Training MSE: 32586.062536693073
Training R²: 0.7169805244571578


In [10]:

test_pred = pred(scaled_X_test, theta)
test_mse = mean_squared_error(y_test, test_pred)
test_r2  = r2_score(y_test, test_pred)
print(f"\nTesting MSE: {test_mse}")
print(f"Testing R²: {test_r2}")




Testing MSE: 57071.09572029485
Testing R²: 0.6576972330664212


The results of resuts in q2 were training MSE 31486.167775794882 and Training R² was 0.7265334318706018 and testing MSE was 57628.154705670415 and Testing R² was 0.6543560876120953. The results in closed form was Training MSE as 32586.062536693073 and Training R² as 0.7169805244571578 and Testing MSE as 57071.09572029485
Testing R² as 0.6576972330664212.

These results are very similar and this is because the closed form implementation is correct and the small difference are mainly due to precision and rounding.