In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
print("The train columns are:")
li = train.columns.tolist()
for i in range(len(li)):
  print(li[i])

print("\n")

print("The test columns are:")
li2 = test.columns.tolist()
for i in range(len(li2)):
  print(li2[i])


The train columns are:
Unnamed: 0
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15


The test columns are:
Unnamed: 0
id
date
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15


In [3]:


train_df = train.drop(columns=["Unnamed: 0", "zipcode"])
test_df  = test.drop(columns=["Unnamed: 0", "zipcode", "id", "date"])


train_df['price'] = train_df["price"] / 1000
test_df['price']  = test_df["price"] / 1000

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"].values

X_test = test_df.drop(columns=["price"])
y_test = test_df["price"].values

X_train_2 = X_train["sqft_living"].values

X_test_2 = X_test["sqft_living"].values




In [4]:
def closed_form(X, y):
    ones = np.ones((X.shape[0], 1))
    X = np.concatenate([ones, X], axis=1)
    theta = np.linalg.inv(X.T @ X) @ X.T @ y
    return theta

def pred(X, theta):
    ones = np.ones((X.shape[0], 1))
    X = np.concatenate([ones, X], axis=1)
    return X @ theta


In [5]:
def polyRegressionFeat(X, p):
    cols = []
    for i in range(1, p+1):
        cols.append(X ** i)
    return np.column_stack(cols)

In [6]:
li1 = []
li2 = []
li3 = []
li4 = []
li5 = []


for j in range(1, 6):

    X_train_poly = polyRegressionFeat(X_train_2, j)
    X_test_poly  = polyRegressionFeat(X_test_2, j)

    scalar = StandardScaler()
    scaled_X_train = scalar.fit_transform(X_train_poly)
    scaled_X_test  = scalar.transform(X_test_poly)


    theta = closed_form(scaled_X_train, y_train)


    train_pred = pred(scaled_X_train, theta)
    test_pred  = pred(scaled_X_test, theta)


    train_mse = mean_squared_error(y_train, train_pred)
    train_r2  = r2_score(y_train, train_pred)
    test_mse  = mean_squared_error(y_test, test_pred)
    test_r2   = r2_score(y_test, test_pred)

    li1.append(j)
    li2.append(train_mse)
    li3.append(train_r2)
    li4.append(test_mse)
    li5.append(test_r2)



In [7]:
df = pd.DataFrame({
    "p": li1,
    "Train MSE": li2,
    "Train R2": li3,
    "Test MSE": li4,
    "Test R2": li5
})

print(df)

   p     Train MSE  Train R2      Test MSE     Test R2
0  1  57947.526161  0.496709  8.857598e+04    0.468736
1  2  54822.665116  0.523849  7.179168e+04    0.569406
2  3  53785.194716  0.532860  9.983348e+04    0.401216
3  4  52795.774758  0.541453  2.509793e+05   -0.505331
4  5  52626.111955  0.542927  2.865728e+07 -170.881541


With increase in p, the number of features increases and the models fits better so train MSE decreases and training R^2 increases. But, testing MSE decreases initially at p=2 and then gets keeps on increasing and gets worse. Similarly, testing R^2 initially increases at p=2 but gets worse and keeps on decreasing. Therefore model is unable to generalize and is overfitting.