In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
print("The train columns are:")
li = train.columns.tolist()
for i in range(len(li)):
  print(li[i])

print("\n")

print("The test columns are:")
li2 = test.columns.tolist()
for i in range(len(li2)):
  print(li2[i])


The train columns are:
Unnamed: 0
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15


The test columns are:
Unnamed: 0
id
date
price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_above
sqft_basement
yr_built
yr_renovated
zipcode
lat
long
sqft_living15
sqft_lot15


In [3]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

train_df = train.drop(columns=["Unnamed: 0", "zipcode"])
test_df  = test.drop(columns=["Unnamed: 0", "zipcode", "id", "date"])

train_df['price'] = train_df["price"] / 1000
test_df['price']  = test_df["price"] / 1000

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"].values

X_test = test_df.drop(columns=["price"])
y_test = test_df["price"].values



In [4]:
feat = X_train.columns.tolist()

scalar = StandardScaler()
scaled_X_train = scalar.fit_transform(X_train)
scaled_X_test  = scalar.transform(X_test)


ones_arr1 = np.ones((scaled_X_train.shape[0], 1))
X_train_final = np.concatenate([ones_arr1, scaled_X_train], axis=1)

ones_arr2 = np.ones((scaled_X_test.shape[0], 1))
X_test_final = np.concatenate([ones_arr2, scaled_X_test], axis=1)

In [5]:
n = X_train_final.shape[0]
a = 0.1
iterations = 100
lam = [0, 1, 10, 100, 1000, 10000]


li1 = []
li2 = []
li3 = []
li4 = []
li5 = []
li6 = []

In [9]:
for l in lam:
    theta = np.zeros(X_train_final.shape[1])
    for j in range(iterations):
        prediction = X_train_final @ theta
        err = prediction - y_train
        loss = (1 / n) * np.sum(err ** 2) + (l / n) * np.sum(theta[1:] ** 2)
        grad = (2 / n) * (X_train_final.T @ err)
        reg = (2 * l / n) * theta
        reg[0] = 0
        grad = grad + reg
        theta = theta - (a * grad)

    pred1 = X_train_final @ theta
    pred2 = X_test_final @ theta

    train_mse = mean_squared_error(y_train, pred1)
    train_r2  = r2_score(y_train, pred1)
    test_mse  = mean_squared_error(y_test, pred2)
    test_r2   = r2_score(y_test, pred2)

    li1.append(l)
    li2.append(train_mse)
    li3.append(train_r2)
    li4.append(test_mse)
    li5.append(test_r2)

df = pd.DataFrame({
    "Lambda": li1,
    "Train MSE": li2,
    "Train R2": li3,
    "Test MSE": li4,
    "Test R2": li5
})

print(df)

    Lambda      Train MSE       Train R2       Test MSE        Test R2
0        0   3.148643e+04   7.265311e-01   5.763896e+04   6.542913e-01
1        1   3.848518e+04   6.657450e-01   6.959025e+04   5.826095e-01
2       10   5.671627e+65  -4.925973e+60   6.333819e+65  -3.798917e+60
3      100  2.858133e+262 -2.482372e+257  3.191015e+262 -1.913917e+257
4     1000            inf           -inf            inf           -inf
5        0   3.148647e+04   7.265308e-01   5.763964e+04   6.542872e-01
6        1   3.148661e+04   7.265296e-01   5.764827e+04   6.542355e-01
7       10   3.149069e+04   7.264941e-01   5.772810e+04   6.537566e-01
8      100   3.172611e+04   7.244495e-01   5.865355e+04   6.482060e-01
9     1000   3.848518e+04   6.657450e-01   6.959025e+04   5.826095e-01
10   10000   7.835374e+04   3.194749e-01   1.216202e+05   2.705427e-01


In [7]:
np.random.seed(42)
N = 1000
X = np.random.uniform(-2, 2, N)
e = np.random.normal(0, 2, N)
Y = 1 + 2 * X + e

ones = np.ones((N, 1))
X_final = np.column_stack([ones, X.reshape(-1, 1)])

n = X_final.shape[0]
a = 0.01
iterations = 1000
lam2 = [0, 1, 10, 100, 1000, 10000]


li7 = []
li8 = []
li9 = []
li10 = []

In [10]:
for l2 in lam2:
    theta = np.zeros(X_final.shape[1])
    for j in range(iterations):
        prediction = X_final @ theta
        err = prediction - Y
        loss = (1 / n) * np.sum(err ** 2) + (l2 / n) * np.sum(theta[1:] ** 2)
        grad = (2 / n) * (X_final.T @ err)
        reg = (2 * l2 / n) * theta
        reg[0] = 0
        grad = grad + reg
        theta = theta - (a * grad)

    pred1 = X_final @ theta
    mse = mean_squared_error(Y, pred1)
    r2  = r2_score(Y, pred1)

    li7.append(l2)
    li8.append(theta[1])
    li9.append(mse)
    li10.append(r2)

In [11]:
#part 3
df2 = pd.DataFrame({
    "Lambda": li7,
    "Slope": li8,
    "MSE": li9,
    "R2": li10
})

print(df2)

   Lambda         Slope           MSE            R2
0       0  1.922607e+00  3.899872e+00  5.638856e-01
1       1  1.109371e+00  4.802053e+00  4.629966e-01
2      10  2.307882e-01  7.804391e+00  1.272515e-01
3     100 -1.283823e+10  2.250936e+20 -2.517174e+19
4       0  1.922607e+00  3.899872e+00  5.638856e-01
5       1  1.921199e+00  3.899874e+00  5.638853e-01
6      10  1.908616e+00  3.900139e+00  5.638558e-01
7     100  1.791295e+00  3.923394e+00  5.612552e-01
8    1000  1.109371e+00  4.802053e+00  4.629966e-01
9   10000  2.307882e-01  7.804391e+00  1.272515e-01


With increase in lamda, the slope is shrinking and MSE is increasing and R^2 is decreasing. I believe this is happening because with lambda penalization increases and makes some coeffients close to 0