In [None]:
import tensorflow as tf
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
import kagglehub
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

path = kagglehub.dataset_download("shree1992/housedata")

print("Path to dataset files:", path)
df = pd.read_csv(path + "/data.csv")
df2 = pd.read_csv(path + "/data.csv")
X = df.drop(["price", "street", "city", "statezip", "country", "date", "yr_built", "yr_renovated","waterfront"], axis=1)
y = df["price"]
X_train, X_test, Y_train,Y_test = train_test_split(X, y, test_size=0.1)
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
Y_train = Y_train/1e6
Y_test = Y_test/1e6


In [None]:

w = np.zeros(X_train.shape[1])
b = np.zeros(1)

def linear_func(X_train, w, b):
  return np.dot(X_train, w) + b

def mean_squared_error_cost_func(X_train, Y_train, w, b):
  m = X_train.shape[0]
  summation = 0
  for i in range(m):
    summation += ((np.dot(w, X_train[i,]) + b) - Y_train[i, ])**2
  MSE = (1/(2*m))*(summation)
  return MSE

NON-Vectorized

In [None]:
alpha = 0.0001

def mean_squared_error_cost_func_deriv(X_train, Y_train, w, b):
  X_train_np = X_train
  Y_train_np = Y_train.values
  m = X_train_np.shape[0]
  summation = 0
  for i in range(m):
    summation += (((np.dot(w, X_train_np[i,])+b) - Y_train_np[i,])*X_train_np[i,])
  w = w - (alpha*((1/m)*summation))

  summation = 0
  for i in range(m):
    summation += ((np.dot(w, X_train_np[i,])+b) - Y_train_np[i,])
  b = b - (alpha*((1/m)*summation))
  return w, b

def gradient_descent(alpha, w, b, X_train, Y_train):
  for i in range(1000):
    w, b = mean_squared_error_cost_func_deriv(X_train, Y_train, w, b)
    if i % 100 == 0:
      preds = np.dot(X_train, w)
      MSE = np.mean((preds-Y_train)**2)
      print(f"Iteration {i}, MSE: {MSE}, w: {w}, b: {b}")
  return w, b

W, B = gradient_descent(alpha, w, b, X_train, Y_train)

def predict(x, W, B):
  y_pred = np.zeros(x.shape[0])
  for i in range(x.shape[0]):
    y_pred[i,] = linear_func(x[i,], W, B)
  return y_pred


y_pred = predict(X_test, W, B)




Vectorized

In [None]:
alpha = 0.0001

def mean_squared_error_cost_func_deriv(X_train, Y_train, w, b):
    m = X_train.shape[0]
    predictions = np.dot(X_train, w) + b
    errors = predictions - Y_train

    dw = (1 / m) * np.dot(X_train.T, errors)
    db = (1 / m) * np.sum(errors)

    w -= alpha * dw
    b -= alpha * db

    return w, b

def gradient_descent(alpha, w, b, X_train, Y_train):
  for i in range(1000):
    w, b = mean_squared_error_cost_func_deriv(X_train, Y_train, w, b)
    if i % 100 == 0:
      preds = np.dot(X_train, w)
      MSE = np.mean((preds-Y_train)**2)
      print(f"Iteration {i}, MSE: {mse}, w: {w}, b: {b}")
  return w, b

W, B = gradient_descent(alpha, w, b, X_train, Y_train)

def predict(x, W, B):
  y_pred = np.zeros(x.shape[0])
  for i in range(x.shape[0]):
    y_pred[i,] = linear_func(x[i,], W, B)
  return y_pred


y_pred = predict(X_test, W, B)

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_test.values, y_pred)
print("Mean Squared Error:", mse)
print(Y_test.values)
print("BREAK")
print(y_pred)
print(W, B)

X_feature = X_train[:, 0]
x_range = np.linspace(X_feature.min(), X_feature.max(), 100)

y_line = w[0] * x_range + b
plt.scatter(X_feature, Y_train, color='blue', label='Training Data')

plt.plot(x_range, y_line, color='red', label='Linear Regression Line')
plt.xlabel('Feature 1')
plt.ylabel('Y')
plt.title('Linear Regression (Feature 1)')
plt.legend()
plt.show()
