In [None]:
# importing libraries
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# printing numbers in array's
np.set_printoptions(suppress=True)

In [None]:
# Problem 1 & 2

def compute_loss(X,y,theta):
#    **** Computes the loss function for linear regression ****
    h = X.dot(theta) # h = predictions
    errors = np.subtract(h,y)
    sqrErrors = np.square(errors)
    J = 1/(2*m) * np.sum(sqrErrors)
    return J

def gradient_descent(X_train,X_test,Y_train,Y_test,theta,alpha,iterations):
#    **** Computes the Gradient Descent for linear regression ****
    loss_history_train = np.zeros(iterations)
    loss_history_test = np.zeros(iterations)
    for i in range(iterations):
        h = X_train.dot(theta)
        errors = np.subtract(h,Y_train)
        sum_delta = (alpha/m) * X_train.transpose().dot(errors);
        theta = theta - sum_delta;
        loss_history_train[i] = compute_loss(X_train,Y_train,theta)
        loss_history_test[i] = compute_loss(X_test,Y_test,theta)
    return loss_history_train, loss_history_test, theta

def binary_map(x):
#    **** Computes the inputs for 1(yes) or 0(no) ****
    return x.map({'yes': 1, 'no': 0})

In [None]:
# Problem 1.a
# Code that predicts housing price based on input variables
# Inputs: area, bedrooms, bathrooms, stories, parking

# Reads csv file and sets it to the variable housing
housing = pd.DataFrame(pd.read_csv("Housing.csv", usecols = ["price","area","bedrooms","bathrooms","stories","parking"]))
housing = housing.to_numpy()

# Splits the training/validation set
np.random.seed(0) 
train, test = train_test_split(housing, train_size = 0.8, test_size = 0.2, random_state = 42)

m = len(train) # Number of values in dataset

# Spliting the inputs and output
Y_train = train[:,0]
X_train = train[:,1:]
Y_test = test[:,0]
X_test = test[:,1:]

# Adding X0 to X_train and X_test
X0 = np.ones((len(X_train),1))
X_train = np.hstack((X0, X_train))
X0 = np.ones((len(X_test),1))
X_test = np.hstack((X0, X_test))

In [None]:
# Calculating theta
theta = np.zeros(6)
alpha = 1e-10
iterations = 1000
loss_history_train, loss_history_test, theta = gradient_descent(X_train,X_test,Y_train,Y_test,theta,alpha,iterations)
print("Theta for X= ", theta)

In [None]:
# Plotting the Loss vs Iterations
plt.rcParams.update({'axes.facecolor':'lightyellow'})
plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train[0:len(loss_history_train)], color='blue', label="Training Loss")
plt.plot(loss_history_test[0:len(loss_history_test)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

In [None]:
# Problem 1.b
# Code that predicts housing price based on input variables
# Inputs: area, bedrooms, bathrooms, stories, mainroad, guestroom, basement, hotwaterheating, airconditioning, parking, prefarea

# Reads csv file and sets it to the variable housing
housing = pd.DataFrame(pd.read_csv("Housing.csv", usecols = ["price","area","bedrooms","bathrooms","stories","mainroad","guestroom","basement","hotwaterheating","airconditioning","parking","prefarea"]))
binarylist = ["mainroad","guestroom","basement","hotwaterheating","airconditioning","prefarea",]
housing[binarylist] = housing[binarylist].apply(binary_map)
housing = housing.to_numpy()

# Splits the training/validation set
np.random.seed(0) 
train, test = train_test_split(housing, train_size = 0.8, test_size = 0.2, random_state = 42)

m = len(train) # Number of values in dataset

# Spliting the inputs and output
Y_train = train[:,0]
X_train = train[:,1:]
Y_test = test[:,0]
X_test = test[:,1:]

# Adding X0 to X_train and X_test
X0 = np.ones((len(X_train),1))
X_train = np.hstack((X0, X_train))
X0 = np.ones((len(X_test),1))
X_test = np.hstack((X0, X_test))

In [None]:
# Calculating theta
theta = np.zeros(12)
alpha = 1e-10
iterations = 1000
loss_history_train, loss_history_test, theta = gradient_descent(X_train,X_test,Y_train,Y_test,theta,alpha,iterations)
print("Theta for X=: ", theta)

In [None]:
# Plotting the Loss vs Iterations
plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train[0:len(loss_history_train)], color='blue', label="Training Loss")
plt.plot(loss_history_test[0:len(loss_history_test)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

In [None]:
# Problem 2.a
# Repeat 1.a

# Reads csv file and sets it to the variable housing
housing = pd.DataFrame(pd.read_csv("Housing.csv", usecols = ["price","area","bedrooms","bathrooms","stories","parking"]))
housing = housing.to_numpy()

# Splits the training/validation set
np.random.seed(0) 
train, test = train_test_split(housing, train_size = 0.8, test_size = 0.2, random_state = 42)

m = len(train) # Number of values in dataset

# Feature Scaling
# Standardization
scaler = StandardScaler()
std_train = scaler.fit_transform(train) 
std_test = scaler.fit_transform(test)

# Normalization
scaler = MinMaxScaler()
norm_train = scaler.fit_transform(train)
norm_test = scaler.fit_transform(test)

# Normalization: Spliting the inputs and output
Y_train_norm = norm_train[:,0]
X_train_norm = norm_train[:,1:]
Y_test_norm = norm_test[:,0]
X_test_norm = norm_test[:,1:]

# Adding X0 to X_train_norm and X_test_norm
X0 = np.ones((len(X_train_norm),1))
X_train_norm = np.hstack((X0, X_train_norm))
X0 = np.ones((len(X_test_norm),1))
X_test_norm = np.hstack((X0, X_test_norm))


# Standardization: Spliting the inputs and output
Y_train_std = std_train[:,0]
X_train_std = std_train[:,1:]
Y_test_std = std_test[:,0]
X_test_std = std_test[:,1:]

# Adding X0 to X_train_norm and X_test_norm
X0 = np.ones((len(X_train_std),1))
X_train_std = np.hstack((X0, X_train_std))
X0 = np.ones((len(X_test_std),1))
X_test_std = np.hstack((X0, X_test_std))

In [None]:
# Calculating theta
theta = np.zeros(6)
alpha = 0.01
iterations = 1000
loss_history_train_norm, loss_history_test_norm, theta = gradient_descent(X_train_norm,X_test_norm,Y_train_norm,Y_test_norm,theta,alpha,iterations)
print("Normalization: Theta for X: ", theta)

In [None]:
# Calculating theta
theta = np.zeros(6)
alpha = 0.01
iterations = 300
loss_history_train_std, loss_history_test_std, theta = gradient_descent(X_train_std,X_test_std,Y_train_std,Y_test_std,theta,alpha,iterations)
print("Standardization: Theta for X: ", theta)

In [None]:
# Plotting the Loss vs Iterations
plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train_norm[0:len(loss_history_train_norm)], color='blue', label="Training Loss")
plt.plot(loss_history_test_norm[0:len(loss_history_test_norm)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train_std[0:len(loss_history_train_std)], color='blue', label="Training Loss")
plt.plot(loss_history_test_std[0:len(loss_history_test_std)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

In [None]:
# Problem 2.b
# Repeat 1.b

# Reads csv file and sets it to the variable housing
housing = pd.DataFrame(pd.read_csv("Housing.csv", usecols = ["price","area","bedrooms","bathrooms","stories","mainroad","guestroom","basement","hotwaterheating","airconditioning","parking","prefarea"]))
binarylist = ["mainroad","guestroom","basement","hotwaterheating","airconditioning","prefarea",]
housing[binarylist] = housing[binarylist].apply(binary_map)

# Splits the training/validation set
np.random.seed(0) 
train, test = train_test_split(housing, train_size = 0.8, test_size = 0.2, random_state = 42)

m = len(train) # Number of values in dataset


# Feature Scaling
# Standardization
scaler = StandardScaler()
std_train = scaler.fit_transform(train) 
std_test = scaler.fit_transform(test)

# Normalization
scaler = MinMaxScaler()
norm_train = scaler.fit_transform(train)
norm_test = scaler.fit_transform(test)

# Normalization: Spliting the inputs and output
Y_train_norm = norm_train[:,0]
X_train_norm = norm_train[:,1:]
Y_test_norm = norm_test[:,0]
X_test_norm = norm_test[:,1:]

# Adding X0 to X_train_norm and X_test_norm
X0 = np.ones((len(X_train_norm),1))
X_train_norm = np.hstack((X0, X_train_norm))
X0 = np.ones((len(X_test_norm),1))
X_test_norm = np.hstack((X0, X_test_norm))


# Standardization: Spliting the inputs and output
Y_train_std = std_train[:,0]
X_train_std = std_train[:,1:]
Y_test_std = std_test[:,0]
X_test_std = std_test[:,1:]

# Adding X0 to X_train_norm and X_test_norm
X0 = np.ones((len(X_train_std),1))
X_train_std = np.hstack((X0, X_train_std))
X0 = np.ones((len(X_test_std),1))
X_test_std = np.hstack((X0, X_test_std))

In [None]:
# Calculating theta
theta = np.zeros(12)
alpha = 0.01
iterations = 200
loss_history_train_norm, loss_history_test_norm, theta = gradient_descent(X_train_norm,X_test_norm,Y_train_norm,Y_test_norm,theta,alpha,iterations)
print("Normalization: Theta for X: ", theta)

In [None]:
# Calculating theta
theta = np.zeros(12)
alpha = 0.01
iterations = 200
loss_history_train_std, loss_history_test_std, theta = gradient_descent(X_train_std,X_test_std,Y_train_std,Y_test_std,theta,alpha,iterations)
print("Standardization: Theta for X: ", theta)

In [None]:
# Plotting the Loss vs Iterations
plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train_norm[0:len(loss_history_train_norm)], color='blue', label="Training Loss")
plt.plot(loss_history_test_norm[0:len(loss_history_test_norm)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train_std[0:len(loss_history_train_std)], color='blue', label="Training Loss")
plt.plot(loss_history_test_std[0:len(loss_history_test_std)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

In [None]:
# Problem 3

def compute_loss_train(X,y,theta):
#    **** Computes the loss function for linear regression with regularization ****   
    h = X.dot(theta)
    errors = np.subtract(h,y)
    sqrErrors = np.square(errors)
    reg = np.square(theta[1:])
    reg = np.insert(reg, 0, theta[:1], axis=0)
    J = 1/(2*m) * ((np.sum(sqrErrors)) + (np.sum(reg)))
    return J

def compute_loss_test(X,y,theta):
#    **** Computes the loss function for linear regression with regularization****
    h = X.dot(theta)
    errors = np.subtract(h,y)
    sqrErrors = np.square(errors)
    J = 1/(2*m) * (np.sum(sqrErrors))
    return J

In [None]:
# Problem 3.a
# Repeat problem 2.a

# Reads csv file and sets it to the variable housing
housing = pd.DataFrame(pd.read_csv("Housing.csv", usecols = ["price","area","bedrooms","bathrooms","stories","parking"]))

# Splits the training/validation set
np.random.seed(0) 
train, test = train_test_split(housing, train_size = 0.8, test_size = 0.2, random_state = 42)

m = len(train) # Number of values in dataset

# Feature Scaling
# Standardization
scaler = StandardScaler()
std_train = scaler.fit_transform(train) 
std_test = scaler.fit_transform(test)

# Normalization
scaler = MinMaxScaler()
norm_train = scaler.fit_transform(train)
norm_test = scaler.fit_transform(test)

# Normalization: Spliting the inputs and output
Y_train_norm = norm_train[:,0]
X_train_norm = norm_train[:,1:]
Y_test_norm = norm_test[:,0]
X_test_norm = norm_test[:,1:]

# Adding X0 to X_train_norm and X_test_norm
X0 = np.ones((len(X_train_norm),1))
X_train_norm = np.hstack((X0, X_train_norm))
X0 = np.ones((len(X_test_norm),1))
X_test_norm = np.hstack((X0, X_test_norm))

# Standardization: Spliting the inputs and output
Y_train_std = std_train[:,0]
X_train_std = std_train[:,1:]
Y_test_std = std_test[:,0]
X_test_std = std_test[:,1:]

# Adding X0 to X_train_norm and X_test_norm
X0 = np.ones((len(X_train_std),1))
X_train_std = np.hstack((X0, X_train_std))
X0 = np.ones((len(X_test_std),1))
X_test_std = np.hstack((X0, X_test_std))

In [None]:
# Calculating theta
theta = np.zeros(6)
alpha = 0.01
iterations = 500
loss_history_train_norm, loss_history_test_norm, theta = gradient_descent(X_train_norm,X_test_norm,Y_train_norm,Y_test_norm,theta,alpha,iterations)
print("Normalization: Theta for X: ", theta)

In [None]:
# Calculating theta
theta = np.zeros(6)
alpha = 0.01
iterations = 200
loss_history_train_std, loss_history_test_std, theta = gradient_descent(X_train_std,X_test_std,Y_train_std,Y_test_std,theta,alpha,iterations)
print("Standardization: Theta for X: ", theta)

In [None]:
# Plotting the Loss vs Iterations
plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train_norm[0:len(loss_history_train_norm)], color='blue', label="Training Loss")
plt.plot(loss_history_test_norm[0:len(loss_history_test_norm)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train_std[0:len(loss_history_train_std)], color='blue', label="Training Loss")
plt.plot(loss_history_test_std[0:len(loss_history_test_std)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

In [None]:
# Problem 3.b
# Repeat problem 2.b 

# Reads csv file and sets it to the variable housing
housing = pd.DataFrame(pd.read_csv("Housing.csv", usecols = ["price","area","bedrooms","bathrooms","stories","mainroad","guestroom","basement","hotwaterheating","airconditioning","parking","prefarea"]))
binarylist = ["mainroad","guestroom","basement","hotwaterheating","airconditioning","prefarea",]
housing[binarylist] = housing[binarylist].apply(binary_map)

# Splits the training/validation set
np.random.seed(0) 
train, test = train_test_split(housing, train_size = 0.8, test_size = 0.2, random_state = 42)

m = len(train) # Number of values in dataset

# Feature Scaling
# Standardization
scaler = StandardScaler()
std_train = scaler.fit_transform(train) 
std_test = scaler.fit_transform(test)

# Normalization
scaler = MinMaxScaler()
norm_train = scaler.fit_transform(train)
norm_test = scaler.fit_transform(test)

# Normalization: Spliting the inputs and output
Y_train_norm = norm_train[:,0]
X_train_norm = norm_train[:,1:]
Y_test_norm = norm_test[:,0]
X_test_norm = norm_test[:,1:]

# Adding X0 to X_train_norm and X_test_norm
X0 = np.ones((len(X_train_norm),1))
X_train_norm = np.hstack((X0, X_train_norm))
X0 = np.ones((len(X_test_norm),1))
X_test_norm = np.hstack((X0, X_test_norm))

# Standardization: Spliting the inputs and output
Y_train_std = std_train[:,0]
X_train_std = std_train[:,1:]
Y_test_std = std_test[:,0]
X_test_std = std_test[:,1:]

# Adding X0 to X_train_norm and X_test_norm
X0 = np.ones((len(X_train_std),1))
X_train_std = np.hstack((X0, X_train_std))
X0 = np.ones((len(X_test_std),1))
X_test_std = np.hstack((X0, X_test_std))

In [None]:
# Calculating theta
theta = np.zeros(12)
alpha = 0.01
iterations = 200
loss_history_train_norm, loss_history_test_norm, theta = gradient_descent(X_train_norm,X_test_norm,Y_train_norm,Y_test_norm,theta,alpha,iterations)
print("Normalization: Theta for X: ", theta)

In [None]:
# Calculating theta
theta = np.zeros(12)
alpha = 0.01
iterations = 200
loss_history_train_std, loss_history_test_std, theta = gradient_descent(X_train_std,X_test_std,Y_train_std,Y_test_std,theta,alpha,iterations)
print("Standardization: Theta for X: ", theta)

In [None]:
# Plotting the Loss vs Iterations
plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train_norm[0:len(loss_history_train_norm)], color='blue', label="Training Loss")
plt.plot(loss_history_test_norm[0:len(loss_history_test_norm)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();

plt.rcParams['figure.figsize'] = [10, 4]
plt.figure()
plt.plot(loss_history_train_std[0:len(loss_history_train_std)], color='blue', label="Training Loss")
plt.plot(loss_history_test_std[0:len(loss_history_test_std)], color='red', label="Validation Loss")
plt.grid(linestyle='-.', linewidth='1')
plt.xlabel("Number of Iterations", color='red')
plt.ylabel("J", color='red')
plt.title("Training and Validation Losses vs Iteration")
plt.legend();