In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import MinMaxScaler, StandardScaler

warnings.filterwarnings('ignore')

#Import the training data from the file
housing = pd.DataFrame(pd.read_csv("Housing.csv"))
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [49]:
# List of variables to map to numerical values from strings:
varlist = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
           
# Defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})
       
#Figure out how many datappoints are in the set:
m = len(housing)
print("Total number of data points:", m)

# Applying the function to the housing list
housing[varlist] = housing[varlist].apply(binary_map)

# Check the housing dataframe now
housing.head()


Total number of data points: 545


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [50]:
#Splitting the Data into Training and Testing Sets
from sklearn.model_selection import train_test_split
# We specify this so that the train and test data set always have the same rows, respec
np.random.seed(0)
df_train, df_test = train_test_split(housing, train_size = 0.7, test_size = 0.3, random_state=5)
print("Shape of training set:", df_train.shape)
print("Shape of validation set:", df_test.shape)
m = len(df_train)
k = len(df_test)


Shape of training set: (381, 13)
Shape of validation set: (164, 13)


In [51]:
num_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking','price']
df_Newtrain = df_train[num_vars]
df_Newtest = df_test[num_vars]

#Original columns
x1A = df_Newtrain.values[:, 0] # get input values for area
x2A = df_Newtrain.values[:, 1] # get input values for bedrooms
x3A = df_Newtrain.values[:, 2] # get input values for bathroom
x4A = df_Newtrain.values[:, 3] # get input values for stories
x5A = df_Newtrain.values[:, 4] # get input values for parking
yA = df_Newtrain.values[:, 5] # get output values (price)
print(df_Newtrain.head(), "\n")
#Input matrix creation for normalized part a matrix

#Reshaping and adding a column of ones to represent the "variable" multiplied against theta0 for each input
X_0A = np.ones((m, 1))
X_1A = x1A.reshape(m, 1)
X_2A = x2A.reshape(m, 1)
X_3A = x3A.reshape(m, 1)
X_4A = x4A.reshape(m, 1)
X_5A = x5A.reshape(m, 1)
xA = np.hstack((X_0A, X_1A, X_2A, X_3A, X_4A, X_5A))
print(xA, "\n")

     area  bedrooms  bathrooms  stories  parking    price
122  6254         4          2        1        1  5950000
9    5750         3          2        4        1  9800000
295  2325         3          1        2        0  4200000
299  7000         3          1        1        3  4200000
524  3264         2          1        1        0  2380000 

[[1.000e+00 6.254e+03 4.000e+00 2.000e+00 1.000e+00 1.000e+00]
 [1.000e+00 5.750e+03 3.000e+00 2.000e+00 4.000e+00 1.000e+00]
 [1.000e+00 2.325e+03 3.000e+00 1.000e+00 2.000e+00 0.000e+00]
 ...
 [1.000e+00 3.512e+03 2.000e+00 1.000e+00 1.000e+00 1.000e+00]
 [1.000e+00 6.420e+03 3.000e+00 1.000e+00 1.000e+00 0.000e+00]
 [1.000e+00 5.800e+03 2.000e+00 1.000e+00 1.000e+00 0.000e+00]] 



In [52]:
#Normalize
df_MinMaxA = df_Newtrain
scaler1 = MinMaxScaler()
df_MinMaxA[num_vars] = scaler1.fit_transform(df_MinMaxA[num_vars])
#Normalization columns
x1AM = df_MinMaxA.values[:, 0] # get input values for area
x2AM = df_MinMaxA.values[:, 1] # get input values for bedrooms
x3AM = df_MinMaxA.values[:, 2] # get input values for bathroom
x4AM = df_MinMaxA.values[:, 3] # get input values for stories
x5AM = df_MinMaxA.values[:, 4] # get input values for parking
yAM = df_MinMaxA.values[:, 5] # get output values (price)
print(df_MinMaxA.head(), "\n")

#Input matrix creation for normalized part a matrix
#Reshaping and adding a column of ones to represent the "variable" multiplied against theta0 for each input
X_0AM = np.ones((m, 1))
X_1AM = x1AM.reshape(m, 1)
X_2AM = x2AM.reshape(m, 1)
X_3AM = x3AM.reshape(m, 1)
X_4AM = x4AM.reshape(m, 1)
X_5AM = x5AM.reshape(m, 1)
xAM = np.hstack((X_0AM, X_1AM, X_2AM, X_3AM, X_4AM, X_5AM))
print(xAM, "\n")

         area  bedrooms  bathrooms   stories   parking     price
122  0.330036       0.6   0.333333  0.000000  0.333333  0.363636
9    0.293907       0.4   0.333333  1.000000  0.333333  0.696970
295  0.048387       0.4   0.000000  0.333333  0.000000  0.212121
299  0.383513       0.4   0.000000  0.000000  1.000000  0.212121
524  0.115699       0.2   0.000000  0.000000  0.000000  0.054545 

[[1.         0.33003584 0.6        0.33333333 0.         0.33333333]
 [1.         0.29390681 0.4        0.33333333 1.         0.33333333]
 [1.         0.0483871  0.4        0.         0.33333333 0.        ]
 ...
 [1.         0.1334767  0.2        0.         0.         0.33333333]
 [1.         0.34193548 0.4        0.         0.         0.        ]
 [1.         0.29749104 0.2        0.         0.         0.        ]] 



In [53]:
#Standardize
df_StandardA = df_Newtrain
scaler2 = StandardScaler()
df_StandardA[num_vars] = scaler2.fit_transform(df_StandardA[num_vars])
#Standardization Columns
x1AS = df_StandardA.values[:, 0] # get input values for area
x2AS = df_StandardA.values[:, 1] # get input values for bedrooms
x3AS = df_StandardA.values[:, 2] # get input values for bathroom
x4AS = df_StandardA.values[:, 3] # get input values for stories
x5AS = df_StandardA.values[:, 4] # get input values for parking
yAS = df_StandardA.values[:, 5] # get output values (price)
print(df_StandardA.head(), "\n")

#Input matrix creation for normalized part a matrix
#Reshaping and adding a column of ones to represent the "variable" multiplied against theta0 for each input
X_0AS = np.ones((m, 1))
X_1AS = x1AS.reshape(m, 1)
X_2AS = x2AS.reshape(m, 1)
X_3AS = x3AS.reshape(m, 1)
X_4AS = x4AS.reshape(m, 1)
X_5AS = x5AS.reshape(m, 1)
xAS = np.hstack((X_0AS, X_1AS, X_2AS, X_3AS, X_4AS, X_5AS))
print(xAS, "\n")

         area  bedrooms  bathrooms   stories   parking     price
122  0.487017  1.395593   1.413364 -0.922077  0.285155  0.525337
9    0.250209  0.025178   1.413364  2.455916  0.285155  2.491088
295 -1.359050  0.025178  -0.588464  0.203921 -0.858468 -0.368187
299  0.837530  0.025178  -0.588464 -0.922077  2.572402 -0.368187
524 -0.917855 -1.345237  -0.588464 -0.922077 -0.858468 -1.297451 

[[ 1.          0.48701693  1.39559317  1.41336417 -0.92207665  0.28515545]
 [ 1.          0.25020916  0.02517823  1.41336417  2.45591569  0.28515545]
 [ 1.         -1.35904995  0.02517823 -0.58846389  0.2039208  -0.85846799]
 ...
 [ 1.         -0.80133008 -1.34523671 -0.58846389 -0.92207665  0.28515545]
 [ 1.          0.56501314  0.02517823 -0.58846389 -0.92207665 -0.85846799]
 [ 1.          0.273702   -1.34523671 -0.58846389 -0.92207665 -0.85846799]] 



In [55]:
#PROBLEM: These should be a min-max normalized dataset and a non-normalized dataset, but they are both stanadardized

print(df_MinMaxA.head(), "\n")

print (df_Newtrain.head())

         area  bedrooms  bathrooms   stories   parking     price
122  0.487017  1.395593   1.413364 -0.922077  0.285155  0.525337
9    0.250209  0.025178   1.413364  2.455916  0.285155  2.491088
295 -1.359050  0.025178  -0.588464  0.203921 -0.858468 -0.368187
299  0.837530  0.025178  -0.588464 -0.922077  2.572402 -0.368187
524 -0.917855 -1.345237  -0.588464 -0.922077 -0.858468 -1.297451 

         area  bedrooms  bathrooms   stories   parking     price
122  0.487017  1.395593   1.413364 -0.922077  0.285155  0.525337
9    0.250209  0.025178   1.413364  2.455916  0.285155  2.491088
295 -1.359050  0.025178  -0.588464  0.203921 -0.858468 -0.368187
299  0.837530  0.025178  -0.588464 -0.922077  2.572402 -0.368187
524 -0.917855 -1.345237  -0.588464 -0.922077 -0.858468 -1.297451


In [47]:
#Credit for the following compute_cost function:
#Author: Hamed Tabkhi
#Institution: University of North Carolina at Charlotte, ECE dept.
#Accessed on: 9/17/2021
#From: "LinearRegression.pdf"
 
def compute_cost(X, y, theta):
 """
 Compute cost for linear regression.
 Input Parameters
 ----------------
 X : 2D array where each row represent the training example and each column represent
 m= number of training examples
 n= number of features (including X_0 column of ones)
 y : 1D array of labels/target value for each traing example. dimension(1 x m)
 theta : 1D array of fitting parameters or weights. Dimension (1 x n)
 Output Parameters
 -----------------
 J : Scalar value.
 """
 predictions = X.dot(theta)
 errors = np.subtract(predictions, y)
 sqrErrors = np.square(errors)
 J = 1 / (2 * m) * np.sum(sqrErrors)
 return J

In [46]:
#This cost function adds parameter penalties
def regularized_compute_cost(X, y, theta, lamda):
    """
    Compute cost for linear regression.
    Input Parameters
    ----------------
    X : 2D array where each row represent the training example and each column represent
    m= number of training examples
    n= number of features (including X_0 column of ones)
    y : 1D array of labels/target value for each traing example. dimension(1 x m)
    theta : 1D array of fitting parameters or weights. Dimension (1 x n)
    Output Parameters
    lamda: Arbitrary scalar to determine the amount a parameter is penalized.
    -----------------
    J : Scalar value.
    """
    predictions = X.dot(theta)
    errors = np.subtract(predictions, y)
    sqrErrors = np.square(errors)
    sqrThetas = (theta.transpose()).dot(theta)-np.square(theta[0])
    J = 1 / (2 * m) * (np.sum(sqrErrors)+lamda*sqrThetas)
    return J

In [45]:
#Credit for the following gradient_descent function:
#Author: Hamed Tabkhi
#Institution: University of North Carolina at Charlotte, ECE dept.
#Accessed on: 9/17/2021
#From: "LinearRegression.pdf"

def gradient_descent(X, y, theta, alpha, iterations):
 """
 Compute cost for linear regression.
 Input Parameters
 ----------------
 X : 2D array where each row represent the training example and each column represent
 m= number of training examples
 n= number of features (including X_0 column of ones)
 y : 1D array of labels/target value for each traing example. dimension(m x 1)
 theta : 1D array of fitting parameters or weights. Dimension (1 x n)
 alpha : Learning rate. Scalar value
 iterations: No of iterations. Scalar value.
 Output Parameters
 -----------------
 theta : Final Value. 1D array of fitting parameters or weights. Dimension (1 x n)
 cost_history: Conatins value of cost for each iteration. 1D array. Dimansion(m x 1)
 """
 cost_history = np.zeros(iterations)
 for i in range(iterations):
     predictions = X.dot(theta)
     errors = np.subtract(predictions, y)
     sum_delta = (alpha / m) * X.transpose().dot(errors);
     theta = theta - sum_delta;
     cost_history[i] = compute_cost(X, y, theta)
 return theta, cost_history

In [44]:
#My version of the gradient_descent to accomadate a threshold instead of a fixed amount of iterations.
def regularized_gradient_descent(X, y, theta, alpha, iterations_limit, threshold, lamda):
    """
    Compute cost for linear regression.
    Input Parameters
    ----------------
    X : 2D array where each row represent the training example and each column represent
    m= number of training examples
    n= number of features (including X_0 column of ones)
    y : 1D array of labels/target value for each traing example. dimension(m x 1)
    theta : 1D array of fitting parameters or weights. Dimension (1 x n)
    alpha : Learning rate. Scalar value
    iterations_limit: Sets a maximum on how many iterations can be performed. Scalar value.
    threshold: Stops the descent once cost falls below this values. Scalar value.
    Output Parameters
    -----------------
    theta : Final Value. 1D array of fitting parameters or weights. Dimension (1 x n)
    cost_history: Conatins value of cost for each iteration. 1D array. Dimansion(m x 1)
    iterations: the total iterations of gradient descent accumulated. Scalar Values
    """
    iterations = 0
    deviation = 10000000 #Assume enourmous deviation(cost) to begin with
    cost_history = np.empty([0,1])
    
    while iterations<iterations_limit and deviation>threshold:
        oldTheta = theta;
        predictions = X.dot(theta)
        errors = np.subtract(predictions, y)
        sum_delta = (alpha / m) * X.transpose().dot(errors);
        
        theta = theta*(1-alhpa*lamda/m) - sum_delta;
        cost_history = np.vstack((cost_history, compute_cost(X, y, theta)))
        iterations+=1
        thetadiff = theta-oldTheta
        deviation = np.sqrt((thetadiff.transpose()).dot(thetadiff))
        #print(deviation)
    
    cost_history = cost_history.transpose()

    return theta, cost_history, iterations

In [43]:
#This versions of gradient descent introduces parameter penalties
def regularized_gradient_descent(X, y, theta, alpha, iterations_limit, threshold):
    """
    Compute cost for linear regression.
    Input Parameters
    ----------------
    X : 2D array where each row represent the training example and each column represent
    m= number of training examples
    n= number of features (including X_0 column of ones)
    y : 1D array of labels/target value for each traing example. dimension(m x 1)
    theta : 1D array of fitting parameters or weights. Dimension (1 x n)
    alpha : Learning rate. Scalar value
    iterations_limit: Sets a maximum on how many iterations can be performed. Scalar value.
    threshold: Stops the descent once cost falls below this values. Scalar value.
    Output Parameters
    -----------------
    theta : Final Value. 1D array of fitting parameters or weights. Dimension (1 x n)
    cost_history: Conatins value of cost for each iteration. 1D array. Dimansion(m x 1)
    iterations: the total iterations of gradient descent accumulated. Scalar Values
    """
    iterations = 0
    deviation = 10000000 #Assume enourmous deviation(cost) to begin with
    cost_history = np.empty([0,1])
    
    while iterations<iterations_limit and deviation>threshold:
        oldTheta = theta;
        predictions = X.dot(theta)
        errors = np.subtract(predictions, y)
        sum_delta = (alpha / m) * X.transpose().dot(errors);
        theta = theta - sum_delta;
        cost_history = np.vstack((cost_history, compute_cost(X, y, theta)))
        
        iterations+=1
        thetadiff = theta-oldTheta
        deviation = np.sqrt((thetadiff.transpose()).dot(thetadiff))
        #print(deviation)
    
    cost_history = cost_history.transpose()

    return theta, cost_history, iterations