# Linear Regression Practice 

This noteook will implement gradient descent using multivariate linear regression to predict wine quality. It will provide a full walkthrough of how I implemented the linear regression model, followed by an analysis of the performance of the model. 

In [1]:
# Scientific and vector computation for python
import numpy as np

# Plotting library
from tabulate import tabulate
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D  # needed to plot 3-D surfaces

# tells matplotlib to embed plots within the notebook
%matplotlib inline

### Loading the Data and perform feature scaling

First, we load the data and use feature normalization to ensure gradient descent converges much more quickly

In [2]:
# Load data
data = np.loadtxt(open("data/wineQuality.csv", "rb"), delimiter=",", skiprows=1)

# Create feature matrix and output variables
# Here, X denotes the feature matrix and y is the output
X = data[:, :-1]
y = data[:, -1]

# print out some data points
sample = data[:10, :]
table = [column for column in sample]
print(tabulate(table, headers=["X[0]", "X[1]", "X[2]", "X[3]", "X[4]", "X[5]", "X[6]", "X[7]", "X[8]", "X[9]", "X[10]", "quality (1-10)" ]))


  X[0]    X[1]    X[2]    X[3]    X[4]    X[5]    X[6]    X[7]    X[8]    X[9]    X[10]    quality (1-10)
------  ------  ------  ------  ------  ------  ------  ------  ------  ------  -------  ----------------
   7.4    0.7     0        1.9   0.076      11      34  0.9978    3.51    0.56      9.4                 5
   7.8    0.88    0        2.6   0.098      25      67  0.9968    3.2     0.68      9.8                 5
   7.8    0.76    0.04     2.3   0.092      15      54  0.997     3.26    0.65      9.8                 5
  11.2    0.28    0.56     1.9   0.075      17      60  0.998     3.16    0.58      9.8                 6
   7.4    0.7     0        1.9   0.076      11      34  0.9978    3.51    0.56      9.4                 5
   7.4    0.66    0        1.8   0.075      13      40  0.9978    3.51    0.56      9.4                 5
   7.9    0.6     0.06     1.6   0.069      15      59  0.9964    3.3     0.46      9.4                 5
   7.3    0.65    0        1.2   0.065      15

In [3]:
# This function returns a normalized version of the feature matrix
def featureNormalization(featureMatrix=X):
    # create mu and sigma vector
    # mu[x] denotes the mean value of column x
    # sigma[x] denotes the standard deviation of column x
    X_normalized = X.copy()
    mu = np.zeros(X.shape[1])
    sigma = np.zeros(X.shape[1])
    
    # set the values of mu and sigma
    mu = np.mean(X, axis = 0)
    sigma = np.std(X, axis = 0)
    X_normalized = (X - mu) / sigma
    
    # return normalized feature matrix, mu and sigma vector
    return X_normalized, mu, sigma

In [4]:
# call featureNormalization on the data
X_normalized, mu, sigma = featureNormalization(X)

print("Computed mean vector: ", mu)
print("\nComputed sigma vector: ", sigma)

Computed mean vector:  [ 8.31963727  0.52782051  0.27097561  2.5388055   0.08746654 15.87492183
 46.46779237  0.99674668  3.3111132   0.65814884 10.42298311]

Computed sigma vector:  [1.74055180e+00 1.79003704e-01 1.94740214e-01 1.40948711e+00
 4.70505826e-02 1.04568856e+01 3.28850367e+01 1.88674370e-03
 1.54338181e-01 1.69453967e-01 1.06533430e+00]


Finally, before we use the feature matrix to compute the cost function, we must add the intercept term

In [5]:
# Add intercept term to X
m = y.size
X = np.concatenate([np.ones((m, 1)), X_normalized], axis=1)


### Cost Function

Next, we must implement the cost function for our multivariate linear regression model. This function computes the average of all the results of our linear hypothesis with inputs from our feature matrix compared to the actual output of our dataset. 

In [6]:
# Cost function for multivariate linear regression
# Parameters: X = feature matrix, y = output, theta = parameter vector
# Returns: cost = the computed cost of fitting data points using theta, the parameter vector
def costFunction(X, y, theta):
    
    # number of training examples
    n = y.shape[0]
    cost = 0
    
    # hypothesis
    hypothesis = np.dot(X, theta)
    # vectorized implementation of cost
    cost = (1/(2*m)) * np.dot((hypothesis - y).T, (hypothesis - y))
    
    return cost

### Gradient Descent Algorithm

Using the cost function defined above, we now implement the gradient descent algorithm to train the model to find the optimal values for our parameter vector.

In [8]:
# Gradient Descent algorithm
# Parameters: X
# Returns: theta: The learned parameter vector
#          costVector: a list containign the cost function after each iteration
def gradientDescent(X, y, theta, alpha, iterations):
    
    # copy the theta vector to be updated by gradient descent
    theta = theta.copy()
    costVector = []
    
    for i in range(iterations):
        theta = theta - (alpha / m) * (np.dot(X, theta) - y).dot(X)
        costVector.append(costFunction(X, y, theta))
    
    return theta, costVector

alpha = 0.01
iterations = 1000

theta = np.zeros(X[1].shape)
theta, costVector = gradientDescent(X, y, theta, alpha, iterations)
print(costVector)

[15.888342020222296, 15.574855653509339, 15.267665735762387, 14.966644639723452, 14.671667376140315, 14.382611536842878, 14.099357239166116, 13.821787071681936, 13.549786041203404, 13.283241521026152, 13.022043200373162, 12.76608303500994, 12.515255198998709, 12.269456037560781, 12.028584021017817, 11.79253969978332, 11.561225660376763, 11.334546482433618, 11.112408696685664, 10.894720743886268, 10.681392934656685, 10.472337410229898, 10.267468104069119, 10.066700704339317, 9.86995261721002, 9.677142930969074, 9.48819238092714, 9.303023315093547, 9.121559660604586, 8.943726890885985, 8.769451993531794, 8.598663438882358, 8.431291149284698, 8.267266469018992, 8.10652213487525, 7.94899224736493, 7.7946122425524615, 7.643318864492245, 7.495050138256741, 7.3497453435422635, 7.207344988838791, 7.067790786150881, 6.931025626257067, 6.79699355449539, 6.665639747062902, 6.536910487817688, 6.410753145571811, 6.287116151864198, 6.165948979202615, 6.047202119764173, 5.930827064544132, 5.816776282