In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20.0, 10.0)
from mpl_toolkits.mplot3d import Axes3D

In [0]:
"""cost function to compute the error in our model

features - numpy matrix of all X variables
targets - numpy matrix of all Y variables
coefficients - numpy matrix of the weights of each Xi in X
return - the cost (error) in our model as a floating point value
"""
def _cost(features, targets, coefficients):
  hypothesis = features.dot(coefficients)
  loss = hypothesis - targets
  num_rows = len(targets)
  cost = np.sum(loss ** 2) / (2 * num_rows)
  return cost

In [0]:
"""training model for gradient descent multivariate linear regression

features - numpy matrix of all X variables
targets - numpy matrix of all Y variables
coefficients - numpy matrix of the weights of each Xi in X
learning_rate - the gradient descent step size
iterations - the number of times the model recalculates its coefficients
return - the final coefficients as an array
return - an array of costs from each iteration
"""
def train(features, targets, coefficients, learning_rate, iterations):
  cost_history = [0 for i in range(epochs)] # Initialize list to 0's
  num_rows = len(targets)
  
  for iteration in range(iterations):
    hypothesis = features.dot(coefficients) # Compute the predicted target values as a matrix
    loss = hypothesis - targets # Compute the difference between hypothesis (predicted) and target (actual) values in the matrices
    
    gradient = features.T.dot(loss) / num_rows # Compute the new gradient for this iteration as a float
    coefficients = coefficients - learning_rate * gradient # Compute the new coefficients as a matrix from the new gradient
    
    cost = _cost(features, targets, coefficients) # Calculate the cost from this iteration
    cost_history[iteration] = cost # Store the cost in the history
    
  return coefficients, cost_history

In [0]:
"""Computes the root mean square error and R^2 score of our model

targets - the actual values from test data
predictions - the estimated values from the model based off the most recent coefficients
return - root mean square error as a float - lower is better
return - R^2 accuracy score (0.0-1.0) - higher is better
"""
def accuracy(targets, predictions):
  # Compute root mean square error of model
  diff = targets - predictions
  num_rows = len(targets)
  
  rmse = np.sqrt(sum(diff ** 2) / num_rows)
  
  # Compute R^2 score of model
  mean_targets = np.mean(targets)
  sq_diff_from_mean = sum((targets - mean_targets) ** 2)
  sq_diff_from_pred = sum((targets - predictions) ** 2)
  
  R2_accuracy = 1 - (sq_diff_from_pred / sq_diff_from_mean)
  
  return rmse, R2_accuracy

In [0]:
from google.colab import files
files.upload()

In [12]:
data = pd.read_csv('student.csv') # Get new dataset
# https://mubaris.com/2017/09/28/linear-regression-from-scratch/

print(data.shape)
data.head()

(1000, 3)


Unnamed: 0,Math,Reading,Writing
0,48,68,63
1,62,81,72
2,79,80,78
3,76,83,79
4,59,64,62


In [0]:
math = data['Math'].values
read = data['Reading'].values
write = data['Writing'].values

# Ploting the scores as scatter plot
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(math, read, write, color='#ef1234')
plt.show()

In [17]:
m = len(math)
x0 = np.ones(m)
X = np.array([x0, math, read]).T
# Initial Coefficients
B = np.array([0, 0, 0])
Y = np.array(write)
alpha = 0.0001

inital_cost = _cost(X, Y, B)
print(inital_cost)

2470.11


In [30]:
# 100000 Iterations
newCoeff, cost_history = gradient_descent(X, Y, B, alpha, 100000)

# New Values of B
print(newCoeff)

# Final Cost of new B
print(cost_history[::10000])

[-0.47889172  0.09137252  0.90144884]
[18.076027940915417, 10.495847089517708, 10.492737077485666, 10.48986767223389, 10.487220259164712, 10.484777663803007, 10.48252404038077, 10.480444769041412, 10.478526360996828, 10.47675637102204]


In [34]:
Y_pred = X.dot(newCoeff)
rmse, R2_score = accuracy(Y, Y_pred)
print(rmse)
print(R2_score)

4.577143972727789
0.9097223273061553
