[Reference](https://medium.com/@hunter-j-phillips/a-simple-introduction-to-multiple-linear-regression-in-python-6f2335d0dcbe)

In [1]:
import torch

torch.manual_seed(5)
torch.set_printoptions(precision=2)

# create ones for the bias | 1000 ones
X0 = torch.ones(1000).reshape(-1,1)

# create values for the first feature | 1000 numbers from -250 to 250
X1 = (500*(torch.rand(1000) - 0.5)).reshape(-1,1)

# create values for the second feature | 1000 numbers from -250 to 250
X2 = (500*(torch.rand(1000) - 0.5)).reshape(-1,1)

# stack data together, X0 = X[:,0], X1 = X[:,1], X2 = X[:,2]
X = torch.hstack((X0, X1,X2))

# normal distribution with a mean of 0 and std of 10
normal = torch.distributions.Normal(loc=0, scale=10)

# output
Y = ((6*X[:,2] + 3*X[:,1] + 2*X[:,0]) + normal.sample(torch.ones(1000).shape)).reshape(-1,1)

In [2]:
import plotly.express as px

fig = px.scatter_3d(x=X[:,1].flatten(),
                    y=X[:,2].flatten(),
                    z=Y.flatten())

fig.update_traces(marker_size=3)
fig.update_layout(scene = dict(xaxis_title='X<sub>1</sub>',
                               yaxis_title='X<sub>2</sub>',
                               zaxis_title='Y'))

In [3]:
# split the data
Xtrain, Xtest = X[:800], X[800:]
Ytrain, Ytest = Y[:800], Y[800:]

In [4]:
# line of best fit
def model(w, X):
  """
    Inputs:
      w: array of weights | (num features, 1)
      X: array of inputs  | (n samples, num features)

    Output:
      returns the output of X@w | (n samples, 1)
  """

  return torch.matmul(X, w)

In [5]:
# mean squared error (MSE)
def MSE(Yhat, Y):
  """
    Inputs:
      Yhat: array of predictions | (n samples, 1)
      Y: array of expected outputs | (n samples, 1)
    Output:
      returns the loss of the model, which is a scalar
  """
  return torch.mean((Yhat-Y)**2) # mean((error)^2)

In [6]:
# optimizer
def gradient_descent(w):
  """
    Inputs:
      w: array of weights | (num features, 1)

    Global Variables / Constants:
      X: array of inputs  | (n samples, num features)
      Y: array of expected outputs | (n samples, 1)
      lr: learning rate to scale the gradient

    Output:
      returns the updated weights
  """

  n = X.shape[0]

  return w - (lr * 2/n) * (torch.matmul(-Y.T, X) + torch.matmul(torch.matmul(w.T, X.T), X)).reshape(w.shape)

In [7]:
torch.manual_seed(5)
w = torch.rand(size=(3, 1))
w

tensor([[0.83],
        [0.13],
        [0.91]])

In [8]:
import plotly.graph_objects as go
def plot_model(x1_range, x2_range):
  """
    Inputs:
      x1_range: x1-axis range [low, high]
      x2_range: x2-axis range [low, high]

    Global Variables:
      Xtrain: array of inputs | (n train samples, num features)
      Ytrain: array of expected outputs | (n train samples, 1)
      Xtest:  array of inputs | (n test samples, num features)
      Xtrain: array of expected outputs | (n test samples, 1)

    Output:
      prints plane of best fit
  """

  # meshgrid of possible combinations of (X1, X2)
  X1_plot, X2_plot = torch.meshgrid(torch.arange(x1_range[0], x1_range[1], 5),
                                    torch.arange(x2_range[0], x2_range[1], 5))
  X0_plot = torch.ones(X1_plot.shape)

  # stack together each point (X1, X2) = (X, Y)
  X_plot = torch.hstack((X0_plot.reshape(-1,1),
                         X1_plot.reshape(-1,1),
                         X2_plot.reshape(-1,1)))

  # all possible model predictions (Yhat = Z)
  Yhat = model(w, X_plot)

  # model's plane of best fit
  fig = go.Figure(data=[go.Mesh3d(x=X_plot[:,1].flatten(),
                                  y=X_plot[:,2].flatten(),
                                  z=Yhat.flatten(),
                                  color='orange',
                                  opacity=0.50)])

  # training data
  fig.add_scatter3d(x=Xtrain[:,1].flatten(),
                    y=Xtrain[:,2].flatten(),
                    z=Ytrain.flatten(),
                    mode="markers",
                    marker=dict(size=3),
                    name="train")

  # test data
  fig.add_scatter3d(x=Xtest[:,1].flatten(),
                    y=Xtest[:,2].flatten(),
                    z=Ytest.flatten(),
                    mode="markers",
                    marker=dict(size=3),
                    name="test")

  # name axes
  fig.update_layout(scene = dict(xaxis_title='X<sub>1</sub>',
                                 yaxis_title='X<sub>2</sub>',
                                 zaxis_title='Y'))

  fig.show()

plot_model([-250,250], [-250,250])


torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3526.)



In [9]:
MSE(model(w,Xtrain), Ytrain)

tensor(655078.81)

In [10]:
torch.manual_seed(5)
w = torch.rand(size=(3, 1))

lr = 0.00004
epochs = 50000

# update the weights 1000 times
for i in range(0, epochs):
  # update the weights
  w = gradient_descent(w)

  # print the new values every 10 iterations
  if (i+1) % 10000 == 0:
    print("epoch:", i+1)
    print("weights:", w)
    print("Train MSE:", MSE(model(w,Xtrain), Ytrain))
    print("Test MSE:", MSE(model(w,Xtest), Ytest))
    print("="*10)

plot_model([-250,250], [-250,250])

epoch: 10000
weights: tensor([[1.39],
        [3.00],
        [6.00]])
Train MSE: tensor(99.79)
Test MSE: tensor(93.91)
epoch: 20000
weights: tensor([[1.64],
        [3.00],
        [6.00]])
Train MSE: tensor(99.53)
Test MSE: tensor(94.10)
epoch: 30000
weights: tensor([[1.76],
        [3.00],
        [6.00]])
Train MSE: tensor(99.45)
Test MSE: tensor(94.23)
epoch: 40000
weights: tensor([[1.81],
        [3.00],
        [6.00]])
Train MSE: tensor(99.42)
Test MSE: tensor(94.30)
epoch: 50000
weights: tensor([[1.84],
        [3.00],
        [6.00]])
Train MSE: tensor(99.41)
Test MSE: tensor(94.33)


In [11]:
def NormalEquation(X, Y):
  """
    Inputs:
      X: array of input values | (n samples, num features)
      Y: array of expected outputs | (n samples, 1)

    Output:
      returns the optimized weights | (num features, 1)
  """

  return torch.inverse(X.T @ X) @ X.T @ Y

In [12]:
w = NormalEquation(Xtrain,Ytrain)

In [13]:
MSE(model(w, Xtrain), Ytrain), MSE(model(w, Xtest), Ytest)

(tensor(99.30), tensor(95.36))