In [46]:
import copy
import plotly
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import sklearn
from   sklearn import linear_model


# **Regression revisited: 1 independent, 2 dependent**

Last time we did linear regression with 1 independent variable, x, and 1 dependent, y.  Now lets add a second dependent variable, z.

### We'll reuse this function from last time, to plot point clouds and their means.



In [49]:
def plot_cloud_with_mean( A, 
                         point_cloud_name = '',
                         plot_title = '', 
                         axis_limits = None, 
                         return_plotting_data = False,
                         show_plot = True):
  # RECALCULATE THE MEAN X AND Y COORDINATES
  mean_x, mean_y, mean_z = A.mean(axis=0)
  # GENERATE PLOTTING DATA FOR THE POINT CLOUD  
  trace_cloud = go.Scatter3d(x=A[:,0], y=A[:,1], z=A[:,2], mode='markers', marker=dict(size=10,color='blue'), showlegend=True, name = "point cloud"+point_cloud_name)
  # GENERATE PLOTTING DATA FOR THE MEAN  
  trace_mean = go.Scatter3d(x=[mean_x], y=[mean_y], z=[mean_z], mode='markers', showlegend=True, marker=dict(size=10,color='red'), name = f'mean(point cloud{point_cloud_name})')
  # STORE ALL THE TRACES IN A LIST
  data = [trace_cloud, trace_mean]
  # GENERATE A "LAYOUT" OBJECT
  if axis_limits is None:
    layout = go.Layout(title=plot_title)
  else:
    layout = go.Layout(title=plot_title, scene = dict( xaxis=dict(range=axis_limits), yaxis=dict(range=axis_limits), zaxis=dict(range=axis_limits), aspectmode = 'cube'))
  # COMBINE THE PLOTTING DATA WITH THE LAYOUT OBJECT, AND PLOT  
  if show_plot:
    fig = go.Figure(data = data,layout = layout)
    fig.show()
  if return_plotting_data:
    return data,layout


## Step 1: Player 1 chooses a point cloud

For convenience let's take the one we've already been working with.

In [51]:
# DEFINE THE ARRAY
A = 2 + np.array( [ [3, 2, 4 ],
                    [2, -1, 2],
                    [-5, -4, -2],
                    [-6, -2, -4]])

# AS BEFORE, CONVERT TO FLOAT TYPE
A = A.astype(float)

# PLOT THE CLOUD
plot_cloud_with_mean(A, axis_limits = [-8,8])


## Step 2: Player 1 draws a line parallel to the x-axis, passing through the center point.

In [72]:

# GET THE PLOTTING DATA AND LAYOUT FOR THE ORIGINAL PLOT
data, layout = plot_cloud_with_mean(A, axis_limits = [-8,8], return_plotting_data=True, show_plot=False)

# CREATE HORIZONTAL DASHED LINE FOR HEIGHT
mean_x, mean_y, mean_z = np.mean(A,axis=0)
trace_horizontal_line     = go.Scatter3d(   
                            x=[-8,8], 
                            y=[mean_y, mean_y],
                            z=[mean_z, mean_z],
                            mode='lines',
                            line=dict(color='black',dash='dash'),
                            name='average height')

# COMBINE + PLOT
data_player1 = data + [trace_horizontal_line]
fig = go.Figure(data=data_player1, layout=layout)
fig.show()

## Step 3: Player 1 draws a line from each point in the cloud to a point on the dotted line (specifically, the point with the same x-coordinate)

The sume of squares of these lines is the "starting" quantity that Player 2 wants to reduce.  In this case it's

$$\text{SumOfSquares} = 3^2 + 0^2 + 3^2 = 18$$

In [75]:
# CREATE A NEW POINT CLOUD BY REPLACING EACH Y-COORDINATE WITH mean_y, AND EACH
# Z-COORDINATE WITH mean_z
B = copy.deepcopy(A)
B[:,1] = np.mean(A[:,[1]])
B[:,2] = np.mean(A[:,2])

# LET'S MAKE A COPY OF THIS FOR LATER USE, CALLED B_baseline
B_baseline = copy.deepcopy(B)

# WRITE A FUNCTION TO GENERATE TRACES FOR THE JOINING LINES
def residual_traces(cloud0, cloud1):
  """
  :param cloud0: an m x n matrix whose rows represent points in a point cloud
  :param cloud1: an m x n matrix whose rows represent points in a point cloud 
  :return: a list of traces connecting the mth point in cloud0 to the mth 
  point in cloud1
  """
  data_residual = [   go.Scatter3d( x=[cloud0[q,0], cloud1[q,0]], 
                                    y=[cloud0[q,1], cloud1[q,1]], 
                                    z=[cloud0[q,2], cloud1[q,2]], 
                                    # text=['', f'offset= {np.linalg.norm(cloud0[q]-cloud1[q])}', ''],
                                    textposition='top left',
                                    mode='lines', 
                                    line=dict(color='red'),
                                    name = f'vertical offset= {np.linalg.norm(cloud0[q]-cloud1[q])}') \
                    for q in range(cloud0.shape[0])]  
  return data_residual

# COMBINE THE PLOT WITH THE VISUAL LINES
data_residual = residual_traces(A,B)
data_player1_vlines = data_player1 + data_residual
fig = go.Figure(data = data_player1_vlines, layout=layout)

# CALCULATE DISTANCE FROM EACH POINT TO ITS "GUESSED" POINT
differences = [np.linalg.norm(A[row,[1,2]]-B[row,[1,2]]) for row in range(A.shape[0])]
differences_sq = np.array(differences)**2
differences_sq = np.round(differences_sq,4)

# MODIFY TITLE
fig.update_layout(title=f'SumOfSquaredLengths = {differences_sq[0]}+{differences_sq[1]}+{differences_sq[2]} = {differences_sq.sum()}')

# DISPLAY
fig.show()

# Step 4: Payer 2 draws a new line

The line is given by equation
\begin{align*}
y &= m_1 x + b_1 \\
z &= m_2 x + b_2 \\
\end{align*}
**If the data lay perfectly along the dotted line** then we could of $m_1$ as the rate at which $y$ increases as we increase $x$.  We think of $b_1$ as the value of $y$ when $x = 0$.

Let's eyeball the values for $m_1, m_2, b_1, b_2$.  First rotate the plot so that (i) the x-axis runs left-right across your field of vision, and (ii) the z-axis runs up and down. Then you'll see that the y-coordinates of the blue points tend to increase as the x-coordinate increases.  Let's guess the rate of increase at about 1.  You can use the same rotation trick to guess the rate of increase in the z-coordinate.  Let's choose 1 for that, too.  The y-intercept $b_1$ looks close to 0, and the z-intercept $b_2$ looks close to 2.5.  So we will gues that 
\begin{align*}
y &= 1 x + 0 \\
z &= 1 x + 2.5 \\
\end{align*}
We can rephrase these formulae in terms of matrix vector multiplication:
\begin{align*}
[y,z] = [x] * [1, 1] + [0, 2.5]
\end{align*}
If we have a whole sequence of x-values, $(x_1, ..., x_n)$, we can write the seqeunce of y-values in almost exactly the same way **by using matrix multiplication**

\begin{align*}
\left[\begin{array}{cc}y_0 & z_0 \\ \vdots & \vdots \\y_n & z_n\end{array}\right]
=
\left[\begin{array}{c}x_0 \\ \vdots \\ x_n\end{array}\right]
* 
\left[\begin{array}{cc}1 & 1\end{array}\right]
+
\left[\begin{array}{cc}0 & 2.5 \\ \vdots & \vdots \\ 0 & 2.5\end{array}\right]
\end{align*}

### Let's write a function to do this so we don't have to
Let's write a function that will take a sequence of x-values, and complete it to a sequence of x,y,z values (recording the output as the rows of a matrix)

In [60]:


# WRITE A FUNCTION TO "ADD" Y AND Z COORDINATES TO EACH X

def extend_x_to_xyz(x,slopes=[1,1],intercepts=[0,0]):
  """
  Input: a list or vector of x values
  Output: an array B such that 
    (i ) the first column of B equals x, 
    (ii) each row of B is a point on the line that we have guessed
  """
  x           = np.array(x).astype(float).reshape(-1,1)     # convert x to a column vector of floats
  slopes      = np.array(slopes).reshape(1,-1)              # convert the slope list into a row vector
  intercepts  = np.array(intercepts).reshape(1,-1)          # convert the intercept list into a row vector
  yz          = np.matmul(x, slopes) + intercepts           # APPLY THE MATRIX FORMULA
  xyz         = np.concatenate((x,yz),axis=1)               # concatenate the x and xy matrices
  return xyz

These equations determine a line.  For each point in our cloud, let's find the point on this line that shares the same x-coordinate.  How do we do this?  Well, pick a point $(x_i, y_i, z_i)$.  We already know that the x-coordinate should be *and* we have formulas for y and z!



In [61]:
x_coords = A[:,[0]]

xyz_coords_guesscloud = extend_x_to_xyz( x_coords, 
                                        slopes=[1,1], 
                                        intercepts=[0,2.5])

print("x coordinates")
display(x_coords)

print("xyz coordinates")
display(xyz_coords_guesscloud)

x coordinates


array([[ 5.],
       [ 4.],
       [-3.],
       [-4.]])

xyz coordinates


array([[ 5. ,  5. ,  7.5],
       [ 4. ,  4. ,  6.5],
       [-3. , -3. , -0.5],
       [-4. , -4. , -1.5]])

### Let's plot.



In [62]:
# PLOT

# RECALCULATE THE POINTS ON THE GUESSED LINE
x_coords = A[:,[0]]
xyz_coords_guesscloud = extend_x_to_xyz( x_coords, 
                                        slopes=[1,1], 
                                        intercepts=[0,2.5])

# GET THE PLOTTING DATA AND LAYOUT FOR THE ORIGINAL POINT CLOUD
data, layout = plot_cloud_with_mean(A, axis_limits = [-8,8], return_plotting_data=True, show_plot=False)

# CREATE A NEW DASHED LINE (WE'LL DO THIS BY JOINING TWO POINTS)
line_endpoints_x = [-8,8]
line_endpoints_xyz = extend_x_to_xyz(   line_endpoints_x, 
                                        slopes=[1,1], 
                                        intercepts=[0,2.5])
trace_guess_line = go.Scatter3d(   
                            x=line_endpoints_xyz[:,0], 
                            y=line_endpoints_xyz[:,1],
                            z=line_endpoints_xyz[:,2],
                            mode='lines',
                            line=dict(color='black',dash='dash'),
                            name='average height')
data_player2 = data + [trace_guess_line]

# COMBINE THE PLOT WITH THE OFFSET LINES
vline_traces_player2 = residual_traces(A,xyz_coords_guesscloud)
data_player2 = data_player2 + vline_traces_player2

# CREATE FIGURE
fig = go.Figure(data = data_player2, layout=layout)

# CALCULATE DISTANCE FROM EACH POINT TO ITS "GUESSED" POINT
differences = [np.linalg.norm(A[row,[1,2]]-xyz_coords_guesscloud[row,[1,2]]) for row in range(A.shape[0])]
differences_sq = np.array(differences)**2
differences_sq = np.round(differences_sq,4)

# MODIFY TITLE
fig.update_layout(title=f'SumOfSquaredLengths = {differences_sq[0]}+{differences_sq[1]}+{differences_sq[2]} = {differences_sq.sum()}')

# DISPLAY
fig.show()

This is better - but still not best!

# Step 5: Linear regression paracheuts into the game, and wins!

In [71]:
# FORMAT THE DATA
x_column = A[:,[0]] # why the extra brackets?  because the package we use to run linear regression wants *column vectors*
yz_matrix = A[:,[1,2]] # why the extra brackets?  because the package we use to run linear regression wants *column vectors*

# FORMAT THE REGRESSOR
reg       =   sklearn.linear_model.LinearRegression()
reg.fit(x_column, yz_matrix)

# EXTRACT SLOPE AND INTERCEPT
slopes_linreg     =   reg.coef_
intercepts_linreg =   reg.intercept_

# GET THE PROJECTED POINT CLOUD
x_coords = A[:,[0]]
xyz_coords_guesscloud = extend_x_to_xyz(  x_coords, 
                                          slopes=slopes_linreg, 
                                          intercepts=intercepts_linreg)

# MAKE A COPY FOR FUTURE USE
B_linreg  =   copy.deepcopy(xyz_coords_guesscloud)

# GET THE PLOTTING DATA AND LAYOUT FOR THE ORIGINAL POINT CLOUD
data, layout = plot_cloud_with_mean(A, axis_limits = [-8,8], return_plotting_data=True, show_plot=False)

# CREATE A NEW DASHED LINE (WE'LL DO THIS BY JOINING TWO POINTS)
fit_line_endpoints_x = np.array([-8,8]).reshape(-1,1)
xyz_coords_dashline = extend_x_to_xyz(  fit_line_endpoints_x, 
                                        slopes=slopes_linreg, 
                                        intercepts=intercepts_linreg)
print(xyz_coords_dashline)
trace_fit_line            = go.Scatter3d(   
                            x=xyz_coords_dashline[:,0], 
                            y=xyz_coords_dashline[:,1],
                            z=xyz_coords_dashline[:,2],
                            mode='lines',
                            line=dict(color='black',dash='dash'),
                            name='average height')
data_player2 = data + [trace_fit_line]

# COMBINE THE PLOT WITH THE VISUAL LINES
vline_traces_player2 = residual_traces(A, xyz_coords_guesscloud)
data_player2 = data_player2 + vline_traces_player2

# CREATE FIGURE
fig = go.Figure(data = data_player2, layout=layout)

# CALCULATE DISTANCE FROM EACH POINT TO ITS "GUESSED" POINT
differences = [np.linalg.norm(A[row,[1,2]]-B_linreg[row,[1,2]]) for row in range(A.shape[0])]
differences_sq = np.array(differences)**2
differences_sq = np.round(differences_sq,4)

# MODIFY TITLE
fig.update_layout(title=f'SumOfSquaredLengths = {differences_sq[0]}+{differences_sq[1]}+{differences_sq[2]} = {differences_sq.sum()}')

# DISPLAY
fig.show()

[[-8.         -2.97692308 -4.53846154]
 [ 8.          4.03846154  7.76923077]]


### Notice: the best fit line passes through the mean

As we can see, in this example.  Interestingly, it *always* passes through the mean (a fact which takes some real work to prove).

# Lessons from this example

* Mathematically, it's similar to the 2d example
  * we use formulas similar to $y = mx + b$
  * we connect "original" points to points on a line
    * in particular, we send each point in the original cloud to the point on the line that shares the same x-coordinate