In [30]:
import copy
import plotly
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import sklearn
from   sklearn import linear_model


# What does the mean of a point cloud look like?

Let's look at a matrix `A` and compute its mean.

In [31]:
#  CREATE ARRAY
A = 2 + np.array( [ [3, 2 ],
                    [2, -1],
                    [-5, 1]])

#   RIGHT NOW THE ARRAY HAS int TYPE; THIS IS A PROBLEM SINCE NUMPY WILL TRY TO
#   ROUND ENTRIES THAT WE ASSIGN TO A: HERE'S AN EXAMPLE

B = copy.deepcopy(A)
B[0,0] = 0.2
print(B)



[[ 0  4]
 [ 4  1]
 [-3  3]]


In [32]:
#   NUMPY ROUNDED THE TOP LEFT ENTRY TO ZERO!  TO AVOID THIS IN ALL FUTURE
#   CASES, CONVERT A TO FLOAT TYPE
 
A = A.astype(float)

A_mean = A.mean(axis=0)

print('Matrix A')
print(A)
print('The mean of matrix A')
print(A_mean)

Matrix A
[[ 5.  4.]
 [ 4.  1.]
 [-3.  3.]]
The mean of matrix A
[2.         2.66666667]


We'll think of the rows of `A` as a point cloud.  The mean lies at the "center" of the cloud.

In [33]:
def plot_cloud_with_mean( A, 
                         point_cloud_name = '',
                         plot_title = '', 
                         axis_limits = None, 
                         return_plotting_data = False,
                         show_plot = True):
  # RECALCULATE THE MEAN X AND Y COORDINATES
  mean_x, mean_y = A.mean(axis=0)
  # GENERATE PLOTTING DATA FOR THE POINT CLOUD  
  trace_cloud = go.Scatter(x=A[:,0], y=A[:,1], mode='markers', marker=dict(size=10,color='blue'), showlegend=True, name = "point cloud"+point_cloud_name)
  # GENERATE PLOTTING DATA FOR THE MEAN  
  trace_mean = go.Scatter(x=[mean_x], y=[mean_y], mode='markers', showlegend=True, marker=dict(size=10,color='red'), name = f'mean(point cloud{point_cloud_name})')
  # STORE ALL THE TRACES IN A LIST
  data = [trace_cloud, trace_mean]
  # GENERATE A "LAYOUT" OBJECT
  if axis_limits is None:
    layout = go.Layout(title=plot_title)
  else:
    layout = go.Layout(title=plot_title, xaxis=dict(range=axis_limits), yaxis=dict(range=axis_limits))
  # COMBINE THE PLOTTING DATA WITH THE LAYOUT OBJECT, AND PLOT  
  if show_plot:
    fig = go.Figure(data = data,layout = layout)
    fig.show()
  if return_plotting_data:
    return data,layout

plot_cloud_with_mean(A, axis_limits = [-8,8])

# Why does it look that way?

Notice that the mean really does seem to lie at the "center" of the cloud.  Why is this?  To getter sense of why, let's see what happens when we set all the x (or all the y) coordinates to zero.


In [None]:
#   MAKE A COPY OF A, AND SET ITS X COORDINATES TO 0
B = copy.deepcopy(A)
B[:,0] = 0

#   PLOT WITH OUR HANDY NEW PLOTTING FUNCTION
plot_cloud_with_mean(B,plot_title='flattened x coordinates', axis_limits=[-8,8])

In [None]:
#   MAKE A COPY OF A, AND SET ITS Y COORDINATES TO 0
B = copy.deepcopy(A)
B[:,1] = 0

#   PLOT WITH OUR HANDY NEW PLOTTING FUNCTION
plot_cloud_with_mean(B,plot_title='flattened y coordinates', axis_limits=[-8,8])

In essence, the mean vector `A_mean` lies at the center of the point cloud because its x-coordinate is the mean of the x-coordinates, its y-coordinate is the mean of the y-coordinates, etc.

# What happens when you add [-2,-4] to each point in the cloud (ie each row of A)?


You "translate" the cloud!

In [None]:
# MAKE A COPY OF THE CLOUD, AND ADD A ROW VECTOR
B = copy.deepcopy(A)
offset_row_vector = np.array([[-2,-4]])
B = B + offset_row_vector
print(f'matrix A')
display(A)
print('\nrow vector offset')
display(offset_row_vector)
print('\nmatrix A + offset')
display(B)

# PRINT THE ORIGINAL CLOUD, AND THE OFFSET
data_A =  plot_cloud_with_mean(A,point_cloud_name=' (original)',axis_limits=[-8,8], return_plotting_data=True)
data_offset = plot_cloud_with_mean(B,point_cloud_name=' (offset)',axis_limits=[-8,8], return_plotting_data=True)


matrix A


array([[ 5.,  4.],
       [ 4.,  1.],
       [-3.,  3.]])


row vector offset


array([[-2, -4]])


matrix A + offset


array([[ 3.,  0.],
       [ 2., -3.],
       [-5., -1.]])

# What happens when you add a column vector [[-4],[-2],[-2]] to each column of A?

Something a bit crazier - play around with the offset column vector to see what happens!

In [None]:
# MAKE A COPY OF THE CLOUD, AND ADD A COLUMN VECTOR
B = copy.deepcopy(A)
offset_col_vector = np.array([[-4],[-2],[1]])
B = B + offset_col_vector
print(f'matrix A')
display(A)
print('\ncolumn vector offset')
display(offset_col_vector)
print('\nmatrix A + column offset')
display(B)

# PRINT THE ORIGINAL CLOUD, AND THE OFFSET
data_A =  plot_cloud_with_mean(A,point_cloud_name=' (original)',axis_limits=[-8,8], return_plotting_data=True)
data_offset = plot_cloud_with_mean(B,point_cloud_name=' (offset)',axis_limits=[-8,8], return_plotting_data=True)


matrix A


array([[ 5.,  4.],
       [ 4.,  1.],
       [-3.,  3.]])


column vector offset


array([[-4],
       [-2],
       [ 1]])


matrix A + column offset


array([[ 1.,  0.],
       [ 2., -1.],
       [-2.,  4.]])

# What happens when you multiply every entry in the matrix by a constant?

You'll "stretch" and "shrink" the cloud accordingly.

In [None]:
# MAKE A COPY OF THE CLOUD, AND MULTIPLY BY A CONSTANT
B = copy.deepcopy(A)
constant = 0.5
B = B * constant
print(f'matrix A')
display(A)
print(f'\nmatrix {constant}*A')
display(B)

# PRINT THE ORIGINAL CLOUD, AND THE OFFSET
plot_cloud_with_mean(A,point_cloud_name=' (original)',axis_limits=[-8,8])
plot_cloud_with_mean(A*constant,point_cloud_name=f' * {constant}',axis_limits=[-8,8])
plot_cloud_with_mean(A*constant/2,point_cloud_name=f' * {constant/2}',axis_limits=[-8,8])


matrix A


array([[ 5.,  4.],
       [ 4.,  1.],
       [-3.,  3.]])


matrix 0.5*A


array([[ 2.5,  2. ],
       [ 2. ,  0.5],
       [-1.5,  1.5]])

# The total variance of a point cloud (ok to skip this and return)

The total variance of a point cloud is the average squared distance from a point in the cloud to the mean of the cloud.

In [None]:
# COMPUTE THE DISTANCES TO THE MEAN
distances = sklearn.metrics.pairwise_distances(A, [A_mean])
print(distances)

[[3.2829526 ]
 [2.60341656]
 [5.01109879]]


In [None]:
# COMPUTE THE SQUARES OF THESE VALUES
distances_squared = distances * distances # note that this is *not* matrix multiplication, it's entry-wise multiplication
print(distances_squared)

[[10.77777778]
 [ 6.77777778]
 [25.11111111]]


In [None]:
#  COMPUTE THE MEAN OF THE SQUARES
variance = distances_squared.mean()
print(f'the point clouds variance is = {variance}')

the point clouds variance is = 14.222222222222223


In [None]:
# ANOTHER WAY TO COMPUTE THE TOTAL VARIANCE OF A POINT CLOUD IS TO ADD THE VARIANCES OF EACH OF ITS COORDINATES
variance_alt = A.var(axis=0).sum()
print(f'the value in the preceding cell should be {variance_alt}')

the value in the preceding cell should be 14.222222222222221


Notice that the values in the preceding two cells aren't quite identical.  This is because numpy makes very tiny numerical errors when it runs computations, and these errors will be slightly different depending on what sequence of operations you perform.  If numpy had perfect precision, the two numbers would be the same.

## Caveat lector

There are multiple **DIFFERENT** definitions of variance out there, and people don't always talk about them.   **Mathematicians** tend to define variance as it is defined above, ie, as 
$$
\frac{\text{SUM(squared distances to the mean)}}
  {\text{number of points}}
$$
This is also the kind of variance that people have in mind when they talk about **variance explained in by a predictor (ie function approximator)** which is why we focus on it.  

**Statisticians** by contrast sometimes define variance as
$$
\frac{\text{SUM(squared distances to the mean)}}
  {\text{number of points - 1}}
$$
The same goes for **standard deviation**: mathematicians divide by $n$,  statisticians (often) divide by $n-1$.

Both groups have their reasons; subtracting 1 from the denomenator tends to produce more conservative predictions (which statisticians like), and *not* subtracting one makes everything much easier to reason about and interpret (which mathematicians like).

There is also a definition of variance in terms of  certain so-called **covaraince matrices**.  This is something different from either of the two preceding quantities.  



Just be sure you know what the other person means when you talk with them about it!  

# But what does total variance LOOK like?


It looks like the sum of the squared-lengths of the edges in the following diagram.

In [34]:
def plot_cloud_with_variance(A, title='', axis_limits = None):
  # RECALCULATE THE MEAN X AND Y COORDINATES
  mean_x, mean_y = A.mean(axis=0)
  # GENERATE PLOTTING DATA FOR THE POINT CLOUD  
  trace_cloud = go.Scatter(x=A[:,0], y=A[:,1], mode='markers', marker=dict(size=10,color='blue'), showlegend=True, name = "point cloud")
  # GENERATE PLOTTING DATA FOR THE MEAN  
  trace_mean = go.Scatter(x=[mean_x], y=[mean_y], mode='markers', showlegend=True, marker=dict(size=10,color='red'), name = "mean(point cloud)")
  # GENERATE PLOTTING DATA FOR THE "RESIDUAL LINES"  
  traces_residual = [ go.Scatter( x=[A[p,0], mean_x], 
                                  y=[A[p,1], mean_y],
                                  mode='lines',
                                  line=dict(color='red'),
                                  showlegend=False)
                     for p in range(A.shape[0])
                    ]
  trace_mean = go.Scatter(x=[mean_x], y=[mean_y], mode='markers', showlegend=True, marker=dict(size=10,color='red'), name = "mean(point cloud)")  
  # COMBINE ALL THE TRACES INTO A SINGLE LIST
  data = [trace_cloud, trace_mean] + traces_residual
  # GENERATE A "LAYOUT" OBJECT
  if axis_limits is None:
    layout = go.Layout(title=title)
  else:
    layout = go.Layout(title=title, xaxis=dict(range=axis_limits), yaxis=dict(range=axis_limits))
  # COMBINE THE PLOTTING DATA WITH THE LAYOUT OBJECT, AND PLOT  
  fig = go.Figure(data = data,layout = layout)
  fig.show()

plot_cloud_with_variance(A,title='variance = AVERAGE(squared-length of the red lines)',axis_limits=[-8,8])

# **Translation preserves variance**

## Because translation preserves distance to the mean!

Recall that "translating" means adding a constant offset row to each row of the matrix.

Play around with this picture to convince your self (visually) that adding an offset row doesn't change the variance.

In [None]:
# MAKE A COPY OF THE CLOUD, AND ADD A ROW VECTOR
B = copy.deepcopy(A)
offset_row_vector = np.array([[-2,-4]])
B = B + offset_row_vector
print(f'matrix A')
display(A)
print('\nrow vector offset')
display(offset_row_vector)
print('\nmatrix A + offset')
display(B)

# PRINT THE ORIGINAL CLOUD, AND THE OFFSET
plot_cloud_with_variance(B,title='variance = AVERAGE(squared-length of the red lines)',axis_limits=[-8,8])

matrix A


array([[ 5.,  4.],
       [ 4.,  1.],
       [-3.,  3.]])


row vector offset


array([[-2, -4]])


matrix A + offset


array([[ 3.,  0.],
       [ 2., -3.],
       [-5., -1.]])

# Exercises

Convince yourself with pictures that the following operations **can** change the variance of a point cloud

* scaling the point cloud by a constant
* adding a a constant column vector to each column

Convince yourself with pictures that the following operations **do not** change the variance of a point cloud

* adding a constant row to each row
* (extra) rotating the cloud in the plane

# **The Regression Game**


Here's a game people like to play.  We're going to draw some points and (red) lines in the plane.  Player 1 will give the points (no rules on how - they can be anywhere) and draw some vertical lines (according to a certain forumla).


Then Player 2 will then draw some vertical lines (also according to a set of rules).  **The goal of Player 2 is to minimize the sum of the squared lengths of these lines.**  There is a jackpot at the end, and Player 2 gets to keep X% of the pot, where X is the percent that Player 2 reduced the sum of squares.  


Let's see how this plays out in practice. 

## Step 1: Player 1 chooses a point cloud

For convenience let's take the one we've already been working with.

In [35]:
# DEFINE THE ARRAY
A = 2 + np.array( [ [3, 2 ],
                    [2, -1],
                    [-5, -4]])

# AS BEFORE, CONVERT TO FLOAT TYPE
A = A.astype(float)

# PLOT THE CLOUD
plot_cloud_with_mean(A, axis_limits = [-8,8])


## Step 2: Player 1 draws a horizontal line whose height is the mean height of the cloud.

In [36]:

# GET THE PLOTTING DATA AND LAYOUT FOR THE ORIGINAL PLOT
data, layout = plot_cloud_with_mean(A, axis_limits = [-8,8], return_plotting_data=True, show_plot=False)

# CREATE HORIZONTAL DASHED LINE FOR HEIGHT
mean_y = np.mean(A[:,1])
trace_horizontal_line     = go.Scatter(   x=[-8,8], 
                            y=[mean_y, mean_y],
                            mode='lines',
                            line=dict(color='black',dash='dash'),
                            name='average height')

# COMBINE + PLOT
data_player1 = data + [trace_horizontal_line]
fig = go.Figure(data=data_player1, layout=layout)
fig.show()

## Step 3: Player 1 draws a vertical line from each point to the horizontal mean line.

The sume of squares of these lines is the "starting" quantity that Player 2 wants to reduce.  In this case it's

$$\text{SumOfSquares} = 3^2 + 0^2 + 3^2 = 18$$

In [37]:
# CREATE A NEW POINT CLOUD WHERE ALL POINTS HAVE HEIGHT EQUAL TO THE MEAN
B = copy.deepcopy(A)
B[:,1] = np.mean(A[:,[1]])

# LET'S MAKE A COPY OF THIS FOR LATER USE, CALLED B_baseline
B_baseline = copy.deepcopy(B)

# WRITE A FUNCTION TO GENERATE TRACES FOR THE VERTICAL LINES
def residual_traces(cloud0, cloud1):
  """
  :param cloud0: an m x n matrix whose rows represent points in a point cloud
  :param cloud1: an m x n matrix whose rows represent points in a point cloud 
  :return: a list of traces connecting the mth point in cloud0 to the mth 
  point in cloud1
  """
  data_residual = [   go.Scatter( x=[cloud0[q,0], cloud1[q,0]], 
                                y=[cloud0[q,1], cloud1[q,1]], 
                                # text=['', f'vertical offset= {np.linalg.norm(cloud0[q]-cloud1[q])}', ''],
                                textposition='top left',
                                mode='lines', 
                                line=dict(color='red'),
                                name = f'vertical offset= {np.linalg.norm(cloud0[q]-cloud1[q])}') \
                    for q in range(cloud0.shape[0])]  
  return data_residual

# COMBINE THE PLOT WITH THE VISUAL LINES
data_residual = residual_traces(A,B)
data_player1_vlines = data_player1 + data_residual
fig = go.Figure(data = data_player1_vlines, layout=layout)

# MODIFY TITLE
differences = np.round(A[:,1]-B[:,1],3)
fig.update_layout(title=f'SumOfSquaredLengths = {differences[0]}^2+{differences[1]}^2+{differences[2]}^2 = {np.sum(differences**2)}')

# DISPLAY
fig.show()

fig.show()

# Step 4: Payer 2 draws a new line

The line is given by equation
$$
y = mx + b
$$
where $m$ is slope and $b$ is the y-intercept.

Let's eyeball these values.

In [38]:
slope = 4/6
intercept = 0

# WRITE A FUNCTION TO RETURN THE Y VALUE, GIVEN X

def guess_y(x):
  """
  Return the y-value of the point on our line with x-coordinate x
  """
  return slope * x + intercept

And find the points on this line that lie direct above or each point.  We can do this beause y is a function of x, and we know the x-coordinate of each point.



In [39]:
x_coords = A[:,0]
y_coords = guess_y(x_coords)

B = copy.deepcopy(A)
B[:,1] = y_coords
print(B)

[[ 5.          3.33333333]
 [ 4.          2.66666667]
 [-3.         -2.        ]]


Now combine the new and old points, and plot.

In [40]:
# PLOT

# GET THE PLOTTING DATA AND LAYOUT FOR THE ORIGINAL POINT CLOUD
data, layout = plot_cloud_with_mean(A, axis_limits = [-8,8], return_plotting_data=True, show_plot=False)

# CREATE A NEW DASHED LINE (WE'LL DO THIS BY JOINING TWO POINTS)
line_endpoints_x = np.array([-8,8])
line_endpoints_y = guess_y(line_endpoints_x)
trace_horizontal_line     = go.Scatter(   
                            x=line_endpoints_x, 
                            y=line_endpoints_y,
                            mode='lines',
                            line=dict(color='black',dash='dash'),
                            name='average height')
data_player2 = data + [trace_horizontal_line]

# COMBINE THE PLOT WITH THE VISUAL LINES
vline_traces_player2 = residual_traces(A,B)
data_player2 = data_player2 + vline_traces_player2

# CREATE FIGURE
fig = go.Figure(data = data_player2, layout=layout)

# MODIFY TITLE
differences = np.round(A[:,1]-B[:,1],3)
fig.update_layout(title=f'SumOfSquaredLengths = {differences[0]}^2+{differences[1]}^2+{differences[2]}^2 = {np.sum(differences**2)}')

# DISPLAY
fig.show()

This is better - but still not best!

# Step 5: Linear regression paracheuts into the game, and wins!

In [41]:
# FORMAT THE DATA
x_column = A[:,[0]] # why the extra brackets?  because the package we use to run linear regression wants *column vectors*
y_column = A[:,[1]] # why the extra brackets?  because the package we use to run linear regression wants *column vectors*

# FORMAT THE REGRESSOR
reg       =   sklearn.linear_model.LinearRegression()
reg.fit(x_column, y_column)

# EXTRACT SLOPE AND INTERCEPT
slope     =   reg.coef_[0][0]
intercept =   reg.intercept_[0]

# GET THE PROJECTED POINT CLOUD
B         =   copy.deepcopy(A)
B[:,[1]]  =   reg.predict(B[:,[0]])

# MAKE A COPY FOR FUTURE USE
B_linreg  =   copy.deepcopy(B)

# GET THE PLOTTING DATA AND LAYOUT FOR THE ORIGINAL POINT CLOUD
data, layout = plot_cloud_with_mean(A, axis_limits = [-8,8], return_plotting_data=True, show_plot=False)

# CREATE A NEW DASHED LINE (WE'LL DO THIS BY JOINING TWO POINTS)
line_endpoints_x = np.array([-8,8])
line_endpoints_y = guess_y(line_endpoints_x)
trace_horizontal_line     = go.Scatter(   
                            x=line_endpoints_x, 
                            y=line_endpoints_y,
                            mode='lines',
                            line=dict(color='black',dash='dash'),
                            name='average height')
data_player2 = data + [trace_horizontal_line]

# COMBINE THE PLOT WITH THE VISUAL LINES
vline_traces_player2 = residual_traces(A,B)
data_player2 = data_player2 + vline_traces_player2

# CREATE FIGURE
fig = go.Figure(data = data_player2, layout=layout)

# MODIFY TITLE
differences = np.round(A[:,1]-B[:,1],3)
fig.update_layout(title=f'SumOfSquaredLengths = {differences[0]}^2+{differences[1]}^2+{differences[2]}^2 = {np.sum(differences**2)}')

# DISPLAY
fig.show()

### Notice: the best fit line passes through the mean

As we can see, in this example.  Interestingly, it *always* passes through the mean (a fact which takes some real work to prove).

# **Linear regression always wins**

### (At this game, at least)

Said another way, linear regression is guaranteed to reduce the sum of squares the most.  In fact, that's the **definition** of the regression line.

# **Definition** (these are all the same)
# - proportion of variance explained
# - coefficient of determination
# - $R^2$

The **proportion of variance explained** by Player 2, also called $R^2$, is the percent reduction in sum of squares.  In symbols, 
$$
R^2 =
\frac{\text{SumOfSquares(Player1)} -\text{SumOfSquares(Player2)}}
{\text{SumOfSquares(Player1)}}
$$

# Caveat lector: different people mean different things when they use this word.

If we calculate $R^2$ using the formula above, we'll get the same result as the automatic sklearn function.

In [47]:
differences_baseline  = A[:,1] - B_baseline[:,1]
differences_linreg    = A[:,1] - B_linreg[:,1]
ss_baseline           = np.sum(differences_baseline**2) # sum of squares - player 1
ss_linreg             = np.sum(differences_linreg**2)   # sum of squares - linear regression

R2_byhand             = (ss_baseline-ss_linreg) / ss_baseline
R2_auto               = sklearn.metrics.r2_score(A[:,[1]], B_linreg[:,[1]])

print(R2_byhand)
print(R2_auto)

0.8421052631578948
0.8421052631578948


However, there are different definitions out there (just check the wikipedia page), and even when people agree in broad strokes what the definition should be, they can differ on important details.  See, for example, the difference between `'uniform average'` and `‘variance_weighted’` in the documentation for `sklearn.metrics.r2_score`.  **The definition we use above corresponds to** `‘variance_weighted’`.