In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context('paper')

# Hands-On Activity 6.1: Collections of Random Variables

## Objectives

+ To practice with the joint probability mass function.

## Joint probability mass function of two discrete random variables

Take two discrete random variabls $X$ and $Y$.
Say that $X$ takes $5$ values, $0,1,\dots,4$, and $Y$ takes $10$ values, $0,1,\dots,9$.
Then, you can think of the joint probability mass function of $X$ and $Y$ as the $5\times 10$ matrix:
$$
A_{ij} = p(X=i, Y=j).
$$
Let's make up such a matrix to play with:

In [None]:
# This is to make sure that we all get the same results
np.random.seed(12345)
# First I draw a matrix with random entries in [0,1]
A = np.random.rand(5, 10)
print(A)

In [None]:
# And then I scale it so that the sum of all elements is one:
A = A / np.sum(A)
print('A = ', A)
print('Sum of A_ij = {0:1.2f}'.format(np.sum(A)))

Now we have a matrix that corresponds to a proper joint probability mass function.

Okay, this is great we can sample from this. Let's find now the probability mass function of just $X$.
Remember that you need to marginalize:
$$
p(x) = \sum_{y} p(x,y) = \sum_{y} A_{xy}.
$$
This is easy

In [None]:
p_x = np.sum(A, axis=1) # Axis = 1 tells sum to sum only the second axis
print('pmf of just X:', p_x)

In [None]:
# Verify that this is indeed a pmf:
print('sum of p_x = {0:1.2f}'.format(np.sum(p_x)))

With this you can easily find the expectation of $X$:

In [None]:
E_X = np.sum(np.arange(5) * p_x)
print('E[X] = {0:1.2f}'.format(E_X))

Similarly for the variance of X:

In [None]:
E_X2 = np.sum(np.arange(5) ** 2 * p_x)
V_X = E_X2 - E_X ** 2
print('V[X] = {0:1.2f}'.format(V_X))

Let's do the same for $Y$:

In [None]:
p_y = np.sum(A, axis=0)
print('pmf of just Y:', p_y)
E_Y = np.sum(np.arange(10) * p_y)
print('E[Y] = {0:1.2f}'.format(E_Y))
E_Y2 = np.sum(np.arange(10) ** 2 * p_y)
V_Y = E_Y2 - E_Y ** 2
print('V[Y] = {0:1.2f}'.format(V_Y))

Alright, we have found all the individual statistics.
Let's now find the covariance of the two random variables.
Remember the formula:
$$
\mathbb{C}[X,Y] = \sum_{x,y} (x-\mathbb{E}[X])(y-\mathbb{E}[Y])p(x,y).
$$
Here we go:

In [None]:
# We will loop over all the possible values
C_XY = 0.0 # Keeping track of the sum
for x in range(5):
    for y in range(10):
        C_XY += (x - E_X) * (y - E_Y) # the += means add to the left hand side
print('C[X, Y] = {0:1.2f}'.format(C_XY))

We see that $X$ and $Y$ are negatively correlated.
If only we could sample from them to visualize it...
How can we do this? We cannot just sample $X$ and then $Y$ without thinking about it.
We need to sample $X$ and $Y$ together.
Basically, we need to sample a set of index $(i,j)$ with probability $A_{ij}$.
This is like sampling from a categorical with $5\times 10 = 50$ different labels $c_0 = (1,1), c_1 = (1,2), \dots, c_{49} = (5, 10)$ each with a probability $A_{00}, A_{01}, \dots, A_{4,9}$.
So, let's define this categorical.
The categorical will sample a label $c_k$ and then we will turn this label to $i$ and $j$ by:
$$
i = \lfloor k / 10 \rfloor,
$$
where $\lfloor \cdot \rfloor$ stands for the integer part, and
$$
j = k \mod 10,
$$
where $\mod 10$ gives the remainder of the division by $10$.

In [None]:
import scipy.stats as st

XY = st.rv_discrete('Joinnt XY', values=(range(50), A.flatten())) # A.flatten() is the matrix flattened out as a row

# Let's now right a function that samples X and Y using a sample from XY
def sample_X_and_Y():
    """
    Samples X and Y once. 
    """
    k = XY.rvs()
    i = k // 10 # This is integer division
    j = k % 10  # This is the remainder
    return i, j

In [None]:
# Let's try it out - take 10 samples
for n in range(10):
    x, y = sample_X_and_Y()
    print('x = {0:d}, y = {0:d}'.format(x, y))

Alright, let's now collect all these samples we take so that we can visualize them:

In [None]:
num_samples = 20
x_samples = np.ndarray((num_samples,), dtype=np.int) # An 1D array of integers to be filled in with samples
y_samples = np.ndarray((num_samples,), dtype=np.int)
for n in range(num_samples):
    x, y = sample_X_and_Y()
    x_samples[n]= x
    y_samples[n] = y

In [None]:
# Ok, now let's do a scatter plot
fig, ax = plt.subplots(dpi=150)
ax.scatter(x_samples, y_samples)

## Questions

+ Write code that finds the variance of $X+Y$.
+ Modify your code to find the variance of $3X + 5Y$.
+ Write code that finds the expectation of the function $f(X,Y) = XY^3$.