In [156]:
import numpy as np
import scipy.stats

Implement basic probability concepts for random variables of various distributions (mean, variance, L_p norm, covariance)

I. Normal Distribution

In [194]:
#Draw 50 samples from a random variable of normal distribution with mean = 0 and standard deviation = 1:
sample_normal = np.random.normal(0,1,50)
print(sample_normal)

[ 0.71766628  1.10740402  1.51835073 -0.02241346 -0.23634159  0.9946374
 -0.22834492  1.45660728  0.08772058  1.32614504 -2.2440173  -0.519985
  0.47802962  0.53307528 -0.42249049 -1.19492438 -1.68665846  0.42986169
 -0.20341079  0.17946266  0.38775923  0.93423199 -1.69897645 -0.44289532
  0.17525638 -1.25521169  1.00693698  0.75029857 -0.37313003  0.72609735
  0.18568003  1.06499742  2.16285669  1.6917077   1.24543738  0.31175796
 -0.50983365 -0.9941878  -3.60355749  1.48631732 -0.00961469  0.57609367
 -0.68282047 -0.28364228  2.81091833 -0.9435521  -0.18071856 -2.06716149
  0.01092944 -0.78472049]


1) Implement basic probability concepts for samples drawn from the standard normal distribution using numpy functions:

In [203]:
#Compute sample mean of sample_normal using numpy:
np.mean(sample_normal)

0.075352562859296146

Questions: 

1)Why is it reasonable to use sample mean for the estimation of the mean of the random variable? Hint: Law of large numbers.

2)Intuitively, the larger number of samples, the more accurate the sample mean is as an estimation for the mean of the given random variable. Can you quantify the "accuracy"?
For example, assume that the given random variable has variance bounded by 2, how many samples are needed for the estimation error to be within 0.01 with probability 0.9? Hint: Chebyshev's inequality. (**)

In [195]:
#Compute sample variance of sample_normal using numpy:
np.var(sample_normal, ddof = 1)

1.400494565442884

Question: What does "ddof" value represent for?

2) Implement basic probability concepts for samples drawn from the standard normal distribution using self-defined functions:

In [196]:
#Compute sample mean of a sample:
def mean(ar):
    return np.sum(ar)/len(ar)

In [220]:
#Compute sample variance of a sample:
def variance(ar):
    mean = np.sum(ar)/len(ar)
    ar_demean = ar-mean
    ar_demean_squared = np.square(ar_demean)
    return np.sum(ar_demean_squared)/(len(ar)-1)

Question: To compute the sample variance, why is the denominator in the last step is (the number of the sample - 1), instead of (the number of the sample)? Hint: Bessel correction.

In [198]:
#Compute L^p norm of a sample:
def L_p(ar,p):
    ar_abs_power_p = np.power(np.abs(ar),p)
    return (np.sum(ar_abs_power_p)/len(ar))**(1/p)

In [199]:
#Compute L^infinity norm of a sample:
def L_infinity(ar):
    ar_abs = np.abs(ar)
    return np.max(ar_abs)

In [200]:
#Compute covariance of two input samples:
def cov(x,y):
    if len(x) == len(y):
        mean_x = np.mean(x)
        mean_y = np.mean(y)
        return np.sum((x - mean_x)*(y-mean_y))/(len(x)-1)
    else:
        print('The lengths of the two input vectors do not match.')

Question: Why is the denominator in the last step is (the number of the sample - 1), instead of (the number of the sample)?

In [202]:
#Use the self-defined function to compute sample mean:
print(mean(sample_normal))

0.0753525628593


In [221]:
#Use the self-defined function to compute sample variance:
print(variance(sample_normal))

1.40049456544


In [207]:
#Use the self-defined function to compute L^3 norm of the sample:
print(L_p(sample_normal-mean(sample_normal),3))

1.41991993661


In [206]:
#Use the self-defined function to compute L^infinity norm of the sample:
print(L_infinity(sample_normal))

3.60355748749


In [None]:
#Use the self-defined function to compute the covariance of two samples:
sample_1 = sample_normal[1:11]
sample_2 = sample_normal[11:21]
print(cov(sample_1,sample_2))

II. Bernoulli Distribution

In [210]:
#Draw 50 samples from a random variable of Bernoulli distribution taking values 1 and -1 with probablity 1/2:
sample_bernoulli = np.random.binomial(1,1/2,50)
print(sample_bernoulli)

[1 1 0 0 1 1 0 1 1 0 1 1 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1
 0 1 0 0 0 1 0 0 1 1 0 0 0]


In [217]:
#Compute sample mean of sample_bernoulli using numpy and self_defined function:
print(np.mean(sample_bernoulli))
print(mean(sample_bernoulli))

0.48
0.48


Question: 
Can you quantify the accuracy of the sample mean as estimations for the mean of the random variable in this case? Does the same estimate (**) hold? Is there any better estimate available in this case? Hint: Hoeffding’s inequality.  

In [222]:
#Compute sample variance of sample_bernoulli using numpy and self_defined function:
print(np.var(sample_bernoulli,ddof = 1))
print(variance(sample_bernoulli))

0.254693877551
0.254693877551


In [223]:
#Use the self-defined function to compute L^4 norm of the sample:
print(L_p(sample_normal-mean(sample_bernoulli),4))

1.77778303014


In [224]:
#Use the self-defined function to compute L^infinity norm of the sample:
print(L_infinity(sample_bernoulli))

1


In [226]:
#Use the self-defined function to compute the covariance of two samples:
sample_1 = sample_bernoulli[1:20]
sample_2 = sample_bernoulli[21:40]
print(cov(sample_1,sample_2))

-0.0672514619883
