In [None]:
import subprocess

from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process import kernels
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import matplotlib.cm as cm
import matplotlib.pyplot as plt

import scipy.stats
import emcee


%matplotlib inline

## Exercise 1 - Latin Hypercube Design

In [None]:
def generate_lhs(npoints, ndim, seed):
    """
    Generate a maximin Latin-hypercube sample (LHS) with the given number of
    points, dimensions, and random seed.

    """

    proc = subprocess.run(
        ['R', '--slave'],
        input="""
        library('lhs')
        set.seed({})
        write.table(maximinLHS({}, {}), col.names=FALSE, row.names=FALSE)
        """.format(seed, npoints, ndim).encode(),
        stdout=subprocess.PIPE,
        check=True
    )

    lhs = np.array(
        [l.split() for l in proc.stdout.splitlines()],
        dtype=float
    )


    return lhs


#### Create and plot design matrix below
##### 20 points, 2 dimensions, set the seed to 80

In [None]:
design = 

In [None]:
plt.scatter(x = , y = )
plt.title('Latin Hypercube Design')
plt.xlabel('Input 1')
plt.ylabel('Input 2')
pass

## Exercise 2 - Toy GP Example

#### Part a - Mean and Variance Estimate

In [None]:
def truth(x):
    return(3*x+np.cos(5*x))    
design = np.linspace(start =-1,stop=1,num=5)
model_data = truth(design)

In [None]:
plt.scatter(x=design,y=model_data)
plt.title('Computer Model Output at Design Points')
plt.xlabel('Design')
plt.ylabel('Model Output')
pass

In [None]:
ptp = 2
kernel = (
    1. * kernels.RBF(
        length_scale=ptp,
        length_scale_bounds=np.outer(ptp, (.1, 10))
    ) 
)

In [None]:
gp = GPR(kernel=kernel,
    #alpha=0,
    n_restarts_optimizer=0,
    copy_X_train=False).fit(design.reshape(-1,1), model_data)

In [None]:
X =

In [None]:
mean, cov = gp.predict(return_cov=True,X=)

In [None]:
#Set up the figure by first plotting the output at the design points
plt.scatter(x=design,y=model_data,color = 'black',label = 'Design Output')
plt.title('Computer Model Output at Design Points')
plt.xlabel('Design')
plt.ylabel('Model Output')


#Add the mean, upper 95% quantile, and lower 95% quantile of the GP predictions at all the in-between points
plt.plot(x=X, y = ,color= 'blue',label = 'GP Mean')
top_var =
bot_var = 
plt.fill_between(X[:,0], bot_var, top_var, where=top_var >= bot_var, facecolor='lightgray', interpolate=True)

plt.plot(X,truth(X),color='black',label = 'Truth')
plt.legend(loc='best', fontsize=12)

pass

#### Part b - Random Draws

In [None]:
#Get the upper 95% quantile, and lower 95% quantile of the GP predictions at all the in-between points
#Same as before
top_var = 
bot_var = 

plt.scatter(x=design,y=model_data,color = 'black',label = 'Design Output')
plt.title('Computer Model Output at Design Points')
plt.xlabel('Design')
plt.ylabel('Model Output')
plt.fill_between(X[:,0], bot_var, top_var, where=top_var >= bot_var, facecolor='lightgray', interpolate=True)
plt.plot(X,truth(X),color='black',label = 'Truth')

ndraws = 10
colors = cm.rainbow(np.linspace(0, 1, ndraws))

#Get [ndraws] random draws from the predictive distribution of the GP at all of the in-between points 
rand_draw = 
for i in range(ndraws):
    plt.plot(X,rand_draw[:,i],color = colors[i],linestyle = ":")

plt.legend(loc='best', fontsize=14)

pass


## Exercise 3 - Principal Component Analysis

In [None]:
scores = np.loadtxt('scores.txt')

In [None]:
scaler = StandardScaler(copy=False)
pca = PCA(copy=False, whiten=True, svd_solver='full')
Z = pca.fit_transform(scaler.fit_transform(scores))

Plot the cummulative fraction of variance explained. How many PCs would you recommend using?

In [None]:
F_r = 

In [None]:
plt.plot(range(len(F_r)),F_r,'-o')
plt.title('Fraction of Variance Explained')
plt.xlabel('Number of Components')
plt.ylabel('F_r')
pass

Find the correlation between the first two principal components

In [None]:
corr = 
corr

Plot the second principal component against the first

In [None]:
plt.scatter()
plt.title('Principle Components')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
pass

## Exercise 4 - Bayes Rule

In [None]:
np.savetxt('unknown_mean.txt',data)

In [None]:
known_var = 5
data = np.loadtxt('unknown_mean.txt')
plt.hist(data)
pass

We know the data are Normally distributed, with known variance $\sigma_y^2 = 5$ but unkown mean $\theta$. We place a Normal prior on $\theta$, with mean $\mu$ and variance $\tau$. Thus we have the following setup:

Likelihood:
\begin{align}
    	y_i\mid\theta&\overset{iid}{\sim} N(\theta,\sigma_y^2)\quad\text{for}\ \ i\in{1,\ldots,n}
\end{align}
Prior:
\begin{align}
	\theta &\sim N(\mu,\tau)
\end{align}

Goal: Explore different priors for $\theta$ to see how the posterior of $\theta$ responds. Try combinations of the following values:

$\mu\in \{0,3,10\}$

$\tau\in \{1,10,0.1\}$

Report the mean and variance for the posterior of each pair, and plot histograms of 10,000 draws from the posterior for 2-3 pairs.

#### Method 1: MCMC via python package emcee

Fill in missing pieces to calc_lnprior and calc_lnlike. These functions calculate the log of the prior and likelihood, respectively. For example, for the prior, you need to calculate the log of the pdf at theta for given mean mu and variance tau.

Hint 1: The liklihood of all data points is the product over all $n$ individual likelihoods. What is the log likelihood of all data points?


Hint 2: If done correctly:

calc_lnlike(theta=2, mu = 0, tau = 1) = -2.919

calc_lnlike(y=data,theta=2,sigma_y_sqrd=5)=-157.55

In [None]:
#Calculate the log of the prior of theta
def calc_lnprior(theta,mu,tau):
    #Put log of proper normal pdf here
    return 

#Calculate the log of the likelihood of {y_1, y_2,...,y_n}
def calc_lnlike(theta, y,sigma_y_sqrd):
    #Put log of proper normal likelihood pdf in here
    return 

def lnposterior(theta,mu,tau,y,sigma_y_sqrd):
    ln_pr = calc_lnprior(theta,mu,tau)
    
    if not np.isfinite(ln_pr):
        return -np.inf
        
    ln_like = calc_lnlike(theta=theta, y=y, sigma_y_sqrd=sigma_y_sqrd)
    return ln_pr + ln_like

The code below sets up the sampler. We provide the number of walkers nwalkers (chosen), the number of parameters ndim (1 in our case, since we just have $\theta$), and the posterior function which takes as argument the parameters we're inferrin on. "args" is a tuple of all of arguments to the probability function

In [None]:
ndim, nwalkers = 1, 200
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnposterior, args=(mu, tau, data,known_var))
p0 = np.random.rand(nwalkers,ndim)

$nsteps$ is the number of steps each walker will run, while $nburnin$ is the number of "burn-in" or "warmup" steps for each walker.  Our total number of samples will be $nwalkers\times (nsteps-nburnin)$ (recall we want 10,000 total samples). After running each walker for $nsteps$ and discarding the first $nburnin$, we reshape the output chain so each row is a draw from the posterior

In [None]:
nsamples = 100
nburnin= 50
out_post = sampler.run_mcmc(p0,nsamples)
samples = sampler.chain[:, nburnin:, :].reshape((-1, ndim))

Use the samples to find posterior means and variances

In [None]:
plt.hist(samples)
pass

#### Method 2: Direct computation of the posterior

We can find the analytics posterior in this case

First, recall the likelihood and prior

\begin{align}
	y_i\mid\theta&\overset{iid}{\sim} N(\theta,\sigma_y^2)\\
	\theta &\sim N(\mu,\tau)
\end{align}

Then, we apply Bayes Rule and do a little algebra

\begin{align}
	p(\theta\mid\mathbf{y}) &\propto p(\mathbf{y}\mid\theta)p(\theta)\\
        		&\propto \left[\prod\limits_{i=1}^np(y_i\mid\theta)\right]p(\theta)\\
               	&\propto \left[\prod\limits_{i=1}^n(2\pi\sigma_y^2)^{-1/2}e^{-\frac{1}{2\sigma_y^2}(y_i - \theta)^2}\right](2\pi\tau)^{-1/2}e^{-\frac{1}{2\tau}(\theta-\mu)}\\
	&\vdots\\
	\theta\mid\mathbf{y}&\sim N\left(\left(\frac{\bar{y}}{\sigma_y^2}+\frac{\mu}{\tau}\right)\left(\frac{1}{\sigma_y^2} + \frac{1}{\tau}\right)^{-1},\left(\frac{1}{\sigma_y^2} + \frac{1}{\tau}\right)^{-1} \right)
\end{align}

Edit the code below to calculate the posterior mean and variance. Compare to above for the two pairs for which you plotted the posterior histogram.

In [None]:
mu = 
tau = 
post_var = 
print(post_var)
post_mean = 
print(post_mean)
post_draws = np.random.normal(post_mean,np.sqrt(post_var),10000)