diff --git a/GPflowOpt/acquisition/acquisition.py b/GPflowOpt/acquisition/acquisition.py
index 6353cc3..907ef8a 100644
--- a/GPflowOpt/acquisition/acquisition.py
+++ b/GPflowOpt/acquisition/acquisition.py
@@ -34,16 +34,19 @@ class Acquisition(Parameterized):
     In Bayesian Optimization this function is typically optimized over the optimization domain
     to determine the next point for evaluation.
 
-    An object of this class holds a list of GPflow models. For single objective optimization this is typically a 
-    single model. Subclasses implement a build_acquisition function which computes the acquisition function (usually 
-    from the predictive distribution) using TensorFlow. Each model is automatically optimized when an acquisition object
-    is constructed or when set_data is called.
+    An object of this class holds a list of GPflow models. Subclasses implement a build_acquisition function
+    which computes the acquisition function (usually from the predictive distribution) using TensorFlow.
+    Each model is automatically optimized when an acquisition object is constructed or when set_data is called.
 
-    Acquisition functions can be combined through addition or multiplication to construct joint criteria 
-    (for instance for constrained optimization)
+    Acquisition functions can be combined through addition or multiplication to construct joint criteria. 
+    For instance, for constrained optimization.
     """
 
     def __init__(self, models=[], optimize_restarts=5):
+        """
+        :param models: list of GPflow models representing our beliefs about the problem
+        :param optimize_restarts: number of optimization restarts to use when training the models
+        """
         super(Acquisition, self).__init__()
         self._models = ParamList([DataScaler(m) for m in np.atleast_1d(models).tolist()])
         self._default_params = list(map(lambda m: m.get_free_state(), self._models))
@@ -56,10 +59,11 @@ def _optimize_models(self):
         """
         Optimizes the hyperparameters of all models that the acquisition function is based on.
 
-        It is called after initialization and set_data(), and before optimizing the acquisition function itself.
+        It is called automatically during initialization and each time set_data() is called.
+        When using the high-level :class:`..BayesianOptimizer` class calling set_data() is taken care of.
 
         For each model the hyperparameters of the model at the time it was passed to __init__() are used as initial
-        point and optimized. If optimize_restarts was configured to values larger than one additional randomization
+        point and optimized. If optimize_restarts is set to >1, additional randomization
         steps are performed.
 
         As a special case, if optimize_restarts is set to zero, the hyperparameters of the models are not optimized.
@@ -82,14 +86,15 @@ def _optimize_models(self):
             best_idx = np.argmin([r.fun for r in runs])
             model.set_state(runs[best_idx].x)
 
-    def build_acquisition(self):
+    def build_acquisition(self, Xcand):
         raise NotImplementedError
 
     def enable_scaling(self, domain):
         """
         Enables and configures the :class:`.DataScaler` objects wrapping the GP models.
+        
         :param domain: :class:`.Domain` object, the input transform of the data scalers is configured as a transform
-         from domain to the unit cube with the same dimensionality.
+            from domain to the unit cube with the same dimensionality.
         """
         n_inputs = self.data[0].shape[1]
         assert (domain.size == n_inputs)
@@ -103,11 +108,11 @@ def set_data(self, X, Y):
         Update the training data of the contained models. Automatically triggers a hyperparameter optimization
         step by calling _optimize_all() and an update of pre-computed quantities by calling setup().
 
-        Consider Q to be the the sum of the output dimensions of the contained models, Y should have a minimum of
+        Let Q be the the sum of the output dimensions of all contained models, Y should have a minimum of
         Q columns. Only the first Q columns of Y are used while returning the scalar Q
 
         :param X: input data N x D
-        :param Y: Responses N x M (M >= Q)
+        :param Y: output data N x R (R >= Q)
         :return: Q (sum of output dimensions of contained models)
         """
         num_outputs_sum = 0
@@ -120,23 +125,30 @@ def set_data(self, X, Y):
             model.Y = Ypart
 
         self._optimize_models()
+
+        # Only call setup for the high-level acquisition function
         if self.highest_parent == self:
             self.setup()
         return num_outputs_sum
 
     @property
     def models(self):
+        """
+        The GPflow models representing our beliefs of the optimization problem.
+        
+        :return: list of GPflow models 
+        """
         return self._models
 
     @property
     def data(self):
         """
-        Property for accessing the training data of the models.
+        The training data of the models.
 
         Corresponds to the input data X which is the same for every model,
         and column-wise concatenation of the Y data over all models
 
-        :return: X, Y tensors (if in tf_mode) or X, Y numpy arrays.
+        :return: tuple X, Y of tensors (if in tf_mode) or numpy arrays.
         """
         if self._tf_mode:
             return self.models[0].X, tf.concat(list(map(lambda model: model.Y, self.models)), 1)
@@ -153,7 +165,10 @@ def constraint_indices(self):
     def objective_indices(self):
         """
         Method returning the indices of the model outputs which are objective functions.
-        By default all outputs are objectives
+        
+        By default all outputs are objectives.
+        
+        :return: indices to the objectives, size R
         """
         return np.setdiff1d(np.arange(self.data[1].shape[1]), self.constraint_indices())
 
@@ -161,17 +176,18 @@ def feasible_data_index(self):
         """
         Returns a boolean array indicating which data points are considered feasible (according to the acquisition
         function(s) ) and which not.
-        By default all data is considered feasible
-        :return: boolean ndarray, N
+        
+        By default all data is considered feasible.
+        
+        :return: logical indices to the feasible data points, size N
         """
         return np.ones(self.data[0].shape[0], dtype=bool)
 
     def setup(self):
         """
-        Method triggered after calling set_data().
-
-        Override for pre-calculation of quantities used later in
-        the evaluation of the acquisition function for candidate points
+        Pre-calculation of quantities used later in the evaluation of the acquisition function for candidate points.
+        
+        Automatically triggered by :meth:`~.Acquisition.set_data`.
         """
         pass
 
@@ -179,6 +195,9 @@ def setup(self):
     def evaluate_with_gradients(self, Xcand):
         """
         AutoFlow method to compute the acquisition scores for candidates, also returns the gradients.
+        
+        :return: acquisition scores, size N x 1
+            the gradients of the acquisition scores, size N x D 
         """
         acq = self.build_acquisition(Xcand)
         return acq, tf.gradients(acq, [Xcand], name="acquisition_gradient")[0]
@@ -187,6 +206,8 @@ def evaluate_with_gradients(self, Xcand):
     def evaluate(self, Xcand):
         """
         AutoFlow method to compute the acquisition scores for candidates, without returning the gradients.
+        
+        :return: acquisition scores, size N x 1
         """
         return self.build_acquisition(Xcand)
 
@@ -198,7 +219,6 @@ def __add__(self, other):
         >>> a2 = GPflowOpt.acquisition.ProbabilityOfFeasibility(m2)
         >>> type(a1 + a2)
         <type 'GPflowOpt.acquisition.AcquisitionSum'>
-
         """
         if isinstance(other, AcquisitionSum):
             return AcquisitionSum([self] + other.operands.sorted_params)
@@ -212,7 +232,6 @@ def __mul__(self, other):
         >>> a2 = GPflowOpt.acquisition.ProbabilityOfFeasibility(m2)
         >>> type(a1 * a2)
         <type 'GPflowOpt.acquisition.AcquisitionProduct'>
-
         """
         if isinstance(other, AcquisitionProduct):
             return AcquisitionProduct([self] + other.operands.sorted_params)
@@ -221,12 +240,11 @@ def __mul__(self, other):
 
 class AcquisitionAggregation(Acquisition):
     """
-    Special acquisition implementation for aggregating multiple others, using a TensorFlow reduce operation.
+    Aggregates multiple acquisition functions, using a TensorFlow reduce operation.
     """
 
     def __init__(self, operands, oper):
         """
-        Constructor
         :param operands: list of acquisition objects
         :param oper: a tf.reduce operation (e.g., tf.reduce_sum) for aggregating the returned scores of each operand.
         """
@@ -310,11 +328,12 @@ def __mul__(self, other):
 
 class MCMCAcquistion(AcquisitionSum):
     """
-    Acquisition object to apply MCMC over the hyperparameters of the models. The models of the acquisition object passed
-    into an object of this class is optimized with MLE, and then sampled with HMC. These hyperparameter samples are then
-    set in copies of the acquisition.
+    Apply MCMC over the hyperparameters of an acquisition function (= over the hyperparameters of the contained models).
+    
+    The models passed into an object of this class are optimized with MLE, and then further sampled with HMC.
+    These hyperparameter samples are then set in copies of the acquisition.
 
-    To compute the acquisition, the predictions of the acquisition copies are averaged.
+    For evaluating the underlying acquisition function, the predictions of the acquisition copies are averaged.
     """
     def __init__(self, acquisition, n_slices, **kwargs):
         assert isinstance(acquisition, Acquisition)
diff --git a/GPflowOpt/acquisition/ei.py b/GPflowOpt/acquisition/ei.py
index eb21c62..90507cb 100644
--- a/GPflowOpt/acquisition/ei.py
+++ b/GPflowOpt/acquisition/ei.py
@@ -45,7 +45,7 @@ class ExpectedImprovement(Acquisition):
        }
 
     This acquisition function is the expectation of the improvement over the current best observation
-    w.r.t. the predictive distribution. The definition is closely related to the Probability of Improvement,
+    w.r.t. the predictive distribution. The definition is closely related to the :class:`.ProbabilityOfImprovement`,
     but adds a multiplication with the improvement w.r.t the current best observation to the integral.
 
     .. math::
@@ -53,6 +53,9 @@ class ExpectedImprovement(Acquisition):
     """
 
     def __init__(self, model):
+        """
+        :param model: GPflow model (single output) representing our belief of the objective
+        """
         super(ExpectedImprovement, self).__init__(model)
         assert (isinstance(model, Model))
         self.fmin = DataHolder(np.zeros(1))
@@ -74,4 +77,4 @@ def build_acquisition(self, Xcand):
         normal = tf.contrib.distributions.Normal(candidate_mean, tf.sqrt(candidate_var))
         t1 = (self.fmin - candidate_mean) * normal.cdf(self.fmin)
         t2 = candidate_var * normal.prob(self.fmin)
-        return tf.add(t1, t2, name=self.__class__.__name__)
\ No newline at end of file
+        return tf.add(t1, t2, name=self.__class__.__name__)
diff --git a/GPflowOpt/acquisition/lcb.py b/GPflowOpt/acquisition/lcb.py
index bb50fd5..faa20de 100644
--- a/GPflowOpt/acquisition/lcb.py
+++ b/GPflowOpt/acquisition/lcb.py
@@ -27,6 +27,10 @@ class LowerConfidenceBound(Acquisition):
     """
 
     def __init__(self, model, sigma=2.0):
+        """
+        :param model: GPflow model (single output) representing our belief of the objective 
+        :param sigma: See formula, the higher the more exploration
+        """
         super(LowerConfidenceBound, self).__init__(model)
         self.sigma = sigma
 
diff --git a/GPflowOpt/acquisition/pof.py b/GPflowOpt/acquisition/pof.py
index 049646f..86d9650 100644
--- a/GPflowOpt/acquisition/pof.py
+++ b/GPflowOpt/acquisition/pof.py
@@ -29,21 +29,18 @@ class ProbabilityOfFeasibility(Acquisition):
     Bayesian Optimization with black-box expensive constraints.
 
     Key reference:
-
+    
     ::
-
-       @article{parr2012infill,
-            title={Infill sampling criteria for surrogate-based optimization with constraint handling},
-            author={Parr, JM and Keane, AJ and Forrester, Alexander IJ and Holden, CME},
-            journal={Engineering Optimization},
-            volume={44},
-            number={10},
-            pages={1147--1166},
-            year={2012},
-            publisher={Taylor & Francis}
-       }
-
-    The acquisition function measures the probability of the latent function being smaller than 0 for a candidate point.
+    
+        @article{Schonlau:1997,
+            title={Computer experiments and global optimization},
+            author={Schonlau, Matthias},
+            year={1997},
+            publisher={University of Waterloo}
+        }
+       
+    The acquisition function measures the probability of the latent function 
+    being smaller than a threshold for a candidate point.
 
     .. math::
        \\alpha(\\mathbf x_{\\star}) = \\int_{-\\infty}^{0} \\, p(f_{\\star}\\,|\\, \\mathbf x, \\mathbf y, \\mathbf x_{\\star} ) \\, d f_{\\star}
@@ -51,11 +48,10 @@ class ProbabilityOfFeasibility(Acquisition):
 
     def __init__(self, model, threshold=0.0, minimum_pof=0.5):
         """
-
-        :param model: GPflow model (single output) for computing the PoF
-        :param threshold: threshold value. Observed values lower than this value are considered valid
-        :param minimum_pof: minimum pof score required for a point to be valid. For more information, see docstring
-        of feasible_data_index
+        :param model: GPflow model (single output) representing our belief of the constraint
+        :param threshold: Observed values lower than the threshold are considered valid
+        :param minimum_pof: minimum pof score required for a point to be valid.
+            For more information, see docstring of feasible_data_index
         """
         super(ProbabilityOfFeasibility, self).__init__(model)
         self.threshold = threshold
@@ -66,18 +62,19 @@ def constraint_indices(self):
 
     def feasible_data_index(self):
         """
-        Returns a boolean array indicating which points are feasible (True) and which are not (False)
+        Returns a boolean array indicating which points are feasible (True) and which are not (False).
+        
         Answering the question *which points are feasible?* is slightly troublesome in case noise is present.
         Directly relying on the noisy data and comparing it to self.threshold does not make much sense.
 
-        Instead, we rely on the model belief. More specifically, we evaluate the PoF (score between 0 and 1).
+        Instead, we rely on the model belief using the PoF (a probability between 0 and 1).
         As the implementation of the PoF corresponds to the cdf of the (normal) predictive distribution in
         a point evaluated at the threshold, requiring a minimum pof of 0.5 implies the mean of the predictive
         distribution is below the threshold, hence it is marked as feasible. A minimum pof of 0 marks all points valid.
         Setting it to 1 results in all invalid.
-        :return: boolean ndarray, size N
+    
+        :return: boolean ndarray (size N)
         """
-        # In
         pred = self.evaluate(self.data[0])
         return pred.ravel() > self.minimum_pof
 
diff --git a/GPflowOpt/acquisition/poi.py b/GPflowOpt/acquisition/poi.py
index 6eee12e..cef81ed 100644
--- a/GPflowOpt/acquisition/poi.py
+++ b/GPflowOpt/acquisition/poi.py
@@ -32,6 +32,9 @@ class ProbabilityOfImprovement(Acquisition):
     """
 
     def __init__(self, model):
+        """
+        :param model: GPflow model (single output) representing our belief of the objective 
+        """
         super(ProbabilityOfImprovement, self).__init__(model)
         self.fmin = DataHolder(np.zeros(1))
         self.setup()
diff --git a/GPflowOpt/bo.py b/GPflowOpt/bo.py
index bf506d2..7ef539f 100644
--- a/GPflowOpt/bo.py
+++ b/GPflowOpt/bo.py
@@ -25,28 +25,31 @@
 
 class BayesianOptimizer(Optimizer):
     """
-    A Bayesian Optimizer.
-
-    Like other optimizers, this optimizer is constructed for optimization over a domain. Additionally, it is configured
-    with a separate optimizer for the acquisition function.
+    A traditional Bayesian optimization framework implementation.
+    
+    Like other optimizers, this optimizer is constructed for optimization over a domain.
+    Additionally, it is configured with a separate optimizer for the acquisition function.
     """
 
     def __init__(self, domain, acquisition, optimizer=None, initial=None, scaling=True, hyper_draws=None):
         """
-        :param domain: Domain object defining the optimization space
-        :param acquisition: Acquisition object representing a utility function optimized over the domain
-        :param optimizer: (optional) Optimizer object used to optimize acquisition. If not specified, SciPyOptimizer
-         is used. This optimizer will run on the same domain as the BayesianOptimizer object.
-        :param initial: (optional) Design object used as initial set of candidates evaluated before the optimization
-         loop runs. Note that if the underlying model already some data from an initial design, it is augmented with the
-         evaluations obtained by evaluating the points as specified by the design
-        :param scaling: (boolean, default true) if set to true, the outputs are normalized, and the inputs are
-          scaled to a unit cube. This only affects model training: calls to acquisition.data, as well as
-          returned optima are unscaled (see :class:`.DataScaler` for more details.)
-        :param hyper_draws: (optional) Enable marginalization of model hyperparameters. By default, point estimates are
-          used. If this parameter set to n, n hyperparameter draws from the likelihood distribution are obtained using
-          Hamiltonian MC (see GPflow documentation for details) for each model. The acquisition score is computed for
-          each draw, and averaged.
+        :param Domain domain: The optimization space.
+        :param Acquisition acquisition: The acquisition function to optimize over the domain.
+        :param Optimizer optimizer: (optional) optimization approach for the acquisition function.
+            If not specified, :class:`~.optim.SciPyOptimizer` is used.
+            This optimizer will run on the same domain as the :class:`.BayesianOptimizer` object.
+        :param Design initial: (optional) The initial design of candidates to evaluate
+            before the optimization loop runs. Note that if the underlying model contains already some data from
+            an initial design, it is augmented with the evaluations obtained by evaluating
+            the points as specified by the design.
+        :param bool scaling: (boolean, default true) if set to true, the outputs are normalized, and the inputs are
+            scaled to a unit cube. This only affects model training: calls to acquisition.data, as well as
+            returned optima are unscaled (see :class:`~.DataScaler` for more details.)
+        :param int hyper_draws: (optional) Enable marginalization of model hyperparameters. By default, point estimates are
+            used. If this parameter set to n, n hyperparameter draws from the likelihood distribution
+            are obtained using Hamiltonian MC.
+            (see `GPflow documentation <https://gpflow.readthedocs.io/en/latest//>`_ for details) for each model.
+            The acquisition score is computed for each draw, and averaged.
         """
         assert isinstance(acquisition, Acquisition)
         assert hyper_draws is None or hyper_draws > 0
@@ -65,9 +68,10 @@ def __init__(self, domain, acquisition, optimizer=None, initial=None, scaling=Tr
 
     def _update_model_data(self, newX, newY):
         """
-        Update the underlying models of the acquisition function with new data
-        :param newX: samples (# new samples x indim)
-        :param newY: values obtained by evaluating the objective and constraint functions (# new samples x # targets)
+        Update the underlying models of the acquisition function with new data.
+
+        :param newX: samples, size N x D
+        :param newY: values obtained by evaluating the objective and constraint functions, size N x R
         """
         assert self.acquisition.data[0].shape[1] == newX.shape[-1]
         assert self.acquisition.data[1].shape[1] == newY.shape[-1]
@@ -78,13 +82,17 @@ def _update_model_data(self, newX, newY):
 
     def _evaluate_objectives(self, X, fxs):
         """
-        Evaluates a list of n functions on X. Returns a ndarray, with the number of columns equal to sum(Q0,...Qn-1)
+        Evaluates a list of n functions on X.
+        
+        Returns a matrix, size N x sum(Q0,...Qn-1)
         with Qi the number of columns obtained by evaluating the i-th function.
-        :param X: input points, 2D ndarray, N x D
-        :param fxs: 1D ndarray of (expensive) functions
-        :return: tuple: (0) 2D ndarray (# new samples x sum(Q0,...Qn-1)). Evaluations
-                        (1) 2D ndarray (# new samples x 0): Bayesian Optimizer is gradient-free, however calling
-                        optimizer of the parent class expects a gradient. Will be discarded further on.
+       
+        :param X: input points, size N x D
+        :param fxs: functions, size n
+        :return: tuple:
+            (0) the evaluations Y, size N x sum(Q0,...Qn-1).
+            (1) Not used, size N x 0. Bayesian Optimizer is gradient-free, however calling optimizer of the parent class
+            expects a gradient. Will be discarded further on.
         """
         if X.size > 0:
             evaluations = np.hstack(map(lambda f: f(X), fxs))
@@ -97,6 +105,7 @@ def _create_bo_result(self, success, message):
         """
         Analyzes all data evaluated during the optimization, and return an OptimizeResult. Outputs of constraints
         are used to remove all infeasible points.
+       
         :param success: Optimization successful? (True/False)
         :param message: return message
         :return: OptimizeResult object
@@ -125,15 +134,18 @@ def _create_bo_result(self, success, message):
 
     def optimize(self, objectivefx, n_iter=20):
         """
-        Run Bayesian optimization for a number of iterations. Before the loop is initiated, first all points retrieved
-        by get_initial() are evaluated on the objective and black-box constraints. These points are then added to the 
-        acquisition function by calling Acquisition.set_data() (and hence, the underlying models). 
+        Run Bayesian optimization for a number of iterations.
+        
+        Before the loop is initiated, first all points retrieved by :meth:`~.optim.Optimizer.get_initial` are evaluated
+        on the objective and black-box constraints. These points are then added to the acquisition function 
+        by calling :meth:`~.acquisition.Acquisition.set_data` (and hence, the underlying models). 
         
         Each iteration a new data point is selected for evaluation by optimizing an acquisition function. This point
         updates the models.
+        
         :param objectivefx: (list of) expensive black-box objective and constraint functions. For evaluation, the 
-         responses of all the expensive functions are aggregated column wise. Unlike the typical optimizer interface, 
-         these functions should not return gradients. 
+            responses of all the expensive functions are aggregated column wise.
+            Unlike the typical :class:`~.optim.Optimizer` interface, these functions should not return gradients. 
         :param n_iter: number of iterations to run
         :return: OptimizeResult object
         """
@@ -143,8 +155,9 @@ def optimize(self, objectivefx, n_iter=20):
     def _optimize(self, fx, n_iter):
         """
         Internal optimization function. Receives an ObjectiveWrapper as input. As exclude_gradient is set to true,
-        the placeholder created by _evaluate_objectives will not be returned.
-        :param fx: ObjectiveWrapper object wrapping expensive black-box objective and constraint functions
+        the placeholder created by :meth:`_evaluate_objectives` will not be returned.
+       
+        :param fx: :class:`.objective.ObjectiveWrapper` object wrapping expensive black-box objective and constraint functions
         :param n_iter: number of iterations to run
         :return: OptimizeResult object
         """
@@ -172,15 +185,16 @@ def inverse_acquisition(x):
     @contextmanager
     def failsafe(self):
         """
-        Context to provide a safe way for optimization. If a RuntimeError is generated, the data of the acquisition
-        object is saved to the disc in the current directory. This allows the data to be re-used (which makes sense
-        for expensive data).
+        Context to provide a safe way for optimization.
+        
+        If a RuntimeError is generated, the data of the acquisition object is saved to the disk.
+        in the current directory. This allows the data to be re-used (which makes sense for expensive data).
 
-        The data can also be used to try to fit a GPflow model first (set sensible initial
+        The data can be used to experiment with fitting a GPflow model first (analyse the data, set sensible initial
         hyperparameter values and hyperpriors) before retrying Bayesian Optimization again.
         """
         try:
             yield
         except Exception as e:
             np.savez('failed_bopt_{0}'.format(id(e)), X=self.acquisition.data[0], Y=self.acquisition.data[1])
-            raise
\ No newline at end of file
+            raise
diff --git a/GPflowOpt/design.py b/GPflowOpt/design.py
index 23986f6..af7b8a0 100644
--- a/GPflowOpt/design.py
+++ b/GPflowOpt/design.py
@@ -39,7 +39,7 @@ def __init__(self, size, domain):
     def generative_domain(self):
         """
         :return: Domain object representing the domain associated with the points generated in create_design().
-        Defaults to [0,1]^D, can be overwritten by subclasses
+            Defaults to [0,1]^D, can be overwritten by subclasses
         """
         return np.sum([ContinuousParameter('d{0}'.format(i), 0, 1) for i in np.arange(self.domain.size)])
 
@@ -48,7 +48,8 @@ def generate(self):
         Creates the design in the domain specified during construction.
 
         It is guaranteed that all data points satisfy this domain
-        :return: 2D ndarray, N x D
+
+        :return: data matrix, size N x D
         """
         Xs = self.create_design()
         assert (Xs in self.generative_domain)
@@ -64,7 +65,8 @@ def create_design(self):
         Returns a design generated in the `generative` domain.
 
         This method should be implemented in the subclasses.
-        :return: 2D ndarray, N x D
+        
+        :return: data matrix, N x D
         """
         raise NotImplementedError
 
@@ -87,8 +89,8 @@ class FactorialDesign(Design):
     """
     A k-level grid-based design.
 
-    Design with the optimal minimal distance between points, however it risks collapsing points when
-    removing parameters. Also its size is not arbitrary but a power of the domain dimensionality.
+    Design with the optimal minimal distance between points (a simple grid), however it risks collapsing points when
+    removing parameters. Its size is a power of the domain dimensionality.
     """
 
     def __init__(self, levels, domain):
@@ -131,6 +133,7 @@ class LatinHyperCube(Design):
     Somewhere beyond 15D this algorithm tends to slow down a lot and become very memory demanding. Key reference is
     
     ::
+    
        @article{Viana:2010,
             title={An algorithm for fast optimal Latin hypercube design of experiments},
             author={Viana, Felipe AC and Venter, Gerhard and Balabanov, Vladimir},
@@ -150,8 +153,8 @@ def __init__(self, size, domain, max_seed_size=None):
         :param size: requested size N for the LHD 
         :param domain: domain to generate the LHD for, must be continuous
         :param max_seed_size: the maximum size 1 <= S <= D for the seed, . If unspecified, equals the dimensionality D 
-        of the domain. During generation, S different designs are generated. Seeds with sizes 1,2,...S are used.
-        Each seed itself is a small LHD.
+            of the domain. During generation, S different designs are generated. Seeds with sizes 1,2,...S are used.
+            Each seed itself is a small LHD.
         """
         super(LatinHyperCube, self).__init__(size, domain)
         self._max_seed_size = np.round(max_seed_size or domain.size)
@@ -166,8 +169,12 @@ def generative_domain(self):
 
     def create_design(self):
         """
-        Generate several LHDs with increasing seed. Maximum S = min(dimensionality,max_seed_size)
-        :return: From S candidate designs, the one with the best intersite distance is returned. 2D ndarray, N x D.
+        Generate several LHDs with increasing seed.
+        
+        Maximum S = min(dimensionality,max_seed_size).
+        From S candidate designs, the one with the best intersite distance is returned
+
+        :return: data matrix, size N x D.
         """
         candidates = []
         scores = []
@@ -190,10 +197,12 @@ def create_design(self):
 
     def _tplhd_design(self, seed):
         """
-        Creates an LHD with the Translational propagation algorithm with specified seed and design size specified during
-        construction (N).
-        :param seed: 2D ndarray, the seed to use. S x D
-        :return: LHD, 2D ndarray. N x D
+        Creates an LHD with the Translational propagation algorithm.
+         
+        Uses the specified seed and design size N specified during construction.
+
+        :param seed: seed design, size S x D
+        :return: data matrix, size N x D
         """
         ns, nv = seed.shape
 
@@ -217,10 +226,11 @@ def _tplhd_design(self, seed):
     def _rescale_seed(seed, npStar, ndStar):
         """
         Rescales the seed design
-        :param seed: 2D ndarray, S x D
+
+        :param seed: seed design, size S x D
         :param npStar: size of the LHD to be generated. N* >= N
         :param ndStar: number of translation steps for the seed in each dimension
-        :return: rescaled seed, 2D ndarray, S x D
+        :return: rescaled seeds, size S x D
         """
         ns, nv = seed.shape
         if ns == 1:
@@ -237,10 +247,11 @@ def _rescale_seed(seed, npStar, ndStar):
     def _translate_propagate(seed, npStar, ndStar):
         """
         Translates and propagates the seed design to a LHD of size npStar (which might exceed the requested size N)
-        :param seed: seed design, 2D ndarray S x D
+
+        :param seed: seed design, size S x D
         :param npStar: size of the LHD to be generated (N*). 
         :param ndStar: number of translation steps for the seed in each dimension
-        :return: LHD, 2D ndarray N* x D (still to be shrinked).
+        :return: LHD data matrix, size N* x D (still to be shrinked).
         """
         nv = seed.shape[1]
         X = seed
@@ -265,9 +276,10 @@ def _shrink(X, npoints):
         """
         When designs are generated that are larger than the requested number of points (N* > N), resize them.
         If the size was correct all along, the LHD is returned unchanged.
-        :param X: Generated LHD, N* x D, with N* >= N
+
+        :param X: Generated LHD, size N* x D, with N* >= N
         :param npoints: What size to resize to (N)
-        :return: LHD, 2D ndarray N x D
+        :return: LHD data matrix, size N x D
         """
         npStar, nv = X.shape
 
diff --git a/GPflowOpt/domain.py b/GPflowOpt/domain.py
index c9e5825..f95275c 100644
--- a/GPflowOpt/domain.py
+++ b/GPflowOpt/domain.py
@@ -128,6 +128,7 @@ def _repr_html_(self):
     def _html_table_rows(self):
         return ''.join(map(lambda l: l._html_table_rows(), self._parameters))
 
+
 class Parameter(Domain):
     """
     Abstract class representing a parameter (which corresponds to a one-dimensional domain)
@@ -198,7 +199,9 @@ def _html_table_rows(self):
 
 
 class UnitCube(Domain):
+    """
+    The unit domain [0, 1]^d
+    """
     def __init__(self, n_inputs):
         params = [ContinuousParameter('u{0}'.format(i), 0, 1) for i in np.arange(n_inputs)]
         super(UnitCube, self).__init__(params)
-
diff --git a/GPflowOpt/objective.py b/GPflowOpt/objective.py
index ce192ef..0884d84 100644
--- a/GPflowOpt/objective.py
+++ b/GPflowOpt/objective.py
@@ -18,14 +18,15 @@
 
 def batch_apply(fun):
     """
-    Decorator which applies a function along the first dimension of a given ndarray as argument (the batch dimension)
-    the most common use case is to convert a function designed to operate on a single input vector, and
+    Decorator which applies a function along the first dimension of a given data matrix (the batch dimension).
+    
+    The most common use case is to convert a function designed to operate on a single input vector, and
     to compute its response (and possibly gradient) for each row of a matrix.
 
-    :param fun: function accepting an input vector of dimensionality d and returns a vector of dimensionality p (the
-       output dimensionality) and (optionally) a gradient of size d x p (or d if p == 1)
-    :return: a function wrapper which calls fun on each row of a given n x d matrix. Here n represents the batch
-       dimension. the wrapper returns n x p and optionally a n x d x p matrix (or n x d if p == 1)
+    :param fun: function accepting an input vector of size D and returns a vector of size R (number of 
+        outputs) and (optionally) a gradient of size D x R (or size D if R == 1)
+    :return: a function wrapper which calls fun on each row of a given N* x D matrix. Here N* represents the batch
+        dimension. The wrapper returns N* x R and optionally a matrix, size N* x D x R matrix (or size N* x D if R == 1)
     """
     @wraps(fun)
     def batch_wrapper(X):
@@ -35,9 +36,9 @@ def batch_wrapper(X):
         if len(sep) == 1:
             return f
 
-        # for each point, the gradient is either (d,) or (d, p) shaped.
-        g_stacked = np.stack((r for r in sep[1]), axis=0)  # n x d or n x d x p
-        # Get rid of last dim = 1 in case p = 1
+        # for each point, the gradient is either (D,) or (D, R) shaped.
+        g_stacked = np.stack((r for r in sep[1]), axis=0)  # N x D or N x D x R
+        # Get rid of last dim = 1 in case R = 1
         g = np.squeeze(g_stacked, axis=2) if len(g_stacked.shape) == 3 and g_stacked.shape[2] == 1 else g_stacked
         return f, g
 
@@ -46,12 +47,13 @@ def batch_wrapper(X):
 
 def to_args(fun):
     """
-    Decorator for calling an objective function which has each feature as seperate input parameter. The 2d input ndarray
-    is split column wise and passed as arguments. Can be combined with batch apply.
+    Decorator for calling an objective function which has each feature as separate input parameter.
+    
+    The data matrix is split column wise and passed as arguments. Can be combined with batch apply.
     
-    :param fun: function accepting d n-dimensional vectors (each representing a feature and returns a a matrix of
-       dimensionality n x p and optionally a gradient of size n x d x p (or n x d if p == 1)
-    :return: a function wrapper which splits a given input ndarray into its columns to call fun.
+    :param fun: function accepting D N*-dimensional vectors (each representing a feature and returns a a matrix of
+        size N* x R and optionally a gradient of size N x D x R (or size N x D if R == 1)
+    :return: a function wrapper which splits a given data matrix into its columns to call fun.
     """
     @wraps(fun)
     def args_wrapper(X):
@@ -63,23 +65,25 @@ def args_wrapper(X):
 
 class to_kwargs(object):
     """
-    Decorator for calling an objective function which has each feature as seperate keyword argument.
-    The 2d input ndarray is split column wise and passed as keyword arguments. Can be combined with batch apply.
+    Decorator for calling an objective function which has each feature as separate keyword argument.
+
+    The data matrix is split column wise and passed as keyword arguments. Can be combined with batch apply.
 
     This decorator is particularly useful for fixing parameters of the optimization domain to fixed values. This can
     be achieved by assigning default values to the keyword arguments. By adding/removing a parameter from the
     optimization domain, the parameter is included or excluded.
 
-    :param domain: optimization domain, labels of the parameters are as keys to calling the objective function.
+    :param domain: optimization domain,
+        labels of the parameters are the keyword arguments to calling the objective function.
     """
     def __init__(self, domain):
         self.labels = [p.label for p in domain]
 
     def __call__(self, fun):
         """
-        :param fun: function accepting d n-dimensional vectors as keyword arguments (each representing a feature,
-         and returns a a matrix of dimensionality n x p and optionally a gradient of size n x d x p (or n x d if p == 1)
-        :return: a function wrapper which splits a given input ndarray into its columns to call fun.
+        :param fun: function accepting D N*-dimensional vectors as keyword arguments (each representing a feature,
+            and returns a a matrix of size N* x R and optionally a gradient of size N* x D x R (or N* x D if R == 1)
+        :return: a function wrapper which splits a given data matrix into its columns to call fun.
         """
         @wraps(fun)
         def kwargs_wrapper(X):
@@ -90,6 +94,11 @@ def kwargs_wrapper(X):
 
 
 class ObjectiveWrapper(model.ObjectiveWrapper):
+    """
+    A wrapper for objective functions.
+    
+    Filters out gradient information if necessary and keeps a count of the number of function evaluations.
+    """
     def __init__(self, objective, exclude_gradient):
         super(ObjectiveWrapper, self).__init__(objective)
         self._no_gradient = exclude_gradient
diff --git a/GPflowOpt/optim.py b/GPflowOpt/optim.py
index f638428..110d8e1 100644
--- a/GPflowOpt/optim.py
+++ b/GPflowOpt/optim.py
@@ -39,10 +39,22 @@ def __init__(self, domain, exclude_gradient=False):
 
     @property
     def domain(self):
+        """
+        The current domain the optimizer operates on.
+        
+        :return: :class:'~.domain.Domain` object 
+        """
         return self._domain
 
     @domain.setter
     def domain(self, dom):
+        """
+        Sets a new domain for the optimizer.
+        
+        Resets the initial points to the middle of the domain.
+        
+        :param dom: new :class:'~.domain.Domain` 
+        """
         self._domain = dom
         self.set_initial(dom.value)
 
@@ -56,7 +68,7 @@ def optimize(self, objectivefx, **kwargs):
         The actual optimization routine is implemented in _optimize, to be implemented in subclasses.
 
         :param objectivefx: callable, taking one argument: a 2D numpy array. The number of columns correspond to the 
-        dimensionality of the input domain.
+            dimensionality of the input domain.
         :return: OptimizeResult reporting the results.
         """
         objective = ObjectiveWrapper(objectivefx, **self._wrapper_args)
@@ -73,13 +85,18 @@ def optimize(self, objectivefx, **kwargs):
     def get_initial(self):
         """
         Return the initial set of points.
+        
+        :return: initial set of points, size N x D
         """
         return self._initial
 
     def set_initial(self, initial):
         """
-        Set the initial set of points. The dimensionality should match the domain dimensionality, and all points should 
-        be within the domain
+        Set the initial set of points.
+        
+        The dimensionality should match the domain dimensionality, and all points should 
+        be within the domain.
+    
         :param initial: initial points, should all be within the domain of the optimizer.
         """
         initial = np.atleast_2d(initial)
@@ -121,9 +138,9 @@ class CandidateOptimizer(Optimizer):
 
     def __init__(self, domain, candidates, batch=False):
         """
-        :param domain: Optimization domain.
+        :param domain: Optimization :class:`.domain.Domain`.
         :param candidates: candidate points, should be within the optimization domain. 
-        :param batch: bool, evaluate the objective function on all points at once or one by one?
+        :param batch: bool, if true evaluates the objective function on all points at once
         """
         super(CandidateOptimizer, self).__init__(domain, exclude_gradient=True)
         assert(candidates in domain)
diff --git a/GPflowOpt/scaling.py b/GPflowOpt/scaling.py
index 4bb661c..baa4f08 100644
--- a/GPflowOpt/scaling.py
+++ b/GPflowOpt/scaling.py
@@ -39,23 +39,23 @@ class DataScaler(GPModel):
       is set, the output transform is first calculated, then the data is scaled.
 
 
-    By default, :class:`.Acquisition` objects will always wrap each model received. However, the input and output transforms
+    By default, :class:`~.acquisition.Acquisition` objects will always wrap each model received. However, the input and output transforms
     will be the identity transforms, and output normalization is switched off. It is up to the user (or
-    specialized classes such as the BayesianOptimizer to correctly configure the datascalers involved.
+    specialized classes such as the BayesianOptimizer) to correctly configure the datascalers involved.
 
     By carrying out the scaling at such a deep level in the framework, it is possible to keep the scaling
     hidden throughout the rest of GPflowOpt. This means that, during implementation of acquisition functions it is safe
     to assume the data is not scaled, and is within the configured optimization domain. There is only one exception:
     the hyperparameters are determined on the scaled data, and are NOT automatically unscaled by this class because the
     datascaler does not know what model is wrapped and what kernels are used. Should hyperparameters of the model be
-    required, it is the responsability of the implementation to rescale the hyperparameters. Additionally, applying
+    required, it is the responsibility of the implementation to rescale the hyperparameters. Additionally, applying
     hyperpriors should anticipate for the scaled data.
     """
     def __init__(self, model, domain=None, normalize_Y=False):
         """
         :param model: model to be wrapped
         :param domain: (default: None) if supplied, the input transform is configured from the supplied domain to
-        :class:`.UnitCube`. If None, the input transform defaults to the identity transform.
+            :class:`.UnitCube`. If None, the input transform defaults to the identity transform.
         :param normalize_Y: (default: False) enable automatic scaling of output values to zero mean and unit
          variance.
         """
@@ -107,6 +107,7 @@ def __str__(self, prepend=''):
     def input_transform(self):
         """
         Get the current input transform
+        
         :return: :class:`.DataTransform` input transform object
         """
         return self._input_transform
@@ -115,6 +116,7 @@ def input_transform(self):
     def input_transform(self, t):
         """
         Configure a new input transform. Data in the model is automatically updated with the new transform.
+        
         :param t: :class:`.DataTransform` object: the new input transform.
         """
         assert(isinstance(t, DataTransform))
@@ -126,6 +128,7 @@ def input_transform(self, t):
     def output_transform(self):
         """
         Get the current output transform
+        
         :return: :class:`.DataTransform` output transform object
         """
         return self._output_transform
@@ -134,6 +137,7 @@ def output_transform(self):
     def output_transform(self, t):
         """
         Configure a new output transform. Data in the model is automatically updated with the new transform.
+        
         :param t: :class:`.DataTransform` object: the new output transform.
         """
         assert (isinstance(t, DataTransform))
@@ -154,6 +158,7 @@ def normalize_output(self, flag):
         Enable/disable automated output scaling. If switched off, the output transform becomes the identity transform.
         If enabled, data will be automatically scaled to zero mean and unit variance. When the output normalization is
         switched on or off, the data in the model is automatically adapted.
+        
         :param flag: boolean, turn output scaling on or off
         """
 
@@ -170,6 +175,7 @@ def normalize_output(self, flag):
     def X(self):
         """
         Returns the input data of the model, unscaled.
+
         :return: :class:`.DataHolder`: unscaled input data
         """
         return DataHolder(self.input_transform.backward(self.wrapped.X.value))
@@ -178,6 +184,7 @@ def X(self):
     def Y(self):
         """
         Returns the output data of the wrapped model, unscaled.
+
         :return: :class:`.DataHolder`: unscaled output data
         """
         return DataHolder(self.output_transform.backward(self.wrapped.Y.value))
diff --git a/GPflowOpt/transforms.py b/GPflowOpt/transforms.py
index 42c88f5..4d5b82d 100644
--- a/GPflowOpt/transforms.py
+++ b/GPflowOpt/transforms.py
@@ -38,6 +38,7 @@ def forward(self, X):
     def build_forward(self, X):
         """
         Tensorflow graph for the transformation of U -> V
+
         :param X: N x P tensor
         :return: N x Q tensor
         """
@@ -45,9 +46,10 @@ def build_forward(self, X):
 
     def backward(self, Y):
         """
-        Performs the transformation of V -> U. By default, calls the :func:`.forward` transform on the inverted
+        Performs the transformation of V -> U. By default, calls the :meth:`.forward` transform on the inverted
         transform object which requires implementation of __invert__. The method can be overwritten in subclasses if a
-        more efficient (direct) transformation is  possible.
+        more efficient (direct) transformation is possible.
+
         :param Y: N x Q matrix
         :return: N x P matrix
         """
@@ -75,9 +77,11 @@ class LinearTransform(DataTransform):
     def __init__(self, A, b):
         """
         :param A: scaling matrix. Either a P-dimensional vector, or a P x P transformation matrix. For the latter, 
-        the inverse and backward methods are not guaranteed to work as A must be invertible. It is also possible to 
-        specify a matrix with size P x Q with Q != P to achieve a lower dimensional representation of X. In this case, 
-        A is not invertible, hence inverse and backward are not supported.
+            the inverse and backward methods are not guaranteed to work as A must be invertible.
+            
+            It is also possible to specify a matrix with size P x Q with Q != P to achieve 
+            a lower dimensional representation of X.
+            In this case, A is not invertible, hence inverse and backward transforms are not supported.
         :param b: A P-dimensional offset vector.
         """
         super(LinearTransform, self).__init__()
@@ -117,8 +121,9 @@ def build_backward_variance(self, Yvar):
         """
         Additional method for scaling variance backward (used in :class:`.Normalizer`). Can process both the diagonal
         variances returned by predict_f, as well as full covariance matrices.
-        :param Yvar: N x N x P or N x P
-        :return: Yvar scaled, same rank and dimensionality as input
+
+        :param Yvar: size N x N x P or size N x P
+        :return: Yvar scaled, same rank and size as input
         """
         rank = tf.rank(Yvar)
         # Because TensorFlow evaluates both fn1 and fn2, the transpose can't be in the same line. If a full cov
@@ -136,8 +141,9 @@ def build_backward_variance(self, Yvar):
 
     def assign(self, other):
         """
-        Assign the parameters of another  to this transform. Can be useful to avoid graph
+        Assign the parameters of another :class:`LinearTransform`. Can be useful to avoid graph
         re-compilation.
+
         :param other: :class:`.LinearTransform` object
         """
         assert other is not None
diff --git a/doc/source/apiAndArchitecture.rst b/doc/source/apiAndArchitecture.rst
index 65e5f70..4eeca70 100644
--- a/doc/source/apiAndArchitecture.rst
+++ b/doc/source/apiAndArchitecture.rst
@@ -1,10 +1,14 @@
+.. _api:
+
 API and architecture
 ==================================
 
 .. toctree::
    :maxdepth: 1
 
-   acquisition
    notebooks/structure
+   bayesianoptimizer
+   acquisition
+   designs
+   transforms
    interfaces
-   transforms
\ No newline at end of file
diff --git a/doc/source/bayesianoptimizer.rst b/doc/source/bayesianoptimizer.rst
new file mode 100644
index 0000000..0467167
--- /dev/null
+++ b/doc/source/bayesianoptimizer.rst
@@ -0,0 +1,7 @@
+Bayesian Optimizer
+==================
+
+.. automodule:: GPflowOpt.bo
+.. autoclass:: GPflowOpt.BayesianOptimizer
+   :members:
+   :special-members:
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 4d24492..a066fdd 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -18,11 +18,6 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import os
-import sys
-#sys.path.insert(0, os.path.abspath('..'))
-#sys.path.insert(0, os.path.abspath('/home/javdrher/GPflow'))
-#sys.path.insert(0, '/home/javdrher/PycharmProjects/GPflowOpt')
-
 
 # on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
@@ -76,7 +71,7 @@
 # General information about the project.
 project = 'GPflowOpt'
 copyright = '2017, Joachim van der Herten'
-author = 'Joachim van der Herten'
+author = 'Joachim van der Herten, Ivo Couckuyt'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -122,7 +117,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = []
 
 
 # -- Options for HTMLHelp output ------------------------------------------
diff --git a/doc/source/designs.rst b/doc/source/designs.rst
new file mode 100644
index 0000000..0ea4871
--- /dev/null
+++ b/doc/source/designs.rst
@@ -0,0 +1,22 @@
+Initial Designs
+===============
+
+.. automodule:: GPflowOpt.design
+
+Latin Hypercube design
+----------------------
+.. autoclass:: GPflowOpt.design.LatinHyperCube
+   :members:
+   :special-members:
+
+Factorial design
+----------------
+.. autoclass:: GPflowOpt.design.FactorialDesign
+   :members:
+   :special-members:
+
+Random design
+-------------
+.. autoclass:: GPflowOpt.design.RandomDesign
+   :members:
+   :special-members:
diff --git a/doc/source/interfaces.rst b/doc/source/interfaces.rst
index 83da7c5..6756e40 100644
--- a/doc/source/interfaces.rst
+++ b/doc/source/interfaces.rst
@@ -1,5 +1,5 @@
-GPflowOpt Interfaces
-========================
+Interfaces
+==========
 
 Domain
 -------
diff --git a/doc/source/intro.rst b/doc/source/intro.rst
index 55344b9..0eb6fc7 100644
--- a/doc/source/intro.rst
+++ b/doc/source/intro.rst
@@ -28,6 +28,14 @@ You can run the tests with ``python setup.py test``.
 
 To build the documentation, first install extra dependencies with ``pip install .[docs]``, then proceed with ``python setup.py build_sphinx``.
 
+Getting started
+---------------
+
+A simple example of Bayesian optimization to get up and running is provided by the
+:ref:`first steps into Bayesian optimization <notebooks/firststeps.ipynb>` notebook
+
+For more advanced use cases have a look at the other :ref:`tutorial <tutorials>` notebooks and the :ref:`api`.
+
 Acknowledgements
 -----------------
 Joachim van der Herten and Ivo Couckuyt are Ghent University - imec postdoctoral fellows. Ivo Couckuyt is supported
diff --git a/doc/source/notebooks/constrained_bo.ipynb b/doc/source/notebooks/constrained_bo.ipynb
index 66e0c65..51f7b63 100644
--- a/doc/source/notebooks/constrained_bo.ipynb
+++ b/doc/source/notebooks/constrained_bo.ipynb
@@ -12,6 +12,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Introduction\n",
+    "\n",
     "This notebook demonstrates the optimization of an analytical function using the well known Expected Improvement (EI) function. The problem is constrained by a black-box constraint function. The feasible regions are learnt jointly with the optimal regions by considering a second acquisition function known as the Probability of Feasibility (PoF), following the approach of Gardner et al. (2014)"
    ]
   },
@@ -38,6 +40,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Constrained problem\n",
+    "\n",
     "First we set up an objective function (the townsend function) and a constraint function. We further assume both functions are black-box. We also define the optimization domain (2 continuous parameters)."
    ]
   },
@@ -95,6 +99,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "##  Modeling and joint acquisition function\n",
+    "\n",
     "We proceed by assigning the objective and constraint function a GP prior. Both functions are evaluated on a space-filling set of points (here, a Latin Hypercube design). Two GPR models are created.\n",
     "The EI is based on the model of the objective function (townsend), whereas PoF is based on the model of the constraint function. We then define the joint criterioin as the product of the EI and PoF."
    ]
@@ -130,6 +136,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Initial belief\n",
+    "\n",
     "We can now inspect our belief about the optimization problem by plotting the models, the EI, PoF and joint mappings. Both models clearly are not very accurate yet. More specifically, the constraint model does not correctly capture the feasibility yet."
    ]
   },
@@ -203,6 +211,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Running Bayesian Optimizer\n",
+    "\n",
     "Running the Bayesian optimization is the next step. For this, we must set up an appropriate strategy to optimize the joint acquisition function. Sometimes this can be a bit challenging as often large non-varying areas may occur. A typical strategy is to apply a Monte Carlo optimization step first, then optimize the point with the best value (several variations exist). This approach is followed here. We then run the Bayesian Optimization and allow it to select up to 50 additional decisions. \n",
     "\n",
     "The joint acquisition function assures the feasibility (w.r.t the constraint) is taken into account while selecting decisions for optimality."
@@ -245,6 +255,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Results\n",
+    "\n",
     "If we now plot the belief, we clearly see the constraint model has improved significantly. More specifically, its PoF mapping is an accurate representation of the true constraint function. By multiplying the EI by the PoF, the search is restricted to the feasible regions."
    ]
   },
@@ -376,7 +388,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.1"
   }
  },
  "nbformat": 4,
diff --git a/doc/source/notebooks/firststeps.ipynb b/doc/source/notebooks/firststeps.ipynb
new file mode 100644
index 0000000..3a6226c
--- /dev/null
+++ b/doc/source/notebooks/firststeps.ipynb
@@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "# First steps into Bayesian optimization\n",
+    "*Ivo Couckuyt*, *Joachim van der Herten*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction\n",
+    "\n",
+    "Bayesian optimization is particularly useful for expensive optimization problems. This includes optimization problems where the objective (and constraints) are time-consuming to evaluate: measurements, engineering simulations, cross-hyperparameter optimization of deep learning models, etc. Another area where Bayesian optimization may provide a benefit is in the presence of (a lot of) noise.\n",
+    "\n",
+    "If your problem does not satisfy these requirements other optimization algorithms might be better suited.\n",
+    "\n",
+    "To setup a basic Bayesian optimization you have to:\n",
+    "\n",
+    "- define your objective and specify the optimization domain\n",
+    "- setup a GPflow model and choose an acquisition function\n",
+    "- create a BayesianOptimizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Objective function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table id='domain' width=100%><tr><td>Name</td><td>Type</td><td>Values</td></tr><tr><td>x1</td><td>Continuous</td><td>[-2.  2.]</td></tr><tr><td>x2</td><td>Continuous</td><td>[-1.  2.]</td></tr></table>"
+      ],
+      "text/plain": [
+       "<GPflowOpt.domain.Domain at 0x21ea69589b0>"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "from GPflowOpt.domain import ContinuousParameter\n",
+    "\n",
+    "\n",
+    "def fx(X):\n",
+    "    X = np.atleast_2d(X)\n",
+    "    # Return objective & gradient\n",
+    "    return np.sum(np.square(X), axis=1)[:, None]\n",
+    "\n",
+    "domain = ContinuousParameter('x1', -2, 2) + ContinuousParameter('x2', -1, 2)\n",
+    "domain"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Bayesian optimizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: optimization restart 5/5 failed\n",
+      "     fun: array([  1.69219493e-05])\n",
+      " message: 'OK'\n",
+      "    nfev: 15\n",
+      " success: True\n",
+      "       x: array([[ -9.60370839e-05,   4.11250850e-03]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import GPflow\n",
+    "from GPflowOpt.bo import BayesianOptimizer\n",
+    "from GPflowOpt.design import LatinHyperCube\n",
+    "from GPflowOpt.acquisition import ExpectedImprovement\n",
+    "from GPflowOpt.optim import SciPyOptimizer\n",
+    "\n",
+    "# Use standard Gaussian process Regression\n",
+    "lhd = LatinHyperCube(21, domain)\n",
+    "X = lhd.generate()\n",
+    "Y = fx(X)\n",
+    "model = GPflow.gpr.GPR(X, Y, GPflow.kernels.Matern52(2, ARD=True))\n",
+    "\n",
+    "# Now create the Bayesian Optimizer\n",
+    "alpha = ExpectedImprovement(model)\n",
+    "optimizer = BayesianOptimizer(domain, alpha)\n",
+    "\n",
+    "# Run the Bayesian optimization\n",
+    "with optimizer.silent():\n",
+    "    r = optimizer.optimize(fx, n_iter=15)\n",
+    "print(r)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That's all! Your objective function has now been optimized over 15 iterations."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/doc/source/notebooks/hyperopt.ipynb b/doc/source/notebooks/hyperopt.ipynb
index ca9fb43..8d3e46c 100644
--- a/doc/source/notebooks/hyperopt.ipynb
+++ b/doc/source/notebooks/hyperopt.ipynb
@@ -156,7 +156,7 @@
    "source": [
     "In total, a lot of hyperparameters must be optimized. Furthermore, the optimization surface of the spectral mixture is highly multimodal. Starting from the default hyperparameter values the optimized GP is able to pick up the linear trend, and the RBF kernels perform local interpolation. However, the kernel is not able to extrapolate away from the data. In sum, with this starting point, the likelihood optimization ends in a local minimum.\n",
     "\n",
-    "## GPflowOpt\n",
+    "## Hyperparameter optimization\n",
     "\n",
     "This issue is a known problem of the spectram mixture kernel, and several recommendations exist on how to improve the starting point. Here, we will use GPflowOpt to optimize the initial values for the lengthscales of the RBF and the Cosine kernel (i.e., the frequencies of the latter kernel). The other hyperparameters (rbf and cosine variances, likelihood variances and the linear and bias terms) are kept at their defaults and will be optimized by the standard likelihood optimization.\n",
     "\n",
@@ -239,7 +239,6 @@
     "from GPflowOpt import optim, BayesianOptimizer\n",
     "design = LatinHyperCube(6, domain)\n",
     "X = design.generate()\n",
-    "#print('generated')\n",
     "\n",
     "Y = objectivefx(X)\n",
     "m = GPR(X, Y, kern=Matern52(domain.size, ARD=False))\n",
diff --git a/doc/source/notebooks/new_acquisition.ipynb b/doc/source/notebooks/new_acquisition.ipynb
index d2bbf79..68612cf 100644
--- a/doc/source/notebooks/new_acquisition.ipynb
+++ b/doc/source/notebooks/new_acquisition.ipynb
@@ -12,6 +12,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Introduction\n",
+    "\n",
     "GPflowOpt implements supports some acquisition functions for common scenarios, such as EI and PoF. However, it is straightforward to implement your own strategy. For most strategies, it is sufficient to implement the `Acquisition` interface. In case a more sophisticated model is needed, this can easily be achieved with GPflow."
    ]
   },
@@ -56,6 +58,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Augmented expected improvement\n",
+    "\n",
     "As an example on how to implement a custom acquisition function, we illustrate the Augmented EI (Huang et al. 2006), a modification for Expected Improvement for optimization of noisy functions. It is defined as\n",
     "$$\n",
     "\\alpha_{\\text{aEI}}(\\mathbf x_{\\star}) = \\alpha_{\\text{EI}}(\\mathbf x_{\\star}) \\left( 1 - \\frac{\\sigma_n}{\\sqrt{\\text{Var}\\left[ f_{\\star}\\,|\\, \\mathbf x, \\mathbf y, \\mathbf x_{\\star} \\right] + \\sigma_n^2}}\\right)\n",
@@ -89,6 +93,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Results\n",
+    "\n",
     "This small experiment on the six hump camelback illustrates impact of the penalty term."
    ]
   },
@@ -166,7 +172,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.1"
   }
  },
  "nbformat": 4,
diff --git a/doc/source/notebooks/structure.ipynb b/doc/source/notebooks/structure.ipynb
index 690bcb3..d88e589 100644
--- a/doc/source/notebooks/structure.ipynb
+++ b/doc/source/notebooks/structure.ipynb
@@ -293,7 +293,7 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 3.0
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
@@ -304,5 +304,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
-}
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/doc/source/transforms.rst b/doc/source/transforms.rst
index eb98c90..929c4c0 100644
--- a/doc/source/transforms.rst
+++ b/doc/source/transforms.rst
@@ -1,5 +1,5 @@
-GPflowOpt Data Transformations
-===============================
+Data Transformations
+====================
 
 Transforms
 ----------
diff --git a/doc/source/tutorialsAndExamples.rst b/doc/source/tutorialsAndExamples.rst
index 69aca7b..6eb3d41 100644
--- a/doc/source/tutorialsAndExamples.rst
+++ b/doc/source/tutorialsAndExamples.rst
@@ -1,9 +1,12 @@
+.. _tutorials:
+
 Tutorials and examples
 ==================================
 
 .. toctree::
    :maxdepth: 2
 
+   notebooks/firststeps
    notebooks/constrained_bo
    notebooks/new_acquisition
    notebooks/hyperopt