Lecture10.tex

%% chapter 7, LCS03, convergence analysis
%\input{header}
%
%% \newcommand{\midarrow}[1]{\stackrel{#1}{\to}}
%
%% \newcommand{\Real}{\mathbb R}
%% \newtheorem{assumption}{Assumption}
%\begin{document}
%
%
%% First page number
%\setcounter{page}{1}
%% PREVIOUS section number
%\setcounter{section}{5}
%% PREVIOUS subsection number
%\setcounter{subsection}{0}
%
%% ***********************************************************************
%\textsc{Reinforcement Learning and Dynamic Programming \hfill Spring 2014$\;$ \\
%Lecture Notes \hfill Shie Mannor and Nahum Shimkin}
%\vspace{-18pt}
%\par
%\hrulefill
%% ***********************************************************************
%
%% {\large
%
%\section{Basic Convergence Results for RL Algorithms}

We establish here some asymptotic convergence results for the basic
RL algorithms, by showing that they reduce to Stochastic Approximation
schemes. We focus on the discounted-cost problem, which is easiest.
Analogous results exist for shortest-path problems under the properness assumption (every policy
terminates in expected finite time).

We do not directly consider here the issue of {\em exploration},
which is essential for convergence to the optimal policy. Thus, where
required we will simply {\em assume} that all actions are sampled
often enough.


\section{Q-learning}

Recall the $Q$-learning algorithm, in the following generalized form:
$$
Q_{n+1} (s,a) = Q_n(s,a) + \alpha_n(s,a)
[r (s,a,s'_{s,a}) + \gamma \max_{{a'}} Q_n (s'_{s,a}, {a'}) - Q_n(s,a)]
$$
where each $s_{s,a}'$ is determined randomly according to
$p(s'|s,a)$.

We allow here $\alpha_n(s,a) = 0$, so that any number of $(s,a)$ pairs can be
updated at each stage.

This iteration can be viewed as an (asynchronous)
Stochastic Approximation algorithm, with $Q\equiv \theta$.
This leads to the following result.

\begin{theorem}[Convergence of $Q$-learning.]
Let $\gamma < 1$, and let  $Q^*$ be the optimal $\gamma$-discounted
$Q$ function. Assume
$$
\sum_{h=0}^\infty \alpha_n (s,a) = \infty, \quad \sum_{n=0}^\infty
\alpha_n(s,a)^2 < \infty \quad
\text{(w.p. 1)} \quad \forall s, a\,.
$$
Then
$$
\lim_{n\to\infty} Q_n(s,a) = Q^* (s,a) \quad \text{(w.p. 1)} \quad \forall s,a \,.
$$
\end{theorem}

\begin{proof}
Define the mapping $H$ over the set of $Q$-functions as follows:
\begin{align*}
(H Q) (s,a) & = \sum_{s'} P(s'|s,a) [r(s, a,s') + \gamma \max_{a'} Q(s',{a'})]\\
& = E[r(s,a,s_{n+1}) + \gamma \max_{a'} Q(s_{n+1},{a'})|s_n=s,a_n=a]\,.
\end{align*}
The above Q-learning algorithm can thus be written in the standard SA form,
with the noise vector $\om_n$ given by:
$$
\om_n(s,a) = r(s,a,s'_{s,a}) + \gamma \max_{a'} Q_n (s'_{s,a}, {a'}) -
(HQ) (s,a)
\,.
$$
We proceed to verify the assumptions in Theorem \ref{thm:SA_2}:
\begin{itemize}
\item[(a)]
Step-size requirements hold here by assumption.
\item[(b)]
Noise Assumption N1:
The definition of $\om_n$ immediately implies that
$ E (\om_n(s,a) | \cF_n) = 0$.
It is further easily seen that
$$
E (\om_n(s,a)^2 | \cF_n) \le \text{ quadratic function of \ }
\|Q\|_{\infty}\,.
$$
\item[(c)]
Contraction:
As with the discounted DP operator, it may be verified that $H$ is a
$\gamma$-contraction w.r.t.\ the max-norm.
\end{itemize}
The required convergence result therefore follows by Theorem \ref{thm:SA_2}.
\end{proof}


\paragraph{Remarks on basic (on-policy) Q-learning:}
\begin{itemize}
\item
In the basic version of the algorithm, we follow a state-action sequence
$(s_n, a_n; n=0,1,\cdots)$ which is generated by some arbitrary policy,
and at time $n$ update $Q(s, a)$ only for $(s,a)=(s_n,a_n)$.
This corresponds to the choice of gains:
$$
\alpha_n(s,a) > 0 \quad \text{iff} \quad
(s,a) = (s_n, a_n)
\,.
$$
\item
For $(s,a)= (s_n, a_n)$, a typical choice for $\alpha_n$ is
$$
\alpha_n(s,a) = \hat\alpha(N_n (s,a))
$$
where $N_n$ is the number of previous visits to $(s,a)$, and
$\hat\alpha(k)$ satisfies the standard assumptions.
\item
For the step-size requirements in the theorem to hold in this case it is
required that each $(s,a)$ pair is visited ``relatively often''.
This should be verified by appropriate exploration policies!
\end{itemize}

\paragraph{Undiscounted case:}
Under appropriate ``Stochastic Shortest Path''
assumptions, it can be shown that $H$ is a pseudo-contraction w.r.t.\ some
weighted max-norm.
Convergence follows as above.


\section{Convergence of TD($\la$)}

TD(0) can be analyzed exactly as Q-learning learning.
TD($\la$) is slightly more involved.
Recall the ``on-line" version of TD($\la$):
$$
V_{n+1}(s)=V_n(s)+\alpha_n e_n(s)d_n\,,\quad s\in S
$$
where
\begin{align*}
\alpha_n & = \mbox{\rm gain} \\
e_n(s) & = \mbox{\rm eligibility trace coefficient}\\
d_n & = r_n + \gamma V_n(s_{n+1})-V_n(s_n)\\
\gamma& = \mbox{\rm discount factor}
\end{align*}

\paragraph{Requirements on the Eligibility Trace:}

Several variants of the algorithm are obtained by different choices
of $e_n(s)$, such as:
\begin{itemize}
\item[(a)] First-visit TD($\la$):
$$
e_n(s)=(\gamma\la)^{n-m_1(s)} 1\{n\ge m_1(s)\} \,,
$$
$m_1(s)$ is the time of first visit to state $s$ (during the present run).
\item[(b)] Every-visit TD($\la$):
$$
e_n(s)=\sum_{j:m_j(s)\le n}   (\gamma\la)^{n-m_j(s)} \,,
$$
$m_j(s)$ is the time of $j^{th}$ visit to state $s$.
\item[(c)]  First-visit with stopping:
$$
e_n(s)=(\gamma\la)^{n-m_1(s)} 1\{m_1(s)\le n \le \tau\}
$$
where $\tau$ is some stopping time -- e.g., end of simulation run, or arrival
to a state whose value $V(s)$ is known with high precision. $e_n(s)$ is
restarted after $\tau$.
\end{itemize}


A {\em general set of requirements} on the eligibility coefficients $e_n(s)$, which
includes the above cases, is given as follows:
\begin{itemize}
\item[(a)]
$e_0(s)= 0$, $e_n(s)\ge 0$.
\item[(b)]
$e_n(s)\le \gamma e_{n-1}(s)$ if $s_n\not= s$, \\
$1\le e_n(s)\le 1+ \gamma e_{n-1}(s)$ if $s_n= s$.
\item[(c)]
$e_n(s)$ is measurable on the past.
\end{itemize}

\paragraph{Convergence:}

We now argue as follows.
\begin{itemize}
\item
It may be seen that TD($\la$) is in the form of the Stochastic
Approximation algorithm, with $\theta_n\equiv V_n$, and
$$
h(\theta)\equiv h(V) = (h(V)(s),\,s\in S)\,,
$$
\begin{eqnarray*}
h(V)(x) &=& E^\pi [d_n|V_n=V,\, s_n=s]\\
&=& \sum_a \pi(a|s)[r(s,a)+\gamma\sum_{s'} p(s'|s,a)V(s')]-V(s) \\
&:=& (HV)(s) -V(s) \,.
\end{eqnarray*}
Here $\pi$ is the {\em fixed stationary} policy that is used.
\item
For $0<\gamma<1$ it is obvious that $H$ is a contraction operator.
\item
For convergence we now need to verify  that the effective  gains
$\alpha_ne_n(s)$ satisfy the ``usual assumptions". This may be
verified by requiring that each state is visited ``relatively often".
\end{itemize}

For $\gamma=1$, a similar argument may be made for SSP (Stochastic Shortest Path)
problems.


%
% \subsection{An ODE Perspective}
%
% It is instructive to examine the convergence of these algorithms via the
% ODE approach.
%
% Recall that Q-learning is in the form of the SA algorithm,
% with $h(Q)=H(Q)-Q$, and $H$ is a max-norm contraction.
%
% The corresponding ODE is
% $$
% \dot Q = H(Q)-Q
% $$
% It can be shown that if $H$ is a max-norm contraction, then the fixed
% point $Q^*$ of $H(Q)=Q$ is globally asymptotically stable.
%
% To establish convergence it only remains to show that $\{\hat{Q}_n\}$
% is a bounded sequence (w.p.\  1). This is usually done via separate
% analysis. However, a recent result
% shows that boundedness too can be deduced via the ODE approach
% (Borkar and Meyn, 2000):
%
% Given $h(Q)\equiv h(\theta)$, define
% $$
% h_{\rho}(\theta)\dfn h(\rho\theta)/\rho\,,
% $$
% and assume that $h_\infty(\theta)\dfn\lim_{\rho\to\infty}h_{\rho}(\theta)$
% exists.
%
% This limit would exist when $h$ has some  ``linear nature". For example,
% for Q-learning we have that $h_{\infty}(Q)\; =$ same as $h(Q)$, but
% with 0 rewards, $r(s,a)\equiv 0$.
%
% {\bf Theorem:} In the basic SA algorithm, if $h_{\infty}$ is well defined, and
% the normalized ODE
% $$
% \dot \theta = h_{\infty}(\theta)
% $$
% has $\theta^*=0$ as an asymptotically stable equilibrium point, then
% $\sup_n |\theta_n| < \infty$ (w.p.  1).
%
% A similar conclusion holds for the ``distributed" versions under appropriate
% conditions.


\section{Actor-Critic Algorithms}

Convergence of actor-critic type algorithms is harder to analyze. We describe here some results from
Konda and Borkar (2000).

Recall that the idea is to use a ``fast" estimation loop to obtain $\hat V(s)$,
and a slower loop to update
the policy $\hat \pi$ given $\hat V$.

Let $V_n(s)$ and $\pi_n=(\pi_n(a|s))$ be the estimated value and policy at step
$n$.

\paragraph{Algorithm 1}
\begin{itemize}
\item[a.]
Value-function estimation (generalized TD(0)):
$$
V_{n+1}=V_n(s)+\beta_n(s)[r(s,a_n(s))+\gamma V_n(s_{n+1}(s))-V_n(s)]\,,\quad s\in Y_n
$$
where
\begin{align*}
&  Y_n   \mbox{ \rm -- set of states updated at step $n$} \\
&  \beta_n(s)  \mbox{ \rm -- gains }\\
& s_{n+1}(s)  \mbox { \rm -- next state, chosen with distribution\ }
         \mbox{\rm $p(s') = \sum_a p(s'|s,a)\pi_n(a|s)$.
}
\end{align*}

\item[b.] Policy update:
$$
\pi_{n+1}(a|s)=\pi_{n}(a|s) + \alpha_n(s,a) (( \hat{Q}_n(s,a) - \hat{Q}_n(s,a_0) ))\,,
\quad (s,a) \in Z_n
$$
where
\begin{align*}
& Z_n  \mbox{ \rm -- set of state-action pairs  updated at step $n$} \\
& \alpha_n(s,a) \mbox{ \rm -- gains} \\
& \hat{Q}_n(s,a):=r(s,a)+\gamma V_n(s_{n+1}(s,a))\\
& s_{n+1}(s,a) \mbox{ \rm -- next state, chosen according to $p(s'|s,a)$} \\
& a_0 \mbox { \rm -- a fixed reference action (for each state).}
\end{align*}

\item[b'.] Policy normalization:

For each $s$, project the vector $(\pi_{n+1}(a|s),\; a\not= a_0)$ unto the
following set
of sub-probability vectors:
$$
\{\pi: \pi(a)\ge 0,\;\; \sum_{a\not = a_0}\pi(a)\le 1\}
$$
and then let $\pi_{n+1}(a_0|s)=1-\sum_{ a\not = a_0}\pi_{n+1}(a|s) $.

% The above scheme can be considered a ``variable structure" automaton". Note that
% the range of $Q$ is not known in advance.

\item[c.] Rate requirements:

We first require that all updates are executed relatively often, namely that for
some $\Delta>0$,
$$
\liminf_{n\to\infty}\frac{n_1(s)}{n} \ge \Delta\,,\quad
\liminf_{n\to\infty}\frac{n_2(s,a)}{n} \ge \Delta\,,
$$
where
\begin{align*}
n_1(s) &= \sum_{k=1}^n 1\{s\in Y_k\} \\
n_2(s,a) &= \sum_{k=1}^n 1\{(s,a) \in Z_k\} \,.
\end{align*}

The gains are determined by some sequences $\alpha(m)$ and $\beta(m)$, as
$$
\alpha_n(s,a)=\alpha(n_2(s,a))\,,\quad \beta_n(s)=\beta(n_1(s)) \,.
$$
The sequences $\alpha(m)$, $\beta(m)$  should satisfy:

(1) The standard summability assumptions.

(2) Policy updates are ``slower":  $\lim_{m\to\infty}\frac{\alpha(m)}{\beta(m)}=0$.

(3) Some additional technical assumptions ...

All these requirements are satisfied, e.g., by
$\alpha(m)=\frac{1}{m\log m}$, $\beta(m)=\frac{1}{m}$.
\end{itemize}

Under these assumptions, Algorithm 1 converges to the optimal value and
policy.


\paragraph{Algorithm 2:}

Same as Algorithm 1, except for the policy update (b):
$$
\pi_{n+1}(a|s)=\pi_n(a|s)+a_n(s,a)[\{\hat Q_n(s,a)-V_n(s)\}\pi_n(a|s)+\xi_n(s,a)
] \,.
$$
$\xi_n(s,a)$ are sequences of  ``small" noise terms, these are needed to prevent
 the algorithm
from getting stuck in the wrong ``corners".


\paragraph{Algorithm 3:}

Same as Algorithm 1, except for  (b):
$$
w_{n+1}(a|s)=w_n(a|s)+\alpha_n(s,a)[\hat Q_n(s,a)-V_n(s)]
$$
and
$$
\pi_n(s,a):=\frac{\exp(w_n(s,a))}{\sum_{a'}\exp(w_n(s,a'))} \,.
$$


% Remark:  Compare these algorithms to the Static Reinforcement Learning chapter.

In all these variants, convergence is proved using a ``two-time scale"
Stochastic Approximation framework, the analysis is based on the ODE
method which couples a ``fast"
ODE (for $V$) and a ``slow" ODE  (for $\pi$).

%
%% }
%% large ended
%
%\end{document}
%~