# Applying LoRAs to Translation Equivariant Self-Attention in Transformers

Suppose we are given an input matrix $X \in \mathbb{R}^{d \times d_{in}}$, with columns representing the embedding vectors of $d$ tokens. We can formulate the self-attention matrix in a transformer with relative positional encoding as 

\begin{equation}
A_{i, j} =  (W_QX)(W_K(X+P_{x(j)-x(i)}))^T 
\end{equation}

We leave out the scaling factor $1/\sqrt{d_k}$ for simplicity, but it can easily be included without disrupting any of our arguments or proofs. However, it will be more beneficial for our purposes to formulate self-attention in a function theoretic way. In particular, we can view the input matrix $X$ as a vector valued function $f:S \to \mathbb{R}^{d_{in}}$, that is $f \in L_{\mathbb{R}^{d_{in}}}(S)$, for the index set $S = \{1, 2, ..., d\}$. We then view the query and key matrices as maps $\varphi_{qry}: L_{\mathbb{R}^{d_{in}}}(S) \to L_{\mathbb{R}^{d_k}}(S)$ and $\varphi_{key}: L_{\mathbb{R}^{d_{in}}}(S) \to L_{\mathbb{R}^{d_k}}(S)$. There is also a value function $\varphi_{val}: L_{\mathbb{R}^{d_{in}}}(S) \to L_{\mathbb{R}^{d_v}}(S)$. With these in hand, we can express the attention map with positional encoding as
\begin{equation}
A_{i, j} = \alpha[f](i, j) = \langle \varphi_{qry}(f(i)), \varphi_{key}(f(j) + \rho(i, j))\rangle
\end{equation}

Here, we have written $\rho(i, j)$ for the positional encoding. The map $\alpha[f]:S \times S \to \mathbb{R}$ maps pairs of elements $i, j \in S$ to the attention score of $j$ relative to $i$. We can then write the attention mechanism as 

\begin{equation}
\zeta[f](i) = \sum_{j \in S} \sigma_j\left( \alpha[f](i, j) \right)\varphi_{val}(f(j))
\end{equation}

Next, we would like to include a LoRA for the query, key, and value maps. We will formulate this as 

\begin{equation}
\Delta\varphi_{qry}(f(i)) = (\varphi^{A}_{qry} \circ \varphi^{B}_{qry})(f(i)) = \varphi^B_{qry}(\varphi^A_{qry}(f(i)))
\end{equation}

\begin{equation}
\Delta\varphi_{key}(f(i)) = (\varphi^{A}_{key} \circ \varphi^{B}_{key})(f(i)) = \varphi^B_{key}(\varphi^A_{key}(f(i)))
\end{equation}

\begin{equation}
\Delta\varphi_{val}(f(i)) = (\varphi^{A}_{val} \circ \varphi^{B}_{val})(f(i)) = \varphi^B_{val}(\varphi^A_{val}(f(i)))
\end{equation}

Here, we have 

\begin{align}
\varphi^A_{qry}: L_{\mathbb{R}^{d_k}}(S) \to L_{\mathbb{R}^{r(A)}}(S) &\quad \quad \varphi^B_{qry}: L_{\mathbb{R}^{r(A)}}(S) \to L_{\mathbb{R}^{d_k}}(S) \\
\varphi^A_{key}: L_{\mathbb{R}^{d_k}}(S) \to L_{\mathbb{R}^{r(A)}}(S) &\quad \quad \varphi^B_{key}: L_{\mathbb{R}^{r(A)}}(S) \to L_{\mathbb{R}^{d_k}}(S) \\
\varphi^A_{val}: L_{\mathbb{R}^{d_v}}(S) \to L_{\mathbb{R}^{r(A)}}(S) &\quad \quad \varphi^B_{val}: L_{\mathbb{R}^{r(A)}}(S) \to L_{\mathbb{R}^{d_v}}(S) 
\end{align}

Next, including this in the attention mechanism, we get

\begin{equation}
\alpha^{LoRA}[f](i, j) = \langle \varphi_{qry}(f(i))+ \Delta\varphi_{qry}(f(i)), \varphi_{key}(f(j) + \rho(i, j)) + \Delta\varphi_{key}(f(j) + \rho(i, j))\rangle
\end{equation}

and then 

\begin{equation}
\zeta^{LoRA}[f](i) = \sum_{j \in S} \sigma_j\left( \alpha^{LoRA}[f](i, j) \right)(\varphi_{val}(f(j))+ \Delta\varphi_{val}(f(j)))
\end{equation}

Now, in order to have translation equivariance of a LoRA multihead self-attention with relative positional encoding, we need the following equation to hold, 

\begin{equation}
m^r_{LoRA}[L_y[f], \rho](i) = L_y[m^r_{LoRA}[f, \rho]](i)
\end{equation}

where $L_y[f](i) = f(x^{-1}(x(i)-y))$. The LoRA multihead self-attention with relative positional encodings on $L_y[f]$ is given by


\begin{align}
m^r_{LoRA}[L_y[f], \rho](i) &= \varphi_{out}\Bigg( \bigcup_{h \in [H]} \sum_{j \in N(i)} \sigma_j \Bigg( \Big\langle \varphi_{qry}^{(h)}(L_y[f](i)) + \Delta\varphi_{qry}^{(h)}(f(i)), \\
&\quad \varphi_{key}^{(h)}(L_y[f](j)+\rho(i, j)) + \Delta\varphi_{key}^{(h)}(f(j)+\rho(i, j)) \Big\rangle\Bigg) \Bigg)\Bigg(\varphi_{val}^{(h)}(L_y[f](j)) + \Delta\varphi_{val}^{(h)}(f(j)) \Bigg)\\
&= \varphi_{out}\Bigg( \bigcup_{h \in [H]} \sum_{j \in N(i)} \sigma_j \Bigg( \Big\langle \varphi_{qry}^{(h)}(f(x^{-1}(x(i)-y))) + \Delta\varphi_{qry}^{(h)}(f(x^{-1}(x(i)-y))), \\
&\quad \varphi_{key}^{(h)}(f(x^{-1}(x(j)-y))+\rho(i, j)) + \Delta\varphi_{key}^{(h)}(f(x^{-1}(x(j)-y))+\rho(i, j)) \Big\rangle\Bigg) \Bigg)\Bigg(\varphi_{val}^{(h)}(f(x^{-1}(x(j)-y))) + \Delta\varphi_{val}^{(h)}(f(x^{-1}(x(j)-y))) \Bigg) \\
&= \varphi_{out}\Bigg( \bigcup_{h \in [H]} \sum_{x^{-1}(x(\overline{j})+y) \in N(x^{-1}(x(\overline{i})+y))} \sigma_{x^{-1}(x(\overline{j})+y)} \Bigg( \Big\langle \varphi_{qry}^{(h)}(f(\overline{i})) + \Delta\varphi_{qry}^{(h)}(f(\overline{i})), \\
&\quad \varphi_{key}^{(h)}(f(\overline{j})+\rho(x^{-1}(x(\overline{i})+y), x^{-1}(x(\overline{j})+y))) + \Delta\varphi_{key}^{(h)}(f(\overline{j})+\rho(x^{-1}(x(\overline{i})+y), x^{-1}(x(\overline{j})+y))) \Big\rangle\Bigg) \Bigg)\Bigg(\varphi_{val}^{(h)}(f(\overline{j})) + \Delta\varphi_{val}^{(h)}(f(\overline{j})) \Bigg)
\end{align}

Here we have used the substitution $\overline{i} = x^{-1}(x(i)-y) \implies i = x^{-1}(x(\overline{i})+y)$ and $\overline{j} = x^{-1}(x(j)-y) \implies j = x^{-1}(x(\overline{j})+y)$. We can further reduce the equations using the defintion of $\rho(i, j) = \rho^P(x(j) - x(i))$:

\begin{align}
&= \varphi_{out}\Bigg( \bigcup_{h \in [H]} \sum_{x^{-1}(x(\overline{j})+y) \in N(x^{-1}(x(\overline{i})+y))} \sigma_{x^{-1}(x(\overline{j})+y)} \Bigg( \Big\langle \varphi_{qry}^{(h)}(f(\overline{i})) + \Delta\varphi_{qry}^{(h)}(f(\overline{i})), \\
&\quad \varphi_{key}^{(h)}(f(\overline{j})+\rho^P(x(\overline{j}) + y - (x(\overline{i})+y))) + \Delta\varphi_{key}^{(h)}(f(\overline{j})+\rho^P(x(\overline{j}) + y - (x(\overline{i})+y))) \Big\rangle\Bigg) \Bigg)\Bigg(\varphi_{val}^{(h)}(f(\overline{j})) + \Delta\varphi_{val}^{(h)}(f(\overline{j})) \Bigg)\\
&= \varphi_{out}\Bigg( \bigcup_{h \in [H]} \sum_{x^{-1}(x(\overline{j})+y) \in N(x^{-1}(x(\overline{i})+y))} \sigma_{x^{-1}(x(\overline{j})+y)} \Bigg( \Big\langle \varphi_{qry}^{(h)}(f(\overline{i})) + \Delta\varphi_{qry}^{(h)}(f(\overline{i})), \\
&\quad \varphi_{key}^{(h)}(f(\overline{j})+\rho^P(x(\overline{j}) - x(\overline{i}))) + \Delta\varphi_{key}^{(h)}(f(\overline{j})+\rho^P(x(\overline{j}) - x(\overline{i}))) \Big\rangle\Bigg) \Bigg)\Bigg(\varphi_{val}^{(h)}(f(\overline{j})) + \Delta\varphi_{val}^{(h)}(f(\overline{j})) \Bigg)\\
&= \varphi_{out}\Bigg( \bigcup_{h \in [H]} \sum_{x^{-1}(x(\overline{j})+y) \in N(x^{-1}(x(\overline{i})+y))} \sigma_{x^{-1}(x(\overline{j})+y)} \Bigg( \Big\langle \varphi_{qry}^{(h)}(f(\overline{i})) + \Delta\varphi_{qry}^{(h)}(f(\overline{i})), \\
&\quad \varphi_{key}^{(h)}(f(\overline{j})+\rho(\overline{i}, \overline{j})) + \Delta\varphi_{key}^{(h)}(f(\overline{j})+\rho(\overline{i}, \overline{j})) \Big\rangle\Bigg) \Bigg)\Bigg(\varphi_{val}^{(h)}(f(\overline{j})) + \Delta\varphi_{val}^{(h)}(f(\overline{j})) \Bigg)
\end{align}

For any translation $y \in \mathbb{R}^{d_{in}}$, where $d_{in}$ is the dimension of $f(i)$ and $f(j)$, the summation remains the same, so we have:

\begin{equation}
\sum_{x^{-1}(x(\overline{j})+y) \in N(x^{-1}(x(\overline{i})+y))}[\bullet] = \sum_{x^{-1}(x(\overline{j})) \in N(x^{-1}(x(\overline{i})))}[\bullet] = \sum_{\overline{j} \in N(\overline{i})}[\bullet]
\end{equation}

As we can see, $m^r_{LoRA}[L_y[f], \rho](i) = L_y[m^r_{LoRA}[f, \rho]](i)$. We can thus conclude that relative positional encodings, coupled with LoRAs for the query, key, and value weight matrices gives a translation equivariant multihead self-attention mechanism. In particular, addition of LoRAs in a translation equivariant model with relative positional encodings does not disrupt the translation equivariance. 


which means we can further reduce the expression as

\begin{align}
m^r_{LoRA}[L_y[f], \rho](i) &= \varphi_{out}\Bigg( \bigcup_{h \in [H]} \sum_{x^{-1}(x(\overline{j})+y) \in N(x^{-1}(x(\overline{i})+y))} \sigma_{x^{-1}(x(\overline{j})+y)} \Bigg( \Big\langle \varphi_{qry}^{(h)}(f(\overline{i})) + \Delta\varphi_{qry}^{(h)}(f(\overline{i})), \\
&\quad \varphi_{key}^{(h)}(f(\overline{j})+\rho(\overline{i}, \overline{j})) + \Delta\varphi_{key}^{(h)}(f(\overline{j})+\rho(\overline{i}, \overline{j})) \Big\rangle\Bigg) \Bigg)\Bigg(\varphi_{val}^{(h)}(f(\overline{j})) + \Delta\varphi_{val}^{(h)}(f(\overline{j})) \Bigg)\\
&= m^r_{LoRA}[f, \rho](\overline{i})\\
&= m^r_{LoRA}[f, \rho](x^{-1}(x(i)-y))\\
&= L_y[m^r_{LoRA}[f, \rho]](i)
\end{align}