diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml
index 2cdaad8..470f52d 100644
--- a/.github/workflows/static.yml
+++ b/.github/workflows/static.yml
@@ -4,7 +4,7 @@ name: Deploy static content to Pages
 on:
   # Runs on pushes targeting the default branch
   push:
-    branches: ["master"]
+    branches: ["master", "beamer-tutorial-2025"]
 
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
@@ -22,58 +22,53 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
-  build_pdf: # New job for PDF compilation
+  build_pdf: # Job for LaTeX and Beamer PDF compilation
     runs-on: ubuntu-latest
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
 
-    - name: Compile LaTeX to PDF
+    - name: Compile LaTeX (main.tex)
       uses: xu-cheng/latex-action@v3
       with:
         root_file: main.tex
-
-    - name: Upload PDF artifact
+        extra_packages: >-
+          texlive-fonts-recommended
+          texlive-latex-recommended
+          texlive-latex-extra
+          texlive-science
+          texlive-xetex
+          latexmk
+
+    - name: Compile Beamer tutorial (beamer-tutorial.tex)
+      uses: xu-cheng/latex-action@v3
+      with:
+        root_file: beamer-tutorial.tex
+        extra_packages: >-
+          texlive-fonts-recommended
+          texlive-latex-recommended
+          texlive-latex-extra
+          texlive-science
+          texlive-xetex
+          latexmk
+
+    - name: Upload main PDF artifact
       uses: actions/upload-artifact@v4
       with:
         name: latex-pdf-output
-        path: main.pdf # Assuming main.pdf is in the root after compilation
-
-  build_html: # New job for HTML compilation
-    runs-on: ubuntu-latest
-    # This entire job will run inside the specified container
-    container:
-      image: texlive/texlive:latest # This container has mk4ht
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-      # Note: When using a container, 'actions/checkout' checks out into /github/workspace
-      # The default working directory for subsequent 'run' commands will be this path.
-
-    - name: Compile LaTeX to HTML using tex4ht
-      # 'mk4ht' is now guaranteed to be available in this container environment
-      run: |
-        echo "Starting HTML compilation with mk4ht..."
-        mk4ht htlatex main.tex "config.cfg,xhtml" "-p"
-        
-        # Create output directory and move files
-        mkdir -p html_output
-        mv main.html html_output/
-        # Add commands to move other generated files like .css, image folders etc.
-        if [ -f main.css ]; then mv main.css html_output/; fi
-        if [ -d main-images ]; then mv main-images html_output/; fi
-        echo "HTML compilation complete and files moved to html_output/."
+        path: main.pdf
 
-    - name: Upload HTML artifact
+    - name: Upload beamer PDF artifact
       uses: actions/upload-artifact@v4
       with:
-        name: latex-html-output
-        path: html_output/
+        name: beamer-pdf-output
+        path: beamer-tutorial.pdf
 
   # Single deploy job since we're just deploying
   deploy:
-    needs: [build_pdf, build_html]
+    # Only deploy from the protected default branch
+    if: github.ref == 'refs/heads/master'
+    needs: [build_pdf]
     environment:
       name: github-pages
       url: ${{ steps.deployment.outputs.page_url }}
@@ -86,16 +81,15 @@ jobs:
     - name: Download all artifacts
       uses: actions/download-artifact@v4
       with:
-        path: artifacts/ # Downloads both latex-pdf-output and latex-html-output
+        path: artifacts/ # Downloads both latex-pdf-output and beamer-pdf-output
 
     - name: Prepare public directory for Pages
       run: |
         mkdir public
-        # Move PDF to public/
+        # Move PDFs to public/
         mv artifacts/latex-pdf-output/main.pdf public/
-        # Move HTML files to public/html/
-        mkdir -p public/html
-        mv artifacts/latex-html-output/* public/html/
+        mv artifacts/beamer-pdf-output/beamer-tutorial.pdf public/
+        ls -l public
 
     - name: Setup Pages
       uses: actions/configure-pages@v5
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bf78bc8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,45 @@
+# LaTeX intermediate files
+*.aux
+*.fdb_latexmk
+*.fls
+*.log
+*.nav
+*.out
+*.snm
+*.synctex.gz
+*.toc
+*.vrb
+
+# LaTeX build files
+*.bbl
+*.blg
+*.idx
+*.ilg
+*.ind
+*.lof
+*.lot
+*.glo
+*.gls
+*.glg
+*.acn
+*.acr
+*.alg
+*.bcf
+*.run.xml
+*.figlist
+*.makefile
+*.fls
+*.fdb_latexmk
+
+# LaTeX backup files
+*.bak
+*~
+
+# LaTeX editor files
+*.swp
+*.swo
+*~
+
+# PDF files (uncomment if you don't want to track compiled PDFs)
+*.pdf
+
diff --git a/AALP.tex b/AALP.tex
new file mode 100644
index 0000000..5fc6c76
--- /dev/null
+++ b/AALP.tex
@@ -0,0 +1,55 @@
+
+\section{Advanced ALP}
+
+This Section treats some more advanced ALP topics by which programmers can exercise tighter control over performance or semantics.
+
+\subsection{Performance Optimisation through Descriptors}
+
+We have previously seen that the semantics of primitives may be subtly changed by the use of descriptors: e.g., adding \texttt{grb::descriptors::transpose\_matrix} to \texttt{grb::mxv} has the primitive interpret the given matrix $A$ as its transpose ($A^T$) instead. Other descriptors, however, may also modify the performance semantics of a primitive. One example is the \texttt{grb::descriptors::dense} descriptor, which has two main effects when supplied to a primitive:
+\begin{enumerate}
+	\item all vector arguments to the primitive must be dense on primitive entry; and
+	\item any code paths that check sparsity or deal with sparsity are disabled.
+\end{enumerate}
+The latter effect directly affects performance, which is particularly evident for the \texttt{grb::nonblocking} backend. Another type of performance effect also caused by the latter is that produced binary code is smaller in size as well.\vspace{.5\baselineskip}
+
+\noindent \textbf{Exercise 12}: inspect the implementation of the PCG method in ALP. Run experiments using the \texttt{nonblocking} backend, comparing the performance of repeated linear solves with and without the dense descriptor. Also inspect the size of the binary. \textbf{Hint}: try \verb|make -j\$(nprocs) build_tests_category_performance| and see if an executable is produced in \texttt{tests/performance} that helps you complete this exercise faster.
+
+\subsection{Explicit SPMD}
+
+When compiling any ALP program with a distributed-memory backend such as \texttt{bsp1d} or \texttt{hybrid}, ALP automatically parallelises across multiple user processes. Most of the time this suffices, however, in some rare cases, the ALP programmer requires exercising explicit control over distributed-memory parallelism. Facilities for these exist across three components, in order of increasing control: \texttt{grb::spmd}, \texttt{grb::collectives}, %\texttt{grb::rdma}, 
+and \emph{explicit backend dispatching}.
+
+\subsubsection*{Basic SPMD}
+
+When selecting a distributed-memory backend, ALP automatically generates SPMD code without the user having to intervene. The \texttt{grb::spmd} class exposes these normally-hidden SPMD constructs to the programmer: 1) \texttt{grb::spmd<>::nprocs()} returns the number of user processes in the current ALP program, while 2) \texttt{grb::spmd<>::pid()} returns the unique ID of the current user process.
+
+\noindent \textbf{Exercise 13}: try to compile and run the earlier hello-world example using the \texttt{bsp1d} backend. How many hello world messages are printed? \textbf{Hint}: use \texttt{-np 2} to \texttt{grbrun} to spawn two user processes when executing the program. Now modify the program so that no matter how many user processes are spawned, only one message is printed to the screen (\texttt{stdout}).
+
+\subsubsection*{Collectives}
+
+The most common way to orchestrate data movement between user processes are the so-called \emph{collective communications}. Examples include:
+\begin{enumerate}
+	\item \emph{broadcast}, a communication pattern where one of the user processes is designated the \emph{root} of the communication, and has one payload message that should materialise on all other user processes. 
+	\item \emph{allreduce}, a communication pattern where all user processes have a value that should be \emph{reduced} into a single aggregate value, which furthermore must be available at each user process.
+\end{enumerate}
+
+ALP also exposes collectives, and in the case of (all)reduce does so in an algebraic manner-- that is, the signature of an allreduce expects an explicit monoid that indicates how aggregation is supposed to occur:
+
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false, morekeywords=constexpr, morekeywords=size_t ]
+size_t to_be_reduced = grb::spmd<>::pid();;
+grb::monoid::max< int > max;
+grb::RC rc = grb::collectives<>::allreduce( to_be_reduced, max );
+if( rc == grb::SUCCESS ) { assert( to_be_reduced + 1 == grb::spmd<>::nprocs ); }
+if( grb::spmd<>::pid() == 0 ) { std::cout << "There are " << to_be_reduced << " processes\n"; }
+\end{lstlisting}
+
+\noindent \textbf{Exercise 14}: change the initial assignment of \texttt{to\_be\_reduced} to $1$ (at each process). Modify the above example to still compute the number of processes via an allreduce collective. \textbf{Hint}: if aggregation by the max-monoid is not suitable after changing the initialised value, which aggregator would be?
+
+\subsubsection*{Explicit dispatching}
+
+ALP containers are templated in the backend they are compiled for-- this is specified as the second template argument, which is default-initialised to the backend given to the \texttt{grbcxx} wrapper. This means the backend is part of the type of ALP containers, which, in turn, enables the compiler to generate, for each ALP primitive, the code that corresponds to the requested backend. It is, however, possible to manually override this backend template argument, which is useful in conjunction with SPMD in that the combination allows the programmer to define operations that should execute within a user process only, as opposed to defining operations that should be performed \emph{across} user processes.
+
+For example, within a program compiled with the \texttt{bsp1d} or \texttt{hybrid} backends, a user may define a process-local vector as follows: \texttt{grb::Vector< double, grb::nonblocking > local\_x( local\_n )}, where \texttt{local\_n} is some local size indicator that normally is proportional to $n/P$, with $n$ a global size and $P$ the total number of user processes. Using \texttt{grb::spmd}, the programmer may specify that each user process performs different computations on their local vectors. This results in process-local computations that are totally independent of other processes, which later on may be aggregated into some meaningful global state through, for example, collectives.\vspace{.5\baselineskip}
+
+\noindent\textbf{Exercise 15}: use the mechanism here described to write a program that, when executed using $P$ processes, solves $P$ different linear systems $Ax=b_k$ where $A$ is the same at every process while the $b_k$, $0\leq k<P$ are initialised to $(k, k, \ldots, k)^T$ each. Make it so that the program returns the maximum residual (squared 2-norm of $b-Ax$) across the $P$ processes. \textbf{Hint}: reuse one of the pre-implemented linear solvers, such as CG.
+
diff --git a/ALP_Transition_Path_Tutorial.tex b/ALP_Transition_Path_Tutorial.tex
index c97fd10..8103c23 100644
--- a/ALP_Transition_Path_Tutorial.tex
+++ b/ALP_Transition_Path_Tutorial.tex
@@ -1,17 +1,17 @@
-\section{Introduction to ALP and Transition Paths}\label{sec:intro}
+\section{ALP Transition Paths}\label{sec:intro}
 
 \textbf{Algebraic Programming (ALP)} is a C++ framework for high-performance linear algebra that can auto-parallelize and auto-optimize your code. A key feature of ALP is its transition path APIs, which let you use ALP through standard interfaces without changing your existing code. In practice, ALP generates drop-in replacements for established linear algebra APIs. You simply re-link your program against ALP's libraries to get optimized performance (no code modifications needed). ALP v0.8 provides transition-path libraries for several standards, including the NIST Sparse BLAS and a CRS-based iterative solver interface (ALP/Solver). This means you can take an existing C/C++ program that uses a supported API and benefit from ALP's optimizations (such as vectorization and parallelism) just by linking with ALP's libraries [1].
 
 One of these transition paths, and the focus of this tutorial, is ALP's \textbf{sparse Conjugate Gradient (CG) solver}. This CG solver accepts matrices in Compressed Row Storage (CRS) format (also known as CSR) and solves $Ax=b$ using an iterative method. Under the hood it leverages ALP's non-blocking execution model, which overlaps computations and memory operations for efficiency. From the user's perspective, however, the solver is accessed via a simple C-style API that feels synchronous. In this workshop, we'll learn how to use this CG solver interface step by step: from setting up ALP, to coding a solution for a small linear system, to building and running the program.
 
-\section{Setup: Installing ALP and Preparing to Use the Solver}\label{sec:setup}
+\subsection{Setup: Installing ALP and Preparing to Use the Solver}\label{sec:setup}
 
 This section explains how to install ALP on a Linux system and compile a simple example. ALP (Algebraic Programming) provides a C++17 library implementing the GraphBLAS interface for linear-algebra-based computations.
 
-\subsection{Installation on Linux}
+\subsubsection*{Installation on Linux}
 
 \begin{enumerate}
-\item Install prerequisites: Ensure you have a C++11 compatible compiler (e.g. \texttt{g++} 4.8.2 or later) with OpenMP support, CMake (>= 3.13) and GNU Make, plus development headers for libNUMA and POSIX threads. 
+\item Install prerequisites: Ensure you have a C++11 compatible compiler (e.g. \texttt{g++} 4.8.2 or later) with OpenMP support, CMake (>= 3.13) and GNU Make, plus development headers for libNUMA and POSIX threads.
 For example, on Debian/Ubuntu:
 \begin{verbatim}
 sudo apt-get install build-essential libnuma-dev libpthread-stubs0-dev cmake
@@ -67,7 +67,7 @@ \subsection{Installation on Linux}
 (ALP's documentation provides details on which libraries to link for each backend [3].) Using grbcxx is recommended for simplicity, but it's good to know what happens under the hood. Now that our environment is set up, let's look at the CG solver API.
 
 
-\section{Overview of ALP's Non-Blocking Sparse CG API}\label{sec:api}
+\subsection{Overview of ALP's Non-Blocking Sparse CG API}\label{sec:api}
 
 The ALP/Solver transition path provides a C-style interface for initializing and running a Conjugate Gradient solver. All functions are exposed via a header (e.g. solver.h in ALP's include directory) and use simple types like pointers and handles. The main functions in this API are:
 
@@ -83,14 +83,14 @@ \section{Overview of ALP's Non-Blocking Sparse CG API}\label{sec:api}
 
 This API is non-blocking in the sense that internally ALP may overlap operations (like sparse matrix-vector multiplications and vector updates) and use asynchronous execution for performance. However, the above functions themselves appear synchronous. For example, sparse\_cg\_solve will only return after the solve is complete (there’s no separate “wait” call exposed in this C interface). The benefit of ALP’s approach is that you, the developer, don’t need to manage threads or message passing at all. ALP’s GraphBLAS engine handles parallelism behind the scenes. You just call these routines as you would any standard library. Now, let’s put these functions into practice with a concrete example.
 
-  
-\section{Example: Solving a Linear System with ALP’s CG Solver}
 
-Suppose we want to solve a small system $Ax = b$ to familiarize ourselves with the CG interface. We will use the following $3 \times 3$ symmetric positive-definite matrix $A$: $$ A = \begin{pmatrix} 4 & 1 & 0\\ 
-        1 & 3 & -1\\ 
-        0 & -1 & 2 \end{pmatrix}, $$ 
-        and we choose a right-hand side vector $b$ such that the true solution is easy to verify. If we take the solution to be $x = (1,\;2,\;3)$, then $b = A x$ can be calculated as: $$ b = \begin{pmatrix}6 \ 4 \ 4 \end{pmatrix}, $$ since $4\cdot1 + 1\cdot2 + 0\cdot3 = 6$, $1\cdot1 + 3\cdot2 + (-1)\cdot3 = 4$, and $0\cdot1 + (-1)\cdot2 + 2\cdot3 = 4$. Our goal is to see if the CG solver recovers $x = (1,2,3)$ from $A$ and $b$. 
-        
+\subsection{Example: Solving a Linear System with ALP’s CG Solver}
+
+Suppose we want to solve a small system $Ax = b$ to familiarize ourselves with the CG interface. We will use the following $3 \times 3$ symmetric positive-definite matrix $A$: $$ A = \begin{pmatrix} 4 & 1 & 0\\
+        1 & 3 & -1\\
+        0 & -1 & 2 \end{pmatrix}, $$
+        and we choose a right-hand side vector $b$ such that the true solution is easy to verify. If we take the solution to be $x = (1,\;2,\;3)$, then $b = A x$ can be calculated as: $$ b = \begin{pmatrix}6 \ 4 \ 4 \end{pmatrix}, $$ since $4\cdot1 + 1\cdot2 + 0\cdot3 = 6$, $1\cdot1 + 3\cdot2 + (-1)\cdot3 = 4$, and $0\cdot1 + (-1)\cdot2 + 2\cdot3 = 4$. Our goal is to see if the CG solver recovers $x = (1,2,3)$ from $A$ and $b$.
+
 We will hard-code $A$ in CRS format (also called CSR: Compressed Sparse Row) for the solver. In CRS, the matrix is stored by rows, using parallel arrays for values and column indices, plus an offset index for where each row starts. For matrix $A$ above:
 
 
@@ -116,34 +116,34 @@ \section{Example: Solving a Linear System with ALP’s CG Solver}
 int main(){
     // Define the 3x3 test matrix in CRS format
     const size_t n = 3;
-    
+
     double A_vals[] = {
         4.0, 1.0, // row 0 values
         1.0, 3.0, -1.0, // row 1 values
         -1.0, 2.0 // row 2 values
     };
-    
+
     int A_cols[] = {
         0, 1, // row 0 column indices
         0, 1, 2, // row 1 column indices
         1, 2 // row 2 column indices
     };
-    
+
     int A_offs[] = { 0, 2, 5, 7 }; // row start offsets: 0,2,5 and total nnz=7
-    
+
     // Right-hand side b and solution vector x
     double b[] = { 6.0, 4.0, 4.0 }; // b = A * [1,2,3]^T
     double x[] = { 0.0, 0.0, 0.0 }; // initial guess x=0 (will hold the solution)
-    
+
     // Solver handle
     sparse_cg_handle_t handle;
-    
+
     int err = sparse_cg_init_dii(&handle, n, A_vals, A_cols, A_offs);
     if (err != 0) {
         fprintf(stderr, "CG init failed with error %d\n", err);
         return EXIT_FAILURE;
     }
-    
+
     // (Optional) set a preconditioner here if needed, e.g. Jacobi or others.
     // We skip this, so no preconditioner (effectively M = Identity).
     err = sparse_cg_solve_dii(handle, x, b);
@@ -153,13 +153,13 @@ \section{Example: Solving a Linear System with ALP’s CG Solver}
         sparse_cg_destroy_dii(handle);
         return EXIT_FAILURE;
     }
-    
+
     // Print the solution vector x
     printf("Solution x = [%.2f, %.2f, %.2f]\n", x[0], x[1], x[2]);
-    
+
     // Clean up
     sparse_cg_destroy_dii(handle);
-    
+
     return 0;
 }
 
@@ -171,27 +171,27 @@ \section{Example: Solving a Linear System with ALP’s CG Solver}
 \begin{itemize}
 
     \item We included <graphblas/solver.h> (the exact include path might be alp/solver.h or similar depending on ALP’s install, but typically it resides in the GraphBLAS include directory of ALP). This header defines the sparse\_cg\_* functions and the \textbf{sparse\_cg\_handle\_t} type.
-    
+
     \item We set up the matrix data in CRS format. For clarity, the values and indices are grouped by row in the code. The offsets array \{0,2,5,7\} indicates: row0 uses vals[0..1], row1 uses vals[2..4] , row2 uses vals[5..6]. The matrix dimension n is 3.
-    
+
     \item We prepare the vectors b and x. b is initialized to \{6,4,4\} as computed above. We initialize x to all zeros (as a starting guess). In a real scenario, you could start from a different guess, but zero is a common default.
-    
+
     \item We create a \textbf{sparse\_cg\_handle\_t} and call {sparse\_cg\_init}. This hands the matrix to ALP’s solver. Under the hood, ALP will likely copy or reference this data and possibly analyze $A$ for the CG algorithm. We check the return code err, if non-zero, we print an error and exit. (For example, an error might occur if n or the offsets are inconsistent. In our case, it should succeed with err == 0.)
-    
+
     \item We do not call \textbf{sparse\_cg\_set\_preconditioner} in this example, which means the CG will run unpreconditioned. If we wanted to, we could implement a simple preconditioner. For instance, a Jacobi preconditioner would use the diagonal of $A$: we’d create an array with $\text{diag}(A) = [4,3,2]$ and a function to divide the residual by this diagonal. We would then call \textbf{sparse\_cg\_set\_preconditioner(handle, my\_prec\_func, diag\_data)}. For brevity, we skip this. ALP will just use the identity preconditioner by default (no acceleration).
-    
+
     \item Next, we call \textbf{sparse\_cg\_solve(handle, x, b)}. ALP will iterate internally to solve $Ax=b$. When this function returns, x should contain the solution. We again check err. A non-zero code could indicate the solver failed to converge (though typically it would still return 0 and one would check convergence via a status or residual, ALP’s API may evolve to provide more info). In our small case, it should converge in at most 3 iterations since $A$ is $3\times3$.
-    
+
     \item We print the resulting x. We expect to see something close to [1.00, 2.00, 3.00]. Because our matrix and $b$ were consistent with an exact solution of $(1,2,3)$, the CG method should find that exactly (within floating-point rounding). You can compare this output with the known true solution to verify the solver worked correctly.
-    
+
     \item Finally, we call \textbf{sparse\_cg\_destroy(handle)} to free ALP’s resources for the solver. This is important especially for larger problems to avoid memory leaks. After this, we return from main.
-    
+
 \end{itemize}
 
 
 
 \section*{Building and Running the Example}
-To compile the above code with ALP, we will use the direct linking option as discussed. 
+To compile the above code with ALP, we will use the direct linking option as discussed.
 \begin{lstlisting}[language=bash, basicstyle=\ttfamily\small, showstringspaces=false]
 g++ example.cpp -o cg_demo \
   -I $ALP_INSTALL_DIR/include \
@@ -231,7 +231,7 @@ \section*{Building and Running the Example}
 supported interfaces. Feel free to experiment with different matrices, add a custom preconditioner function
 to see its effect, or try other solvers if ALP introduces them in future releases. Happy coding!
 
-    
+
 
 
 \bibliographystyle{plain}
diff --git a/ALP_Tutorial.tex b/ALP_Tutorial.tex
index 6b1c4ab..b57d71b 100644
--- a/ALP_Tutorial.tex
+++ b/ALP_Tutorial.tex
@@ -4,7 +4,7 @@ \section{Installation on Linux}\label{sec:installation}
 This section explains how to install ALP on a Linux system and compile a simple example. To get started:
 
 \begin{enumerate}
-\item \textbf{Install prerequisites}. Ensure you have a C++11 compatible compiler (e.g. \texttt{g++} 4.8.2 or later) with OpenMP support, CMake (>= 3.13) and GNU Make, plus development headers for libNUMA and POSIX threads. 
+\item \textbf{Install prerequisites}. Ensure you have a C++11 compatible compiler (e.g. \texttt{g++} 4.8.2 or later) with OpenMP support, CMake (>= 3.13) and GNU Make, plus development headers for libNUMA and POSIX threads.
 For example, on Debian/Ubuntu:
 \begin{verbatim}
 sudo apt-get install build-essential libnuma-dev libpthread-stubs0-dev cmake;
@@ -99,7 +99,7 @@ \section{ALP/GraphBLAS}\label{sec:alp_concepts}
 Info: grb::init (reference) called.
 Hello from ./a.out
 Info: grb::finalize (reference) called.
-$ 
+$
 \end{lstlisting}
 
 \noindent \textbf{Exercise 1.} Double-check that you have the expected output from this example, as we will use its framework in the following exercises.
@@ -261,24 +261,25 @@ \subsection{Copying, Masking, and Standard Matrices}
 
 \subsection{Numerical Linear Algebra}
 
-GraphBLAS, as the name implies, provides canonical BLAS-like functionalities on sparse matrices, sparse vectors, dense vectors and scalars. 
-These include \texttt{grb::foldl}, \texttt{grb::foldr}, \texttt{grb::dot}, \texttt{grb::eWiseAdd}, 
-\texttt{grb::eWiseMul}, \texttt{grb::eWiseLambda}, \texttt{grb::eWiseApply}, \texttt{grb::mxv}, 
-\texttt{grb::vxm}, and \texttt{grb::mxm}. 
+GraphBLAS, as the name implies, provides canonical BLAS-like functionalities on sparse matrices, sparse vectors, dense vectors and scalars.
+These include \texttt{grb::foldl}, \texttt{grb::foldr}, \texttt{grb::dot}, \texttt{grb::eWiseAdd},
+\texttt{grb::eWiseMul}, \texttt{grb::eWiseLambda}, \texttt{grb::eWiseApply}, \texttt{grb::mxv},
+\texttt{grb::vxm}, and \texttt{grb::mxm}.
 
 The loose analogy with BLAS is intentional, as these primitives cover the same ground as the Level 1, Level 2, and Level 3 BLAS operations.
 The former three scalar/vector primitives are dubbed \emph{level 1}, the following two matrix--vector primitives \emph{level 2}, and the latter matrix--matrix primitive \emph{level 3}. Their functionalities are summarised as follows:\newline
 
-    \textbf{grb::foldl, grb::foldr} – These primitives inplace operate on the input/output data structure by applying binary operator from the left or right, respectively. 
+    \textbf{grb::foldl, grb::foldr} – These primitives inplace operate on the input/output data structure by applying binary operator from the left or right, respectively.
     They require two data structures and two operators, the accumulation and binary operators oerators, typically taken from a semiring.
-    For example, \texttt{grb::foldl( alpha, u, binary\_op, accum\_op )} computes $\alpha$\textit{=}$\alpha$ \textit{accum\_op} $(u_0$ \textit{binary\_op} $(u_1$ \textit{binary\_op} $(\ldots (u_{n-2}$ \textit{binary\_op} $u_{n-1})\ldots)))$, where $u$ is a vector of length $n$.
+    For example, \texttt{grb::foldl( alpha, u, binary\_op, accum\_op )} computes $\alpha$\textit{=}$\alpha$ \textit{accum\_op} $(u_0$ \textit{binary\_op} $(u_1$ \textit{binary\_op} $(\ldots (u_{n-2}$ \textit{binary\_op} $u_{n-1})\ldots)))$, where $u$ is a vector of length $n$.\newline
 
-    \textbf{grb::dot( grb::Vector<T> u, grb::Vector<T> v )} – Compute the dot product of two vectors, $\alpha$\textit{+=}$u^Tv$ or $\alpha$\textit{+=}$\sum_i (u_i \times v_i)$, in essence combining element-wise multiplication with a reduction. The output $\alpha$ is a scalar, usually a primitive type such as \texttt{double}. Unlike the out-of-place \texttt{grb::set}, the \texttt{grb::dot} updates the output scalar in-place.\newline
+    \textbf{grb::dot}%( grb::Vector<T> u, grb::Vector<T> v )}
+	– Compute the dot product of two vectors, $\alpha$\textit{+=}$u^Tv$ or $\alpha$\textit{+=}$\sum_i (u_i \times v_i)$, in essence combining element-wise multiplication with a reduction. The output $\alpha$ is a scalar, usually a primitive type such as \texttt{double}. Unlike the out-of-place \texttt{grb::set}, the \texttt{grb::dot} updates the output scalar in-place.\newline
 
-	\textbf{grb::eWiseMul, grb::eWiseAdd} - These primitives combine two containers element-by-element, the former using element-wise multiplication, and the latter using element-wise addition. Different from \texttt{grb::set}, 
-    these primitives are in-place, meaning the result of the element-wise operations are added to any elements already in the output container; i.e., \texttt{grb::eWiseMul} computes $z$\textit{+=}$x\odot y$, where $\odot$ denotes element-wise multiplication. 
+	\textbf{grb::eWiseMul, grb::eWiseAdd} - These primitives combine two containers element-by-element, the former using element-wise multiplication, and the latter using element-wise addition. Different from \texttt{grb::set},
+    these primitives are in-place, meaning the result of the element-wise operations are added to any elements already in the output container; i.e., \texttt{grb::eWiseMul} computes $z$\textit{+=}$x\odot y$, where $\odot$ denotes element-wise multiplication.
     In case of sparse vectors and an initially-empty output container, the primitives separate themselves in terms of the structure of the output vector, which is composed either of an intersection or union of the input structures.
-    Since \textbf{grb} primitives are unvaware of conventional multiplication and addition operations, they are provedied 
+    Since \textbf{grb} primitives are unvaware of conventional multiplication and addition operations, they are provided
     as correspond monoids from the semiring argument, the multiplicative and additive monoids.
 \begin{itemize}
   \item \textbf{intersection (eWiseMul):} The primitive will compute only an element-wise multiplication for those positions where \emph{both} input containers have entries. This is since any missing entries are assumed to have value zero, and are therefore ignored under multiplication.
@@ -286,51 +287,49 @@ \subsection{Numerical Linear Algebra}
   Note: The \texttt{grb} primitives do not assume conventional addition or multiplication. Instead, these operations are defined by the semiring argument, which specifies the additive and multiplicative monoids to use for the computation.
 \end{itemize}
 
-    \textbf{grb::eWiseLambda} - This primitive applies a user-defined function to each element of an input container, storing the result in an output container. 
-    Similar to \texttt{grb::set}, this primitive is out-of-place; i.e., the output container is potentially overwritten with the results of applying the function to each element of the input container. 
+    \textbf{grb::eWiseLambda} - This primitive applies a user-defined function to each element of an input container, storing the result in an output container.
+    Similar to \texttt{grb::set}, this primitive is out-of-place; i.e., the output container is potentially overwritten with the results of applying the function to each element of the input container.
     The user-defined function must be a callable object (e.g., a lambda function or a functor) that comes in two versions, one for vector and one for matrix arguments.
     For vector arguments, the function must take a single argument of the vector's value type and returns no value.
     For matrix arguments, the function must take three arguments: the row index, the column index, and the value type of the matrix; it also returns no value.
     The function is applied to each element of the input container and is expected to modify the output container.
-    Note that in the nonblocking backend all containers modified by the lambda must be listed in the argument list, while only the first one generates the iterations over which the lambda is applied.
+    Note that in the nonblocking backend all containers modified by the lambda must be listed in the argument list, while only the first one generates the iterations over which the lambda is applied.\newline
 
-    \textbf{grb::eWiseApply} - This primitive applies a user-defined binary function to corresponding elements of two input containers, storing the result in an output container. 
+    \textbf{grb::eWiseApply} - This primitive applies a user-defined binary function to corresponding elements of two input containers, storing the result in an output container.
     It aims to handle all operators that do not fit into the semiring abstraction, such as minimum, maximum, logical AND, logical OR, etc.\newline
 
-    \textbf{grb::mxv}, \textbf{grb::vxm} - Performs right- and left-handed matrix--vector multiplication; i.e., $u$\textit{+=}$Av$ and $u$\textit{+=}$vA$, respectively. More precisely, e.g., \texttt{grb::mxv} computes the standard linear algebraic operation $u_i = u_i + \sum_j A_{ij} v_j$. 
-    Like almost all GraphBLAS primitives, the \texttt{grb::mxv} is an in-place operation. 
+    \textbf{grb::mxv}, \textbf{grb::vxm} - Performs right- and left-handed matrix--vector multiplication; i.e., $u$\textit{+=}$Av$ and $u$\textit{+=}$vA$, respectively. More precisely, e.g., \texttt{grb::mxv} computes the standard linear algebraic operation $u_i = u_i + \sum_j A_{ij} v_j$.
+    Like almost all GraphBLAS primitives, the \texttt{grb::mxv} is an in-place operation.
     If the intent is to compute $u=Av$ while $u$ is not empty, there are two solutions: 1) $u$ may cleared first (\texttt{grb::clear(u)}), or 2) $u$ may have all its values set to zero first (\texttt{grb::set(u, 0)}).\newline
 
     \textbf{grb::mxm} - Performs matrix--matrix multiplication; i.e., $C$\textit{+=}$AB$, or $C_{ij}=C_{ij}\textit{+=}\sum_{k}A_{ik}B_{kj}$. If, for a given $i,j$ the $i$th row of $A$ is empty or the $j$th column of $B$ is empty, no output will be appended to $C$-- that is, if $C_{ij}\notin C$, then after executing the primitive no such entry will have been added to $C$, meaning that the sparsity of $C$ is retained and the only fill-in to $C$ is due to non-zero contributions of $AB$. If in-place behaviour is not desired, $C$ must be cleared prior to calling this primitive\footnote{note that while setting $C$ to a dense matrix of all zeroes semantically also results in an out-of-place matrix--matrix multiplication, typically GraphBLAS applications should shy away from forming dense matrices due to their quadratic storage requirements}.\newline
 
-    \textbf{grb::set} - This primitive initializes values of a container (vector or matrix) to a specified scalar value or values from another container. 
-    It does not allocate nor free any dynamic memory, but simply sets values in the existing container. \\ newline \\ newline
+    \textbf{grb::set} - This primitive initializes values of a container (vector or matrix) to a specified scalar value or values from another container.
+    It does not allocate nor free any dynamic memory, but simply sets values in the existing container.\newline
 
     The interface is more flexible that what is described above, since combinations of vectors and scalars and sometimes matrices
-    are supported in the single argument lists of the primitives. 
+    are supported in the single argument lists of the primitives.
     For example \texttt{grb::foldr} supports both vector-scalar and scalar-vector combinations.
-    With all the above operations, the same-type containers must have matching dimensions in the linear algebraic sense -- e.g., 
-    for $u=Av$, $u$ must have size equal to the number of rows in $A$ while $v$ must have size equal to the number of columns. 
-    If the sizes do not match, the related primitives will return \texttt{grb::MISMATCH}. 
-    Similarly, if for an output container the result cannot be stored due to insufficient capacity, 
-    \texttt{grb::ILLEGAL} will be returned. As with \texttt{grb::set}, 
-    furthermore, all above primitives may optionally take a mask as well as take a phase (resize or execute) 
+    With all the above operations, the same-type containers must have matching dimensions in the linear algebraic sense -- e.g.,
+    for $u=Av$, $u$ must have size equal to the number of rows in $A$ while $v$ must have size equal to the number of columns.
+    If the sizes do not match, the related primitives will return \texttt{grb::MISMATCH}.
+    Similarly, if for an output container the result cannot be stored due to insufficient capacity,
+    \texttt{grb::ILLEGAL} will be returned. As with \texttt{grb::set},
+    furthermore, all above primitives may optionally take a mask as well as take a phase (resize or execute)
     as its last argument.
 
-    There is one final caveat. In order to support graph algorithms, GraphBLAS provides on \emph{generalised} 
-    linear algebraic primitives -- not just numerical linear algebraic ones. 
-    This is explained further in the next subsection of this tutorial. 
-    To perform standard numerical linear algebra, the standard semirings are predefined in ALP/GraphBLAS. 
-    For instance, \texttt{grb::semirings::plusTimes< T >}, where $T$ is the domain over which we wish to perform numerical 
-    linear algebra (usually \texttt{double} or \texttt{std::complex< double >}). 
+    There is one final caveat. In order to support graph algorithms, GraphBLAS provides on \emph{generalised}
+    linear algebraic primitives -- not just numerical linear algebraic ones.
+    This is explained further in the next subsection of this tutorial.
+    To perform standard numerical linear algebra, the standard semirings are predefined in ALP/GraphBLAS.
+    For instance, \texttt{grb::semirings::plusTimes< T >}, where $T$ is the domain over which we wish to perform numerical
+    linear algebra (usually \texttt{double} or \texttt{std::complex< double >}).
     For example, to perform a sparse matrix--vector multiplication:
     \begin{lstlisting}
     auto plusTimes = grb::semirings::plusTimes< double >();
     grb::RC rc = grb::clear( y );
     rc = rc ? rc : grb::mxv(y, A, x, plusTimes);
     \end{lstlisting}
-    Note: mxv could be further generalised and can infer the semiring from the types of the input containers. This would however limit the control exposed to the user.
-
 
 \noindent \textbf{Exercise 8.} Copy the older \texttt{alp\_hw.cpp} to start with this exercise. Modify it to perform the following steps:
 \begin{enumerate}
@@ -343,7 +342,7 @@ \subsection{Numerical Linear Algebra}
 \end{enumerate}
 One example $A, x$ could be:
 \[
-A = 
+A =
 \begin{bmatrix}
 0 & 1 & 2 \\
 0 & 3 & 4 \\
@@ -372,72 +371,123 @@ \subsection{Numerical Linear Algebra}
 
 \subsection{Semirings and Algebraic Operations}
 
-A key feature of GraphBLAS (and ALP) is that operations are defined over semirings rather than just the conventional arithmetic operations. A semiring consists of a pair of operations (an “addition” and a “multiplication”) along with their identity elements, which generalize the standard arithmetic (+ and $\times$). GraphBLAS allows using different semirings to, for example, perform computations like shortest paths or logical operations by substituting the plus or times operations with min, max, logical OR/AND, etc. In GraphBLAS, matrix multiplication is defined in terms of a semiring: the “add” operation is used to accumulate results, and the “multiply” operation is used when combining elements.
-ALP lets you define and use custom \textbf{semirings} by specifying:
+A key feature of GraphBLAS and ALP is that operations are defined over generic algebraic structures rather than over the conventional arithmetic operations only. An example of an algebraic structure is a \emph{semiring}. Loosely speaking, a semiring formalises what we understand as standard linear algebra. Intuitively, a semiring consists of a pair of operations, an ``addition'' and a ``multiplication'', along with their identity elements. The additive and multiplicative operations may differ from standard arithmetic ($+$ and $\times$). The multiplicative operation together with its identity -- which we call $\textbf{1}$ -- forms a monoid. The additive operation together with its identity -- $\mathbf{0}$ -- forms a commutative monoid, meaning that the order of addition shall not change the result ($a+b=b+a$). Formally speaking there are more requirements to monoids and semirings that ALP is aware of and exploits -- however, for the purposes of this tutorial, the preceding intuitive description suffices.
 
+GraphBLAS allows using different semirings to, for example, perform computations like shortest paths or logical operations by substituting the plus or times operations with min, max, logical OR/AND, and so on. This is usually done structurally, by replacing the plus-times semirings with another, such as e.g.\ a min-plus semiring to compute shortest paths. This is why, as you saw in the previous subsection, GraphBLAS primitives explicitly take algebraic structures as a mandatory argument. ALP additionally allows expressing algebraic structures as C++ types, and introduces an algebraic type trait system:
 
 \begin{itemize}
-  \item \textbf{A binary monoid:} an associative, commutative ``addition'' operation with an identity element. Examples:
+  \item \textbf{commutative monoids:} an associative and commutative operation with an identity element. Examples:
   \begin{itemize}
-    \item \texttt{(+}, 0\texttt{)} — the usual addition over numbers
-    \item \texttt{(min}, $+\infty$\texttt{)} — useful for computing minima
-  \end{itemize}
-  
-  \item \textbf{A binary multiplicative operator:} a second operation (not necessarily arithmetic multiplication), with its own identity element. Examples:
-  \begin{itemize}
-    \item \texttt{(*}, 1\texttt{)} — standard multiplication
-    \item \texttt{(AND}, \texttt{true}\texttt{)} — logical semiring for Boolean algebra
+    \item \texttt{(+, 0)} — the usual addition over numbers:
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::Monoid< grb::operators::add< T >, grb::identities::zero >
+\end{lstlisting}
+or, more simply,
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::monoids::plus< T >
+\end{lstlisting}
+    \item \texttt{(min, $\infty$)} — useful for computing minima:
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::Monoid< grb::operators::min< T >, grb::identities::infinity > // or simply
+grb::monoids::min< T >
+\end{lstlisting}
+    \item \texttt{($\land$, true)} — logical semiring for Boolean algebra:
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::Monoid< grb::operators::logical_and >, grb::identities::logical_true > // or
+grb::monoids:land< T >
+\end{lstlisting}
   \end{itemize}
-\end{itemize}
-
-A semiring is a mathematical structure consisting of a set equipped with two binary operations satisfying certain axioms. 
-Many common semirings are provided or can be constructed. 
-For instance, the plus-times semiring uses standard addition as the accumulation (monoid) 
-and multiplication as the combination operator – this yields ordinary linear algebra over real numbers. 
-One can also define a \texttt{min-plus} semiring (useful for shortest path algorithms, where "addition" 
-is min and "multiplication" is numeric addition). ALP’s design allows an “almost unlimited variety of operators 
-and types” in semirings.
-
-In code, ALP provides templates to construct these. For example, one can define:
+For all the above examples, the following \emph{type traits} are \texttt{true}:
 \begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
-using Add = grb::operators::add<double>;
-using AddMonoid = grb::Monoid<Add, grb::identities::zero>;
-using Mul = grb::operators::mul<double>;
-using PlusTimes = grb::Semiring<Mul, AddMonoid>;
-PlusTimes plusTimesSemiring;
+grb::is_monoid< grb::monoid::plus< T > >::value
+grb::is_commutative< grb::monoid::min< T > >::value
+// and so on
 \end{lstlisting}
-Here we built the plus-times semiring for \texttt{double}: we use the provided addition operator and its identity (zero) to make a monoid, then combine it with the multiply operator to form a semiring. ALP comes with a library of predefined operator functors (in \texttt{grb::operators}) and identities (in \texttt{grb::identities}) for common types. You can also define custom functor structs if needed. In many cases, using the standard \texttt{plusTimesSemiring} (or simply passing operators/monoids directly to functions) is sufficient for basic algorithms.
 
-\subsection{Primitive Operations (mxv, eWiseMul, dot, etc.)}
-
-Using the above containers and semirings, ALP provides a set of primitive functions in the \texttt{grb} namespace to manipulate the data. These functions are free functions (not class methods) and typically take the output container as the first parameter (by reference), followed by input containers and an operator or semiring specification. The most important primitives include:
-
-    \textbf{grb::set} – Assigns all elements of a container to a given value. For example, \texttt{grb::set(x, 1.0)} will set every entry of vector \texttt{x} to $1.0$ (making all indices present with value 1.0). This is useful for initialization (if called on an empty vector, it will insert all indices with that value). There is also \texttt{grb::setElement(container, value, index[, index2])} to set a single element: for a vector, you provide an index; for a matrix, a row and column. For example, \texttt{grb::setElement(y, 3.0, n/2)} sets $y_{n/2} = 3.0$.
-\newline
-
-    \textbf{grb::mxv} – Perform matrix-vector multiplication on a semiring. The call \texttt{grb::mxv(u, A, v, semiring)} computes $u = A \otimes v$ (where $\otimes$ denotes matrix-vector multiply under the given semiring). For the plus-times semiring, this corresponds to the usual linear algebra operation $u_i = \sum_j A_{ij} \times v_j$ (summing with + and multiplying with $\times$). The output vector \texttt{u} must be pre-allocated to the correct size (number of rows of $A$). By default, ALP’s \texttt{mxv} adds into the output vector (as if doing $u += A \times v$). If you want to overwrite \texttt{u} instead of accumulate, you may need to explicitly set \texttt{u} to the identity element (e.g. zero) beforehand or use descriptors (advanced options) – but for most use cases, initializing $u$ to 0 and then calling mxv is sufficient to compute $u = A x$. For example, \texttt{grb::mxv(y, A, x, plusTimesSemiring)} will compute $y_i = \sum_j A_{ij} x_j$ using standard arithmetic (assuming \texttt{y} was zeroed initially).
-\newline
+%  \item \textbf{general monoids:} an operation with an identity element. Example:
+%  \begin{itemize}
+%    \item \texttt{($\times$, 1)} — standard multiplication:
+%\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+%grb::Monoid< grb::operators::mul< T >, grb::identities::one > // or
+%grb::monoids::times< T >
+%\end{lstlisting}
+   \item \textbf{general binary operators}: binary operators without any additional structure. Examples:
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::operators::mul< double >    // f(x, y) = x * y
+grb::operators::subtract< int >  // f(i, j) = i - j
+grb::operators::zip< char, int > // f(c, i) = {c, i}, an std::pair< char, int >
+\end{lstlisting}
+These too define algebraic type traits:
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::is_operator< OP >::value // true for all above operators
+grb::is_monoid< OP >::value   // false for all above operators
+grb::is_associative< grb::operators::mul< T > >::value    // true
+grb::is_commutative< grb::operators::subtract< T >::value // false
+// ...
+\end{lstlisting}
+    \item \textbf{semirings}: simplified, a commutative ``additive'' monoid combined with a ``multiplicative'' monoid. Examples:
+	\begin{itemize}
+		\item the plus-times semiring: standard linear algebra
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::Semiring<
+    grb::operators::add< T >, grb::operators::mul< T >,
+    grb::identities::zero, grb::identities::one
+> // or
+grb::semirings::plusTimes< T >
+\end{lstlisting}
+		\item the Boolean semiring: perhaps the second most common semiring
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::Semiring<
+    grb::operators::logical_or< bool >, grb::operators::logical_and< bool >,
+    grb::identities::logical_false, grb::identities::logical_true
+> // or
+grb::semirings::boolean
+\end{lstlisting}
+		\item the min-plus semiring: useful for, e.g., shortest paths
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::Semiring<
+    grb::operators::min< T >, grb::operators::add< T >,
+    grb::identities::infinity, grb::identities::zero
+> // or
+grb::semirings:::minPlus< T >
+\end{lstlisting}
+	\end{itemize}
+	For all of the above semirings, we have the following:
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::is_semiring< S >::value // true
+grb::is_monoid< S >::value   // false
+grb::is_operator< S >::value // false
+\end{lstlisting}
+We may furthermore extract the additive and multiplicative monoids and operators from semirings:
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false ]
+grb::semirings::boolean mySemiring;
+auto myAddM  = mySemiring.getAdditiveMonoid();
+auto myMulM  = mySemiring.getMultiplicativeMonoid();
+auto myAddOp = mySemiring.getAdditiveOperator();
+auto myMulOp = myMulM.getOperator();
+\end{lstlisting}
+Finally, we may also extract and instantiate identities from semirings:
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false, morekeywords=constexpr ]
+grb::semirings::plusTimes< int > mySemiring;
+constexpr int myZero = mySemiring.template getZero< int >();
+double castFromZero = mySemiring.template getZero< double >();
+constexpr int myOne = mySemiring.getMultiplicativeMonoid().template getIdentity();
+\end{lstlisting}
+\end{itemize}
 
-      \textbf{grb::dot} – Compute the dot product of two vectors. This is essentially a special case of a matrix-vector multiply or a reduce operation. ALP provides \texttt{grb::dot(result, u, v, semiring)} to compute a scalar result = $u^T \otimes v$ under a given semiring. For the standard plus-times semiring, \texttt{grb::dot(alpha, u, v, plusTimesSemiring)} will calculate $\alpha = \sum_i (u_i \times v_i)$ (i.e. the dot product of $u$ and $v$). If you use a different monoid or operator, you can compute other pairwise reductions (for example, using a \texttt{min} monoid with logical multiplication could compute something like an “AND over all i” if that were needed). In most cases, you'll use dot with the default arithmetic semiring for inner products. The output \texttt{alpha} is a scalar (primitive type) passed by reference, which will be set to the resulting value.
-\newline
+The design of ALP allows the definition of custom operators and identities, and therefore allows a virtually unlimited variety of algebraic structures. The most useful operators, monoids, and semirings are predefined in the namespaces \texttt{grb::operators}, \texttt{grb::monoids}, and \texttt{grb::semirings}, respectively. Let us now employ these algebraic structures to show-case how these amplify the expressiveness of standard linear algebraic primitives.\vspace{.5\baselineskip}
 
-      \textbf{grb::apply} – Apply a binary operator out-of-place between two scalars and store result in a third scalar. 
-      The function \texttt{grb::apply}\,(z, x, s, op) applies the binary functor \texttt{op} to scalar \texttt{x} 
-      and scalar \texttt{s} and writes the result into scalar \texttt{z}. 
-      For example, \texttt{grb::apply}\,(z, x, s, grb::operators::mul<double>()) computes $z = x \times s$. 
-      Note: some backends like nonblocking or hyperdag may rely on \texttt{grb::apply} to infer data dependencies,
-      therefore pure C++ should be avoided and implementations should ensure all dependencies are explicit.
-\newline
-      \textbf{grb::eWiseApply} – Apply a binary operator element-wise between two containers (out-of-place vector/matrix update). 
-      The function \texttt{grb::eWiseApply}\,(z, x, y, op) applies the binary functor \texttt{op} to corresponding elements of \texttt{x} and \texttt{y} and writes the results into \texttt{z}; both inputs and the output are containers of matching dimensions. 
-      For example, \texttt{grb::eWiseApply}\,(z, x, y, grb::operators::mul<double>()) computes $z_i = x_i \times y_i$ for indices where the operation is defined. 
-      In summary, both \texttt{grb::apply}\,() and \texttt{grb::eWiseApply}\,() require a binary operator and perform out-of-place elementwise updates (one with a scalar, the other with a second container).
+\noindent \textbf{Exercise 9} (warm-up): given a \texttt{grb::Vector< double > x}, write an ALP/GraphBLAS function that computes the squared 2-norm of $x$ (i.e., compute $\sum_i x_i^2$) using \texttt{grb::dot}. \textbf{Hint:} consider starting from the code of exercise 8. \textbf{Question}: which semiring applies here?\vspace{.5\baselineskip}
 
-\paragraph{API usage notes:} All the above operations require that output parameters be passed by reference, since they are modified in place (e.g., \texttt{y} in \texttt{grb::mxv(y, A, x, ...)} is updated with the result). Input objects are typically passed by const-reference. You should ensure that the output container is allocated with the correct size beforehand – ALP will not automatically resize vectors or matrices on operation calls if dimensions mismatch. If dimensions do not agree (e.g., you try to multiply an $m\times n$ matrix with a vector of length not $n$), 
-the function will return an error code to indicate the misuse. In fact, most ALP primitives return a status code of type \texttt{grb::RC} (with \texttt{grb::SUCCESS} indicating success). For clarity, our code examples will omit explicit error handling, but in a real program you may check the returned code of each operation.
+\noindent \textbf{Exercise 10}: ALP and GraphBLAS allow for the use of \emph{improper} semirings -- semirings that are mathematically not true semirings but are still useful in practice. In ALP, improper semirings are made up of 1) an commutative ``additive'' monoid and 2) \emph{any} binary ``multiplicative'' operator. All primitives that take a semiring may also take a pair of such an additive monoid and multiplicative operator-- for example, \texttt{grb::dot( alpha, x, y, plusTimes )} is semantically equivalent to
+\begin{lstlisting} [language=C++, basicstyle=\ttfamily\small, showstringspaces=false, morekeywords=constexpr ]
+grb::dot( alpha, x, y, plusTimes.getAdditiveMonoid(), plusTimes.getMultiplicativeOperator() );
+\end{lstlisting}
+Take note of \texttt{grb::operators::abs\_diff< double >}. What does this binary operator compute? Use this operator and the notion of improper semirings to compute the $1$-norm difference between two vectors $x$ and $y$ using a single call to \texttt{grb::dot}. \textbf{Hint:} start off from the code from the previous exercise.\vspace{.5\baselineskip}
 
-In the next section, we will put these concepts together in a concrete example.
+\noindent \textbf{Exercise 11}: consider a square matrix \texttt{grb::Matrix< double > G} the matrix representation of an edge-weighted graph $G$. Consider \texttt{grb::Vector< double > s} (source) a vector of matching dimension to $G$ with a single value zero ($0$) at an index of your choosing. \textbf{Question}: normally, under a plusTimes semiring, $Gs$ would return a zero vector. However, what interpretation would $Gs$ have under the minPlus semiring? Use this interpretation to compute the shortest path to any other vertex  reachable from your chosen source. Then, extend the approach to compute the shortest path two hops away from your chosen source.\vspace{.25\baselineskip}
 
+\textbf{Bonus question}: what interpretation could $G^ks$ have under the maxTimes semiring?
 
 \section{Solution to Exercise 8}\label{sec:simple_example}
 
@@ -479,7 +529,7 @@ \section{Solution to Exercise 8}\label{sec:simple_example}
 int main( int argc, char **argv ) {
     (void)argc;
     (void)argv;
-    std::printf("example (ALP/GraphBLAS) corrected API usage\n\n");
+    std::printf("example (ALP/GraphBLAS) API usage\n\n");
 
     //------------------------------
     // 1) Create a 3x3 sparse matrix A
@@ -548,7 +598,7 @@ \section{Solution to Exercise 8}\label{sec:simple_example}
             return (int)rc;
         }
     }
-    
+
     //------------------------------
     // 7) Compute dot_val = xᵀ·x  (dot‐product under plus‐times semiring)
     //------------------------------
@@ -613,13 +663,11 @@ \section{Solution to Exercise 8}\label{sec:simple_example}
 $
 \end{lstlisting}
 
-
-
-\section{Makefile and CMake Instructions}\label{sec:build_instructions}
+\subsection{Makefile and CMake Instructions}\label{sec:build_instructions}
 
 Finally, we provide guidance on compiling and running the above example in your own development environment. If you followed the installation steps and used \texttt{grbcxx}, compilation is straightforward. Here we outline two approaches: using the ALP wrapper scripts, and integrating ALP manually via a build system.
 
-\subsection*{Using the ALP compiler wrapper}
+\subsubsection*{Using the ALP compiler wrapper}
 
 The simplest way to compile your ALP-based program is to use the provided wrapper. After sourcing the ALP environment (setenv script), the commands \texttt{grbcxx} and \texttt{grbrun} are available in your PATH. You can compile the example above by saving it (e.g. as \texttt{example.cpp}) and running:
 \begin{lstlisting}[language=bash]
@@ -638,7 +686,7 @@ \subsection*{Using the ALP compiler wrapper}
 
 (You can also run \texttt{./example} directly for a non-distributed run; \texttt{grbrun} is mainly needed for orchestrating distributed runs or setting up the execution environment.)
 
-\subsection*{Using a custom build (Make/CMake)}
+\subsubsection*{Using a custom build (Make/CMake)}
 
 If you prefer to compile without the wrapper (for integration into an existing project or custom build system), you need to instruct your compiler to include ALP's headers and link against the ALP library and dependencies. The ALP installation (at the chosen \texttt{--prefix}) provides an include directory and a library directory.
 
@@ -673,3 +721,4 @@ \subsection*{Using a custom build (Make/CMake)}
 \bigskip
 
 This tutorial has introduced the fundamentals of using ALP/GraphBLAS in C++ on Linux, from installation to running a basic example. With ALP set up, you can explore more complex graph algorithms and linear algebra operations, confident that the library will handle parallelization and optimization under the hood. Happy coding!
+
diff --git a/Images/ARM920_hw_models.png b/Images/ARM920_hw_models.png
new file mode 100644
index 0000000..03ee4cd
Binary files /dev/null and b/Images/ARM920_hw_models.png differ
diff --git a/Images/ARM920_system.png b/Images/ARM920_system.png
new file mode 100644
index 0000000..971e7c9
Binary files /dev/null and b/Images/ARM920_system.png differ
diff --git a/Images/Multi_BSP_submodel_roofline_arm920.png b/Images/Multi_BSP_submodel_roofline_arm920.png
new file mode 100644
index 0000000..76de81b
Binary files /dev/null and b/Images/Multi_BSP_submodel_roofline_arm920.png differ
diff --git a/Images/Multi_BSP_submodel_roofline_cascade.png b/Images/Multi_BSP_submodel_roofline_cascade.png
new file mode 100644
index 0000000..42e7cb5
Binary files /dev/null and b/Images/Multi_BSP_submodel_roofline_cascade.png differ
diff --git a/Images/Solver_iteration_close_real_performance_ARM920.png b/Images/Solver_iteration_close_real_performance_ARM920.png
new file mode 100644
index 0000000..aa4c50c
Binary files /dev/null and b/Images/Solver_iteration_close_real_performance_ARM920.png differ
diff --git a/Images/Solver_iteration_close_synthetic_performance_ARM920.png b/Images/Solver_iteration_close_synthetic_performance_ARM920.png
new file mode 100644
index 0000000..a81dbc8
Binary files /dev/null and b/Images/Solver_iteration_close_synthetic_performance_ARM920.png differ
diff --git a/Images/Solver_iteration_performance_1t_real.png b/Images/Solver_iteration_performance_1t_real.png
new file mode 100644
index 0000000..e96d255
Binary files /dev/null and b/Images/Solver_iteration_performance_1t_real.png differ
diff --git a/Images/Solver_iteration_performance_1t_synthetic.png b/Images/Solver_iteration_performance_1t_synthetic.png
new file mode 100644
index 0000000..6b670e4
Binary files /dev/null and b/Images/Solver_iteration_performance_1t_synthetic.png differ
diff --git a/Images/Solver_iteration_performance_96t_real.png b/Images/Solver_iteration_performance_96t_real.png
new file mode 100644
index 0000000..571a5d5
Binary files /dev/null and b/Images/Solver_iteration_performance_96t_real.png differ
diff --git a/Images/Solver_iteration_performance_96t_synthetic.png b/Images/Solver_iteration_performance_96t_synthetic.png
new file mode 100644
index 0000000..08cfa2e
Binary files /dev/null and b/Images/Solver_iteration_performance_96t_synthetic.png differ
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..9e32681
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,125 @@
+# Build main.pdf from main.tex (use existing main.tex which loads utf8 & listings)
+# Dependencies:
+# - TeX Live (pdflatex) and common LaTeX packages: amsmath, hyperref, listings, xcolor
+# - Beamer class for slides
+# - latexmk (recommended)
+#
+# Debian/Ubuntu (recommended minimal set):
+#   sudo apt-get update && sudo apt-get install -y \
+#     latexmk texlive-latex-base texlive-latex-recommended latex-beamer \
+#     texlive-latex-extra texlive-fonts-recommended
+#
+# Fedora/RHEL (approximate equivalents):
+#   sudo dnf install -y latexmk texlive-scheme-medium texlive-beamer texlive-collection-latexrecommended
+
+MAIN      := main
+TEXSRC    := $(MAIN).tex
+PDF       := $(MAIN).pdf
+
+LATEXMK   := $(shell command -v latexmk 2>/dev/null)
+TEXENGINE ?= pdflatex
+
+ifeq ($(LATEXMK),)
+	PDF_CMD := $(TEXENGINE) -interaction=nonstopmode -halt-on-error $(TEXSRC) && \
+			   $(TEXENGINE) -interaction=nonstopmode -halt-on-error $(TEXSRC)
+	WATCH_CMD := @echo "latexmk not found; 'make watch' unavailable."
+else
+	PDF_CMD := latexmk -pdf -interaction=nonstopmode -halt-on-error -file-line-error $(TEXSRC)
+	WATCH_CMD := latexmk -pdf -pvc -interaction=nonstopmode -halt-on-error -file-line-error $(TEXSRC)
+endif
+
+SHELL := /bin/bash
+
+# Clean aux for all TeX sources in this directory
+TEX_SOURCES    := $(wildcard *.tex)
+TEX_BASENAMES  := $(basename $(TEX_SOURCES))
+AUX_EXTS       := aux log out toc lof lot fls fdb_latexmk synctex.gz \
+				  bbl blg bcf run.xml xdv dvi ps nav snm vrb brf idx ilg ind ist \
+				  acn acr alg glg glo gls glsdefs nlo nls lox thm auxlock \
+				  4ct 4tc lg tmp maf mtc mtc0 maf0
+
+.PHONY: all pdf watch clean distclean veryclean open beamer beamer-clean latexmk-clean help deps check-deps
+
+all: pdf
+
+pdf: $(PDF)
+
+$(PDF): $(TEXSRC)
+	@echo "==> Building $(PDF)"
+	@$(PDF_CMD)
+
+# --- info / helpers ---------------------------------------------------------
+help:
+	@echo "Targets: make | make pdf | make beamer | make clean | make distclean"
+	@echo "Helpers: make deps (install hints), make check-deps (quick sanity check)"
+
+deps:
+	@echo "Debian/Ubuntu:"
+	@echo "  sudo apt-get update && sudo apt-get install -y \\"
+	@echo "    latexmk texlive-latex-base texlive-latex-recommended latex-beamer \\"
+	@echo "    texlive-latex-extra texlive-fonts-recommended"
+	@echo
+	@echo "Fedora/RHEL:"
+	@echo "  sudo dnf install -y latexmk texlive-scheme-medium texlive-beamer texlive-collection-latexrecommended"
+
+check-deps:
+	@ok=1; \
+	if ! command -v pdflatex >/dev/null 2>&1; then echo "Missing: pdflatex (install TeX Live)"; ok=0; fi; \
+	if ! command -v latexmk  >/dev/null 2>&1; then echo "Note: latexmk not found (optional but recommended)"; fi; \
+	if ! kpsewhich beamer.cls >/dev/null 2>&1; then echo "Missing: Beamer class (install latex-beamer or texlive-latex-recommended)"; ok=0; fi; \
+	if [ $$ok -eq 1 ]; then echo "Dependency check: OK"; else echo "See 'make deps' for install hints."; fi
+
+# --- beamer target ----------------------------------------------------------
+BEAMER_MAIN := beamer-tutorial
+BEAMER_SRC  := $(BEAMER_MAIN).tex
+BEAMER_PDF  := $(BEAMER_MAIN).pdf
+
+beamer: $(BEAMER_PDF)
+
+$(BEAMER_PDF): $(BEAMER_SRC)
+	@echo "==> Building $(BEAMER_PDF)"
+	@if command -v latexmk >/dev/null 2>&1; then \
+		latexmk -pdf -interaction=nonstopmode -halt-on-error -file-line-error $(BEAMER_SRC); \
+	else \
+		$(TEXENGINE) -interaction=nonstopmode -halt-on-error $(BEAMER_SRC) && \
+		$(TEXENGINE) -interaction=nonstopmode -halt-on-error $(BEAMER_SRC); \
+	fi
+
+watch:
+	$(WATCH_CMD)
+
+open: $(PDF)
+	xdg-open $(PDF) >/dev/null 2>&1 || true
+
+clean:
+	@echo "==> Cleaning TeX auxiliary files for: $(TEX_BASENAMES)"
+	@for base in $(TEX_BASENAMES); do \
+		for ext in $(AUX_EXTS); do rm -f "$${base}.$${ext}"; done; \
+	done
+	@-rm -rf _minted-* .latex-cache latex.out
+
+latexmk-clean:
+	@echo "==> latexmk -C (all .tex, if available)"
+	@if command -v latexmk >/dev/null 2>&1; then \
+		for tex in $(TEX_SOURCES); do latexmk -C "$$tex"; done; \
+	else \
+		echo "latexmk not found; skipping latexmk -C"; \
+	fi
+
+distclean: clean
+	@echo "==> Removing generated PDFs"
+	@-rm -f $(PDF) $(BEAMER_PDF)
+
+veryclean: distclean
+	@echo "==> Removing editor backup/temporary files"
+	@-rm -f *~ \#*# .*~ .\#* *.bak *.bck *.tmp *.swp *.swo *.swx
+
+beamer-clean:
+	@echo "==> Cleaning beamer aux files"
+	@-for base in $(TEX_BASENAMES); do \
+		if [[ "$$base" == beamer-* ]]; then \
+			for ext in aux log nav out snm toc vrb fls fdb_latexmk synctex.gz; do \
+				rm -f "$${base}.$${ext}"; \
+			done; \
+		fi; \
+	done
diff --git a/README.md b/README.md
index fab3195..33c209b 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 # ALP-Tutorial
 
-The [pdf](https://algebraic-programming.github.io/ALP-Tutorial/main.pdf) vresion is available online.
+The [pdf](https://algebraic-programming.github.io/ALP-Tutorial/main.pdf) version is available online.
 
 ## License
 
diff --git a/beamer-tutorial.tex b/beamer-tutorial.tex
new file mode 100644
index 0000000..01643b5
--- /dev/null
+++ b/beamer-tutorial.tex
@@ -0,0 +1,2168 @@
+\documentclass[aspectratio=169]{beamer}
+\usetheme{Madrid}
+\usecolortheme{seahorse}
+\setbeamertemplate{navigation symbols}{}
+\setbeamertemplate{footline}[frame number]
+
+\usepackage[utf8]{inputenc}
+\usepackage[T1]{fontenc}
+\usepackage{lmodern}
+\usepackage{graphicx}
+\usepackage{amsmath,amssymb,mathtools}
+\usepackage{hyperref}
+\usepackage{xcolor}
+\usepackage{listings}
+% Simple diagram support
+\usepackage{tikz}
+\usetikzlibrary{arrows.meta,positioning}
+
+\newcommand{\PA}[1]{{\textcolor{cyan}{[{\bfseries PA:} #1]}}}
+\newcommand{\DeJ}[1]{{\textcolor{magenta}{[{\bfseries DJ:} #1]}}} % DJ was taken!
+\newcommand{\AJ}[1]{{\textcolor{green}{[{\bfseries AJ:} #1]}}}
+\newcommand{\GG}[1]{{\textcolor{blue}{[{\bfseries GG:} #1]}}}
+
+% Map common Unicode symbols to LaTeX math (outside listings)
+\usepackage{newunicodechar}
+\newunicodechar{·}{\ensuremath{\cdot}}
+\newunicodechar{⊙}{\ensuremath{\odot}}
+\newunicodechar{ᵀ}{\ensuremath{^{\top}}}
+\newunicodechar{×}{\ensuremath{\times}}
+\newunicodechar{–}{-}
+\newunicodechar{—}{-}
+
+% Listing styles (Unicode-safe for later use)
+\definecolor{terminalback}{rgb}{0.9,0.9,0.9}
+\definecolor{terminaltext}{rgb}{0.1,0.1,0.1}
+\lstdefinestyle{terminal}{
+    backgroundcolor=\color{gray!20},
+    basicstyle=\ttfamily\small\color{terminaltext},
+    frame=single, rulecolor=\color{white}, breaklines=true,
+    captionpos=b, showstringspaces=false,
+    upquote=true, columns=fullflexible,
+    commentstyle=\color{terminaltext},
+    keywordstyle=\color{terminaltext},
+    stringstyle=\color{terminaltext},
+    identifierstyle=\color{terminaltext},
+    literate={·}{{$\cdot$}}1 {⊙}{{$\odot$}}1 {ᵀ}{{$^\top$}}1 {×}{{$\times$}}1 {‐}{{-}}1 {–}{{-}}1 {—}{{-}}1
+}
+\lstdefinestyle{code}{
+    backgroundcolor=\color[rgb]{0.95,0.95,0.95},
+    basicstyle=\ttfamily\small, frame=single, breaklines=true, showstringspaces=false,
+    rulecolor=\color{black}, numbers=none, keepspaces=true, captionpos=b, tabsize=2,
+    language=C++,
+    literate={·}{{$\cdot$}}1 {⊙}{{$\odot$}}1 {ᵀ}{{$^\top$}}1 {×}{{$\times$}}1 {‐}{{-}}1 {"}{{"`}}1 {"}{{'"}}1
+}
+
+
+\lstdefinestyle{cpp-rich}{
+  language=C++,
+  basicstyle=\ttfamily\scriptsize,
+  keywordstyle=\color{blue}\bfseries,
+  identifierstyle=\color{black},
+  commentstyle=\color{gray}\itshape,
+  stringstyle=\color{green!50!black},
+  morekeywords={nullptr,override,final,constexpr},
+  showstringspaces=false,
+  upquote=true,
+  frame=single,
+  rulecolor=\color{black},
+  breaklines=true,
+  columns=fullflexible,
+  keepspaces=true,
+  tabsize=2,
+  backgroundcolor=\color[rgb]{0.95,0.95,0.95},
+  literate={·}{{$\cdot$}}1 {⊙}{{$\odot$}}1 {ᵀ}{{$^\top$}}1 {×}{{$\times$}}1 {‐}{{-}}1 {–}{{-}}1 {—}{{-}}1
+}
+% make it the default
+\lstset{style=cpp-rich}
+% alias macro (use in optional args: \begin{lstlisting}[\cpplistingstyle])
+\newcommand{\cpplistingstyle}{style=cpp-rich}
+
+\title{Algebraic Programming (ALP) Tutorial}
+%\subtitle{From HPC to GraphBLAS and Transition Paths}
+\author{ALP HIPO Team}
+\date{\today}
+
+% Show roadmap before each section/subsection with upcoming content highlighted
+\AtBeginSection{
+  \begin{frame}{Roadmap}
+    \tableofcontents[currentsection,sectionstyle=show/shaded,subsectionstyle=show/show/hide]
+  \end{frame}
+}
+\AtBeginSubsection{
+  \begin{frame}{Roadmap}
+    \tableofcontents[currentsection,currentsubsection,sectionstyle=show/shaded,subsectionstyle=show/shaded/hide]
+  \end{frame}
+}
+
+\begin{document}
+\frame{\titlepage}
+
+% =========================
+% Monday 10 Nov, Morning
+% =========================
+\section{Monday 10 Nov, Morning}
+
+% One initial agenda slide for Monday; will be auto-repeated via Roadmap frames
+\begin{frame}{Today's Plan}
+Morning:
+\begin{enumerate}
+  \item Introduction to ALP, installation, and short demos
+  \item Hands-on (I): installation, containers, I/O, copying, masking, and standard matrices
+  \item Introduction to core primitives
+  \item Hands-on (II): numerical linear algebra
+  \item Interoperability with existing code: transition paths \& Python
+\end{enumerate}\vspace{\baselineskip}
+
+Afternoon:
+\begin{enumerate}\setcounter{enumi}{6}
+	\item Ising Machine, and
+	\item other solvers
+\end{enumerate}
+\end{frame}
+
+\subsection{1) Introduction to ALP, installation, and short demos}
+% Owner: DJ
+
+% Intro (concise outline)
+\begin{frame}{Introduction to GraphBLAS and ALP}
+\begin{itemize}
+  \item What you'll learn: algebraic programming for graphs and sparse data
+  \item Who it's for: HPC developers and solver authors
+  \item How we'll use ALP: C++ headers, precompiled libraries, Python API
+\end{itemize}
+% Owner: DJ
+% TODO: add a one-slide agenda summary graphic
+\end{frame}
+
+% Slide 1
+\begin{frame}{Goals, Audience, and Scope}
+\begin{itemize}
+  \item Goals
+    \begin{itemize}
+      \item Express algorithms as algebra over vectors/matrices
+      \item Introduce GraphBLAS: containers, semirings, primitives, masks
+      \item Demonstrate ALP backends for performance portability
+    \end{itemize}
+  \item Audience
+    \begin{itemize}
+      \item Developers of HPC applications, graph analytics, and solvers
+      \item No prior GraphBLAS/ALP knowledge assumed
+    \end{itemize}
+  \item Scope: multiple usage modes
+    \begin{itemize}
+      \item C++ header library for new algorithms/solvers (template API)
+      \item Precompiled libraries: use provided algorithms as black boxes
+      \item Python API for precompiled algorithms
+    \end{itemize}
+\end{itemize}
+% Owner: DJ
+% Source: ALP_Tutorial.tex (intro), alp_graphblas_tutorial.txt (overview)
+\end{frame}
+
+% Slide 2
+\begin{frame}{ALP in One Slide: Programming Model and Backends}
+\begin{itemize}
+  \item Programming model (humble by design)
+    \begin{itemize}
+      \item Heavily templated C++ header library (no runtime codegen)
+      \item Hardware-unaware, hardware-independent user code
+      \item Algebraic API: write what to compute, not how
+    \end{itemize}
+  \item Backends handle hardware specifics
+    \begin{itemize}
+      \item reference (single-threaded), \texttt{reference\_omp} (OpenMP)
+      \item nonblocking (multi-threaded with auto-fusion)
+      \item bsp1d (distributed/LPF), dense-dispatch (BLAS-backed, WIP), tensor (experimental, WIP)
+      \item Same source runs across backends with consistent semantics
+    \end{itemize}
+  \item Flexible infrastructure
+    \begin{itemize}
+      \item Rich type system and traits enable different and mixed data-type operations
+      \item Compile-time algebraic properties (associative, commutative, idempotent, annihilators)
+            are known to the backends and propagate from top to bottom of the algorithm
+    \end{itemize}
+\end{itemize}
+
+% Owner: DJ
+\end{frame}
+
+% Slide 2b (graphic)
+\begin{frame}{ALP in One Slide: Programming Model and Backends (Graphic)}
+\centering
+\scriptsize
+\begin{tikzpicture}[
+  node distance=8mm and 6mm,
+  mainbox/.style={rounded corners, draw, very thick, align=center, inner sep=2pt, minimum height=0.75cm, text width=.56\linewidth},
+  sidebox/.style={rounded corners, draw, very thick, align=left, inner sep=2pt, text width=.18\linewidth},
+  arrow/.style={-{Latex[length=1.6mm,width=1.6mm]}, very thick, shorten >=6pt, shorten <=6pt}
+]
+  % Main vertical flow (compact stack)
+  \node[mainbox, fill=blue!10] (user) {User Algorithm (C++ via ALP/GraphBLAS API, or Python bindings)};
+  \node[mainbox, fill=cyan!10, below=8mm of user] (api) {GraphBLAS Algebraic API: Vectors/Matrix containers, Operators/Monoids/Semirings, Masks/Descriptors};
+  % Compile-time traits note placed between API and Backends
+  \node[mainbox, fill=yellow!15, below=8mm of api] (traits) {Compile-time traits \& algebraic properties propagate down (types, identities, associativity, commutativity, idempotence)};
+  \node[mainbox, fill=green!12, below=8mm of traits] (backends) {Backend selection (same source, consistent semantics):\ \texttt{reference}, \texttt{reference\_omp}, nonblocking, bsp1d, dense-dispatch (WIP), tensor (exp.)};
+  \node[mainbox, fill=orange!15, below=8mm of backends] (exec) {Optimized execution: fusion, mask-aware scheduling, blocking/tiling, kernel dispatch (mxv/vxm, mxm, eWise*, reductions), runtime/platform (CPU/OpenMP, LPF, BLAS path)};
+
+  \draw[arrow] (user) -- (api);
+  \draw[arrow] (backends) -- (exec);
+
+  % Side inputs/metadata
+  \node[sidebox, dashed, draw=gray, fill=white, right=2mm of api] (desc) {Descriptors \& Masks\\ replace, accumulate, transpose};
+  \draw[arrow, dashed] (desc.west) -- (api.east);
+
+  \node[sidebox, dashed, draw=gray, fill=white, left=2mm of api] (io) {Parsers \& Builders\\ MatrixMarket, buildMatrixUnique};
+  \draw[arrow, dashed] (io.east) -- (api.west);
+
+  \draw[arrow] (api.south) -- (traits.north);
+  \draw[arrow] (traits.south) -- (backends.north);
+\end{tikzpicture}
+\end{frame}
+
+% Slide 3
+\begin{frame}{What is GraphBLAS: Concepts and Motivation}
+\begin{itemize}
+  \item Motivation
+    \begin{itemize}
+      \item Graphs map to sparse matrices; algorithms to linear algebra
+      \item Sparse/graph workloads are bandwidth-bound and irregular
+    \end{itemize}
+  \item Core concepts
+    \begin{itemize}
+      \item Semirings $\langle D,\oplus,\otimes\rangle$ (plus–times, min–plus, Boolean)
+      \item Primitives: mxv/vxm, mxm, eWiseAdd/Mul/Apply, dot, masks, descriptors
+      \item Containers: \texttt{grb::Vector<T>}, \texttt{grb::Matrix<T>}
+    \end{itemize}
+  \item Why beyond traditional dense LA
+    \begin{itemize}
+      \item Algebraic flexibility enables BFS, SSSP, etc., via non-standard "addition" and "multiplication"
+      \item Masking and sparsity-aware execution preserve efficiency
+    \end{itemize}
+\end{itemize}
+% Owner: DJ
+% Source: alp_graphblas_tutorial.txt (GraphBLAS), ALP_Tutorial.tex (ALP/GraphBLAS)
+\end{frame}
+
+% Slide 4
+\begin{frame}{Automation and Optimization Space}
+\begin{itemize}
+  \item What automation to expect (from ALP backends)
+    \begin{itemize}
+      \item Auto-parallel execution (threads/processes) across backends
+      \item Cache-aware blocking/tiling; minimized data movement
+      \item Mask-aware computation and potential kernel fusion (backend-dependent)
+    \end{itemize}
+  \item Algorithm-level optimization space (ALP vs. BLAS/LAPACK)
+    \begin{itemize}
+      \item ALP: end-to-end algebraic programs expose global opportunities
+            (fusion across primitives, mask-driven sparsity cuts, reordering, schedule selection)
+      \item LAPACK: dense factorizations focus on tiling and blocking inside fixed algorithms
+      \item BLAS-only: kernel-local tuning; limited visibility for cross-kernel optimizations
+    \end{itemize}
+  \item Portability outcome
+    \begin{itemize}
+      \item Express once at the algebraic level; get optimized code on shared, distributed, or hybrid memory systems
+      \item Mixed-type and custom-operator workflows supported via templates
+    \end{itemize}
+\end{itemize}
+% Owner: DJ
+% TODO: add a tiny mxv code snippet + "same code, different backend" example timings
+% Source: ALP_Tutorial.tex (semantics, portability), alp_graphblas_tutorial.txt (landscape)
+\end{frame}
+
+% (Duplicate intro frames removed below to avoid repetition)
+
+
+\begin{frame}[fragile]{1. ALP Installation}
+\framesubtitle{Prerequisites (skip if already installed)}
+\begin{itemize}
+    \item Requirements:
+    \begin{itemize}
+        \item C++11 compiler, OpenMP, CMake (>=3.13), libNUMA, pthreads
+    \end{itemize}
+    \item Debian/Ubuntu:
+\begin{lstlisting}[style=terminal, language=bash]
+sudo apt-get install build-essential libnuma-dev libpthread-stubs0-dev cmake
+\end{lstlisting}
+    \item Red Hat/CentOS:
+\begin{lstlisting}[style=terminal, language=bash]
+dnf group install "Development Tools"
+dnf install numactl-devel cmake
+\end{lstlisting}
+    \item Clone \textbf{ALP-Tutorial} from the official GitHub repository:
+\begin{lstlisting}[style=terminal, language=bash]
+git clone https://github.com/Algebraic-Programming/ALP-Tutorial.git
+\end{lstlisting}
+\end{itemize}
+% Owner: PA
+% Source (ALP_Tutorial.tex - Installation on Linux, step 1)
+\end{frame}
+
+\begin{frame}[fragile]{1. ALP Installation}
+\framesubtitle{Obtain and Build ALP}
+\begin{enumerate}
+    \item Clone \textbf{ALP} from the official GitHub repository:
+\begin{lstlisting}[style=terminal, language=bash]
+git clone https://github.com/Algebraic-Programming/ALP.git
+\end{lstlisting}
+    \item Build and install \textbf{ALP} with default configuration settings:
+\begin{lstlisting}[style=terminal, language=bash]
+cd ALP && mkdir build && cd build
+../bootstrap.sh --prefix=../install
+make -j
+make install
+\end{lstlisting}
+    \item \textbf{Activate ALP environment}:
+\begin{lstlisting}[style=terminal, language=bash]
+source ../install/bin/setenv
+\end{lstlisting}
+\end{enumerate}
+% Owner: PA
+% Source (ALP_Tutorial.tex - Installation on Linux, steps 2-3)
+\end{frame}
+
+\begin{frame}[fragile]{1. ALP Installation}
+\framesubtitle{Setup Environment and Test}
+\begin{enumerate}
+    \item Return to the \textbf{ALP-Tutorial} directory:
+\begin{lstlisting}[style=terminal, language=bash]
+cd ../ALP-Tutorial/scripts
+\end{lstlisting}
+    \item \textbf{Compile example}:
+\begin{lstlisting}[style=terminal, language=bash]
+grbcxx sp.cpp -o sp_example
+\end{lstlisting}
+    \item \textbf{Run}:
+\begin{lstlisting}[style=terminal, language=bash]
+grbrun ./sp_example
+\end{lstlisting}
+\end{enumerate}
+% Owner: PA
+% Source (ALP_Tutorial.tex - Installation on Linux, steps 4-6)
+\end{frame}
+
+\begin{frame}[fragile]{2. Hands-on: Setting up scripts}
+    \framesubtitle{ALP/GraphBLAS Overview}
+    \begin{itemize}
+        \item You can find code skeletons for the tutorial in: \texttt{path/to/alp-tutorial/scripts}
+%         \item You should copy the scripts to your home directory: 
+% \begin{lstlisting}[style=terminal, language=bash]
+% cp -r /some/path/to/alp/tutorial/scripts ~/alp_tutorial_scripts
+% cd ~/alp_tutorial_scripts && ls -l
+%\end{lstlisting}
+        \item You can use any editor of your preference to edit these scripts (e.g. nano, vim, gedit)
+        \item Run these commands to test your setup:
+\begin{lstlisting}[style=terminal, language=bash]
+grbcxx alp_hw.cpp
+grbrun ./a.out
+\end{lstlisting}
+\item \textbf{Expected output}:
+\begin{lstlisting}[style=terminal, language=bash]
+Info: grb::init (reference) called.
+Hello from ./a.out
+Info: grb::finalize (reference) called.
+\end{lstlisting}
+    \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]{2. Hands-on: What did we just run?}
+    \framesubtitle{Hello World in ALP/GraphBLAS}
+    \vspace{-0.6em}
+    \begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+#include <cstddef>
+#include <cstring>
+#include <graphblas.hpp>
+#include <assert.h>
+
+constexpr size_t max_fn_size = 255;
+typedef char Filename[ max_fn_size ];
+
+void hello_world( const Filename &in, int &out ) {
+    std::cout << "Hello from " << in << std::endl;
+    out = 0;
+}
+
+int main( int argc, char ** argv ) {
+    Filename fn;
+    std::strncpy( fn, argv[0], max_fn_size );
+    int error_code = 100;
+    
+    grb::Launcher< grb::AUTOMATIC > launcher;
+    assert( launcher.exec( &hello_world, fn, error_code, true ) == grb::SUCCESS );
+    return error_code;
+}
+        \end{lstlisting}
+        % Owner: PA
+        % Source (ALP_Tutorial.tex - ALP/GraphBLAS, Hello World)
+\end{frame}
+
+\begin{frame}{2. Hands-on: What did we just run?}
+\framesubtitle{ALP/GraphBLAS Overview}
+\begin{itemize}
+    \item Pure C++ (developed similarly to the GraphBLAS C specification)
+    \item Exposes a GraphBLAS interface with 3 categories (part of the \texttt{grb} namespace):
+    \begin{itemize}
+        \item Algebraic containers (vectors, matrices, etc.)
+        \item Algebraic structures (binary operators, semirings, etc.)
+        \item Algebraic operations (take containers and structures as arguments)
+    \end{itemize}
+    \item \textbf{\texttt{grb::Launcher}}:
+    \begin{itemize}
+        \item Wraps calls to ALP programs
+        \item Adapts to run-time conditions (e.g., distributed execution)
+        \item \textbf{Question:} Why is the last argument to \texttt{launcher.exec} \texttt{true}?
+        \begin{itemize}
+            \item Consider the programmer reference documentation for the \texttt{grb::Launcher}...
+        \end{itemize}
+    \end{itemize}
+    \item All ALP programs: input (\href{https://www.geeksforgeeks.org/cpp/pod-type-in-cpp/}{\textcolor{blue}{POD}}) $\rightarrow$ output (POD)
+    \begin{itemize}
+        \item \textbf{\texttt{hello\_world} example}:
+        \begin{itemize}
+            \item \textbf{Question:} Why is \texttt{argv[0]} not directly passed as input to \texttt{hello\_world}?
+            \item Returns zero error\_code as output (POD type)
+        \end{itemize}
+    \end{itemize}
+\end{itemize}
+\vfill
+\colorbox{gray!20}{
+\begin{minipage}{0.95\textwidth}
+\small
+\textbf{For more info:} ALP Documentation: \url{http://albert-jan.yzelman.net/alp/user/}
+\end{minipage}}
+% Owner: PA
+% Source (ALP_Tutorial.tex - ALP/GraphBLAS, Hello World explanation)
+\end{frame}
+
+\subsection{2) Hands-on: installation, containers, I/O, copying, masking, standard matrices (2–3.3)}
+
+\begin{frame}[fragile]{2.1. Hands-on: Containers}
+\framesubtitle{2.1. ALP/GraphBLAS Containers}
+\begin{itemize}
+    \item \textbf{Primary containers:} \texttt{grb::Vector<T>} and \texttt{grb::Matrix<T>}
+    \begin{itemize}
+        \item Templated on value type \texttt{T} (any POD type: \texttt{double}, \texttt{std::complex<T>}, etc.)
+        \item Support for sparse: Store nonzeros efficiently internally (with CSR/CSC formats)
+    \end{itemize}
+    \item \textbf{Examples:}
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+grb::Vector<double> x(100000), y(150000);
+grb::Matrix<void> A(150000, 100000);
+\end{lstlisting}
+\begin{itemize}
+    \item \textbf{Note:} \texttt{Matrix<void>} stores pattern only (Boolean matrices/unweighted graphs)
+\end{itemize}
+\item \textbf{Container Properties:}
+    \begin{itemize}
+        \item \texttt{grb::size(vector)}, \texttt{grb::nrows(matrix)}, \texttt{grb::ncols(matrix)}: dimensions
+        \item \texttt{grb::nnz(container)}: number of stored elements ($<<$ nrows $\times$ ncols for sparse matrices)
+        \item \texttt{grb::capacity(container)}: maximum capacity (default: max dimension)
+    \end{itemize}
+    \item \textbf{Basics:} New containers are \textbf{empty}; size is \textbf{fixed}, capacity \textbf{can be increased}
+\end{itemize}
+% Owner: PA
+% Source (ALP_Tutorial.tex - ALP/GraphBLAS Containers)
+\end{frame}
+
+\begin{frame}[fragile]{2.1. Hands-on: Containers}
+\framesubtitle{Exercise 2}
+\vspace{-0.5em}
+\textbf{Exercise 2.} Allocate the following vectors and matrices:
+\begin{itemize}
+    \item \texttt{grb::Vector<double>} \texttt{x}: length 100, capacity 100
+    \item \texttt{grb::Vector<double>} \texttt{y}: length 1000, capacity 1000
+    \item \texttt{grb::Matrix<double>} \texttt{A}: size $(100 \times 1000)$, capacity 1000
+    \item \texttt{grb::Matrix<double>} \texttt{B}: size $(100 \times 1000)$, capacity 5000
+    \item Start from \texttt{alp\_hw.cpp} (in the \texttt{hello\_world} function)
+\begin{lstlisting}[style=terminal, language=bash, basicstyle=\ttfamily\scriptsize\color{terminaltext}] 
+cp alp_hw.cpp alp_containers_ex2.cpp
+\end{lstlisting}
+    \item \textbf{Hint:} \href{http://albert-jan.yzelman.net/alp/user/group__IO.html#ga0857eef4e6995027be48e7d6b03ea4d3}{\textcolor{blue}{search the documentation}} for \texttt{grb::resize} to override the default capacities
+\end{itemize}
+\textbf{Expected output}:
+\begin{lstlisting}[style=terminal, language=bash, basicstyle=\ttfamily\scriptsize\color{terminaltext}]
+Info: grb::init (reference) called.
+Capacity of x: 100
+Capacity of y: 1000
+Capacity of A: 1000
+Capacity of B: 5000
+Info: grb::finalize (reference) called.
+\end{lstlisting}
+\textbf{Question.} Is overriding the default capacity necessary for all of \texttt{x, y, A} and \texttt{B}?
+% Owner: PA
+% Source (ALP_Tutorial.tex - ALP/GraphBLAS Containers, Exercise 2)
+\end{frame}
+
+\begin{frame}[fragile]{2.2. Hands-on: Basic Container I/O}
+\framesubtitle{Container primitives and Exercise 3}
+\begin{columns}[T]
+\begin{column}{0.45\textwidth}
+\textbf{Container manipulation primitives:}
+\begin{itemize}
+    \item \texttt{grb::clear(container)}:\\ removes all elements
+    \item \texttt{grb::set(vector,scalar)}:\\ sets all elements to \textit{scalar} (-> dense)
+    \item \texttt{grb::setElement(vector,scalar
+    \\,index)}: sets element at \textit{index} to \textit{scalar}
+\end{itemize}
+\end{column}
+
+\begin{column}{0.5\textwidth}
+    \textbf{Exercise 3.} Allocate:
+    \begin{itemize}
+        \item \texttt{grb::Vector<bool>} \texttt{x}, \texttt{y}:\\ length 497, capacities 497 and 1
+        \item \texttt{grb::Matrix<void>} \texttt{A}:\\ size $497 \times 497$, capacity 1727
+        \item Initialize \texttt{y} with \texttt{true} at index 200
+        \item Initialize \texttt{x} with \texttt{false} everywhere
+        \item Print nnz for \texttt{x} and \texttt{y}
+    \end{itemize}
+\end{column}
+\end{columns}
+
+\vspace{1em}
+Start from \texttt{alp\_containers\_ex2.cpp} 
+\begin{lstlisting}[style=terminal, language=bash, basicstyle=\ttfamily\scriptsize\color{terminaltext}] 
+cp alp_containers_ex2.cpp alp_containers_ex3.cpp 
+\end{lstlisting}
+% \textbf{Expected output:}
+% \begin{lstlisting}[style=terminal, language=bash, basicstyle=\ttfamily\scriptsize\color{terminaltext}]
+% nonzeroes in x: 497
+% nonzeroes in y: 1
+% \end{lstlisting}
+\textbf{Bonus question:} Print the capacity of \texttt{y}. Should the value returned be unexpected, considering the specification in the user documentation? Is this a bug in ALP?
+% Owner: PA
+% Source (ALP_Tutorial.tex - Basic Container I/O, Exercise 3)
+\end{frame}
+
+\begin{frame}[fragile]{2.2. Hands-on: Basic Container I/O}
+    \framesubtitle{Exercise 4: Itterators}
+ALP/GraphBLAS exposes \textbf{C++ STL-compatible iterators:}
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+for( const auto &pair : y ) {
+std::cout << "y[" << pair.first << "]=" << pair.second << "\n";
+}
+\end{lstlisting}
+
+\textbf{Exercise 4.} Use output iterators to double-check that \texttt{x} has $497$ values and that all those values equal \texttt{false}:
+\begin{itemize}
+    \item Use STL-compatible iterators to iterate over \texttt{x}
+    \item Count the number of entries and verify each value is \texttt{false}
+\end{itemize}
+Start from \texttt{alp\_containers\_ex3.cpp}
+\begin{lstlisting}[style=terminal, language=bash, basicstyle=\ttfamily\scriptsize\color{terminaltext}] 
+cp alp_containers_ex3.cpp alp_containers_ex4.cpp 
+\end{lstlisting}
+    % Owner: PA
+    % Source (ALP_Tutorial.tex - Basic Container I/O, Exercise 4)
+\end{frame}
+
+\begin{frame}[fragile]{2.2. Hands-on: Basic Container I/O}
+    \framesubtitle{Container file I/O}
+    ALP supports reading sparse matrices from common file formats(e.g. MatrixMarket \texttt{.mtx})
+    \begin{itemize}
+        \item \textbf{MatrixMarket file parser:}
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+#include <graphblas/utils/parser.hpp>
+std::string in( "matrix_file.mtx" );
+grb::utils::MatrixFileReader< double > parser( in, true );
+const auto iterator = parser.begin();
+std::cout << "First parsed entry: ( " << iterator.i() << ", " << iterator.j() << " ) = " << iterator.v() << "\n";
+\end{lstlisting}
+        \begin{itemize}
+            \item \textbf{Constructor}: \texttt{filename, consecutive\_vertices (use = \texttt{true} for .mtx)}
+            \item \textbf{Sparse iterators}: \texttt{iterator.i()}, \texttt{iterator.j()}, \texttt{iterator.v()}
+        \end{itemize}
+        \item \textbf{Building/loading matrices from files:}
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+grb::RC rc = grb::buildMatrixUnique( A, parser.begin(grb::SEQUENTIAL), 
+    parser.end(grb::SEQUENTIAL),grb::SEQUENTIAL);
+\end{lstlisting}
+        \item \textbf{Iterator types:} \texttt{SEQUENTIAL} (all elements) vs \texttt{PARALLEL} (subset per process)
+        \item \textbf{Return codes:} \texttt{grb::RC} (error codes) -> \texttt{grb::SUCCESS} on success
+    \end{itemize}
+    % Owner: PA
+    % Source (ALP_Tutorial.tex - Basic Container I/O)
+\end{frame}
+
+\begin{frame}[fragile]{2.2. Hands-on: Basic Container I/O}
+\framesubtitle{Exercise 5: File I/O}
+\textbf{Exercise 5.} Use the \texttt{MatrixFileReader} and its iterators to build \texttt{A} from \texttt{west0497.mtx}:
+\begin{itemize}
+    \item Use \texttt{MatrixFileReader} and \texttt{buildMatrixUnique}
+    \item Print the number of nonzeroes in \texttt{A} after \texttt{buildMatrixUnique}
+    \item Modify the \texttt{main} function to take as the first program argument a path to an .mtx file
+    \item Pass that path to the ALP/GraphBLAS program
+\end{itemize}
+\textbf{Download the west0497 matrix from the SuiteSparse matrix collection:}
+\begin{lstlisting}[style=terminal, language=bash, basicstyle=\ttfamily\scriptsize\color{terminaltext}] 
+wget "https://suitesparse-collection-website.herokuapp.com/MM/HB/west0497.tar.gz" && tar -xzvf west0497.tar.gz
+\end{lstlisting}
+\textbf{Run the application with the path \texttt{./west0497/west0497.mtx}. Expected output:}
+\begin{lstlisting}[style=terminal, language=bash, basicstyle=\ttfamily\scriptsize\color{terminaltext}]
+./a.out ./west0497/west0497.mtx
+First parsed entry: ( 495, 496 ) = 0.897354
+nonzeroes in A: 1727
+\end{lstlisting}
+\textbf{Bonus question:} Why is there no \texttt{grb::set(matrix,scalar)} primitive?
+% Owner: PA
+% Source (ALP_Tutorial.tex - Basic Container I/O, Exercise 5)
+\end{frame}
+
+\begin{frame}[fragile]{3.3. Hands-on: Copying, Masking, and Standard Matrices}
+\framesubtitle{Copying with Resize/Execute Phases}
+\texttt{grb::set} supports copying containers with automatic capacity management:
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+grb::RC rc = grb::set( x, y ); // Works normally 
+\end{lstlisting}
+But... if capacity $B$(497) $<$ $A$ nnz (1727), \texttt{grb::set(B, A)} returns \textbf{grb::ILLEGAL}
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+grb::Matrix< double > B( 497, 497 );
+rc = rc ? rc : grb::set( B, A );  // grb::ILLEGAL, B capacity violation
+\end{lstlisting}
+\textbf{Solution:} use \texttt{RESIZE} phase first, then \texttt{EXECUTE}
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+rc = rc ? rc : grb::set( B, A, grb::RESIZE );
+rc = rc ? rc : grb::set( B, A, grb::EXECUTE );
+\end{lstlisting}
+\begin{itemize}
+    \item \textbf{Resize phase}: ALP figures out required capacity and resizes if necessary
+    \item \textbf{Execute phase}: Performs the copy (default if omitted)
+\end{itemize}
+\textbf{Question.} What does the code pattern \texttt{rc = rc ? rc : <function call>;} achieve?
+
+\textbf{Question.} $A$ contains $1727$ double-precision elements. Are these $1727$ nonzeroes?
+% Owner: PA
+% Source (ALP_Tutorial.tex - Copying, Masking, and Standard Matrices)
+\end{frame}
+
+\begin{frame}[fragile]{3.3. Hands-on: Copying, Masking, and Standard Matrices}
+\framesubtitle{Masking}
+\texttt{grb::set} can also accept a \textit{mask} argument that determines which outputs are computed:
+\begin{itemize}
+    \item The mask determines which positions get output entries
+    \item All GraphBLAS primitives with output containers can take mask arguments
+\end{itemize}
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+grb::RC rc = grb::set( x, y, false ); // Second argument is a mask
+\end{lstlisting}
+\textbf{Example}: If $y$ has only $y[200]=\texttt{true}$, then \texttt{grb::set(x, y, false)} results in $x$ having only one entry: $x[200]=\texttt{false}$. Why?
+\begin{itemize}
+    \item Mask evaluates \texttt{false} for positions where $y$ has no element (no output generated)
+    \item At position $200$, mask $y$ contains \texttt{true}, so output entry is generated
+\end{itemize}
+\textbf{Question.} What would \texttt{grb::set( y, x, true )} return for $y$, assuming it is computed immediately after the preceding code snippet?
+% \begin{itemize}
+%     \item The mask evaluates \texttt{true} for positions where $x$ has an element
+%     \item The output is the same as the input, since the mask is \texttt{true} for all positions
+% \end{itemize}
+% Owner: PA
+% Source (ALP_Tutorial.tex - Copying, Masking, and Standard Matrices)
+\end{frame}
+
+\begin{frame}[fragile]{3.3. Hands-on: Copying, Masking, and Standard Matrices}
+\framesubtitle{Standard Matrix Factories}
+Iterator-based ingestion allows construction of vectors and matrices with regular structures. ALP/GraphBLAS provides standard matrix factories:
+\begin{lstlisting}[style=cpp-rich, language=C++, basicstyle=\ttfamily\scriptsize]
+#include <graphblas/algorithms/matrix_factory.hpp>
+const grb::Matrix< double > identity = 
+    grb::algorithms::matrices< double >::identity( n );
+\end{lstlisting}
+\begin{itemize}
+    \item \textbf{Factory patterns}: \texttt{identity}, \texttt{eye}, \texttt{diag} 
+    \begin{itemize}
+        \item with optional offset $k$ for superdiagonal/subdiagonal
+    \end{itemize}
+    \item \textbf{Dense patterns}: \texttt{dense}, \texttt{full}, \texttt{zeros}, \texttt{ones}
+    \begin{itemize}
+        \item \textbf{Discouraged} - ALP/GraphBLAS not optimized for dense matrices
+    \end{itemize}
+\item Finally, matrices can be derived from existing ones via \texttt{grb::set(matrix,mask,value)}
+\item \href{http://albert-jan.yzelman.net/alp/v0.8-preview/classgrb_1_1algorithms_1_1matrices.html#a1336accbaf6a61ebd890bef9da4116fc}
+{\textcolor{blue}{See documentation}} for all supported patterns.
+\end{itemize}
+% Owner: PA
+% Source (ALP_Tutorial.tex - Copying, Masking, and Standard Matrices)
+\end{frame}
+
+\begin{frame}[fragile]{3.3. Hands-on: Copying, Masking, and Standard Matrices}
+\framesubtitle{Exercises 6 and 7: Masking and matrix manipulation}
+\textbf{Exercise 6.} Determine whether $A$ holds explicit zeroes (entries with numerical value zero):
+\begin{itemize}
+    \item Start from \texttt{alp\_containers\_ex5.cpp}
+    \item Use masking and copying to detect explicit zeroes on A. 
+    \item \textbf{Hint:} Consider changing $A$ element type to \texttt{double}.
+\end{itemize}
+\textbf{Exercise 7.} Count explicit zeroes on diagonal, superdiagonal, and subdiagonal of $A$:
+\begin{itemize}
+    \item Use the condition(s): $|\{A_{ij}\in A\ |\ A_{ij}=0, |i-j|\leq1, 0\leq i,j<497\}|$
+    \item \textbf{Note:} \textit{Explicit zeroes} = entries \textit{present in the matrix} but with \textit{numerical value} zero.
+    \item \textbf{Hint:} Matrix factory patterns could be helpful here...
+\end{itemize}
+\textbf{Bonus question.} How much memory beyond storing the $n\times n$ identity matrix will \texttt{matrices<double>::identity(n)} consume? \textbf{Hint:} consider that iterators passed to \texttt{buildMatrixUnique} iterate over regular index sequences.
+% Owner: PA
+% Source (ALP_Tutorial.tex - Copying, Masking, and Standard Matrices)
+\end{frame}
+
+\subsection{3) Introduction to core primitives}
+
+\begin{frame}[fragile]
+\frametitle{Primitives overview}
+Element-wise addition, $z=x+y$:
+$$
+	\left(\begin{tabular}{c}0\\0\\1\\0\\0\end{tabular}\right) +
+	\left(\begin{tabular}{c}1\\0\\0\\0\\1\end{tabular}\right) =
+	\left(\begin{tabular}{c}1\\0\\1\\0\\1\end{tabular}\right).
+$$
+ALP/GraphBLAS:
+\begin{itemize}
+	\item {\color{gray}\texttt{grb::Vector< double > x( n ), y( n ), z( n );}}
+	\begin{itemize}
+		\item {\color{gray}(as discussed before)}
+	\end{itemize}
+	\item {\color{gray}...}
+	\item {\color{gray}\texttt{grb::semirings::plusTimes< double > plusTimes;}}
+	\begin{itemize}
+		\item {\color{gray}(will be detailed tomorrow)}
+	\end{itemize}
+	\item \texttt{grb::clear( z ); // makes sure z is empty}
+	\item \texttt{grb::eWiseAdd( z, x, y, plusTimes ); // z = x + y}
+\end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Primitives overview}
+Element-wise multiplication, $z=x\odot y$:
+$$
+	\left(\begin{tabular}{c}0\\0\\1\\0\\0\end{tabular}\right) \odot
+	\left(\begin{tabular}{c}1\\0\\0\\0\\1\end{tabular}\right) =
+	\left(\begin{tabular}{c}0\\0\\0\\0\\0\end{tabular}\right).
+$$
+ALP/GraphBLAS:
+\begin{itemize}
+	\item {\color{gray}...}
+	\item \texttt{grb::RC rc = grb::clear( z ); // makes sure z is empty}
+	\item \texttt{rc = rc ? rc : grb::eWiseMul( z, x, y, plusTimes ); // z = x .* y}
+\end{itemize}
+Every primitive emits a \textbf{return code} (RC). Two fundamental ones are:
+\begin{itemize}
+	\item \texttt{grb::SUCCESS}: primitive executed OK;
+	\item \texttt{grb::PANIC}: something bad happened.
+\end{itemize}
+Always check return codes! (The above snippet contains a short-cut)
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Primitives overview}
+Dot product, $\alpha=(x,y)$:
+$$
+	\alpha = \sum_i x_iy_i.
+$$
+ALP/GraphBLAS:
+\begin{itemize}
+	\item {\color{gray}...}
+	\item \texttt{double alpha = 0.0;}
+	\item \texttt{grb::RC rc = grb::dot( alpha, x, y, plusTimes );}
+\end{itemize}\vspace{\baselineskip}\pause
+What if I again execute a dot product:
+\begin{itemize}
+	\item \texttt{rc = rc ? rc : grb::dot( alpha, x, y, plusTimes );}
+\end{itemize}
+Does $\alpha$ now equal $\sum_i x_iy_i$, or ${\color{red}2}\sum_i x_iy_i$?
+\vspace{\baselineskip}\pause
+
+Answer: $2\sum_i x_iy_i$ -- dot, eWiseAdd, eWiseMul, ... \textbf{are in-place}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Primitives overview}
+Most ALP primitives are in-place: helps achieve \textbf{high performance}
+$$
+x \odot y \to x
+$$
+ALP/GraphBLAS:
+\begin{itemize}
+	\item {\color{gray}...}
+	\item \texttt{auto times = plusTimes.getMultiplicativeOperator();}
+	\item \texttt{grb::RC rc = grb::foldl( x, y, times ); // x <- x .* y}
+\end{itemize}\vspace{\baselineskip}\pause
+
+We could also have folded in the other direction:
+\begin{itemize}
+	\item {\color{gray}...}
+	\item \texttt{grb::RC rc = grb::foldr( x, y, times ); // y <- x .* y}
+\end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Primitives overview}
+Similar to \texttt{grb::dot}, we may also fold into scalars:
+$$
+\beta = \sum_i x_i
+$$
+ALP/GraphBLAS:
+\begin{itemize}
+	\item {\color{gray}...}
+	\item \texttt{auto plus = plusTimes.getAdditiveMonoid();}
+	\item \texttt{double beta = 0.0;}
+	\item \texttt{grb::RC rc = grb::foldl( beta, x, plus );}
+\end{itemize}
+or
+\begin{itemize}
+	\item \texttt{grb::RC rc = grb::foldr( x, beta, plus );}
+\end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Primitives overview}
+Higher-level primitives:
+$$
+\gamma=(xA)^TAy, C=AB
+$$
+ALP/GraphBLAS:
+\begin{itemize}
+	\item {\color{gray}...}
+	\item \texttt{double gamma = 0,0;}
+	\item \texttt{grb::RC rc = grb::mxv( u, A, y, plusTimes ); // u = Ay}
+	\item \texttt{rc = rc ? rc : grb::vxm<}\\
+		\texttt{    grb::descriptors::transpose\_matrix}\\
+		\texttt{>( v, x, A, plusTimes ); // v = xA'}
+	\item \texttt{rc = rc ? rc : dot( gamma, v, u, plusTimes );}
+	\item \texttt{rc = rc ? rc : mxm( C, A, B, plusTimes );}
+\end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Primitives overview}
+In BLAS-terminology, we overviewed three levels of primitives:
+\begin{itemize}
+    \item Level-1: foldl/foldr, dot, eWiseAdd/Mul
+    \item Level-2: mxv, vxm
+    \item Level-3: mxm
+\end{itemize}\vspace{\baselineskip}
+
+We also overviewed:
+\begin{itemize}
+	\item error handling: \texttt{grb::MISMATCH}, \texttt{grb::ILLEGAL};
+	\item GraphBLAS descriptors that alter semantics (transpose); and
+	\item ALP's default in-place behaviour of operations.
+\end{itemize}
+% Owner: AJ
+% Source (ALP_Tutorial.tex - Numerical Linear Algebra):
+% Primitives: \texttt{grb::foldl}, \texttt{grb::dot}, \texttt{grb::eWiseAdd}, \texttt{grb::eWiseMul},
+% \texttt{grb::mxv}, \texttt{grb::vxm}, \texttt{grb::mxm}.
+% \begin{lstlisting}
+% auto plusTimes = grb::semirings::plusTimes<double>();
+% grb::mxv(y, A, x, plusTimes);
+% \end{lstlisting}
+\end{frame}
+
+\subsection{4) Hands-on: numerical linear algebra (3.4)}
+
+\begin{frame}[fragile]{Exercise 8}
+\begin{itemize}
+  \item Build small $A$ and $x$
+  \item Compute $y = A x$, $z = x \odot y$, $\delta = x^{\top} x$
+  \item Print results
+\end{itemize}
+% Owner: DJ
+% Source (ALP_Tutorial.tex - Exercise 8):
+% One example:
+% A = [ [0,1,2], [0,3,4], [5,6,0] ], x = [1,2,3]^T
+% Expected:
+% \begin{lstlisting}[language=bash]
+% x = [ 1, 2, 3 ]
+% y = A·x = [ 7, 18, 17 ]
+% z = x ⊙ y = [ 7, 36, 51 ]
+% dot(x,x) = 14
+% \end{lstlisting}
+\end{frame}
+
+% Exercise 8 details from ALP_Tutorial.tex
+\begin{frame}[fragile]{Exercise 8: Problem statement}
+Given the matrix $A$ and vector $x$ below, compute
+\[ y = A\cdot x, \quad z = x \odot y, \quad d = x^{\top} x. \]
+\vspace{0.5em}
+\[
+ A = \begin{bmatrix}
+ 0 & 1 & 2 \\
+ 0 & 3 & 4 \\
+ 5 & 6 & 0
+ \end{bmatrix}, \qquad
+ x = \begin{bmatrix} 1 \\ 2 \\ 3 \end{bmatrix}.
+\]
+\vspace{0.5em}
+  \textbf{Expected output:}
+\begin{lstlisting}[]
+x = [ 1, 2, 3 ]
+y = A·x = [ 7, 18, 17 ]
+z = x ⊙ y = [ 7, 36, 51 ]
+dot(x,x) = 14
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]{Exercise 8: Starter code (C++)}
+
+	\textbf{Your tasks on this slide}
+\begin{itemize}
+  \item Open code \texttt{alp\_exercise8\_starter.cpp} from the tutorial materials. And complete the following tasks:
+  \item Build matrix $A$ from (I,J,values): reserve capacity and call \texttt{buildMatrixUnique}.
+  \item Initialize vector $x = [1,2,3]^\top$ using \texttt{setElement} (clear first with \texttt{set}).
+  \item Compute $y = A\cdot x$ via \texttt{mxv} using the \texttt{plusTimes} semiring.
+  \item Compute $z = x \odot y$ with \texttt{eWiseMul} under the same semiring.
+  \item Compute \texttt{dot\_val} = $x^{\top} x$ with \texttt{dot} (\texttt{plusTimes}). Print all results.
+\end{itemize}
+
+\vspace{0.5em}
+\footnotesize Code on next slide →
+\end{frame}
+
+\begin{frame}[fragile]{Exercise 8: Starter code (C++) — continued}
+
+\begin{lstlisting}[ language=C++, basicstyle=\ttfamily\scriptsize, 
+  caption={Exercise 8 starter: complete the TODOs}, 
+  label=lst:exercise8-starter, showstringspaces=false, aboveskip=2pt, belowskip=2pt]
+#include <cstdio>
+#include <iostream>
+#include <vector>
+#include <utility>   // for std::pair
+#include <array>
+#include <graphblas.hpp>
+using namespace grb;
+// Indices and values for our sparse 3x3 matrix A:
+//      A = [ 1   0   2 ]
+//          [ 0   3   4 ]
+//          [ 5   6   0 ]
+// We store the nonzero entries via buildMatrixUnique.
+static const size_t Iidx[6]    = { 0, 0, 1, 1, 2, 2 };  // row indices
+static const size_t Jidx[6]    = { 0, 2, 1, 2, 0, 1 };  // column indices
+static const double Avalues[6] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0 };
+
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]{Exercise 8: Starter code (C++) — continued}
+\begin{lstlisting}[language=C++, basicstyle=\ttfamily\scriptsize, showstringspaces=false, aboveskip=2pt, belowskip=2pt]
+int main( int argc, char **argv ) {
+    (void)argc;
+    (void)argv;
+    std::printf("example (ALP/GraphBLAS) API usage\n\n");
+
+    // 1) Create a 3x3 sparse matrix A
+    std::printf("Step 1: Constructing a 3x3 sparse matrix A.\n");
+    Matrix<double> A(3, 3);
+    // TODO 1: Reserve memory for 6 non-zero entries and build A from (Iidx,Jidx,Avalues), 
+    //         use resize and buildMatrixUnique
+
+    // 2) Create a 3-element vector x and initialize x = [1, 2, 3]^T
+    // TODO 2: Initialize x = [1, 2, 3]^T
+    //         first clear with set, then setElement for indices 0..2
+
+    // 3) Create two result vectors y and z (dimension 3) and set to zero
+    // TODO 3: Create y and z with proper type
+
+    // 4) Use the built-in "plusTimes" semiring alias
+    //      (add = plus, multiply = times, id‐add = 0.0, id-mul = 1.0)
+    auto plusTimes = grb::semirings::plusTimes<double>();
+
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]{Exercise 8: Starter code (C++) — continued}
+\begin{lstlisting}[language=C++, basicstyle=\ttfamily\scriptsize, showstringspaces=false, aboveskip=2pt, belowskip=2pt]
+
+    // 5) Compute y = A·x  (matrix‐vector multiply under plus‐times semiring)
+    // TODO 3: y = A·x  (matrix-vector multiply under plusTimes) using mxv()
+
+    // 6) Compute z = x ⊙ y  (element‐wise multiply) via eWiseMul with semiring
+    // TODO 4: z = x ⊙ y  (element-wise multiply) using eWiseMul()
+    
+    // 7) Compute dot_val = xᵀ·x  (dot‐product under plus‐times semiring)
+    // TODO 5: dot_val = x^T x (dot-product under plusTimes) using dot() 
+
+    // 8) Print x, y, z, and dot_val
+
+    return EXIT_SUCCESS;
+}
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]{Exercise 8: Solutions}
+Answers:
+\begin{itemize}
+	\item $x=(1,2,3)$
+	\item $y=(8,18,17)$
+	\item $z=(8,36,51)$
+	\item complexity is $\Omega(n)$
+	\item random-access buildVector complexity is $\mathcal{O}(n/T+T)$
+\end{itemize}\vspace{\baselineskip}
+
+Bonus, the following descriptor short-hands building $x$:
+\begin{itemize}
+	\item \texttt{grb::set< grb::descriptors::use\_index >( x, 0 );}
+\end{itemize}
+\end{frame}
+
+\subsection{5) Interoperability with existing code: Transition Paths \& Python}
+
+\begin{frame}{Transition paths and Python mxv}
+\begin{itemize}
+    \item ALP transition paths overview
+    \item Re-linking existing codes
+    \item Python examples
+\end{itemize}
+% Owner: DJ & AJ
+% Source (ALP_Transition_Path_Tutorial.tex - Intro):
+
+\end{frame}
+
+\begin{frame}[fragile]{Python: mxv with pyalp}
+\framesubtitle{Build a sparse matrix and call mxv across backends}
+
+\href{https://pypi.org/project/alp-graphblas/}{PyPI project: alp-graphblas}
+
+Install from public PyPI repository:
+\begin{lstlisting}[style=terminal,language=bash]
+pip install "alp-graphblas"
+\end{lstlisting}
+
+For offline installation, download the wheel file from PyPI and install from the file.
+Example, this scripts created a temp conda env, installs pyalp, runs the scripts and cleans up:
+\textbf{solutions/ex9/run.sh}
+use this script to run the examples in an isolated environment with proper python version. 
+
+\begin{lstlisting}[language=Python, basicstyle=\ttfamily\tiny, frame=single, showstringspaces=false]
+import numpy as np
+N, M = 5, 5
+idata = np.array([0,1,2,3,3,4, 2,3,3,4, 1,4, 1,4,4], dtype=np.int32)
+jdata = np.array([0,1,2,3,2,2, 1,4,1,1, 0,3, 0,3,4], dtype=np.int32)
+vdata = np.array([1,1,1,1,0.5,2, 1,4,4.4,1, 0,3.5, 0,3,1], dtype=np.float64)
+x_np = np.array([1.0, 1.0, 0.0, 0.3, -1.0], dtype=np.float64)
+A_dense = np.zeros((N, M), dtype=np.float64); A_dense[idata, jdata] += vdata
+for backendname in ['pyalp_ref','pyalp_omp']:
+  import pyalp; 
+  pyalp = pyalp.get_backend(backendname)
+  A = pyalp.Matrix(N, M, idata, jdata, vdata); 
+  x = pyalp.Vector(M, x_np)
+  y = pyalp.Vector(N, np.zeros(N)); 
+  pyalp.mxv(y, A, x); 
+  y_np = y.to_numpy()
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}{Python mxv: what each part does}
+\begin{itemize}
+  \item Import and backend selection
+  \begin{itemize}
+    \item \texttt{import pyalp; pyalp = pyalp.get\_backend(\emph{name})} picks a runtime backend (e.g., \texttt{pyalp\_ref}, \texttt{pyalp\_omp}).
+  \end{itemize}
+  \item Build containers from NumPy
+  \begin{itemize}
+    \item \texttt{Matrix(N,M,idata,jdata,vdata)} constructs a sparse A from COO arrays (0-based; duplicates coalesce by summation).
+    \item \texttt{Vector(M, x\_np)} wraps a dense NumPy array as an ALP vector.
+  \end{itemize}
+  \item Call mxv (two forms)
+  \begin{itemize}
+    \item Functional: \texttt{y = mxv(A,x)} returns a new vector.
+    \item Out-parameter: \texttt{mxv(y,A,x)} writes into preallocated \texttt{y}.
+  \end{itemize}
+  \item Convert and validate
+  \begin{itemize}
+    \item \texttt{y.to\_numpy()} copies the result to a NumPy array.
+    \item Build a dense reference with \texttt{A\_dense[idata, jdata] += vdata} and check \texttt{np.allclose}.
+  \end{itemize}
+  \item Tips
+  \begin{itemize}
+    \item Ensure the chosen backend module is installed and importable.
+    \item For larger problems, prefer CSR/CSC loaders (e.g., MatrixMarket parser) to avoid Python-side COO expansion overhead.
+  \end{itemize}
+\end{itemize}
+\end{frame}
+
+% =========================
+% Monday 10 Nov, Afternoon
+% =========================
+\section{Monday 10 Nov, Afternoon}
+
+\subsection{7) Ising Machine, new solvers}
+\begin{frame}[fragile]{ALP algorithms: overview}
+\begin{itemize}
+  \item Project repo: \url{https://github.com/Algebraic-Programming/ALP}
+  \item Algorithms live under: \verb|include/graphblas/algorithms/|
+  \item We ship several common algorithms/solvers that can be used as black boxes
+  \item Usage modes:
+    \begin{itemize}
+      \item From C++: include the header and call via your program (optionally through a runner/launcher)
+      \item As precompiled libraries: link and call without touching internals
+    \end{itemize}
+\end{itemize}
+% Owner: DJ
+\end{frame}
+
+\begin{frame}[fragile]{Algorithms in the repository (selected)}
+\framesubtitle{Headers under \texttt{include/graphblas/algorithms/}}
+\begin{lstlisting}[style=terminal,language=bash]
+bicgstab.hpp             kmeans.hpp          pregel_connected_components.hpp
+conjugate_gradient.hpp   knn.hpp             pregel_pagerank.hpp
+cosine_similarity.hpp    label.hpp           simple_pagerank.hpp
+gmres.hpp                matrix_factory.hpp  sparse_nn_single_inference.hpp
+hpcg/                    mpv.hpp             spy.hpp
+kcore_decomposition.hpp  norm.hpp
+\end{lstlisting}
+\vspace{-0.5em}
+\begin{itemize}
+  \item Coverage spans linear solvers (CG/GMRES/BiCGSTAB), graph analytics (PageRank, CC, k-core), ML (kNN, k-means), utilities (matrix factory, spy), etc.
+  \item Treat them as black boxes or customize via template parameters and descriptors.
+\end{itemize}
+% Owner: DJ
+\end{frame}
+
+\begin{frame}{New solvers: QUBO / Ising}
+\begin{itemize}
+  \item Added QUBO optimization solvers (work-in-progress):
+    \begin{itemize}
+      \item Ising Machine — Simulated Bifurcation
+      \item Replica Exchange (Ising / QUBO)
+    \end{itemize}
+  \item Status: currently available on separate branches
+  \item How you can help: inspect branches, add comments, propose improvements (PRs/issues welcome)
+\end{itemize}
+% Owner: DJ
+\end{frame}
+
+\begin{frame}{Roadmap for solvers}
+\begin{itemize}
+  \item This year: complete Ising solvers (Simulated Bifurcation, Replica Exchange)
+  \item Next year: support interior-point optimization methods
+\end{itemize}
+% Owner: DJ
+\end{frame}
+
+% =========================
+% Tuesday 11 Nov
+% =========================
+\section{Tuesday 11 Nov, Morning}
+
+\subsection{4) Hands-on: numerical linear algebra (3.4, ex 8)}
+
+\begin{frame}{Complete Exercise 8}
+\begin{itemize}
+    \item Review results and pitfalls
+    \item Alternative builds and descriptors
+\end{itemize}
+% Owner: DJ
+\end{frame}
+
+\subsection{8) Algebraic structures}
+
+\begin{frame}[fragile]{Intro to algebraic structures}
+We learned:
+\begin{itemize}
+    \item Defining and using semirings
+    \item Built-ins: plusTimes, minPlus, boolean
+    \item Exercise: shortest path (min-plus)
+\end{itemize}
+% Owner: AJ 
+% TODO: add exercise "shortest path"
+% Source (ALP_Tutorial.tex - Semirings and Algebraic Operations):
+% A semiring consists of a pair of operations ...
+% \begin{lstlisting}[language=C++]
+% using Add = grb::operators::add<double>;
+% using AddMonoid = grb::Monoid<Add, grb::identities::zero>;
+% using Mul = grb::operators::mul<double>;
+% using PlusTimes = grb::Semiring<Mul, AddMonoid>;
+% \end{lstlisting}
+\end{frame}
+
+\subsection{9) Hands-on: norms and shortest paths through semirings (3.5)}
+
+\begin{frame}{Hands-on}
+Refer to the tutorial: section 3.5, exercises 9 \& 10
+\begin{itemize}
+	\item stretch goal: exercise 11
+\end{itemize}
+\end{frame}
+
+\subsection{10) Solvers, transition path, Python API}
+
+\begin{frame}{Solvers and transition path}
+\begin{itemize}
+    \item Sparse CG solver API
+    \item Preconditioned CG
+    \item Python API notes
+\end{itemize}
+% Owner: DJ
+% Source (ALP_Transition_Path_Tutorial.tex - API):
+% sparse_cg_init / set_preconditioner / solve / destroy — CRS inputs; non-blocking engine, synchronous API.
+\end{frame}
+
+% 10.1 Transition path recap
+\begin{frame}{Transition path recap}
+\framesubtitle{Legacy code interface vs. algebraic reformulation}
+\begin{itemize}
+  \item \textbf{Idea:} A "transition path" lets existing (legacy) sparse / linear algebra code call ALP solvers through a thin C / C++ layer.
+  \item \textbf{No full rewrite:} You do \emph{not} need to express your algorithm in terms of ALP containers, semirings, masks, etc. – you keep CRS/CSR/COO data structures you already have.
+  \item \textbf{Cost:} Because the algorithm is not re-expressed algebraically, ALP cannot perform whole-program optimisations (kernel fusion, mask-aware reordering, mixed-type propagation) beyond the solver internals.
+  \item \textbf{Benefit:} Fast adoption: drop in a solver without changing surrounding application logic.
+  \item \textbf{Typical boundary:} Provide raw value / index arrays + dimensions + parameters; receive solution vector / status code.
+  \item \textbf{Takeaway:} Great for quick wins and incremental migration; limited for deep cross-kernel optimisation.
+\end{itemize}
+\end{frame}
+
+% 10.2 Black-box exposure & optimisation boundaries
+\begin{frame}{Transition path: Black-box algorithms exposure}
+\framesubtitle{What is (and is not) optimised}
+\begin{itemize}
+  \item \textbf{Exposed set:} Iterative solvers (CG, BiCGSTAB, GMRES, k-means, PageRank, connected components, etc.) via stable headers.
+  \item \textbf{Internals:} Inside each solver ALP leverages traits (associativity, identities, sparsity, mask semantics) for backend-specific scheduling and fusion.
+  \item \textbf{Boundary limit:} Data marshaling between your legacy structures and ALP containers is minimal but still a barrier; external loops remain opaque so global multi-primitive fusion cannot cross the library boundary.
+  \item \textbf{Performance model:} You gain highly-tuned inner iterations; you \emph{do not} gain automatic restructuring of surrounding pre/post processing steps.
+  \item \textbf{When to move further:} If pre/post steps dominate runtime or you want algebraic transformations (e.g. descriptor-based masking), consider refactoring those steps into ALP primitives.
+\end{itemize}
+\end{frame}
+
+% 10.3 C++ Conjugate Gradient & Preconditioned CG interface
+\begin{frame}[fragile]{Transition path: Conjugate Gradient (CG) C++ interface}
+\framesubtitle{Minimal handle lifecycle (legacy CRS input)}
+\small
+\begin{itemize}
+  \item \textbf{Inputs (CRS):} \verb|A_vals[nnz]|, \verb|A_cols[nnz]| (column indices), \verb|A_offs[n+1]| (row offsets), dimension \verb|n|.
+  \item \textbf{Vectors:} Right-hand side \verb|b[n]|, initial guess \verb|x[n]| (in/out).
+  \item \textbf{Parameters:} Max iterations, tolerance, (optional) convergence callback.
+  \item \textbf{Lifecycle:} init \(\rightarrow\) (optional preconditioner attach) \(\rightarrow\) solve loop \(\rightarrow\) destroy.
+  \item \textbf{Code:} See Listing~\ref{lst:cg-cpp-interface} for a minimal example.
+  \item \textbf{External reference:} solver headers live under\newline \url{https://github.com/Algebraic-Programming/ALP/tree/develop/src/transition} (see \texttt{solvers.cpp}).
+\end{itemize}
+
+\end{frame}
+\begin{frame}[fragile]{Transition path: Conjugate Gradient (CG) C++ interface - continued}
+
+\begin{lstlisting}[language=C++,basicstyle=\ttfamily\scriptsize, caption={CG handle lifecycle (CRS inputs)}, label={lst:cg-cpp-interface}]
+#include <graphblas/algorithms/conjugate_gradient.hpp>
+
+// Raw CRS arrays (double precision)
+extern const double *A_vals;      // length nnz
+extern const size_t *A_cols;      // length nnz
+extern const size_t *A_offs;      // length n+1
+size_t n; // dimension
+
+// Solution and RHS
+std::vector<double> x(n, 0.0);    // initial guess
+std::vector<double> b = load_rhs(n);
+
+grb_cg_handle handle = nullptr; // opaque handle type
+int rc = sparse_cg_init_dii(&handle, n, A_vals, A_cols, A_offs);
+if(rc) { /* error handling */ }
+
+// Optional: attach simple Jacobi preconditioner (diagonal inverse)
+// sparse_cg_set_precond_d(handle, diag_inv_ptr);
+
+rc = sparse_cg_solve_dii(handle, x.data(), b.data(), /*maxIters=*/1000, /*tol=*/1e-8);
+
+
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]{Transition path: Conjugate Gradient (CG) C++ interface - continued}
+\begin{lstlisting}[language=C++,basicstyle=\ttfamily\scriptsize]
+
+
+
+// x now contains solution
+int its = sparse_cg_get_iters(handle);
+double resnrm = sparse_cg_get_residual(handle);
+
+sparse_cg_destroy_dii(handle);
+\end{lstlisting}
+\begin{itemize}
+  \item Suffix \verb|_dii| indicates a concrete template instantiation (double + integer index types).
+  \item Preconditioner attach call depends on provided header; typical: pass pointer to values or functor (if fully templated in header-only mode).
+\end{itemize}
+\end{frame}
+
+% 10.4 Preconditioned CG specifics
+\begin{frame}[fragile]{Transition path: Preconditioned CG specifics}
+\framesubtitle{Supplying a preconditioner}
+\small
+\begin{itemize}
+  \item \textbf{Jacobi / diagonal:} Provide array of inverse diagonal entries (or a functor that applies \(M^{-1}v\)).
+  \item \textbf{ILU(0)/IC(0):} Build outside ALP; pass CRS of \(M\) if interface supports sparse application routine.
+  %\item \textbf{AMGCL:} Use existing AMGCL interface; pass CRS of \(M\) and wrap application.
+  % not sure if AMGCL works with GC yet
+\end{itemize}
+\vspace{-0.5em}
+\begin{lstlisting}[language=C++,basicstyle=\ttfamily\scriptsize]
+struct DiagInv {
+  const double *dinv;
+  void apply(const double *in, double *out) const {
+    // element-wise scale
+    for(size_t i=0;i<n;++i) out[i] = dinv[i] * in[i];
+  }
+};
+// Templated init might look like:
+auto pcg = grb::algorithms::make_pcg<double,int>(A_vals,A_cols,A_offs,n);
+pcg.set_preconditioner(DiagInv{diag_inv_ptr});
+pcg.solve(x.data(), b.data(), 1e-8, 1000);
+\end{lstlisting}
+\begin{itemize}
+  \item \textbf{typed interface vs templates:} API exposes fixed signatures (e.g. \verb|_dii|); template interface offers richer composition (callbacks, custom operators) when building from source.
+  \item \textbf{Fallback:} If no preconditioner is set, solver defaults to plain CG.
+\end{itemize}
+\end{frame}
+
+%% 11 Hands-on: Section 5; CG example; Python example
+\subsection{11) Hands-on: Section 5; CG example; Python example; explicit SIMD}
+\begin{frame}[fragile]{Python: Conjugate Gradient usage}
+\framesubtitle{Calling a solver from a wheel}
+\begin{lstlisting}[language=Python]
+    import numpy as np
+    import pyalp   # local build or PyPI wheel (alp-graphblas)
+    pyalp = pyalp.get_backend('pyalp_ref')  # or 'pyalp_omp'
+    # Generate a small sparse linear system using numpy arrays
+    N, M = 5, 5
+    idata = np.array([0, 1, 2, 3, 3, 4, 2, 3, 3, 4, 1, 4, 1, 4, 4], dtype=np.int32)
+    jdata = np.array([0, 1, 2, 3, 2, 2, 1, 4, 1, 1, 0, 3, 0, 3, 4], dtype=np.int32)
+    vdata = np.array([1, 1, 1, 1, 0.5, 2, 1, 4, 4.4, 1, 0, 3.5, 0, 3, 1], dtype=np.float64)
+    b = np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float64)
+    x = np.array([1.0, 1.0, 0.0, 0.3, -1.0], dtype=np.float64)
+    r = np.zeros(5, dtype=np.float64)
+    u = np.zeros(5, dtype=np.float64)
+    tmp = np.zeros(5, dtype=np.float64)
+    # Create the pyalp Matrix and Vector objects
+    alpmatrixA = pyalp.Matrix(5, 5, idata, jdata, vdata)
+    alpvectorx = pyalp.Vector(5, x)
+    alpvectorb = pyalp.Vector(5, b)
+    alpvectorr = pyalp.Vector(5, r)
+    alpvectoru = pyalp.Vector(5, u)
+    alpvectortmp = pyalp.Vector(5, tmp)
+
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]{Python: Conjugate Gradient usage — continued}
+\begin{lstlisting}[language=Python, basicstyle=\ttfamily\tiny, frame=single, showstringspaces=false]
+
+    maxiterations = 2000
+    verbose = 1
+    # Solve the linear system using the conjugate gradient method in the backend
+    iterations, residual = pyalp.conjugate_gradient(
+		alpmatrixA, alpvectorx,alpvectorb,alpvectorr,alpvectoru,
+		alpvectortmp,maxiterations,verbose,
+    )
+    print('iterations =', iterations)
+    print('residual =', residual)
+
+    # Convert the result vector to a numpy array and print it
+    x_result = alpvectorx.to_numpy()
+    print('x_result =', x_result)
+
+\end{lstlisting}
+\normalsize
+\begin{itemize}
+  \item \textbf{Exposed variants:} Only fully instantiated numeric signatures (e.g. double + 32/64-bit indices) appear in the Python module; generic templates are not dynamically instantiated.
+  \item \textbf{Local build advantages:} Native optimisations enabled. Additional backends and template instantiations available compared to PyPI wheels.
+\end{itemize}
+\end{frame}
+
+
+
+\begin{frame}[fragile]{Python: Conjugate Gradient usage — continued}
+\begin{itemize}
+
+  \item \textbf{Current coverage (indicative):}
+    \begin{itemize}
+      \item PyPI wheels: Linux x86\_64 (manylinux) and macOS 15 ARM64; CPython 3.9--3.12; common solvers (CG, GMRES, PageRank).
+      \item Local source build: may enable additional backends and template instantiations.
+    \end{itemize}
+  \item \textbf{Optimisation boundary:} Same as C++ transition path – solver internals optimised; external NumPy assembly steps not transformed.
+  \item \textbf{Tip:} For repeated solves with changing RHS, reuse the handle to avoid re-analysis of \(A\).
+\end{itemize}
+\end{frame}
+
+% 11 Python build & performance notes
+\begin{frame}[fragile]{Python build \& performance notes}
+\begin{itemize}
+  \item \textbf{Local wheel build:} Use \verb|pip install .| in source checkout to expose extra backends and template instantiations needed for your precision/index mix.
+  \item \textbf{PyPI wheel constraints:} Size limits and manylinux policy restrict included backends; heavy experimental code often excluded.
+  \item \textbf{Threading:} \verb|pyalp_omp| maps to OpenMP settings (control via \verb|OMP_NUM_THREADS|); reference backend single-threaded.
+  \item \textbf{Memory:} Provide contiguous NumPy arrays; zero-copy wrappers reduce overhead when dtypes match expected C++ signatures.
+  \item \textbf{Scaling path:} Migrate hot pre/post operations (vector assembly, residual checks) into ALP primitives for additional gains once baseline CG is stable.
+\end{itemize}
+\end{frame}
+
+% 11 PyPI distribution snapshot
+\begin{frame}[fragile]{PyPI: \texttt{alp-graphblas} 0.8.41 distribution files}
+\framesubtitle{Download files overview}
+\small
+  \textit{Link:} \url{https://pypi.org/project/alp-graphblas/0.8.41/\#files}
+\begin{itemize}
+  \item \textbf{Release date:} 4 Nov 2025
+  \item \textbf{Source distribution:} \emph{none for this release}
+  \item \textbf{Binary wheels (CPython 3.9--3.12):} Linux x86\_64 (manylinux\_2\_17) and macOS 15 ARM64
+\end{itemize}
+\vspace{-0.5em}
+\begin{lstlisting}[basicstyle=\ttfamily\scriptsize]
+File set (size):
+cp39  manylinux_x86_64  (1.7 MB)  macOS15_arm64 (492.9 kB)
+cp310 manylinux_x86_64  (1.7 MB)  macOS15_arm64 (492.9 kB)
+cp311 manylinux_x86_64  (1.7 MB)  macOS15_arm64 (492.9 kB)
+cp312 manylinux_x86_64  (1.7 MB)  macOS15_arm64 (492.9 kB)
+\end{lstlisting}
+\vspace{-0.5em}
+\begin{itemize}
+  \item \textbf{Install example:} \verb|pip install alp-graphblas==0.8.41|
+  \item \textbf{Platform gaps:} No Windows wheels yet; build from source for other OS/architectures.
+  \item \textbf{Why sizes differ:} Linux wheels include compiled OpenMP objects; macOS ARM64 build smaller.
+  \item \textbf{Tip:} Need extra backends or template instantiations? Clone repo and run \verb|pip wheel .| locally.
+\end{itemize}
+\normalsize
+\end{frame}
+
+% =========================
+% Exercise 9 (Python CG backends)
+% =========================
+\begin{frame}[fragile]{Exercise 9: Python Conjugate Gradient}
+\framesubtitle{Hands-on: solve with two backends and compare}
+  \textbf{Goal:} Use the Python wheel (or local build) to run Conjugate Gradient on a small sparse system with both \texttt{pyalp\_ref} and \texttt{pyalp\_omp}.
+\vspace{0.35em}
+  \textbf{Tasks}\small
+\begin{itemize}
+  \item Install: \verb|pip install alp-graphblas| (or build locally for extra backends)
+  \item Construct a small random symmetric positive definite (SPD) matrix \(A\) and vectors \(b, x_0\)
+  \item Run CG using backend names: \texttt{pyalp\_ref}, \texttt{pyalp\_omp}
+  \item Record: iterations, final residual, runtime (use Python's \verb|time.perf_counter()|)
+  \item (Stretch) Vary tolerance (1e-6, 1e-10) and note iteration changes
+\end{itemize}
+\vspace{0.25em}
+  \textbf{Deliverable:} A tiny table: backend vs iterations vs residual vs time.
+\end{frame}
+
+\begin{frame}[fragile]{Exercise 9: Starter code (Python)}
+\begin{itemize}
+\item {Fill in TODOs (timing, SPD generation, loop)}
+\item {Test execution after each TODO to ensure correctness}
+\item {Test execution time}
+\end{itemize}
+\begin{lstlisting}[language=Python, basicstyle=\ttfamily\scriptsize, frame=single, showstringspaces=false, caption={Exercise 9 starter}, label={lst:exercise9-starter}]
+import numpy as np, time
+import pyalp
+
+# TODO 1: pick backend list
+backends = ["pyalp_ref", "pyalp_omp"]
+
+# Build a small random symmetric positive definite matrix A (size n)
+n = 64
+np.random.seed(42)
+R = np.random.randn(n, n)
+A_dense = R.T @ R + 0.1 * np.eye(n)  # SPD tweak (add diagonal)
+
+# Convert dense SPD to COO triplets (could also threshold to make sparse)
+I, J = np.nonzero(A_dense)
+V = A_dense[I, J].astype(np.float64)
+
+# Right-hand side and initial guess
+b_np = np.ones(n, dtype=np.float64)
+x0_np = np.zeros(n, dtype=np.float64)
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}[fragile]{Exercise 9: Starter code (Python) — continued}
+\begin{lstlisting}[language=Python, basicstyle=\ttfamily\scriptsize, frame=single, showstringspaces=false]
+
+results = []
+for name in backends:
+    backend = pyalp.get_backend(name)
+    # Containers
+    A = backend.Matrix(n, n, I.astype(np.int32), J.astype(np.int32), V)
+    x = backend.Vector(n, x0_np.copy())
+    b = backend.Vector(n, b_np)
+    r = backend.Vector(n, np.zeros(n))
+    u = backend.Vector(n, np.zeros(n))
+    tmp = backend.Vector(n, np.zeros(n))
+    maxiters, tol, verbose = 2000, 1e-8, 0
+    t0 = time.perf_counter()
+    its, res = backend.conjugate_gradient(A, x, b, r, u, tmp, maxiters, verbose)
+    dt = time.perf_counter() - t0
+    results.append((name, its, res, dt))
+
+# TODO 2: Pretty-print comparison table
+for name, its, res, dt in results:
+    print(f"{name:10s} | iterations={its:4d} | residual={res:.2e} | time={dt*1e3:.1f} ms")
+
+# TODO 3 (stretch): loop over tolerances [1e-6, 1e-10] and observe iterations
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}{Advanced ALP: explicit SPMD control}
+ALP can compile code to run programs fully hybrid-parallel:
+\begin{itemize}
+    \item hybrid shared- and distributed-memory execution, $2\times4$:
+	\begin{itemize}
+		\item \texttt{grbcxx -b hybrid myProgram.cpp}
+		\item \texttt{export OMP\_NUM\_THREADS=4}
+		\item \texttt{grbrun -b hybrid -np 2 ./a.out}
+	\end{itemize}
+    \item \textbf{fully transparent}: no parallelisation concepts in ALP
+\end{itemize}\vspace{\baselineskip}
+
+The hybrid backend is enabled through compilation with LPF:
+\begin{itemize}
+	\item \texttt{../boostrap.sh --prefix=</install/path> {\color{red}--with-lpf=/tmp/lpfinstall}}
+\end{itemize}
+% Owner: AJ
+\end{frame}
+
+\begin{frame}[fragile]
+\frametitle{Advanced ALP: explicit SPMD control}
+Nevertheless, sometimes explicit control is useful:
+\begin{itemize}
+	\item \texttt{grb::spmd<>::nprocs();} -- returns no.\ of processes
+	\item \texttt{grb::spmd<>::pid();} -- returns unique process ID
+\end{itemize}\vspace{\baselineskip}
+
+\textbf{Bonus exercise}: write a program that prints the following
+\begin{itemize}
+	\item \texttt{Hello world from process <i> / <p>}
+\end{itemize}
+where $i$ is the process ID and $p$ the total number of processes. There should be one such line for each process.\vspace{\baselineskip}
+
+\textbf{Hint: }first rebuild your ALP installation with LPF support (bootstrap with \verb|--with-lpf=/tmp/lpfinstall|). Don't forget to use the launcher!
+\end{frame}
+
+\begin{frame}{Advanced ALP: explicit SPMD control}
+By using SPMD, different processes may compute on different data. Results between processes must sometimes be exchanged to progress.\vspace{\baselineskip}
+
+\textbf{Collectives} are a standard mechanism for inter-process communication, which ALP also provides:
+\begin{itemize}
+	\item \texttt{alpha = 1.0;}
+	\item \texttt{grb::RC rc = grb::collectives<>::allreduce( alpha, plus );}
+\end{itemize}
+The above computes the sum of the value of alpha across each process in the SPMD section.\vspace{\baselineskip}
+
+\textbf{Question: }what is the value of alpha after the a successful call of the above?\pause\vspace{.5\baselineskip}
+
+\textbf{Answer: }$P=$\ \texttt{grb::spmd<>::nprocs;}\vspace{\baselineskip}\pause
+
+Near future: \textbf{RDMA support}.
+\end{frame}
+
+\begin{frame}{Advanced ALP: explicit SPMD control}
+If you want to explicit run a different ALP computation on different processes:
+\begin{itemize}
+	\item select the nonblocking backend for process-local computations
+	\begin{itemize}
+		\item \texttt{grb::Vector< double, grb::nonblocking > local\_x( n );}
+		\item \texttt{grb::Matrix< void, grb::nonblocking > local\_A( m, n );}
+		\item ...
+		\item \texttt{grbcxx {\color{red}-b hybrid} myProgram.cpp}
+	\end{itemize}\pause
+	\item ALP will compile-time dispatch to the right backend
+	\begin{itemize}
+		\item \texttt{grb::Vector< double > global\_x;}
+		\item \texttt{grb::set( global\_x, 3.14 ); // will use all processes}
+		\item \texttt{grb::set( local\_x, grb::spmd<>::pid() ); // uses SPMD}
+	\end{itemize}
+\end{itemize}\vspace{\baselineskip}\pause
+
+\textbf{Important: }these functionalities are here if you absolutely need them -- if, on the other hand, ALP can auto-parallelise your computation of interest, then {\color{red} none of these constructs are needed}. SPMD programming is harder than sequential programming!
+\end{frame}
+
+% =========================
+% Tuesday 11 Nov, Afternoon
+% =========================
+\section{Tuesday 11 Nov, Afternoon}
+
+% =========================
+% HIPO: 
+% =========================
+\subsection{12) HIPO: Introduction and Motivation}
+
+\begin{frame}{HIPO: Huawei's Hardware Integrated Platform for Optimisation}
+\framesubtitle{Problem formulation: The Challenge}
+\begin{itemize}
+    \item Modern HPC architectures $\rightarrow$ diverse hardware platforms
+    \begin{itemize}
+        \item CPUs, GPUs, accelerators, hybrid systems, UB
+        \item Different memory hierarchies, compute capabilities, bandwidths
+    \end{itemize}
+    \item Diverse problem domains and solvers
+    \begin{itemize}
+        \item Graph algorithms, linear algebra, optimization, simulations
+        \item Each with different computational/algorithmic characteristics
+    \end{itemize}
+    \item \textbf{Key Question:} How to optimally pair algorithms with hardware?
+    \begin{itemize}
+        \item Model hardware capabilities
+        \item Model solver/algorithm requirements
+        \item Find optimal pairings in theory and practice
+    \end{itemize}
+\end{itemize}
+% Owner: PA
+\end{frame}
+
+\begin{frame}{HIPO: Introduction and Motivation}
+\framesubtitle{Motivation: What We Want to Achieve}
+\begin{itemize}
+    \item \textbf{Core Goal:} Systematic HW-SW co-design and optimization
+    \begin{itemize}
+        \item Model hardware: capabilities, constraints, performance characteristics
+        \item Model software/solvers: computational/communication/access patterns
+        \item Bridging mechanism: How to bridge the gap(s) between these two?
+    \end{itemize}
+    \item \textbf{Applications:}
+    \begin{itemize}
+        \item \textbf{Performance modeling:} Predict performance before execution
+        \item \textbf{Autotuning:} Automatically find optimal configurations and parameters
+        \item \textbf{Algorithm co-design:} Adapt/choose algorithms to fit the hardware
+        \item \textbf{HW-SW co-design:} Choose/evaluate/design hardware for specific algorithms
+    \end{itemize}
+    \item \textbf{Outcome:} A platform for automatic algorithm optimization and hardware utilization.
+\end{itemize}
+% Owner: PA
+\end{frame}
+
+\begin{frame}{HIPO: Introduction and Motivation}
+\framesubtitle{Why ALP/GraphBLAS?}
+\begin{itemize}
+    \item \textbf{ALP/GraphBLAS characteristics:}
+    \begin{itemize}
+        \item Hardware-unaware, hardware-independent user code
+        \item Backend handles hardware-specific optimizations internally
+        \item Easy-to-use software solution that optimizes execution automatically
+    \end{itemize}
+    \item \textbf{Why it's ideal for HIPO:}
+    \begin{itemize}
+        \item Clear separation: algorithm (what) vs.\ backend (how)
+        \item Multiple backends: same code, different hardware targets
+        \item Internal optimization opportunities: fusion, scheduling, kernel dispatch, tiling etc
+    \end{itemize}
+    \item \textbf{HIPO can enhance ALP:}
+    \begin{itemize}
+        \item Model-driven backend selection and optimization
+        \item Autotuning for backend parameters
+        \item ``Free'' performance improvement for end users
+    \end{itemize}
+\end{itemize}
+% Owner: PA
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{Current models and their limitations}
+  \textbf{Target: Universal model} $\rightarrow$ Modeling any current/future hw architecture/solver.\\
+  \begin{itemize}
+    \item \textbf{Trade-off:} Accuracy $\leftrightarrow$ applicability.
+    \end{itemize}
+  \vfill
+  \textbf{Existing models:}\\
+\begin{itemize}
+  \item \textbf{Roofline}: Simple model, latency-unaware and hardware-centric
+  \begin{itemize}
+  \item Lacks an algorithm-side structure to drive co-design decisions
+  \end{itemize}
+  \item \textbf{BSP}: Single-level communication/latency aware model
+  \begin{itemize}
+  \item Not suitable for on-node memory/cache/NUMA modeling
+  \end{itemize}
+  \item \textbf{NUMA-BSP}: Adds NUMA-awareness, stays relatively simple
+  \begin{itemize}
+  \item Still lacks the option to express deeper hierarchies (private/shared memories)
+  \end{itemize}
+  \item \textbf{Multi-BSP}: closest fit $\rightarrow$ hierarchical(HW) and recursive(ALGO)
+  \begin{itemize}
+  \item Still requires algorithms written explicitly in recursive supersteps. 
+  \item Cannot expose irregular('bad') vs consequtive('good') access patterns
+  \begin{itemize}
+    \item Common in sparse algorithms/kernels, important for performance.
+  \end{itemize}
+  \end{itemize}
+\end{itemize}
+\textbf{Our goal:} Multi-BSP-like + irregular access patterns + relaxed algorithmic representation
+% Owner: PA
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{k-Multi-BSP: Hardware parameters}
+\begin{columns}[T,totalwidth=\textwidth]
+\column{0.5\textwidth}
+Levels $l = 0..d$ with throughput vector
+\begin{itemize}
+  \item $r_v(0) = (r_{\text{scalar}}, r_{\text{vec\_SIMD}})$
+  \item Inverse bandwidth $g(l)$ ($\tfrac{\text{s}}{\text{byte access}}$)
+  \item Latency $ls(l)$
+  \item Available memory $m(l)$
+  \item Sub-components count $p(l)$
+  \item Stream capacity $kmax(l)$
+  \begin{itemize}
+    \item For maintaining peak throughput
+  \end{itemize}
+\end{itemize}
+\column{0.5\textwidth}
+\centering
+\begin{tikzpicture}[font=\small,
+  level distance=1cm,
+  sibling distance=0.5cm,
+  edge from parent/.style={draw, thick, -{Latex[length=2.3mm]}}]
+  \tikzstyle{level 1}=[sibling distance=2 cm]
+  \tikzstyle{level 2}=[sibling distance=0.5 cm]
+  \tikzstyle{nodebox}=[draw=gray!60, rounded corners=2pt, minimum width=0.38cm, minimum height=0.3cm, fill=gray!10]
+
+  \node[nodebox, fill=green!10] (lvl2root) {}
+    child { node[nodebox, fill=blue!10] (lvl1left) {}
+            child { node[nodebox, fill=gray!15, very thick] (rootL) {} }
+            child { node[nodebox, fill=gray!15, very thick] {} }
+            child { node[nodebox, fill=gray!15, very thick] {} }
+            child { node[nodebox, fill=gray!15, very thick] {} }
+          }
+    child { node[nodebox, fill=blue!10] (lvl1right) {}
+            child { node[nodebox, fill=gray!15, very thick] (rootR) {} }
+            child { node[nodebox, fill=gray!15, very thick] {} }
+            child { node[nodebox, fill=gray!15, very thick] {} }
+            child { node[nodebox, fill=gray!15, very thick] {} }
+          };
+
+  \node[right=2.3cm of lvl2root, font=\scriptsize, text=black!70, align=left] {level 2 ($p(2)=2$)\\ $g(2),\ ls(2),\ m(2)$};
+  \node[right=1.3cm of lvl1right, font=\scriptsize, text=black!70, align=left] {level 1 ($p(1)=4$)\\ $g(1),\ ls(1),\ m(1)$};
+  \node[right=2cm of rootR, font=\scriptsize, text=black!70, align=left] {level 0 ($p(0)=1$)\\ $g(0) = ls(0) =  m(0) = 0$};
+
+  \node[below=0.5cm of rootR, font=\scriptsize, text=black!70] {Example 3-level hierarchy};
+\end{tikzpicture}
+\end{columns}
+% Owner: PA
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{k-Multi-BSP: Algorithm parameters and cost}
+\begin{itemize}
+\item Supersteps $i = 0..n$ with participating sub-components $s = 0..p(l)$
+\item Work vectors at the leaf level: $w_v(i,s,l=0) = (w_{\text{scalar}}, w_{\text{vec\_SIMD}})$
+\item Stream descriptors $ks(i,s,l)$ accessing (0..$kmax(l)$) different memory regions 
+\item Communication volume $t(i,s,l,k)$, $v(i,s,l,k)$ for bytes sent/received
+\end{itemize}
+\vspace{0.5em}
+\begin{columns}[t]
+\column{0.5\textwidth}
+\textbf{Computation cost (per-superstep)}
+\[
+c_{\text{comp}} = \max\left( \frac{r_{\text{scalar}}}{w_{\text{scalar}}}, \frac{w_{\text{vec\_SIMD}}}{ops_{\text{SIMD}}} \right)
+\]
+\column{0.5\textwidth}
+\textbf{Communication cost (per-level)}
+\[
+c_{\text{comm}}(i,l) = \max_{s,k} h(i,s,l,k) \cdot g(l) + ls(l)
+\]
+\end{columns}
+\vspace{0.5em}
+\textbf{Total cost aggregation}
+\[
+T = \sum_{l=0}^{d} f\Big( \sum_{i=0}^{n} c_{\text{comm}}(i,l),\, c_{\text{comp}} \Big), \quad f \in \{\max, +\}
+\]
+\textbf{Simplified memory-bound variant}: Assume uniform per process (no $s$), drop $r_v$, $w_v$.
+\begin{itemize}
+  \item $T = \sum_{i,l,k} c_{\text{comm}}(i,l)$ $\rightarrow$ $6d+1$ HW parameters, $1 + n d (ks+1)$ algo parameters.
+\end{itemize}
+% Owner: PA
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{Example testbed: ARM920 (hwloc snapshot)}
+\centering
+\includegraphics[width=0.95\linewidth,height=0.7\textheight,keepaspectratio]{Images/ARM920_system.png}
+\vspace{0.5em}
+\begin{itemize}
+  \item 2 Sockets with two NUMA domains each; 24 MB shared L3 per NUMA.
+  \item 96 PUs mapped across NUMA domains each with L1/L2 private cache.
+  \item \textbf{Our Approach:} Convert this to k-Multi-BSP model(s).
+  \begin{itemize}
+    \item Convert hierarchy to levels $l$ with $p(l)$, $m(l)$.
+    \item Perform microbenchmarks to extract $g(l)$, $ls(l)$.
+  \end{itemize}
+\end{itemize}
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{Automated calibration for $g(l)$ and $ls(l)$}
+\begin{itemize}
+  \item We run a custom microbenchmark suite across cache, NUMA and memory tiers.
+  \item For various sizes per \textit{level area:} 
+  \begin{itemize}
+    \item 'Pointer-chasing' + 1-byte 'random accesses' for $ls(l)$, 
+    \item Classic kernels (daxpy,copy,stream etc) on \texttt{CACHELINE\_SIZE} consecutive data for $g(l)$.
+  \end{itemize}
+  \item Benchmarks repeat for different thread counts and placements to create multiple models.
+  \item Add a final \textit{GLOBAL\_SYNC} layer for kernel parallel launch/sync overheads.
+  \item k-Multi-BSP models $\rightarrow$ Headers with $g(l)$, $ls(l)$, $m(l)$ values for use in GraphBLAS.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{Example models for ARM920}
+\centering
+\includegraphics[width=0.85\linewidth,height=0.7\textheight,keepaspectratio]{Images/ARM920_hw_models.png}
+\vspace{0.5em}
+\begin{itemize}
+  \item Hardware parameters extracted for ARM920: $g(l)$, $ls(l)$, $m(l)$ per tier.
+  \item Here: Two bandwidth/$l$ plots for different thread numbers and consecutive byte accesses.
+   \begin{itemize}
+    \item Aggregated to $g(l)$, $ls(l)$ values for k-Multi-BSP model.
+  \end{itemize}
+\end{itemize}
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{Model validation: 1 thread (synthetic vs. real datasets)}
+\centering
+\begin{minipage}{0.48\linewidth}
+  \includegraphics[width=\linewidth,height=0.65\textheight,keepaspectratio]{Images/Solver_iteration_performance_1t_synthetic.png}
+\end{minipage}
+\hfill
+\begin{minipage}{0.48\linewidth}
+  \includegraphics[width=\linewidth,height=0.65\textheight,keepaspectratio]{Images/Solver_iteration_performance_1t_real.png}
+\end{minipage}
+\vspace{0.5em}
+\begin{itemize}
+  \item k-Multi-BSP model (red) closely tracks ground truth across both synthetic (left) and real (right) workloads.
+  \item Single-thread baseline validates model accuracy before scaling.
+  \item Latency-aware hierarchical roofline follows similar pattern for 1 thread.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Solver Footprints}
+\framesubtitle{Model validation: 96 threads (synthetic vs. real datasets)}
+\centering
+\begin{minipage}{0.48\linewidth}
+  \includegraphics[width=\linewidth,height=0.65\textheight,keepaspectratio]{Images/Solver_iteration_performance_96t_synthetic.png}
+\end{minipage}
+\hfill
+\begin{minipage}{0.48\linewidth}
+  \includegraphics[width=\linewidth,height=0.65\textheight,keepaspectratio]{Images/Solver_iteration_performance_96t_real.png}
+\end{minipage}
+\vspace{0.5em}
+\begin{itemize}
+  \item Full system validation: k-Multi-BSP model accurately predicts performance patterns.
+  \item Latency-aware hierarchical roofline fails to capture parallel overheads
+\end{itemize}
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{k-Multi-BSP example for hierarchical rooflines}
+\centering
+\begin{minipage}{0.48\linewidth}
+  \includegraphics[width=\linewidth,height=0.65\textheight,keepaspectratio]{Images/Multi_BSP_submodel_roofline_cascade.png}
+\end{minipage}
+\hfill
+\begin{minipage}{0.48\linewidth}
+  \includegraphics[width=\linewidth,height=0.65\textheight,keepaspectratio]{Images/Multi_BSP_submodel_roofline_arm920.png}
+\end{minipage}
+\vspace{0.5em}
+\begin{itemize}
+  \item k-Multi-BSP g(l) example for different architectures: Cascade (left) vs. ARM920 (right).
+  \item Could enable problem-specific system selection: based on the solver's OI + footprint (here).
+  \item Supports theoretical "what-if" analyses during HW/SW co-design.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{Sneak-peek to model-based autotuning: Thread selection — synthetic workload}
+\centering
+\includegraphics[width=0.9\linewidth,height=0.7\textheight,keepaspectratio]{Images/Solver_iteration_close_synthetic_performance_ARM920.png}
+\vspace{0.5em}
+\begin{itemize}
+  \item \textbf{Question:} Can the current thread models (right) predict execution behavior (left)?
+  \item  \textbf{YES!} Model is fairly good at predicting thread counts for different problem sizes.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{HIPO Modeling}
+\framesubtitle{Sneak-peek to model-based autotuning: Thread selection — real workload}
+\centering
+\includegraphics[width=0.9\linewidth,height=0.7\textheight,keepaspectratio]{Images/Solver_iteration_close_real_performance_ARM920.png}
+\vspace{0.5em}
+\begin{itemize}
+  \item While the real dataset exhibits more complex patterns, the model still follows them well.
+\end{itemize}
+\end{frame}
+
+\subsection{13) SPMD execution, Replica exchange}
+
+% \begin{frame}{SPMD execution, Replica exchange}
+% \begin{itemize}
+    % \item Concepts and API sketch
+    % \item Example workflow
+% \end{itemize}
+% % Owner: GG
+% \end{frame}
+
+\begin{frame}{The SPMD paradigm}
+	\begin{itemize}
+		\item What is SPMD?
+		\begin{itemize}
+			\item SPMD means \textit{Single Program Multiple Data} % ;
+			\item \textbf{Multiprocessing}: Multiple processes run independently and communicate
+			\item \textbf{MPI} is an example of SPMD in the real world.
+				% You may know MPI.
+		\end{itemize}
+		\item Why SPMD and not OpenMP? \\
+			Asynchronous execution, distributed memory systems, clusters.
+	\end{itemize}
+	\vspace{0.5cm}
+	\textbf{Note:} ALP/GraphBLAS has currently a single SPMD backend: \textbf{BSP1D}.
+% Owner: GG
+\end{frame}
+
+\begin{frame}[fragile]{SPMD Hello World with ALP/GraphBLAS}
+
+We compile and run the example as follows:
+\begin{lstlisting}[style=terminal, language=bash]
+$ grbcxx -b bsp1d spmd.cpp -o spmd_example
+$ grbrun -b bsp1d -n 2  ./spmd_example
+\end{lstlisting}
+So the output will be something like this:
+	\begin{lstlisting}[style=terminal,language=bash]
+Starting... 
+Info: grb::init (BSP1D) called using 4 user processes.
+Info: grb::init (reference) called.
+Info: grb::init (reference) called.
+Hello from process 0
+Info: grb::finalize (bsp1d) called.
+	 process 0 is finalisingHello from process 1
+
+	 process 1 is finalising
+Info: grb::finalize (reference) called.
+Info: grb::finalize (reference) called.
+Finishing: data_out is 69
+\end{lstlisting}
+% Owner: GG
+\end{frame}
+
+\begin{frame}[fragile]{SPMD Hello World with ALP/GraphBLAS}
+	Now let's look at \texttt{spmd.cpp}:
+\begin{lstlisting}[style=cpp-rich, language=C++]
+#include<iostream>
+#include<graphblas.hpp>
+
+void grb_program( const size_t &data_in , size_t &data_out ){
+	const size_t s = grb::spmd<>::pid();
+	std :: cerr << "Hello from process " << s << std :: endl; // printed by each process
+	data_out = 69;
+}
+
+int main ( int argc , char ** argv ) {
+	size_t data_in = 42 , data_out;
+
+	std::cerr << "Starting... " << std::endl ; // printed only once
+	grb::Launcher< grb::AUTOMATIC > launcher;
+	launcher.exec( &grb_program, data_in, data_out, true );
+	std::cerr << "Finishing: data_out is " << data_out << std::endl ; // printed once
+}
+\end{lstlisting}
+
+% Owner: GG
+\end{frame}
+
+\begin{frame}[fragile]{What ALP/GraphBLAS offers: SPMD APIs}
+	\begin{itemize}
+		\item \textbf{SPMD namespace utilities} \\
+\begin{lstlisting}[style=cpp-rich, language=C++]
+size_t s = spmd<>::pid(); 		// get my process number
+size_t np = spmd<>::nprocs();	// get the total number of processes
+\end{lstlisting}
+	\vfill
+\item \textbf{Collective functions} \\
+	Optimized routines to do common tasks that need the \em collective \em effort of all the processes.
+\begin{lstlisting}[style=cpp-rich, language=C++]
+// sum over all the values of num in the different processes. The result is available only in process 1
+grb::collectives<>::reduce( num, 1, grb::operators::add );
+// same as above but the result is available to every process
+grb::collectives<>::allreduce( num, grb::operators::add );
+// set x to the value it has in process 7
+grb::collectives<>::broadcast( x, 7 );
+\end{lstlisting}
+
+And more to come...
+	\end{itemize}
+% Owner: GG
+\end{frame}
+
+\begin{frame}{Simulated Annealing - Replica Exchange}
+	Typical problems approached with Replica Exchange are:
+	\begin{itemize}
+		\item \textbf{Quadratic Unconstrained Binary Optimization (QUBO)}:
+			\[ \text{minimize } E(x) = x^TQx \qquad x\in\{0,1\}^d,\quad Q\in\mathbb{R}^{d\times d} \]
+		\item \textbf{Ising models' Hamiltonian function}:
+			\[ \text{minimize } E(x) = x^T(Jx+h) \qquad x\in\{-1,1\}^d,\quad J\in\mathbb{R}^{d \times d}, h\in\mathbb{R}^{d} \]
+		\item But Simulated Annealing has been used to approximate solutions to \textit{Traveling Salesman Problem} and other hard problems!
+			% The sky is the limit!
+	\end{itemize}
+% Owner: GG
+\end{frame}
+
+\begin{frame}{Simulated Annealing - Replica Exchange}
+	The algorithm proceeds in two alternating steps, that are repeated multiple times:
+	\begin{enumerate}
+		\item \textbf{Simulated Annealing}:
+
+			make small \textbf{random changes} to the state and always keep those that improve the solution,
+			but
+			\textbf{also accept bad updates} with some probability, depending on the temperature: \\
+			States at high temperature likely accept changes.
+			
+			\vspace{0.5cm}
+
+		\item \textbf{Replica Exchange}:
+
+			exchange (partial) solutions at different temperatures so that we have:
+		\begin{itemize}
+			\item \textbf{Good states at low temperature}, to be finely optimized locally
+			\item \textbf{Bad states at high temperature}, and thus explore the whole state space
+		\end{itemize}
+			This step is optional, but can improve the results.
+	\end{enumerate}
+	\small This algorithm is also known as \textit{Parallel Tempering}.
+% Owner: GG
+\end{frame}
+
+
+\begin{frame}[fragile]{Simulated Annealing - Replica Exchange ALP/GraphBLAS API}
+	What makes this implementation \textbf{fast and scalable}?
+	\begin{itemize}
+		\item GraphBLAS all the way down. 
+		\item \textbf{Distributed Replica Exchange}: SA independently on each replica, then mix and combine the results \\ 
+			\small As the work on each replica is the same, that is an embarassingly parallel job followed by careful communication. 
+	\end{itemize}
+% Owner: GG
+\end{frame}
+
+\begin{frame}[fragile]{Simulated Annealing - Replica Exchange ALP/GraphBLAS API}
+	\begin{itemize}
+		\item You can call directly the optimization function:
+\begin{lstlisting}[style=cpp-rich, language=C++]
+grb::RC simulated_annealing_RE(
+				const SweepFuncType &sweep,
+				SweepDataType& sweep_data,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
+				std::vector< grb::Vector< StateType, backend > >  &temp_states,
+				grb::Vector< EnergyType, backend > &temp_energies,
+				const size_t &n_sweeps,
+				const bool &use_pt = false
+				)
+\end{lstlisting}
+			\small \textbf{But you need to define} the \texttt{sweep} function that changes the state and returns the relative change in the energy \\
+			This is flexible, but possibly tedious
+
+	\end{itemize}
+% Owner: GG
+\end{frame}
+
+\begin{frame}[fragile]{Simulated Annealing - Replica Exchange ALP/GraphBLAS API}
+	\begin{itemize}
+\item For common problems are also the simpler built-in QUBO/Ising optimizers:
+\begin{lstlisting}[style=cpp-rich, language=C++]
+grb::RC simulated_annealing_RE_QUBO(
+				const grb::Matrix< QType, backend, RSI, CSI, NZI > &Q,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
+				const size_t &n_sweeps = 1,
+				const bool &use_pt = false
+				)
+\end{lstlisting}
+
+\begin{lstlisting}[style=cpp-rich, language=C++]
+grb::RC simulated_annealing_RE_Ising(
+				const grb::Matrix< QType, backend, RSI, CSI, NZI > &couplings,
+				const grb::Vector< QType, backend > &local_fields,
+				std::vector< grb::Vector< StateType, backend > > &states,
+				grb::Vector< EnergyType, backend > &energies,
+				grb::Vector< TempType, backend > &betas,
+				const size_t &n_sweeps = 1,
+				const bool &use_pt = false
+				)
+\end{lstlisting}
+\item But these give away some control (and thus may leave performance on the table)...
+	\end{itemize}
+% Owner: GG
+\end{frame}
+
+\begin{frame}[fragile]{Simulated Annealing-Replica Exchange Example workflow}
+	\begin{itemize}
+		\item Example usage to (approximately) optimize an Ising problem:
+\begin{lstlisting}[style=cpp-rich, language=C++]
+const size_t n = 420; // size of the problem
+grb::Matrix< double > J; // in this example we omit filling J with values
+grb::Vector< double > h ( n ), energies ( n ), betas ( n ); 
+std::vector< grb::Vector< float > > states;
+const size_t n_sweeps = 69; // number of repetitions
+const bool use_pt = true;
+
+grb::set( h, 1.0 );
+grb::set( betas, 3.0 );
+for(size_t i = 0 ; i < n_replicas ; ++i ){
+	grb::Vector< double > v (n);	// create a state v, you should (randomly) initialize them
+	states.emplace_back( v );
+	grb::setElement( energies, energy(state[i]), i ); // don't forget to set initial energies!
+}
+grb::RC rc = grb::algorithms::simulated_annealing_RE_Ising(
+	J, h, states, energies, betas, n_sweeps, use_pt
+);
+// Now states and energies are optimized!
+auto best_state = states[0];
+auto best_energy = energy[0];
+\end{lstlisting}
+	\end{itemize}
+% Owner: GG
+\end{frame}
+
+\begin{frame}[fragile]{Simulated Annealing-Replica Exchange Example workflow}
+	\begin{itemize}
+		\item If you have a custom problem you can define your sweep function as follows
+\begin{lstlisting}[style=cpp-rich, language=C++]
+auto sweep_fun = [](
+	grb::Vector< uint8_t > &state,
+	float &beta,
+	std::tuple<
+		const int&,
+		grb::Vector< uint8_t >&, // or whatever else you need
+		std::minstd_rand& // you can also pass a random number generator
+	> &data){
+	uint8_t delta_E = 0; 
+	const auto n = std::get<0>(data);
+	// Do what you need to do...
+	return delta_E; // return the change in energy
+};
+// note that the type of sweep_data must be equal to the tuple above
+auto sweep_data = std::tie( n, v, rng );
+
+grb::RC rc = grb::algorithms::simulated_annealing_RE(
+	sweep_fun, sweep_data, states, energies, betas, temp_states, temp_energies, n_sweeps, use_pt
+);
+\end{lstlisting}
+	\end{itemize}
+% Owner: GG
+\end{frame}
+
+% =========================
+% Wednesday 12 Nov
+% =========================
+\section{Wednesday 12 Nov}
+
+\subsection{14) WIP overview: dense backend, tensor, stencil, EM simulations}
+\begin{frame}{WIP overview}
+\begin{itemize}
+    \item Dense backend status
+    \item Tensor/stencil/EM simulation directions
+\end{itemize}
+% Owner: DJ
+\end{frame}
+
+\subsection{15) Deep technical? Future work (autodiff)}
+\begin{frame}{Future work (autodiff)}
+\begin{itemize}
+    \item Autodiff integration ideas
+    \item Open problems and roadmap
+\end{itemize}
+% Owner: DJ
+\end{frame}
+
+\end{document}
+
diff --git a/exercises/ex8/exercise8_starter.cpp b/exercises/ex8/exercise8_starter.cpp
new file mode 100644
index 0000000..c59e766
--- /dev/null
+++ b/exercises/ex8/exercise8_starter.cpp
@@ -0,0 +1,66 @@
+/*
+ * minimal ALP (GraphBLAS) example.
+ *
+ * To compile (using the reference OpenMP backend):
+ *    grbcxx -b reference_omp example.cpp -o example
+ *
+ * To run:
+ *    grbrun ./example
+ */
+
+#include <cstdio>
+#include <iostream>
+#include <vector>
+#include <utility>   // for std::pair
+#include <array>
+
+#include <graphblas.hpp>
+
+using namespace grb;
+
+// Indices and values for our sparse 3x3 matrix A:
+//
+//      A = [ 1   0   2 ]
+//          [ 0   3   4 ]
+//          [ 5   6   0 ]
+//
+// We store the nonzero entries via buildMatrixUnique.
+static const size_t Iidx[6]    = { 0, 0, 1, 1, 2, 2 };  // row indices
+static const size_t Jidx[6]    = { 0, 2, 1, 2, 0, 1 };  // column indices
+static const double Avalues[6] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0 };
+
+int main( int argc, char **argv ) {
+    (void)argc;
+    (void)argv;
+    std::printf("example (ALP/GraphBLAS) corrected API usage\n\n");
+
+    // 1) Create a 3x3 sparse matrix A
+    std::printf("Step 1: Constructing a 3x3 sparse matrix A.\n");
+    Matrix<double> A(3, 3);
+    // TODO 1: Reserve memory for 6 non-zero entries and build A from (Iidx,Jidx,Avalues), 
+    //         use resize and buildMatrixUnique
+
+    // 2) Create a 3-element vector x and initialize x = [1, 2, 3]^T
+    // TODO 2: Initialize x = [1, 2, 3]^T
+    //         first clear with set, then setElement for indices 0..2
+
+    // 3) Create two result vectors y and z (dimension 3) and set to zero
+    // TODO 3: Create y and z with proper type
+
+    // 4) Use the built-in “plusTimes” semiring alias
+    //      (add = plus, multiply = times, id‐add = 0.0, id-mul = 1.0)
+    auto plusTimes = grb::semirings::plusTimes<double>();
+
+    // 5) Compute y = A·x  (matrix‐vector multiply under plus‐times semiring)
+    // TODO 3: y = A·x  (matrix-vector multiply under plusTimes) using mxv()
+
+    // 6) Compute z = x ⊙ y  (element‐wise multiply) via eWiseMul with semiring
+    // TODO 4: z = x ⊙ y  (element-wise multiply) using eWiseMul()
+    
+    // 7) Compute dot_val = xᵀ·x  (dot‐product under plus‐times semiring)
+    // TODO 5: dot_val = x^T x (dot-product under plusTimes) using dot() 
+
+    // 8) Print x, y, z, and dot_val
+
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/exercises/ex9/exercise9_starter.py b/exercises/ex9/exercise9_starter.py
new file mode 100644
index 0000000..7aebad0
--- /dev/null
+++ b/exercises/ex9/exercise9_starter.py
@@ -0,0 +1,90 @@
+"""
+Exercise 9 (starter): Python Conjugate Gradient with ALP/GraphBLAS backends.
+
+Goal:
+  - Build a small SPD system (A, b, x0) in NumPy
+  - Wrap into pyalp containers
+  - Run conjugate_gradient with two backends (pyalp_ref, pyalp_omp)
+  - Print iterations and residual (timing/tolerance sweep as stretch goals)
+
+Usage:
+  python exercises/ex9/exercise9_starter.py
+
+Requirements:
+  pip install alp-graphblas
+"""
+
+from __future__ import annotations
+import time
+import numpy as np
+
+try:
+    import pyalp  # provided by alp-graphblas
+except Exception as e:
+    raise SystemExit(
+        "pyalp not found. Install with: pip install alp-graphblas\n"
+        f"Original import error: {e}"
+    )
+
+
+def make_spd(n: int, seed: int = 42, density: float = 1.0) -> np.ndarray:
+    """Construct a small SPD matrix. If density<1, sparsify by thresholding.
+    Returns a dense ndarray; we'll convert to COO triplets.
+    """
+    rng = np.random.default_rng(seed)
+    R = rng.standard_normal((n, n))
+    A = R.T @ R + 0.1 * np.eye(n)
+    if density < 1.0:
+        # Zero out small values to create sparsity (simple heuristic)
+        thresh = np.quantile(np.abs(A), 1.0 - density)
+        A[np.abs(A) < thresh] = 0.0
+    return A
+
+
+def to_coo(A: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    I, J = np.nonzero(A)
+    V = A[I, J].astype(np.float64)
+    return I.astype(np.int32), J.astype(np.int32), V
+
+
+def run_once(backend_name: str, Acoo, b_np, x0_np, maxiters=2000, verbose=0):
+    backend = pyalp.get_backend(backend_name)
+    I, J, V = Acoo
+    n = b_np.shape[0]
+
+    # Create pyalp containers
+    A = backend.Matrix(n, n, I, J, V)
+    x = backend.Vector(n, x0_np.copy())
+    b = backend.Vector(n, b_np)
+    r = backend.Vector(n, np.zeros(n))
+    u = backend.Vector(n, np.zeros(n))
+    tmp = backend.Vector(n, np.zeros(n))
+
+    # TODO: add timing with time.perf_counter()
+    its, res = backend.conjugate_gradient(A, x, b, r, u, tmp, maxiters, verbose)
+    return its, float(res), x.to_numpy()
+
+
+def main():
+    n = 64
+    A = make_spd(n, density=1.0)  # TODO: try density=0.2 to sparsify
+    I, J, V = to_coo(A)
+
+    b_np = np.ones(n, dtype=np.float64)
+    x0_np = np.zeros(n, dtype=np.float64)
+
+    # TODO: try both backends
+    backends = ["pyalp_ref", "pyalp_omp"]  # adjust if only one is available
+
+    for name in backends:
+        try:
+            its, res, x_np = run_once(name, (I, J, V), b_np, x0_np)
+            print(f"{name:10s} | iterations={its:4d} | residual={res:.3e}")
+        except Exception as e:
+            print(f"Backend {name}: failed -> {e}")
+
+    # TODO (stretch): sweep tolerances like [1e-6, 1e-10] and compare iterations
+
+
+if __name__ == "__main__":
+    main()
diff --git a/exercises/ex9/run.sh b/exercises/ex9/run.sh
new file mode 100644
index 0000000..175c364
--- /dev/null
+++ b/exercises/ex9/run.sh
@@ -0,0 +1,44 @@
+# -------------------------------
+# CONFIGURATION
+# -------------------------------
+BASE_ENV="py311"
+WHEEL_DIR="$HOME/download"
+SCRIPT_PATH="$HOME/download/GitHub/ALP-Tutorial/exercises/ex9/exercise9_starter.py"
+PKG_NAME="alp_graphblas"
+
+# -------------------------------
+# CREATE UNIQUE TEMP ENV NAME
+# -------------------------------
+RAND_SUFFIX=$(tr -dc 'a-z0-9' < /dev/urandom | head -c 6)
+TMP_ENV="tmp_env_${RAND_SUFFIX}"
+
+echo ">>> Creating temporary environment: $TMP_ENV"
+
+# -------------------------------
+# CLONE BASE ENV
+# -------------------------------
+conda create --name "$TMP_ENV" --clone "$BASE_ENV" -y
+
+# -------------------------------
+# ACTIVATE TEMP ENV AND INSTALL WHEEL
+# -------------------------------
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate "$TMP_ENV"
+
+echo ">>> Installing wheel $PKG_NAME from $WHEEL_DIR"
+pip install --no-index --find-links "$WHEEL_DIR" "$PKG_NAME"
+
+# -------------------------------
+# RUN SCRIPT
+# -------------------------------
+echo ">>> Running script: $SCRIPT_PATH"
+python3 "$SCRIPT_PATH"
+
+# -------------------------------
+# CLEAN UP
+# -------------------------------
+echo ">>> Deactivating and deleting temporary environment: $TMP_ENV"
+conda deactivate
+conda remove -y --name "$TMP_ENV" --all
+
+echo ">>> Done."
diff --git a/main.tex b/main.tex
index 425d220..975b9b0 100644
--- a/main.tex
+++ b/main.tex
@@ -82,6 +82,8 @@
 
 \input{ALP_Transition_Path_Tutorial.tex}
 
+\input{AALP}
+
 \section*{Acknowledgements}
 
 This document was compiled from \LaTeX{} source code using GitHub Actions and deployed to GitHub Pages.
@@ -91,3 +93,4 @@ \section*{Acknowledgements}
 \small This document was generated on \today.
 
 \end{document}
+
diff --git a/scripts/alp_hw.cpp b/scripts/alp_hw.cpp
new file mode 100644
index 0000000..776c196
--- /dev/null
+++ b/scripts/alp_hw.cpp
@@ -0,0 +1,27 @@
+
+#include <cstddef>
+#include <cstring>
+#include <graphblas.hpp>
+#include <assert.h>
+
+constexpr size_t max_fn_size = 255;
+typedef char Filename[ max_fn_size ];
+
+void hello_world( const Filename &in, int &out ) {
+    std::cout << "Hello from " << in << std::endl;
+    out = 0;
+}
+
+int main( int argc, char ** argv ) {
+    // get input
+    Filename fn;
+    (void) std::strncpy( fn, argv[ 0 ], max_fn_size );
+    // set up output field
+    int error_code = 100;
+    // launch hello world program
+    grb::Launcher< grb::AUTOMATIC > launcher;
+    assert( launcher.exec( &hello_world, fn, error_code, true ) == grb::SUCCESS );
+    // return with the hello_world error code
+    return error_code;
+}
+
diff --git a/scripts/sp.cpp b/scripts/sp.cpp
new file mode 100644
index 0000000..47db3e2
--- /dev/null
+++ b/scripts/sp.cpp
@@ -0,0 +1,257 @@
+
+/*
+ *   Copyright 2021 Huawei Technologies Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * A practical graph corresponding to a 5 by 5 matrix with 10 nonzeroes.
+ *
+ * -# Flight prices correspond to the cheapest round trip price with departure
+ *    on 1/10/2016 and return on 8/10/2016 when booked on 10/8/2016 according to
+ *    Google Flights.
+ * -# Distances are as determined by Google Maps.
+ * -# All edges are directed.
+ *
+ * @author: A. N. Yzelman
+ * @date: 11th August, 2016.
+ */
+
+#include <climits>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include <graphblas.hpp>
+
+using namespace grb;
+
+//! [Example Data]
+static const char * const vertex_ids[ 5 ] = { "Shenzhen", "Hong Kong", "Santa Clara", "London", "Paris" };
+
+static const double distances[ 10 ] = { 8.628, 8.964, 11.148, .334, 9.606, 9.610, .017, .334, .017, .334 };
+static const int price[ 10 ] = { 723, 956, 600, 85, 468, 457, 333, 85, 50, 150 };
+static const double timeliness[ 10 ] = { 0.9, 0.7, 0.99, 0.9, 0.9, 0.7, 0.99, 0.7, .99, 0.99 };
+static const std::string mode[ 10 ] = { "air", "air", "air", "air", "air", "air", "air", "air", "land", "land" };
+
+static const size_t I[ 10 ] = { 3, 4, 2, 3, 3, 4, 1, 4, 1, 4 };
+static const size_t J[ 10 ] = { 2, 2, 1, 4, 1, 1, 0, 3, 0, 3 };
+//! [Example Data]
+
+//! [Example function taking arbitrary semirings]
+template< typename Ring >
+grb::Vector< typename Ring::D4 >
+shortest_path( const grb::Matrix< typename Ring::D2 > & A, const grb::Vector< typename Ring::D1 > & initial_state, const size_t hops = 1, const Ring & ring = Ring() ) {
+	const size_t size = grb::size( initial_state );
+	grb::Vector< typename Ring::D4 > ret( size );
+	grb::Vector< typename Ring::D4 > new_state( size );
+	grb::set( ret, initial_state );
+	vxm( ret, initial_state, A, ring );
+	for( size_t i = 1; i < hops; ++i ) {
+		grb::set( new_state, ret );
+		vxm( ret, new_state, A, ring );
+	}
+	return ret;
+}
+//! [Example function taking arbitrary semirings]
+
+int main( int argc, char ** argv ) {
+	(void)argc;
+	(void)printf( "Illustration executable: %s\n\n", argv[ 0 ] );
+
+	(void)printf( "This is not a functional or performance test, but rather an illustration of some of the GraphBLAS usefulness.\n\n" );
+
+	(void)printf( "Create distance graph as a 5 x 5 matrix with 10 nonzeroes:\n"
+				  "-->grb::Matrix< double > dist( 5, 5 );\n" );
+	//! [Example matrix allocation]
+	grb::Matrix< double > dist( 5, 5 );
+	resize( dist, 10 );
+	//! [Example matrix allocation]
+
+	(void)printf( "Load distance graph:\n"
+				  "-->dist.buildMatrixUnique( dist, &(I[0]), &(J[0]), distances, 10 "
+				  ");\n" );
+	//! [Example matrix assignment]
+	buildMatrixUnique( dist, &( I[ 0 ] ), &( J[ 0 ] ), distances, 10, SEQUENTIAL );
+	//! [Example matrix assignment]
+
+	(void)printf( "Create new vectors x and y:\n"
+				  "-->grb::Vector< int > x( 5 );\n"
+				  "-->grb::Vector< int > y( 5 );\n" );
+	//! [Example vector allocation]
+	grb::Vector< double > x( 5 );
+	grb::Vector< double > y( 5 );
+	//! [Example vector allocation]
+
+	(void)printf( "The five vertices stand for the following cities:\n" );
+	for( size_t i = 0; i < 5; ++i ) {
+		(void)printf( "--> city %zd: %s\n", i, vertex_ids[ i ] );
+	}
+	(void)printf( "Let us calculate which cities are reachable from %s by taking one "
+				  "air or land route:\n-->"
+				  "x.set( INFINITY );\n-->"
+				  "x.setElement( 0, 4 );\n-->"
+				  "y.set( x );\n-->"
+				  "typedef grb::Semiring< grb::operators::min< double >, "
+				  "grb::operators::add< double >, grb::identities::infinity, "
+				  "grb::identitites::zero > shortest_path_double;\n-->"
+				  "vxm( y, x, dist, shortest_path_double );\n",
+		vertex_ids[ 4 ] );
+	//! [Example vector assignment]
+	grb::set( x, INFINITY );
+	grb::setElement( x, 0.0, 4 );
+	grb::set( y, x );
+	//! [Example vector assignment]
+	//! [Example semiring definition]
+	grb::Semiring< grb::operators::min< double >, grb::operators::add< double >, grb::identities::infinity, grb::identities::zero > shortest_path_double;
+	//! [Example semiring definition]
+	//! [Example semiring use: sparse vector times matrix multiplication]
+	grb::vxm( y, x, dist, shortest_path_double );
+	//! [Example semiring use: sparse vector times matrix multiplication]
+	(void)printf( "We can reach the following cities within one trip:\n" );
+	for( const std::pair< size_t, double > & pair : y ) {
+		const double val = pair.second;
+		if( val < INFINITY ) {
+			(void)printf( "--> %s at distance %lf thousand kilometres.\n", vertex_ids[ pair.first ], val );
+		}
+	}
+
+	(void)printf( "Let us calculate which cities we can reach after one more trip. "
+				  "To do this, we first copy y into x, thus effectively computing "
+				  "y=A(Ax).\n"
+				  "-->grb( x, y );\n"
+				  "-->grb::vxm( y, x, dist, shortest_path_double );\n" );
+	grb::set( x, y );
+	grb::operators::add< double, double, double > add_operator;
+	grb::vxm( y, x, dist, shortest_path_double );
+	(void)printf( "We can reach the following cities within two trips:\n" );
+	for( const std::pair< size_t, double > & pair : y ) {
+		const double val = pair.second;
+		if( val < INFINITY ) {
+			(void)printf( "--> %s at distance %lf\n", vertex_ids[ pair.first ], val );
+		}
+	}
+
+	(void)printf( "We put the above in a templated function so we can call the same "
+				  "shortest-paths calculation on different input and using different "
+				  "semirings:\n"
+				  "-->template< typename ring >\n"
+				  "-->grb::Vector< typename ring::D4 > shortest_path( const "
+				  "grb::Matrix< typename ring::D2 > &A, const grb::Vector< typename "
+				  "ring::D1 > &initial_state, const size_t n, const size_t hops = 1 "
+				  ") {\n"
+				  "-->	grb::Vector< typename ring::D4 > ret( n );\n"
+				  "-->    grb::set( ret, initial_state );\n"
+				  "-->	grb::vxm( ret, initial_state, A, ring );\n"
+				  "-->	for( size_t i = 1; i < hops; ++i ) {\n"
+				  "-->		grb::Vector< typename ring::D4 > new_state( n );\n"
+				  "-->            grb::set( new_state, ret );\n"
+				  "-->		grb::vxm( ret, new_state, A, ring );\n"
+				  "-->	}\n"
+				  "-->	return ret;\n"
+				  "-->}\n" );
+
+	(void)printf( "Now let us calculate the price of flying instead of the distance. "
+				  "The price is in Euros so now we use integers instead of doubles, "
+				  "resulting in different domains the semiring which otherwise "
+				  "remains identical:\n"
+				  "-->typedef grb::Semiring< grb::operators::min< int >, "
+				  "grb::operators::add< int >, grb::identities::infinity, "
+				  "grb::identities::zero > shortest_path_ints;\n" );
+
+	typedef grb::Semiring< grb::operators::min< int >, grb::operators::add< int >, grb::identities::infinity, grb::identities::zero > shortest_path_ints;
+	(void)printf( "We continue in one go:"
+				  "-->grb::Matrix< int > prices( 5, 5 );\n"
+				  "-->grb::Vector< int > initial_trip_price( 5 );\n"
+				  "-->buildMatrixUnique( prices, &(I[0]), &(J[0]), air, 10 );\n"
+				  "-->before_trip_price.set( 9999 ); //all prices initially unknown. Integers have no infinite, however, so just pick a big number that doesn't overflow)\n"
+				  "-->before_trip_price.setElement( 0, 4 );   //except that of our start position, which is free\n"
+				  "-->grb::Vector< int > trip_prices = shortest_path< shortest_path_ints >( prices, initial_trip_price, 2 );\n" );
+	grb::Matrix< int > prices( 5, 5 );
+	resize( prices, 10 );
+	grb::Vector< int > initial_trip_price( 5 );
+	buildMatrixUnique( prices, &( I[ 0 ] ), &( J[ 0 ] ), price, 10, SEQUENTIAL );
+	grb::set( initial_trip_price, 9999 );        // all prices initially unknown. Integers have no infinite, however, so just pick a big number (that doesn't overflow)
+	grb::setElement( initial_trip_price, 0, 4 ); // except that of our start position, which is free
+	                                             //! [Example function call while passing a semiring]
+	grb::Vector< int > trip_prices = shortest_path< shortest_path_ints >( prices, initial_trip_price, 2 );
+	//! [Example function call while passing a semiring]
+
+	(void)printf( "We can go from Paris to the following cities, within two separate trips:\n" );
+	for( const std::pair< size_t, int > & pair : trip_prices ) {
+		const size_t i = pair.first;
+		const int val = pair.second;
+		if( val < INFINITY ) {
+			(void)printf( "--> %s at cost %d\n", vertex_ids[ i ], val );
+		}
+	}
+
+	(void)printf( "We might also be interested in the probability we will arrive on "
+				  "time. Instead of distances or prices, we now assign probabilities "
+				  "to the edges; e.g., flights from Santa Clara to Hong Kong have a "
+				  "`timeliness' of 0.99, meaning that with 99 percent certainty, the "
+				  "flight will be on time.\n" );
+	(void)printf( "For the sake of example, we count flights going out from Paris as "
+				  "having only a 70 percent probability of being on time due to "
+				  "strikes, while flights going out of Heathrow London are slightly "
+				  "more often late, at 90 percent. Trains between London and Paris "
+				  "run at .99 timeliness.\n" );
+	(void)printf( "We can now compute the best combination of trip legs in terms of "
+				  "timeliness when using the following semiring:\n" );
+	(void)printf( "-->typedef grb::Semiring< grb::operators::mul< double >, "
+				  "grb::operators::max< double >, grb::identities::one, "
+				  "grb::identities::negative_infinity > mul_max_double;\n" );
+	typedef grb::Semiring< grb::operators::max< double >, grb::operators::mul< double >, grb::identities::negative_infinity, grb::identities::one > mul_max_double;
+	(void)printf( "Let us use this semi-ring:\n"
+				  "-->grb::Matrix< double > T( 5, 5 );\n"
+				  "-->buildMatrixUnique( T, &(I[0]), &(J[0]), timeliness, 10 );\n"
+				  "-->grb::Vector< double > initial_timeliness( 5 );\n"
+				  "-->initial_timeliness.set( 0.0 );\n"
+				  "-->initial_timeliness.setElement( 1.0, 4 );\n"
+				  "-->grb::Vector< int > trip_timeliness = shortest_path< "
+				  "mul_max_double >( T, initial_timeliness, 2 );\n" );
+	//! [Example shortest-paths with semiring adapted to find the most reliable route instead]
+	grb::Matrix< double > T( 5, 5 );
+	resize( T, 10 );
+	buildMatrixUnique( T, &( I[ 0 ] ), &( J[ 0 ] ), timeliness, 10, SEQUENTIAL );
+	grb::Vector< double > initial_timeliness( 5 );
+	grb::set( initial_timeliness, 0.0 );
+	grb::setElement( initial_timeliness, 1.0, 4 );
+	const grb::Vector< double > trip_timeliness2 = shortest_path< mul_max_double >( T, initial_timeliness, 2 );
+
+	(void)printf( "If we take a maximum of two separate trips, we can go from Paris "
+				  "to the following cities timeliness as follows:\n" );
+	for( const std::pair< size_t, double > & pair : trip_timeliness2 ) {
+		const size_t i = pair.first;
+		const double val = pair.second;
+		if( val > 0 ) {
+			(void)printf( "--> %s with %lf percent probability of arriving on time\n", vertex_ids[ i ], val * 100.0 );
+		}
+	}
+	//! [Example shortest-paths with semiring adapted to find the most reliable route instead]
+
+	(void)printf( "If we allow a maximum of three separate trips, however, the "
+				  "probability of us arriving in Shenzhen increases "
+				  "dramatically:\n" );
+	const grb::Vector< double > trip_timeliness3 = shortest_path< mul_max_double >( T, initial_timeliness, 3 );
+	for( const std::pair< size_t, double > & pair : trip_timeliness3 ) {
+		const size_t i = pair.first;
+		const double val = pair.second;
+		if( val > 0 ) {
+			(void)printf( "--> %s with %lf percent probability of arriving on time\n", vertex_ids[ i ], val * 100.0 );
+		}
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/scripts/spmd.cpp b/scripts/spmd.cpp
new file mode 100644
index 0000000..a5d78fe
--- /dev/null
+++ b/scripts/spmd.cpp
@@ -0,0 +1,17 @@
+#include<iostream>
+#include<graphblas.hpp>
+
+void grb_program( const size_t &data_in , size_t &data_out ){
+	const size_t s = grb::spmd<>::pid();
+	std :: cerr << "Hello from process " << s << std :: endl; // printed by each process
+	data_out = 69;
+}
+
+int main ( int argc , char ** argv ) {
+	size_t data_in = 42 , data_out;
+
+	std::cerr << "Starting... " << std::endl ; // printed only once
+	grb::Launcher< grb::AUTOMATIC > launcher;
+	launcher.exec( &grb_program, data_in, data_out, true );
+	std::cerr << "Finishing: data_out is " << data_out << std::endl ; // printed once
+}
diff --git a/solutions/Makefile b/solutions/Makefile
new file mode 100644
index 0000000..2d0a0e3
--- /dev/null
+++ b/solutions/Makefile
@@ -0,0 +1,24 @@
+
+## Minimal Makefile for Exercise 8
+## export ALP_INSTALL_DIR to point to the proper location.
+
+ALP_INSTALL_DIR ?= $(HOME)/Repos/graphblas/install
+
+CXX      := g++
+SRC      := ex8/exercise8.cpp
+BIN      := exercise8.x
+
+
+.PHONY: all run clean
+
+all: $(BIN)
+
+$(BIN): $(SRC)
+	$(CXX) -std=c++14 $< -o $@ -I$(ALP_INSTALL_DIR)/include -D_GRB_WITH_REFERENCE -D_GRB_WITH_OMP -L$(ALP_INSTALL_DIR)/lib/sequential -lgraphblas -lnuma -lpthread -lm -fopenmp
+
+run: $(BIN)
+	LD_LIBRARY_PATH=$$LD_LIBRARY_PATH:$(ALP_INSTALL_DIR)/lib/sequential ./$(BIN)
+
+clean:
+	rm -f $(BIN)
+
diff --git a/solutions/ex8/exercise8.cpp b/solutions/ex8/exercise8.cpp
new file mode 100644
index 0000000..430e215
--- /dev/null
+++ b/solutions/ex8/exercise8.cpp
@@ -0,0 +1,140 @@
+
+/*
+ * minimal ALP (GraphBLAS) example.
+ *
+ * To compile (using the reference OpenMP backend):
+ *    grbcxx -b reference_omp example.cpp -o example
+ *
+ * To run:
+ *    grbrun ./example
+ */
+
+#include <cstdio>
+#include <iostream>
+#include <vector>
+#include <utility>   // for std::pair
+#include <array>
+
+#include <graphblas.hpp>
+
+using namespace grb;
+
+// Indices and values for our sparse 3x3 matrix A:
+//
+//      A = [ 1   0   2 ]
+//          [ 0   3   4 ]
+//          [ 5   6   0 ]
+//
+// We store the nonzero entries via buildMatrixUnique.
+static const size_t Iidx[6]    = { 0, 0, 1, 1, 2, 2 };  // row indices
+static const size_t Jidx[6]    = { 0, 2, 1, 2, 0, 1 };  // column indices
+static const double Avalues[6] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0 };
+
+int main( int argc, char **argv ) {
+    (void)argc;
+    (void)argv;
+    std::printf("example (ALP/GraphBLAS) corrected API usage\n\n");
+
+    //------------------------------
+    // 1) Create a 3x3 sparse matrix A
+    //------------------------------
+    std::printf("Step 1: Constructing a 3x3 sparse matrix A.\n");
+    Matrix<double> A(3, 3);
+    // Reserve space for 6 nonzeros
+    resize(A, 6);
+    // Populate A from (Iidx,Jidx,Avalues)
+    buildMatrixUnique( A, &(Iidx[0]), &(Jidx[0]), Avalues, /* nvals = */ 6, SEQUENTIAL );
+
+    //------------------------------
+    // 2) Create a 3-element vector x and initialize x = [1, 2, 3]^T
+    //------------------------------
+    std::printf("Step 2: Creating vector x = [1, 2, 3]^T.\n");
+    Vector<double> x(3);
+    set<descriptors::no_operation>(x, 0.0);           // zero-out
+    setElement<descriptors::no_operation>(x, 1.0, 0); // x(0) = 1.0
+    setElement<descriptors::no_operation>(x, 2.0, 1); // x(1) = 2.0
+    setElement<descriptors::no_operation>(x, 3.0, 2); // x(2) = 3.0
+    // note: setElement() supports vectors only
+    //       set() support for matrix is work in progress
+
+    //------------------------------
+    // 3) Create two result vectors y and z (dimension 3)
+    //------------------------------
+    Vector<double> y(3), z(3);
+    set<descriptors::no_operation>(y, 0.0);
+    set<descriptors::no_operation>(z, 0.0);
+
+    //------------------------------
+    // 4) Use the built-in “plusTimes” semiring alias
+    //      (add = plus, multiply = times, id‐add = 0.0, id-mul = 1.0)
+    //------------------------------
+    auto plusTimes = grb::semirings::plusTimes<double>();
+
+    //------------------------------
+    // 5) Compute y = A·x  (matrix‐vector multiply under plus‐times)
+    //------------------------------
+    std::printf("Step 3: Computing y = A·x under plus‐times semiring.\n");
+    {
+        RC rc = mxv<descriptors::no_operation>(y, A, x, plusTimes);
+        if(rc != SUCCESS) {
+            std::cerr << "Error: mxv(y,A,x) failed with code " << toString(rc) << "\n";
+            return (int)rc;
+        }
+    }
+
+    //------------------------------
+    // 6) Compute z = x ⊙ y  (element‐wise multiply) via eWiseMul with semiring
+    //------------------------------
+    std::printf("Step 4: Computing z = x ⊙ y (element‐wise multiply).\n");
+    {
+        RC rc = eWiseMul<descriptors::no_operation>(
+            z, x, y, plusTimes
+        );
+        if(rc != SUCCESS) {
+            std::cerr << "Error: eWiseMul(z,x,y,plusTimes) failed with code " << toString(rc) << "\n";
+            return (int)rc;
+        }
+    }
+    
+    //------------------------------
+    // 7) Compute dot_val = xᵀ·x  (dot‐product under plus‐times semiring)
+    //------------------------------
+    std::printf("Step 5: Computing dot_val = xᵀ·x under plus‐times semiring.\n");
+    double dot_val = 0.0;
+    {
+        RC rc = dot<descriptors::no_operation>(dot_val, x, x, plusTimes);
+        if(rc != SUCCESS) {
+            std::cerr << "Error: dot(x,x) failed with code " << toString(rc) << "\n";
+            return (int)rc;
+        }
+    }
+
+    //------------------------------
+    // 8) Print x, y, z, and dot_val
+    //    We reconstruct each full 3 - vector by filling an std::array<3,double>
+    //------------------------------
+    auto printVector = [&](const Vector<double> &v, const std::string &name) {
+        // Initialize all entries to zero
+        std::array<double,3> arr = { 0.0, 0.0, 0.0 };
+        // Overwrite stored (nonzero) entries
+        for(const auto &pair : v) {
+            // pair.first = index, pair.second = value
+            arr[pair.first] = pair.second;
+        }
+        // Print
+        std::printf("%s = [ ", name.c_str());
+        for(size_t i = 0; i < 3; ++i) {
+            std::printf("%g", arr[i]);
+            if(i + 1 < 3) std::printf(", ");
+        }
+        std::printf(" ]\n");
+    };
+
+    std::printf("\n-- Results --\n");
+    printVector(x, "x");
+    printVector(y, "y = A·x");
+    printVector(z, "z = x ⊙ y");
+    std::printf("dot(x,x) = %g\n\n", dot_val);
+
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/solutions/ex9/exercise9.py b/solutions/ex9/exercise9.py
new file mode 100644
index 0000000..2574ff6
--- /dev/null
+++ b/solutions/ex9/exercise9.py
@@ -0,0 +1,104 @@
+"""
+Exercise 9 (solution): Python Conjugate Gradient with ALP/GraphBLAS backends.
+
+Features:
+  - SPD system generation (size n)
+  - Backend loop (pyalp_ref, pyalp_omp)
+  - Timing with time.perf_counter()
+  - Optional tolerance sweep (re-run using different tolerances if supported)
+
+Usage:
+  python solutions/ex9/exercise9.py [--n 64] [--density 1.0] [--tol 1e-8 1e-10]
+
+Requirements:
+  pip install alp-graphblas
+  or from downloaded wheel/package
+  pip install --no-index --find-links "$WHEEL_DIR" alp_graphblas
+"""
+
+from __future__ import annotations
+import argparse
+import time
+import numpy as np
+
+try:
+    import pyalp  # provided by alp-graphblas
+except Exception as e:
+    raise SystemExit(
+        "pyalp not found. Install with: pip install alp-graphblas\n"
+        f"Original import error: {e}"
+    )
+
+
+def make_spd(n: int, seed: int = 42, density: float = 1.0) -> np.ndarray:
+    """Construct a small SPD matrix. If density<1, sparsify by thresholding."""
+    rng = np.random.default_rng(seed)
+    R = rng.standard_normal((n, n))
+    A = R.T @ R + 0.1 * np.eye(n)
+    if density < 1.0:
+        thresh = np.quantile(np.abs(A), 1.0 - density)
+        A[np.abs(A) < thresh] = 0.0
+    return A
+
+
+def to_coo(A: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    I, J = np.nonzero(A)
+    V = A[I, J].astype(np.float64)
+    return I.astype(np.int32), J.astype(np.int32), V
+
+
+def conjugate_gradient_once(backend_name: str, Acoo, b_np, x0_np,
+                            maxiters: int, verbose: int) -> tuple[int, float, float]:
+    backend = pyalp.get_backend(backend_name)
+    I, J, V = Acoo
+    n = b_np.shape[0]
+    A = backend.Matrix(n, n, I, J, V)
+    x = backend.Vector(n, x0_np.copy())
+    b = backend.Vector(n, b_np)
+    r = backend.Vector(n, np.zeros(n))
+    u = backend.Vector(n, np.zeros(n))
+    tmp = backend.Vector(n, np.zeros(n))
+
+    t0 = time.perf_counter()
+    its, res = backend.conjugate_gradient(A, x, b, r, u, tmp, maxiters, verbose)
+    dt = time.perf_counter() - t0
+    return its, float(res), dt
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--n", type=int, default=64, help="problem size (n x n)")
+    ap.add_argument("--density", type=float, default=1.0, help="target density (0..1)")
+    ap.add_argument("--tol", type=float, nargs="*", default=[1e-8], help="tolerances to explore")
+    ap.add_argument("--backends", type=str, nargs="*", default=["pyalp_ref", "pyalp_omp"],
+                    help="backend names to try")
+    ap.add_argument("--maxiters", type=int, default=2000)
+    ap.add_argument("--verbose", type=int, default=0)
+    args = ap.parse_args()
+
+    n = args.n
+    A = make_spd(n, density=args.density)
+    I, J, V = to_coo(A)
+
+    b_np = np.ones(n, dtype=np.float64)
+    x0_np = np.zeros(n, dtype=np.float64)
+
+    print(f"Problem: n={n}, nnz={V.size}, density~{args.density}")
+    print("backend      tol       its   residual        time [ms]")
+    print("-----------------------------------------------------")
+
+    for tol in args.tol:
+        # Some backends bind tol at compile-time; if runtime tol not exposed,
+        # you can still compare iterations/time on the default tol.
+        for name in args.backends:
+            try:
+                its, res, dt = conjugate_gradient_once(
+                    name, (I, J, V), b_np, x0_np, args.maxiters, args.verbose
+                )
+                print(f"{name:10s}  {tol:8.1e}  {its:5d}  {res:10.3e}  {dt*1e3:10.1f}")
+            except Exception as e:
+                print(f"{name:10s}  {tol:8.1e}    fail  --            --     ({e})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/solutions/ex9/run.sh b/solutions/ex9/run.sh
new file mode 100644
index 0000000..ea321ee
--- /dev/null
+++ b/solutions/ex9/run.sh
@@ -0,0 +1,44 @@
+# -------------------------------
+# CONFIGURATION
+# -------------------------------
+BASE_ENV="py311"
+WHEEL_DIR="$HOME/download"
+SCRIPT_PATH="$HOME/download/GitHub/ALP-Tutorial/solutions/ex9/exercise9.py"
+PKG_NAME="alp_graphblas"
+
+# -------------------------------
+# CREATE UNIQUE TEMP ENV NAME
+# -------------------------------
+RAND_SUFFIX=$(tr -dc 'a-z0-9' < /dev/urandom | head -c 6)
+TMP_ENV="tmp_env_${RAND_SUFFIX}"
+
+echo ">>> Creating temporary environment: $TMP_ENV"
+
+# -------------------------------
+# CLONE BASE ENV
+# -------------------------------
+conda create --name "$TMP_ENV" --clone "$BASE_ENV" -y
+
+# -------------------------------
+# ACTIVATE TEMP ENV AND INSTALL WHEEL
+# -------------------------------
+source "$(conda info --base)/etc/profile.d/conda.sh"
+conda activate "$TMP_ENV"
+
+echo ">>> Installing wheel $PKG_NAME from $WHEEL_DIR"
+pip install --no-index --find-links "$WHEEL_DIR" "$PKG_NAME"
+
+# -------------------------------
+# RUN SCRIPT
+# -------------------------------
+echo ">>> Running script: $SCRIPT_PATH"
+python3 "$SCRIPT_PATH"
+
+# -------------------------------
+# CLEAN UP
+# -------------------------------
+echo ">>> Deactivating and deleting temporary environment: $TMP_ENV"
+conda deactivate
+conda remove -y --name "$TMP_ENV" --all
+
+echo ">>> Done."