From 33527d566ed9999d03afb4f3e467f2210910ec46 Mon Sep 17 00:00:00 2001 From: Remco de Boer <29308176+redeboer@users.noreply.github.com> Date: Fri, 22 Mar 2024 21:18:52 +0100 Subject: [PATCH] DOC: add page about symbolic amplitude models (#238) Co-authored-by: Miriam Fritsch <67281455+miriamfritsch@users.noreply.github.com> --- .cspell.json | 11 + docs/conf.py | 8 + docs/index.md | 23 +- docs/report/024.ipynb | 2 +- docs/symbolics.ipynb | 875 ++++++++++++++++++++++++++++++++++++++++++ environment.yml | 2 +- 6 files changed, 908 insertions(+), 13 deletions(-) create mode 100644 docs/symbolics.ipynb diff --git a/.cspell.json b/.cspell.json index 1dc6a40f..258f6c85 100644 --- a/.cspell.json +++ b/.cspell.json @@ -61,8 +61,10 @@ "autograd", "blatt", "breit", + "chromodynamics", "compwa", "conda", + "Curvenote", "Dalitz", "deadsnakes", "defaultdict", @@ -84,12 +86,14 @@ "lambdifying", "LHCb", "lineshape", + "Mathematica", "MathML", "matplotlib", "Mikhasenko", "miniconda", "mkdir", "mypy", + "Numba", "numpy", "parametrizations", "pathlib", @@ -99,6 +103,7 @@ "pytest", "PYTHONHASHSEED", "qrules", + "Reana", "roadmap", "Scikit", "scipy", @@ -108,6 +113,7 @@ "tensorwaves", "textwrap", "toolkits", + "TPUs", "traceback", "unbinned", "unitarity", @@ -163,6 +169,7 @@ "coolwarm", "csqrt", "cstride", + "cxxcode", "dalitzplot", "darkred", "dasharray", @@ -182,6 +189,7 @@ "expertsystem", "facecolor", "facecolors", + "fcode", "figsize", "filterwarnings", "fontcolor", @@ -258,6 +266,7 @@ "ndarray", "nonlocal", "nonumber", + "nopython", "noqa", "noreply", "nrows", @@ -293,6 +302,7 @@ "relim", "repr", "richman", + "royalblue", "rpartition", "rstride", "rstrip", @@ -332,6 +342,7 @@ "tickvals", "timeit", "toctree", + "toprettyxml", "tqdm", "treewise", "unevaluatable", diff --git a/docs/conf.py b/docs/conf.py index 20098f04..e962eff9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -113,6 +113,7 @@ def get_nb_exclusion_patterns() -> list[str]: "sphinx.ext.intersphinx", "sphinx.ext.mathjax", "sphinx.ext.napoleon", + "sphinx.ext.todo", "sphinx_api_relink", "sphinx_codeautolink", "sphinx_comments", @@ -126,6 +127,9 @@ def get_nb_exclusion_patterns() -> list[str]: "sphinxcontrib.bibtex", ] graphviz_output_format = "svg" +html_css_files = [ + "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.1.1/css/all.min.css", +] html_favicon = "_static/favicon.ico" html_js_files = [ "https://cdn.datatables.net/1.13.6/js/jquery.dataTables.min.js", @@ -186,6 +190,7 @@ def get_nb_exclusion_patterns() -> list[str]: f"https://mpl-interactions.readthedocs.io/en/{pin('mpl-interactions')}", None, ), + "numba": ("https://numba.pydata.org/numba-doc/latest", None), "numpy": (f"https://numpy.org/doc/{pin_minor('numpy')}", None), "plotly": ("https://plotly.com/python-api-reference/", None), "pwa": ("https://pwa.readthedocs.io", None), @@ -195,6 +200,7 @@ def get_nb_exclusion_patterns() -> list[str]: "scipy": ("https://docs.scipy.org/doc/scipy-1.7.0", None), "sympy": ("https://docs.sympy.org/latest", None), "tensorwaves": ("https://tensorwaves.readthedocs.io/stable", None), + "torch": ("https://pytorch.org/docs/stable", None), "zfit": ("https://zfit.readthedocs.io/en/latest", None), } linkcheck_anchors = False @@ -211,6 +217,7 @@ def get_nb_exclusion_patterns() -> list[str]: "https://mybinder.org", # often instable "https://open.vscode.dev", "https://rosettacode.org", + "https://stackoverflow.com", "https://via.placeholder.com", # irregular timeout "https://www.andiamo.co.uk/resources/iso-language-codes", # 443, but works "https://www.bookfinder.com", @@ -264,3 +271,4 @@ def get_nb_exclusion_patterns() -> list[str]: "repository_url": html_theme_options["repository_url"], "repository_branch": html_theme_options["repository_branch"], } +todo_include_todos = True diff --git a/docs/index.md b/docs/index.md index df269337..2a8c273c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,21 +8,21 @@ {{ '[![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ComPWA/compwa.github.io/blob/{})'.format(branch) }} {{ '[![Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ComPWA/compwa.github.io/{}?filepath=docs/usage)'.format(branch) }} -The ["Common Partial Wave Analysis"](https://github.com/ComPWA) organization (ComPWA) -aims to make amplitude analysis accessible through transparent and interactive -documentation, modern software development tools, and collaboration-independent -frameworks. **Contact details** can be found [here](https://github.com/ComPWA). +The ["Common Partial Wave Analysis"](https://github.com/ComPWA) organization (ComPWA) aims to make amplitude analysis accessible through transparent and interactive documentation, modern software development tools, and collaboration-independent frameworks. One major novelty is that we [formulate amplitude models symbolically](./symbolics.ipynb) with a Computer Algebra System, which results in a **self-documenting workflow** with high-performance, **backend-agnostic computations**. + +Contact details can be found [here](https://github.com/ComPWA) on our GitHub organization page. + +:::{card} {material-outlined}`calculate;1.5em` Symbolic amplitude models +:link: symbolics +:link-type: doc +Read more about computations with symbolic amplitude models here. +::: ## Main projects -ComPWA maintains **three main Python packages** with which you can do a full partial -wave analysis. The packages are designed as **libraries**, so that they can be used -separately by other projects. +ComPWA maintains **three main Python packages** with which you can do a full partial wave analysis. The packages are designed as **libraries**, so that they can be used separately by other projects. -Each of these libraries come with **interactive and interlinked documentation** that is -intended to bring theory and code closer together. The PWA Pages takes that one step -further: it is an independent and easy-to-maintain documentation project that can serve -as a central place to gather links to PWA theory and software. +Each of these libraries come with **interactive and interlinked documentation** that is intended to bring theory and code closer together. The PWA Pages takes that one step further: it is an independent and easy-to-maintain documentation project that can serve as a central place to gather links to PWA theory and software. ::::{grid} 1 2 2 2 @@ -253,6 +253,7 @@ more about our ideals and ongoing projects on the {doc}`main` page. caption: Resources hidden: --- +symbolics develop adr reports diff --git a/docs/report/024.ipynb b/docs/report/024.ipynb index 188e7f4b..3174820e 100644 --- a/docs/report/024.ipynb +++ b/docs/report/024.ipynb @@ -48,7 +48,7 @@ ":::{card} Symbolic expressions and model serialization\n", "TR-024\n", "^^^\n", - "\n", + "\n", "Investigation into dumping SymPy expressions to human-readable format for model preservation. The notebook was motivated by the [COMAP-V workshop on analysis preservation](https://indico.cern.ch/event/1348003/). See also SymPy [printing](https://docs.sympy.org/latest/modules/printing.html), [parsing](https://docs.sympy.org/latest/modules/parsing.html), and [expression manipulation](https://docs.sympy.org/latest/tutorials/intro-tutorial/manipulation.html).\n", "+++\n", "🚧 [polarimetry#319](https://github.com/ComPWA/polarimetry/pull/319)\n", diff --git a/docs/symbolics.ipynb b/docs/symbolics.ipynb new file mode 100644 index 00000000..1b38b1bd --- /dev/null +++ b/docs/symbolics.ipynb @@ -0,0 +1,875 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```{autolink-concat}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Symbolic amplitude models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "%pip install -q black==24.2.0 sympy==1.12" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Amplitude analysis is a method that is used intensively in particle and hadron physics experiments. It allows us to describes the intensity distributions obtained from the experiments with the use of amplitude models. These allow us to extract parameters about intermediate states appearing in the scattering processes, which are governed by the electroweak force and the strong force.\n", + "\n", + "The complicated nature of the strong force, described by Quantum Chromodynamics, makes it difficult to derive intensity models from first principles. Instead, we have to rely on approximations given specific assumptions for the scattering process that we study. Each amplitude model that we formulate, is almost always merely an approximation of the true scattering process. As a consequence, we always have to reassess our analysis results and try alternative models. In addition, amplitude models can be extremely complicated, with large, complex-valued parametrizations and dozens of input parameters. We therefore want to evaluate these models with as much information as possible. That means large input data samples and 'fits' using the full likelihood function, which provides us a multidimensional description of the data by using event-based, unbinned fit methods.\n", + "\n", + "Given these challenges, we can identify **three major requirements that amplitude analysis software should satisfy**:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ":::{card} {material-regular}`speed` Performance\n", + ":link: performance\n", + ":link-type: ref\n", + "We want to evaluate likelihood functions as fast as possible over large data samples, so that we can optimize our model parameters by testing several hypotheses in due time.\n", + ":::\n", + "\n", + ":::{card} {material-regular}`draw` Flexibility\n", + ":link: flexibility\n", + ":link-type: ref\n", + "We want to quickly formulate a wide range of amplitude models, given the latest theoretical and experimental insights.\n", + ":::\n", + "\n", + ":::{card} {material-regular}`school` Transparency\n", + ":link: transparency\n", + ":link-type: ref\n", + "It should be easy to inspect the implemented amplitude models, ideally by using mathematical formulas, so that the analysis can easily be reproduced or compared to results from other experiments, tools, or theoretical models.\n", + ":::\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(performance)=\n", + "## {material-regular}`speed` Performance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(array-oriented)=\n", + "### Array-oriented programming" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Even though Python is a popular programming language for data science, it is too slow for performing computations over large data samples. Computations in Python programs are therefore almost always outsourced through third-party Python libraries that are written in C++ or other compiled languages. This leads to an **array-oriented programming** style. Variables represent multidimensional arrays and the computational backend performs the element-wise operations behind-the-scenes. This has the additional benefit that the higher level Python code becomes more readable.\n", + "\n", + "In the following example, we have two data samples $a$ and $b$, each containing a million data points, and we want to compute $c_i=a_i+b_i^2$ for each of these data point $i$. For simplicity, we set both $a$ and $b$ to be `[0, 1, 2, ..., 999_999]`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_lst = list(range(1_000_000))\n", + "b_lst = list(range(1_000_000))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "#### Pure Python loop" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Naively, one could compute $c$ for each data point by creating a list and filling it with $c_i = a_i+b_i^2$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "%%timeit\n", + "c_lst = []\n", + "for a_i, b_i in zip(a_lst, a_lst):\n", + " c_lst.append(a_i + b_i**2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "`for` loops like these are a natural choice when coming from compiled languages like C++, but are considerably much slower when done with Python." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "#### Equivalent computation with arrays" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "[NumPy](https://numpy.org) is one of the most popular array-oriented libraries for Python. The data points for $a$ and $b$ are now represented by array objects..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "a = np.array(a_lst)\n", + "b = np.array(b_lst)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "...and the _array-oriented_ computation of $c = a+b^2$ becomes much **faster** and **more readable**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "%%timeit\n", + "c = a + b**2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Accelerated computational libraries" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The 2010s saw the release of a number of Python packages for highly optimized numerical computational backends that gained popularity within the data science and Machine Learning communities. Examples of these are [Numba](https://numba.pydata.org) (2012), [TensorFlow](https://tensorflow.org) (2015), [Pytorch](https://pytorch.org) (2016), and [JAX](https://jax.rtfd.io) (2018). Just like NumPy, the core of these packages is written in highly performant languages like C++, but apply several smart techniques to make the computations even faster. The main techniques that these backends apply are:\n", + "\n", + "- **Just-In-Time Compilation** (JIT): Python code is compiled if and only if it is run. JIT not only offers performance in a dynamic workflow, but also allows the compiler to optimize the code at runtime based on the actual data input.\n", + "- **Hardware acceleration**: JIT compilation is performed through an intermediate, device-agnostic layer of code (particularly [XLA](https://openxla.org/xla)), which allows the user to run their code not only on regular CPUs, but also on different types of hardware accelerators, like GPUs and TPUs.\n", + "- **Parallelization**: array-oriented computations can automatically parallelized over multiple CPU cores (multithreading) or multiple CPU, GPU or TPU devices (multiprocessing).\n", + "- **Automatic Differentiation**: Many of these libraries can automatically compute derivatives, which is useful for gradient-based optimization algorithms. While this functionality was designed with linear Machine Learning models in mind, it can be used to compute exact gradients over mathematical models.\n", + "\n", + "These techniques are usually directly available with minor changes to existing [array-oriented code](#array-oriented). In most cases, it is just a matter of decorating the array-oriented function with a JIT-compile decorator and, where needed, replacing the calls to vectorized functions (such as summing up a column in two-dimensional array) with their accelerated equivalents." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::::{tab-set}\n", + "\n", + ":::{tab-item} Original\n", + "```python\n", + "import numpy as np\n", + "\n", + "def my_func(a, b):\n", + " return np.sum(a + b**2, axis=0)\n", + "```\n", + ":::\n", + "\n", + ":::{tab-item} Numba\n", + "```python\n", + "import numba as nb\n", + "import numpy as np\n", + "\n", + "@nb.jit(nopython=True)\n", + "def my_func(a, b):\n", + " return np.sum(a + b**2, axis=0)\n", + "```\n", + ":::\n", + "\n", + ":::{tab-item} JAX\n", + "```python\n", + "import jax\n", + "import jax.numpy as jnp\n", + "\n", + "@jax.jit\n", + "def my_func(a, b):\n", + " return jnp.sum(a + b**2, axis=0)\n", + "```\n", + ":::\n", + "\n", + ":::{tab-item} TensorFlow\n", + "```python\n", + "import tensorflow as tf\n", + "import tensorflow.experimental.numpy as tnp\n", + "\n", + "@tf.function(jit_compile=True)\n", + "def my_func(a, b):\n", + " return tnp.sum(a + b**2, axis=0)\n", + "```\n", + ":::\n", + "\n", + ":::{tab-item} Pytorch\n", + "```python\n", + "import torch\n", + "\n", + "@torch.jit.script\n", + "def my_func(a, b):\n", + " return torch.sum(a + b**2, dim=0)\n", + "```\n", + ":::\n", + "\n", + "::::" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As can be seen, the implementation of the array-oriented NumPy function remains largely unaffected by the switch to these accelerated computational libraries. The resulting JIT-compiled function objects are automatically compiled and parallelized for the selected device for fast numerical computations over large data samples.\n", + "\n", + ":::{topic} {material-regular}`speed` Performance ✔️\n", + "Array-oriented programming allows for concise, recognizable implementations of mathematical models. Accelerated libraries like JAX and Numba can transform these implementations so that high-performance numerical computing can be achieved with trivial changes to the code.\n", + ":::\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "(flexibility)=\n", + "## {material-regular}`draw` Flexibility" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Python offers us flexibility to write concise and understandable code that can be run code interactively through a terminal or with Jupyter Notebooks. As we saw, [array-oriented computational backends](#array-oriented) make this code suitable for high-performance, parallelized computations over large data samples. The fact that array-oriented code looks so similar for different accelerated computational libraries begs the question whether we can find a way to **directly convert the mathematical expressions that we as physicists are familiar with into these fast numerical functions**. It turns out that we can do this using a Computer Algebra System." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computer Algebra System" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Programs like [Mathematica](https://www.wolfram.com/mathematica), [Maple](https://www.maplesoft.com/products/Maple) and [Matlab](https://www.mathworks.com/products/matlab.html) are popular examples for mathematics, physicists, and engineers, as they allow to simplify expression, solve equations or integrals, investigate their behavior with plots, et cetera. At core, these programs are [Computer Algebra Systems](https://en.wikipedia.org/wiki/List_of_computer_algebra_systems) (CAS) that represent mathematical expressions as graphs or trees and transform and modify them through algorithms that implement algebraic operations.\n", + "\n", + "The most commonly used CAS in Python is [SymPy](https://docs.sympy.org) and it has a major advantage over commercial CAS programs in that it is [open source and can be used as a library](https://docs.sympy.org/latest/tutorials/intro-tutorial/intro.html#why-sympy). This allows us to integrate it into our own applications for amplitude analysis or build up simple mathematical expressions in Jupyter notebooks, so that we can inspect them in $\\LaTeX$ form. For example, a simple Breit-Wigner function is written as:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sympy as sp\n", + "\n", + "s, m0, Γ0, g = sp.symbols(\"s m0 Gamma0 g\")\n", + "expression = g * m0 * Γ0 / (m0**2 - s - sp.I * Γ0 * m0)\n", + "expression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ":::{note}\n", + "SymPy orders symbolic terms [its own way](https://stackoverflow.com/a/36344627) if they are commutative, independent on the Python code given by the user.\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Expression trees" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Internally, SymPy expressions are built up by applying mathematical operations to algebraic objects, such as symbols and \n", + "numbers. In this example, we see how the Breit-Wigner function is built up from four symbols, a complex number, and a few integers. The resulting expression can be visualized as an **expression tree** of fundamental mathematical operations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "import graphviz\n", + "\n", + "style = [\n", + " (sp.Atom, {\"color\": \"grey\", \"fontcolor\": \"grey\"}),\n", + " (sp.Symbol, {\"color\": \"royalblue\", \"fontcolor\": \"royalblue\"}),\n", + "]\n", + "src = sp.dotprint(expression, styles=style)\n", + "graphviz.Source(src)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Algebraic substitutions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An example of an algebraic computation is algebraic substitution of some of the symbols. Here's an example where we substitute the symbols $N$, $m_0$, and $\\Gamma_0$ with fixed values (like model parameters)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "substituted_expr = expression.subs({m0: 0.980, Γ0: 0.06, g: 1})\n", + "substituted_expr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the substitutions, the expression tree shrinks. Subtrees that contained only real-valued numbers or one of the three substituted symbols are collapsed into a single number node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "jupyter": { + "source_hidden": true + }, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "src = sp.dotprint(substituted_expr.n(3), styles=style)\n", + "graphviz.Source(src)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Code generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Expression trees are not only useful for applying algebraic operations to their nodes. They can also be used as a **template for generating code**. In fact, the $\\LaTeX$ formula is generated using SymPy's $\\LaTeX$ printer:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from IPython.display import Markdown\n", + "\n", + "src = sp.latex(expression)\n", + "Markdown(f\"```latex\\n{src}\\n```\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "SymPy provides a [large number of code printers](https://docs.sympy.org/latest/modules/codegen.html) for different languages and human-readable serialization standards. A few examples are shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "scroll-output", + "scroll-input", + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from IPython.display import Markdown\n", + "from sympy.printing.mathml import MathMLPresentationPrinter\n", + "\n", + "\n", + "def to_mathml(expr: sp.Expr) -> str:\n", + " printer = MathMLPresentationPrinter()\n", + " xml = printer._print(expr)\n", + " return xml.toprettyxml().replace(\"\\t\", \" \")\n", + "\n", + "\n", + "Markdown(\n", + " f\"\"\"\n", + "```python\n", + "# Python\n", + "{sp.pycode(expression)}\n", + "```\n", + "```cpp\n", + "// C++\n", + "{sp.cxxcode(expression, standard=\"c++17\")}\n", + "```\n", + "```fortran\n", + "! Fortran\n", + "{sp.fcode(expression).strip()}\n", + "```\n", + "```julia\n", + "# Julia\n", + "{sp.julia_code(expression)}\n", + "```\n", + "```rust\n", + "// Rust\n", + "{sp.rust_code(expression)} \n", + "```\n", + "```xml\n", + "\n", + "{to_mathml(expression)}\n", + "```\n", + "\"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since SymPy is a Python library, the code generation process can be [completely customized](https://docs.sympy.org/latest/modules/printing.html). This allows us to generate code for languages that are not yet implemented or modify the behavior of existing code printers, which can be used to **generate [array-oriented Python code](#accelerated-computational-libraries)** for several computational libraries. For the Breit-Wigner example, the generated NumPy function looks like the following." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "full_func = sp.lambdify(args=(s, m0, Γ0, g), expr=expression, modules=\"numpy\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "jupyter": { + "source_hidden": true + }, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "import inspect\n", + "\n", + "import black\n", + "\n", + "src = inspect.getsource(full_func)\n", + "src = black.format_str(src, mode=black.FileMode())\n", + "Markdown(f\"```python\\n{src}\\n```\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "substituted_func = sp.lambdify(args=s, expr=substituted_expr, modules=\"numpy\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "jupyter": { + "source_hidden": true + }, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "src = inspect.getsource(substituted_func)\n", + "src = black.format_str(src, mode=black.FileMode())\n", + "Markdown(f\"```python\\n{src}\\n```\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + ":::{sidebar}\n", + "![SymPy code generation](https://github.com/ComPWA/compwa.github.io/assets/29308176/a1a19f74-b2dd-484f-804f-02da523ed4b7)\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The example as described here is small for illustrative purposes. It turns out that code generation works just as well for **expressions with a much larger number of mathematical operations**, even if in the order of hundreds of thousands. This is exactly what is needed for fitting amplitude models to data.\n", + "\n", + "We now have a flexible and transparent way of formulating amplitude models that can be easily modified with algebraic operations. The models can immediately be inspected as mathematical expressions and can be used as template for generating array-oriented numerical functions for efficient computations over large data samples. In addition, any algebraic operations that simplify the expression tree directly map onto the generated array-oriented code, which can result in better numerical performance of the generated code." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + ":::{topic} {material-regular}`draw` Flexibility ✔️\n", + "A Computer Algebra System provides a simple way to **separate physics from number crunching**. Amplitude models only have to be formulated symbolically, while computations are outsourced to array-oriented, numerical libraries through automated code generation. This provides us a **[Single Source of Truth](https://en.wikipedia.org/wiki/Single_source_of_truth)** for implemented physics models.\n", + ":::\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "(transparency)=\n", + "## {material-regular}`school` Transparency" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have seen how a Computer Algebra System that generates array-oriented code allows us to formulate [performant](#performance) and [flexible](#flexibility) amplitude models. Physicists can now focus on implementing theory in a central place, while the computations are outsourced to optimized libraries. In itself, these are ingredients that make it much easier to write analysis code. However, the set-up offers major indirect benefits to the wider amplitude analysis community as well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Self-documenting workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The combination of a CAS, performant code generation, and a dynamically typed language like Python is ideal when working with Jupyter notebooks. Amplitude models can be built up interactively with the option to directly inspect their implementation as mathematical formulas or plot their behavior with visualization libraries. Fast, numerical performance is near at hand through automatic code generation, which bridges the gab between formulating amplitude models to performing fits.\n", + "\n", + "This ties in perfectly with recent trends in data science and modern publishing tools. In recent years, there have been several initiatives that render Jupyter notebooks as publication-ready documents (websites, PDF files, etc.). Community-wide examples are the [Executable Book Project](https://executablebooks.org), [Curvenote](https://curvenote.com), and [Quarto](https://quarto.org), while CERN has launched platforms like [SWAN](https://swan.docs.cern.ch) for running notebooks with direct access to CERN computing resources and [Reana](https://www.reanahub.io) for making analyses more reproducible and scalable.\n", + "\n", + "Given these trends, writing an amplitude analysis with symbolics is therefore not only intuitive to the physicist who write it, but results in a **self-documenting workflow** that naturally evolves towards publication-ready materials. An example is the [polarimetry analysis by LHCb](https://doi.org/10.1007/JHEP07(2023)228), where the entire implemented amplitude model is [directly rendered from the codebase](https://lc2pkpi-polarimetry.docs.cern.ch/amplitude-model.html#amplitude) as mathematical formulas and analysis results from high-performance computations are directly available through code generation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Model preservation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Amplitude analyses are notoriously hard to reproduce. There are often several ways to formulate the same model, the model complexity makes the source code of the analysis model hard to understand, and starting conditions in a fit can lead to completely different fit results. Formulating the amplitude models symbolically addresses exactly these difficulties.\n", + "\n", + "First of all, the [self-documenting workflow](#self-documenting-workflow) with symbolic expressions removes the need for readers to dive through the underlying code, as the formulas are directly visible to the reader. Mathematics is the language we all speak. This easily allows others to reimplement models into their own framework of choice, now or in the future, in e.g. new programming languages.\n", + "\n", + "Second, a symbolic amplitude model can be serialized to human-readable format with [code generation](#code-generation). Just as with the generation of numerical functions, the model's [expression tree](#expression-trees) can be used to generate nodes in a serialization format like YAML or JSON. Other analysis frameworks can then import the model for cross-checks or for adapting the analysis to other experiments.\n", + "\n", + "On a technical note, the Python ecosystem in combination with Jupyter Notebooks and Sphinx makes it possible for any reader to directly rerun analysis in the browser or in some local environment. [Pinned dependencies](https://github.com/ComPWA/update-pip-constraints) ensure that the analysis produces the same results." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Knowledge exchange" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Exciting times are coming for amplitude analysis studies. On the one hand, collider experiments are producing increasingly large data samples that provide high statistics that are suitable for amplitude analysis and challenge the tools that are on the market. On the other hand, computational power has become so widely available, that it becomes possible to perform fits with more complicated models over large data input. While this provides many opportunities, it also poses challenges to the community.\n", + "\n", + "There are many physicists who have little background in amplitude analysis, but need to become familiar with the theory and the techniques. For now, however, the literature is sparse, highly technical, and often specific to the experiment for which it was written. This makes the learning curve for newcomers extremely steep, so that it takes a lot of time before one can perform an actual amplitude analysis.\n", + "\n", + "As amplitude models become more complex, it becomes crucial to get input from other experiments. This requires a proper matching of amplitude models and therefore it is essential that the models become more reproducible, extendable, and portable. All of this makes it paramount for the amplitude analysis community to provide easy access to the basic principles of the theoretical model.\n", + "\n", + "Symbolics can play a valuable role here as well, as it becomes much easier to share and maintain knowledge gained about amplitude models and amplitude analysis theory. Symbolic amplitude models directly show the implemented mathematics and their numerical functions can directly be used for interactive visualizations. In addition, the [self-documenting workflow](#self-documenting-workflow) makes it more inviting to contribute to community documentation as it narrows the gap between theory and code." + ] + } + ], + "metadata": { + "colab": { + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/environment.yml b/environment.yml index aa7d5a26..9a6d456b 100644 --- a/environment.yml +++ b/environment.yml @@ -10,5 +10,5 @@ dependencies: - pip: - -c .constraints/py3.10.txt -e .[dev] variables: - PRETTIER_LEGACY_CLI: "1" + PRETTIER_LEGACY_CLI: 1 PYTHONHASHSEED: 0