From 5191901ac8946065c5bf16cc887cc22b0fda2851 Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 14:53:56 +0100
Subject: [PATCH 01/10] fix: update build-backend to use legacy mode and
 re-enable branch formatting in versioning

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a1cc84d..76794bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "torch",
     "numpy",
 ]
-build-backend = "setuptools.build_meta"
+build-backend = "setuptools.build_meta:__legacy__"
 
 [tool.setuptools-git-versioning]
 enabled = true
@@ -16,7 +16,7 @@ count_commits_from_version_file = true    # <--- enable commits tracking
 dev_template = "{tag}.{branch}{ccount}"   # suffix for versions will be .dev
 dirty_template = "{tag}.{branch}{ccount}" # same thing here
 # Temporarily disable branch formatting due to issues with regex in _version.py
-# branch_formatter = "torchlpc._version:format_branch_name"
+branch_formatter = "torchlpc._version:format_branch_name"
 
 [tool.setuptools.package-data]
 # include VERSION file to a package

From 1bf2be6eab020517c030a9f06b7af637a31c3a92 Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 14:55:04 +0100
Subject: [PATCH 02/10] fix: simplify event triggers in version workflow

---
 .github/workflows/version.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/version.yml b/.github/workflows/version.yml
index 6bd24fa..428e443 100644
--- a/.github/workflows/version.yml
+++ b/.github/workflows/version.yml
@@ -1,10 +1,6 @@
 name: Display version
 
-on:
-  push:
-    branches: [ "dev", "main", "alpha", "beta" ]
-  pull_request:
-    branches: [ "dev", "main", "alpha", "beta" ]
+on: [push, pull_request]
 
 permissions:
     contents: read

From 14a49072d349954352c6b7343c089f0d740e6841 Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 15:17:34 +0100
Subject: [PATCH 03/10] fix: increase verbosity of version display in workflow

---
 .github/workflows/version.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/version.yml b/.github/workflows/version.yml
index 428e443..329287a 100644
--- a/.github/workflows/version.yml
+++ b/.github/workflows/version.yml
@@ -26,4 +26,4 @@ jobs:
         pip install torch --index-url https://download.pytorch.org/whl/cpu
     - name: Display version
       run: |
-          setuptools-git-versioning -v >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
+          setuptools-git-versioning -vv >> $GITHUB_STEP_SUMMARY
\ No newline at end of file

From 46494423b3a9c92ea9ed6678f3129fad62ecfb0b Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 15:21:10 +0100
Subject: [PATCH 04/10] fix: add numpy to dependencies in version workflow

---
 .github/workflows/version.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/version.yml b/.github/workflows/version.yml
index 329287a..1be59c9 100644
--- a/.github/workflows/version.yml
+++ b/.github/workflows/version.yml
@@ -22,7 +22,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install build "setuptools-git-versioning>=2,<3"
+        pip install build "setuptools-git-versioning>=2,<3" numpy
         pip install torch --index-url https://download.pytorch.org/whl/cpu
     - name: Display version
       run: |

From 9d47f307ae5c7dce8c66ce18c0de48880c05643a Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 15:24:17 +0100
Subject: [PATCH 05/10] fix: add numba to dependencies in version workflow and
 pyproject.toml

---
 .github/workflows/version.yml | 2 +-
 pyproject.toml                | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/version.yml b/.github/workflows/version.yml
index 1be59c9..c334c91 100644
--- a/.github/workflows/version.yml
+++ b/.github/workflows/version.yml
@@ -22,7 +22,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install build "setuptools-git-versioning>=2,<3" numpy
+        pip install build "setuptools-git-versioning>=2,<3" numpy numba
         pip install torch --index-url https://download.pytorch.org/whl/cpu
     - name: Display version
       run: |
diff --git a/pyproject.toml b/pyproject.toml
index 76794bb..41f5fde 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,6 +4,7 @@ requires = [
     "setuptools-git-versioning>=2.0,<3",
     "wheel",
     "torch",
+    "numba",
     "numpy",
 ]
 build-backend = "setuptools.build_meta:__legacy__"
@@ -12,10 +13,9 @@ build-backend = "setuptools.build_meta:__legacy__"
 enabled = true
 # change the file path
 version_file = "torchlpc/VERSION.txt"
-count_commits_from_version_file = true    # <--- enable commits tracking
-dev_template = "{tag}.{branch}{ccount}"   # suffix for versions will be .dev
-dirty_template = "{tag}.{branch}{ccount}" # same thing here
-# Temporarily disable branch formatting due to issues with regex in _version.py
+count_commits_from_version_file = true                    # <--- enable commits tracking
+dev_template = "{tag}.{branch}{ccount}"                   # suffix for versions will be .dev
+dirty_template = "{tag}.{branch}{ccount}"                 # same thing here
 branch_formatter = "torchlpc._version:format_branch_name"
 
 [tool.setuptools.package-data]

From 63da7b830757b1a83c7410525b229edafb86585e Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 17:58:20 +0100
Subject: [PATCH 06/10] fix: update license format in pyproject.toml

---
 pyproject.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 41f5fde..89cd037 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ exclude = ["tests", "tests.*"]
 [tool.setuptools]
 # this package will read some included files in runtime, avoid installing it as .zip
 zip-safe = false
+license-files = ["LICENSE"]
 
 [project]
 dynamic = ["version"]
@@ -39,8 +40,8 @@ authors = [{ name = "Chin-Yun Yu", email = "chin-yun.yu@qmul.ac.uk" }]
 maintainers = [{ name = "Chin-Yun Yu", email = "chin-yun.yu@qmul.ac.uk" }]
 description = "Fast, efficient, and differentiable time-varying LPC filtering in PyTorch."
 readme = "README.md"
-license = "MIT"
-license-files = ["LICENSE"]
+license = { text = "MIT" }
+
 classifiers = [
     "Development Status :: 3 - Alpha",
     "Intended Audience :: Developers",

From 675c23a91331e1b4cf3e4b233b4a9c6af8aa9d79 Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 19:40:48 +0100
Subject: [PATCH 07/10] fix: refactor scan_cpu function for improved
 performance

---
 torchlpc/csrc/scan_cpu.cpp | 123 ++++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 58 deletions(-)

diff --git a/torchlpc/csrc/scan_cpu.cpp b/torchlpc/csrc/scan_cpu.cpp
index 6b47cad..a1a3acd 100644
--- a/torchlpc/csrc/scan_cpu.cpp
+++ b/torchlpc/csrc/scan_cpu.cpp
@@ -6,26 +6,29 @@
 #include <utility>
 #include <vector>
 
-extern "C" {
-/* Creates a dummy empty _C module that can be imported from Python.
-   The import from Python will load the .so associated with this extension
-   built from this file, so that all the TORCH_LIBRARY calls below are run.*/
-PyObject *PyInit__C(void) {
-    static struct PyModuleDef module_def = {
-        PyModuleDef_HEAD_INIT,
-        "_C", /* name of module */
-        NULL, /* module documentation, may be NULL */
-        -1,   /* size of per-interpreter state of the module,
-                 or -1 if the module keeps state in global variables. */
-        NULL, /* methods */
-    };
-    return PyModule_Create(&module_def);
-}
+extern "C"
+{
+    /* Creates a dummy empty _C module that can be imported from Python.
+       The import from Python will load the .so associated with this extension
+       built from this file, so that all the TORCH_LIBRARY calls below are run.*/
+    PyObject *PyInit__C(void)
+    {
+        static struct PyModuleDef module_def = {
+            PyModuleDef_HEAD_INIT,
+            "_C", /* name of module */
+            NULL, /* module documentation, may be NULL */
+            -1,   /* size of per-interpreter state of the module,
+                     or -1 if the module keeps state in global variables. */
+            NULL, /* methods */
+        };
+        return PyModule_Create(&module_def);
+    }
 }
 
 template <typename scalar_t>
 void scan_cpu(const at::Tensor &input, const at::Tensor &weights,
-              const at::Tensor &initials, const at::Tensor &output) {
+              const at::Tensor &initials, const at::Tensor &output)
+{
     TORCH_CHECK(input.dim() == 2, "Input must be 2D");
     TORCH_CHECK(initials.dim() == 1, "Initials must be 1D");
     TORCH_CHECK(weights.sizes() == input.sizes(),
@@ -50,39 +53,33 @@ void scan_cpu(const at::Tensor &input, const at::Tensor &weights,
     auto T = input.size(1);
     auto total_size = input.numel();
 
-    std::pair<scalar_t, scalar_t> buffer[total_size];
-
     const scalar_t *input_ptr = input_contiguous.const_data_ptr<scalar_t>();
     const scalar_t *initials_ptr =
         initials_contiguous.const_data_ptr<scalar_t>();
     const scalar_t *weights_ptr = weights_contiguous.const_data_ptr<scalar_t>();
     scalar_t *output_ptr = output.mutable_data_ptr<scalar_t>();
 
-    std::transform(weights_ptr, weights_ptr + total_size, input_ptr, buffer,
-                   [](const scalar_t &a, const scalar_t &b) {
-                       return std::make_pair(a, b);
-                   });
-
-    at::parallel_for(0, n_batch, 1, [&](int64_t start, int64_t end) {
-        for (auto b = start; b < end; b++) {
-            std::inclusive_scan(
-                buffer + b * T, buffer + (b + 1) * T, buffer + b * T,
-                [](const std::pair<scalar_t, scalar_t> &a,
-                   const std::pair<scalar_t, scalar_t> &b) {
-                    return std::make_pair(a.first * b.first,
-                                          a.second * b.first + b.second);
-                },
-                std::make_pair((scalar_t)1.0, initials_ptr[b]));
-        }
-    });
-
-    std::transform(
-        buffer, buffer + total_size, output_ptr,
-        [](const std::pair<scalar_t, scalar_t> &a) { return a.second; });
+    at::parallel_for(0, n_batch, 1, [&](int64_t start, int64_t end)
+                     {
+        for (auto b = start; b < end; b++)
+        {
+            auto initial = initials_ptr[b];
+            auto weights_offset = weights_ptr + b * T;
+            auto input_offset = input_ptr + b * T;
+            auto output_offset = output_ptr + b * T;
+            for (int64_t t = 0; t < T; t++)
+            {
+                auto w = weights_offset[t];
+                auto x = input_offset[t];
+                initial = initial * w + x;
+                output_offset[t] = initial;
+            }
+        }; });
 }
 
 template <typename scalar_t>
-void lpc_cpu_core(const torch::Tensor &a, const torch::Tensor &padded_out) {
+void lpc_cpu_core(const torch::Tensor &a, const torch::Tensor &padded_out)
+{
     // Ensure input dimensions are correct
     TORCH_CHECK(a.dim() == 3, "a must be 3-dimensional");
     TORCH_CHECK(padded_out.dim() == 2, "out must be 2-dimensional");
@@ -106,24 +103,29 @@ void lpc_cpu_core(const torch::Tensor &a, const torch::Tensor &padded_out) {
     const scalar_t *a_ptr = a_contiguous.const_data_ptr<scalar_t>();
     scalar_t *out_ptr = padded_out.mutable_data_ptr<scalar_t>();
 
-    at::parallel_for(0, B, 1, [&](int64_t start, int64_t end) {
-        for (auto b = start; b < end; b++) {
-            auto out_offset = b * (T + order) + order;
-            auto a_offset = b * T * order;
-            for (int64_t t = 0; t < T; t++) {
-                scalar_t y = out_ptr[out_offset + t];
-                for (int64_t i = 0; i < order; i++) {
-                    y -= a_ptr[a_offset + t * order + i] *
-                         out_ptr[out_offset + t - i - 1];
-                }
-                out_ptr[out_offset + t] = y;
+// at::parallel_for(0, B, 1, [&](int64_t start, int64_t end)
+//                  {
+#pragma omp parallel for
+    for (auto b = 0; b < B; b++)
+    {
+        auto out_offset = b * (T + order) + order;
+        auto a_offset = b * T * order;
+        for (int64_t t = 0; t < T; t++)
+        {
+            scalar_t y = out_ptr[out_offset + t];
+            for (int64_t i = 0; i < order; i++)
+            {
+                y -= a_ptr[a_offset + t * order + i] *
+                     out_ptr[out_offset + t - i - 1];
             }
+            out_ptr[out_offset + t] = y;
         }
-    });
+    };
 }
 
 at::Tensor scan_cpu_wrapper(const at::Tensor &input, const at::Tensor &weights,
-                            const at::Tensor &initials) {
+                            const at::Tensor &initials)
+{
     TORCH_CHECK(input.is_floating_point() || input.is_complex(),
                 "Input must be floating point or complex");
     TORCH_CHECK(initials.scalar_type() == input.scalar_type(),
@@ -135,12 +137,14 @@ at::Tensor scan_cpu_wrapper(const at::Tensor &input, const at::Tensor &weights,
 
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
         input.scalar_type(), "scan_cpu",
-        [&] { scan_cpu<scalar_t>(input, weights, initials, output); });
+        [&]
+        { scan_cpu<scalar_t>(input, weights, initials, output); });
     return output;
 }
 
 at::Tensor lpc_cpu(const at::Tensor &x, const at::Tensor &a,
-                   const at::Tensor &zi) {
+                   const at::Tensor &zi)
+{
     TORCH_CHECK(x.is_floating_point() || x.is_complex(),
                 "Input must be floating point or complex");
     TORCH_CHECK(a.scalar_type() == x.scalar_type(),
@@ -156,16 +160,19 @@ at::Tensor lpc_cpu(const at::Tensor &x, const at::Tensor &a,
     auto out = at::cat({zi.flip(1), x}, 1).contiguous();
 
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
-        x.scalar_type(), "lpc_cpu", [&] { lpc_cpu_core<scalar_t>(a, out); });
+        x.scalar_type(), "lpc_cpu", [&]
+        { lpc_cpu_core<scalar_t>(a, out); });
     return out.slice(1, zi.size(1), out.size(1)).contiguous();
 }
 
-TORCH_LIBRARY(torchlpc, m) {
+TORCH_LIBRARY(torchlpc, m)
+{
     m.def("torchlpc::scan(Tensor a, Tensor b, Tensor c) -> Tensor");
     m.def("torchlpc::lpc(Tensor a, Tensor b, Tensor c) -> Tensor");
 }
 
-TORCH_LIBRARY_IMPL(torchlpc, CPU, m) {
+TORCH_LIBRARY_IMPL(torchlpc, CPU, m)
+{
     m.impl("scan", &scan_cpu_wrapper);
     m.impl("lpc", &lpc_cpu);
 }

From 328cafa73c9e1d885bde758c20ccc778655bf25a Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 20:37:45 +0100
Subject: [PATCH 08/10] refactor lpc_cpu

---
 torchlpc/csrc/scan_cpu.cpp | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/torchlpc/csrc/scan_cpu.cpp b/torchlpc/csrc/scan_cpu.cpp
index a1a3acd..a789d9d 100644
--- a/torchlpc/csrc/scan_cpu.cpp
+++ b/torchlpc/csrc/scan_cpu.cpp
@@ -103,24 +103,22 @@ void lpc_cpu_core(const torch::Tensor &a, const torch::Tensor &padded_out)
     const scalar_t *a_ptr = a_contiguous.const_data_ptr<scalar_t>();
     scalar_t *out_ptr = padded_out.mutable_data_ptr<scalar_t>();
 
-// at::parallel_for(0, B, 1, [&](int64_t start, int64_t end)
-//                  {
-#pragma omp parallel for
-    for (auto b = 0; b < B; b++)
-    {
-        auto out_offset = b * (T + order) + order;
-        auto a_offset = b * T * order;
-        for (int64_t t = 0; t < T; t++)
+    at::parallel_for(0, B, 1, [&](int64_t start, int64_t end)
+                     {
+        for (auto b = start; b < end; b++)
         {
-            scalar_t y = out_ptr[out_offset + t];
-            for (int64_t i = 0; i < order; i++)
+            auto out_offset = out_ptr + b * (T + order) + order;
+            auto a_offset = a_ptr + b * T * order;
+            for (int64_t t = 0; t < T; t++)
             {
-                y -= a_ptr[a_offset + t * order + i] *
-                     out_ptr[out_offset + t - i - 1];
+                scalar_t y = out_offset[t];
+                for (int64_t i = 0; i < order; i++)
+                {
+                    y -= a_offset[t * order + i] * out_offset [t - i - 1];
+                }
+                out_offset[t] = y;
             }
-            out_ptr[out_offset + t] = y;
-        }
-    };
+        }; });
 }
 
 at::Tensor scan_cpu_wrapper(const at::Tensor &input, const at::Tensor &weights,

From 264d45f9fcebc5b0cc65cb1b9ee475f2ef2519fb Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 20:41:46 +0100
Subject: [PATCH 09/10] fix: update clang++ version in build step to llvm@18 to
 match macos-latest

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index dc923d9..9cdbe9c 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -77,7 +77,7 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Build CPP extension with clang++
       run: |
-        export CXX=$(brew --prefix llvm@15)/bin/clang++
+        export CXX=$(brew --prefix llvm@18)/bin/clang++
         export LDFLAGS="-L/usr/local/opt/libomp/lib"
         export CPPFLAGS="-I/usr/local/opt/libomp/include"
         pip install -e .[dev]

From dde214eeefc26c41638daa406d9f02c5f569e0b0 Mon Sep 17 00:00:00 2001
From: Chin-Yun Yu <chin-yun.yu@qmul.ac.uk>
Date: Tue, 16 Sep 2025 20:53:04 +0100
Subject: [PATCH 10/10] fix: update LDFLAGS and CPPFLAGS paths for libomp in
 macOS build step

---
 .github/workflows/python-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 9cdbe9c..71fd475 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -78,8 +78,8 @@ jobs:
     - name: Build CPP extension with clang++
       run: |
         export CXX=$(brew --prefix llvm@18)/bin/clang++
-        export LDFLAGS="-L/usr/local/opt/libomp/lib"
-        export CPPFLAGS="-I/usr/local/opt/libomp/include"
+        export LDFLAGS="-L/opt/homebrew/opt/libomp/lib"
+        export CPPFLAGS="-I/opt/homebrew/opt/libomp/include"
         pip install -e .[dev]
     - name: Test with pytest
       run: |