diff --git a/README.md b/README.md
index 916aa32..4ba9b99 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ add please create a GitHub Issue.
 
 ## Usage
 
+### Basic Usage (mono IR, mono i/o, uniform partitioning)
+
 First, create a `Config` object:
 
 ```cpp
@@ -36,20 +38,16 @@ chowdsp::convolution::IR_Uniform ir {};
 chowdsp::convolution::create_ir (&config, &ir, my_ir.data(), my_ir.size());
 ```
 
-Then we'll create a convolution "state". For this example, let's assume
-we have a monophonic IR that's be used to process a stereo input.
+Then we'll create a convolution "state".
 ```cpp
-chowdsp::convolution::Process_Uniform_State left_state {};
-chowdsp::convolution::Process_Uniform_State right_state {};
-chowdsp::convolution::create_process_state (&config, &ir, &left_state);
-chowdsp::convolution::create_process_state (&config, &ir, &right_state);
+chowdsp::convolution::Process_Uniform_State state {};
+chowdsp::convolution::create_process_state (&config, &ir, &state);
 ```
 
 Now we're ready to process some data:
 
 ```cpp
-chowdsp::convolution::process_sample (&config, &ir, &left_state, left_channel_data, left_channel_data, num_samples, fft_scratch);
-chowdsp::convolution::process_sample (&config, &ir, &right_state, right_channel_data, right_channel_data, num_samples, fft_scratch);
+chowdsp::convolution::process_samples (&config, &ir, &state, data, data, num_samples, fft_scratch);
 ```
 
 Alternatively, we could use `process_samples_with_latency()` which is
@@ -59,12 +57,63 @@ Finally, let's clean up all our memory allocation:
 
 ```cpp
 chowdsp::fft::aligned_free (fft_scratch);
-chowdsp::convolution::destroy_process_state (&left_state);
-chowdsp::convolution::destroy_process_state (&right_state);
+chowdsp::convolution::destroy_process_state (&state);
 chowdsp::convolution::destroy_ir (&ir);
 chowdsp::convolution::destroy_config (&config);
 ```
 
+### Multi-Channel Processing (mono IR)
+
+Let's say that you want to convolve a stereo audio stream with a mono IR.
+We can use `create_multichannel_process_state()` to create a processing state
+with a given number of channels.
+
+```cpp
+chowdsp::convolution::Process_Uniform_State stereo_state {};
+chowdsp::convolution::create_multichannel_process_state (&config, &ir, &stereo_state, 2);
+```
+
+To process our audio, we'll want to use `process_samples_multichannel()`
+(or `process_samples_multichannel_with_latency()`).
+
+```cpp
+float* channel_data[2] {
+    left_channel_data,
+    right_channel_data,
+};
+chowdsp::convolution::process_samples_multichannel (&config, &ir, &state, channel_data, channel_data, num_samples, 2, fft_scratch);
+```
+
+### Multi-Channel IRs
+
+let's create a stereo, uniform-partitioned IR:
+
+```cpp
+float* ir_data[2] {
+    left_ir_data,
+    right_ir_data,
+};
+chowdsp::convolution::IR_Uniform ir {};
+chowdsp::convolution::create_multichannel_ir (&config, &ir, ir_data, ir_num_samples, 2, fft_scratch);
+```
+
+Now if we call `create_process_state()`, the state will automatically be created
+for the same number of channels as the IR.
+```cpp
+chowdsp::convolution::Process_Uniform_State state {};
+chowdsp::convolution::create_process_state (&config, &ir, &state);
+```
+
+Then (as before), we can do our multi-channel processing:
+
+```cpp
+float* channel_data[2] {
+    left_channel_data,
+    right_channel_data,
+};
+chowdsp::convolution::process_samples_multichannel (&config, &ir, &state, channel_data, channel_data, num_samples, 2, fft_scratch);
+```
+
 ### Multi-Threaded Usage
 
 What should you do if you're looking to load an impulse response
@@ -79,7 +128,7 @@ thread is still running? The basic idea is that you should:
 Note that the `Config` object is thread-safe, so you may use the
 same config on both your audio thread and background thread (e.g.
 when calling `create_ir()` or `load_ir()`). However, the `fft_scratch`
-is not thread-safe, so make sure to allocate a dedicated `fft_scratch`
+is **not** thread-safe, so make sure to allocate a dedicated `fft_scratch`
 for each thread.
 
 ## License
diff --git a/chowdsp_convolution.cpp b/chowdsp_convolution.cpp
index cfddb10..1d5bfd3 100644
--- a/chowdsp_convolution.cpp
+++ b/chowdsp_convolution.cpp
@@ -22,7 +22,7 @@ static int next_pow2 (int v) noexcept
 static int pad_floats (int N)
 {
     static constexpr int pad_len = 16;
-    const auto N_div =  (N + pad_len - 1) / pad_len;
+    const auto N_div = (N + pad_len - 1) / pad_len;
     return N_div * pad_len;
 }
 
@@ -130,8 +130,7 @@ void load_multichannel_ir (const Config* config, IR_Uniform* ir, const float* co
 
     for (int ch = 0; ch < num_channels; ++ch)
     {
-        IR_Uniform this_channel_ir
-        {
+        IR_Uniform this_channel_ir {
             .segments = get_segment (config, ir->segments, ch * ir->max_num_segments),
             .num_segments = ir->num_segments,
             .max_num_segments = ir->max_num_segments,
@@ -147,7 +146,6 @@ static size_t state_data_bytes_needed (const Config* config, const IR_Uniform* i
     size_t bytes_needed {};
 
     const auto segment_num_samples = config->fft_size;
-    state->num_channels = ir->num_channels;
     state->max_num_segments = config->block_size > 128 ? ir->max_num_segments : 3 * ir->max_num_segments;
     bytes_needed += segment_num_samples * state->max_num_segments * sizeof (float);
 
@@ -174,9 +172,10 @@ static void state_data_partition_memory (const Config* config, Process_Uniform_S
     data += config->fft_size;
 }
 
-void create_process_state (const Config* config, const IR_Uniform* ir, Process_Uniform_State* state)
+void create_multichannel_process_state (const Config* config, const IR_Uniform* ir, Process_Uniform_State* state, int num_channels)
 {
     using State_Data = Process_Uniform_State::State_Data;
+    state->num_channels = num_channels;
     const auto state_bytes_needed = state_data_bytes_needed (config, ir, state);
     auto* data = fft::aligned_malloc (state_bytes_needed + state->num_channels * sizeof (State_Data));
     state->state_data = reinterpret_cast<State_Data*> (static_cast<std::byte*> (data) + state_bytes_needed);
@@ -189,6 +188,11 @@ void create_process_state (const Config* config, const IR_Uniform* ir, Process_U
     reset_process_state (config, state);
 }
 
+void create_process_state (const Config* config, const IR_Uniform* ir, Process_Uniform_State* state)
+{
+    create_multichannel_process_state (config, ir, state, ir->num_channels);
+}
+
 void reset_process_state (const Config* config, Process_Uniform_State* state)
 {
     state->current_segment = 0;
@@ -221,8 +225,9 @@ int get_required_nuir_scratch_bytes (const IR_Non_Uniform* ir)
     assert (ir->head_config != nullptr);
     assert (ir->tail_config != nullptr);
     return static_cast<int> ((std::max (ir->head_config->fft_size,
-                     ir->tail_config->fft_size)
-           + pad_floats (ir->head_config->block_size)) * sizeof (float));
+                                        ir->tail_config->fft_size)
+                              + pad_floats (ir->head_config->block_size))
+                             * sizeof (float));
 }
 
 void create_nuir (IR_Non_Uniform* ir, const float* ir_data, int ir_num_samples, float* fft_scratch)
@@ -275,7 +280,9 @@ void create_nuir_process_state (const IR_Non_Uniform* ir, Process_Non_Uniform_St
 {
     using State_Data = Process_Uniform_State::State_Data;
 
+    state->head.num_channels = 1; // @TODO
     state->head_config = ir->head_config;
+    state->tail.num_channels = 1; // @TODO
     state->tail_config = ir->tail_config;
 
     const auto head_state_bytes_needed = state_data_bytes_needed (state->head_config, &ir->head, &state->head);
@@ -284,7 +291,7 @@ void create_nuir_process_state (const IR_Non_Uniform* ir, Process_Non_Uniform_St
     state->head.state_data = reinterpret_cast<State_Data*> (static_cast<std::byte*> (data) + head_state_bytes_needed + tail_state_bytes_needed);
     state->tail.state_data = state->head.state_data + 1;
 
-    auto* float_data = static_cast<float*>(data);
+    auto* float_data = static_cast<float*> (data);
 
     state_data_partition_memory (state->head_config, &state->head, state->head.state_data[0], float_data);
     state_data_partition_memory (state->tail_config, &state->tail, state->tail.state_data[0], float_data);
@@ -570,21 +577,21 @@ static void process_multichannel (const Config* config,
                                   float* fft_scratch,
                                   bool with_latency)
 {
-    assert (state->num_channels == ir->num_channels);
+    assert (ir->num_channels == 1 || ir->num_channels == state->num_channels);
     assert (state->num_channels == num_channels);
 
     for (int ch = 0; ch < num_channels; ++ch)
     {
-        IR_Uniform mono_ir
-        {
-            .segments = get_segment (config, ir->segments, ch * ir->max_num_segments),
+        IR_Uniform mono_ir {
+            .segments = ir->num_channels == 1
+                            ? ir->segments
+                            : get_segment (config, ir->segments, ch * ir->max_num_segments),
             .num_segments = ir->num_segments,
             .max_num_segments = ir->max_num_segments,
             .num_channels = 1,
         };
 
-        Process_Uniform_State mono_state
-        {
+        Process_Uniform_State mono_state {
             .state_data = state->state_data + ch,
             .max_num_segments = state->max_num_segments,
             .current_segment = state->current_segment,
diff --git a/chowdsp_convolution.h b/chowdsp_convolution.h
index f6cdfd0..2f0e0f9 100644
--- a/chowdsp_convolution.h
+++ b/chowdsp_convolution.h
@@ -127,9 +127,18 @@ void load_multichannel_ir (const struct Convolution_Config*, struct IR_Uniform*,
 /** De-allocates the IR's internal data. */
 void destroy_ir (struct IR_Uniform*);
 
-/** Creates a process state object for a given IR. */
+/**
+ * Creates a process state object for a given IR.
+ * The process state will be created to process the same number of channels as the IR contains.
+ */
 void create_process_state (const struct Convolution_Config*, const struct IR_Uniform*, struct Process_Uniform_State*);
 
+/**
+ * Creates a process state object for a given IR, with a specific number of channels.
+ * This is useful for convolving a monophonic IR with multiple channels.
+ */
+void create_multichannel_process_state (const struct Convolution_Config*, const struct IR_Uniform*, struct Process_Uniform_State*, int num_channels);
+
 /** Zeros the process state. */
 void reset_process_state (const struct Convolution_Config*, struct Process_Uniform_State*);
 
diff --git a/test/chowdsp_convolution_test.cpp b/test/chowdsp_convolution_test.cpp
index 4d900ed..487cc03 100644
--- a/test/chowdsp_convolution_test.cpp
+++ b/test/chowdsp_convolution_test.cpp
@@ -418,12 +418,18 @@ static bool test_convolution (int ir_length_samples, int block_size, int num_blo
     return max_error < 5.0e-4f && mse < 1.0e-9f;
 }
 
-static bool test_convolution_multi_channel (int ir_length_samples, int block_size, int num_blocks, bool latency, int num_channels)
+static bool test_convolution_multi_channel (int ir_length_samples,
+                                            int block_size,
+                                            int num_blocks,
+                                            bool latency,
+                                            int num_channels,
+                                            bool mono_ir)
 {
     std::cout << "Running test with IR length: " << ir_length_samples
               << ", block size: " << block_size
               << ", latency: " << (latency ? "ON" : "OFF")
-              << ", # channels: " << num_channels << '\n';
+              << ", # channels: " << num_channels
+              << ", mono IR: " << (mono_ir ? "ON" : "OFF") << '\n';
 
     std::mt19937 rng { 0x12345 };
     auto ir = generate (ir_length_samples, rng);
@@ -458,15 +464,26 @@ static bool test_convolution_multi_channel (int ir_length_samples, int block_siz
     auto* fft_scratch = (float*) chowdsp::fft::aligned_malloc (conv_config.fft_size * sizeof (float));
 
     chowdsp::convolution::IR_Uniform conv_ir {};
-    chowdsp::convolution::create_multichannel_ir (&conv_config,
-                                                  &conv_ir,
-                                                  multi_channel_ir.data(),
-                                                  ir_length_samples,
-                                                  num_channels,
-                                                  fft_scratch);
+    if (mono_ir)
+    {
+        chowdsp::convolution::create_ir (&conv_config,
+                                         &conv_ir,
+                                         ir.data(),
+                                         ir_length_samples,
+                                         fft_scratch);
+    }
+    else
+    {
+        chowdsp::convolution::create_multichannel_ir (&conv_config,
+                                                      &conv_ir,
+                                                      multi_channel_ir.data(),
+                                                      ir_length_samples,
+                                                      num_channels,
+                                                      fft_scratch);
+    }
 
     chowdsp::convolution::Process_Uniform_State conv_state {};
-    chowdsp::convolution::create_process_state (&conv_config, &conv_ir, &conv_state);
+    chowdsp::convolution::create_multichannel_process_state (&conv_config, &conv_ir, &conv_state, num_channels);
 
     start = std::chrono::high_resolution_clock::now();
     for (int i = 0; i < num_blocks; ++i)
@@ -560,8 +577,7 @@ static bool test_convolution_non_uniform (int ir_length_samples, int block_size,
     chowdsp::convolution::Config tail_config {};
     chowdsp::convolution::create_config (&tail_config, head_size);
 
-    chowdsp::convolution::IR_Non_Uniform conv_ir
-    {
+    chowdsp::convolution::IR_Non_Uniform conv_ir {
         .head_config = &head_config,
         .tail_config = &tail_config,
         .head_size = head_size,
@@ -630,8 +646,10 @@ int main()
         success &= test_convolution (100, 511, 4, latency);
         success &= test_convolution (100, 32, 10, latency);
 
-        success &= test_convolution_multi_channel (6000, 2048, 4, latency, 2);
-        success &= test_convolution_multi_channel (100, 32, 10, latency, 4);
+        success &= test_convolution_multi_channel (6000, 2048, 4, latency, 2, false);
+        success &= test_convolution_multi_channel (100, 32, 10, latency, 4, false);
+        success &= test_convolution_multi_channel (6000, 512, 4, latency, 2, true);
+        success &= test_convolution_multi_channel (100, 511, 10, latency, 4, true);
     }
 
     success &= test_convolution_non_uniform (6000, 2048, 4, 2048);