Skip to content

Add support for captured groups in Find & Replace #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 17, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/bin/edit/draw_editor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,12 @@ pub fn search_execute(ctx: &mut Context, state: &mut State, action: SearchAction
SearchAction::Replace => doc.buffer.borrow_mut().find_and_replace(
&state.search_needle,
state.search_options,
&state.search_replacement,
state.search_replacement.as_bytes(),
),
SearchAction::ReplaceAll => doc.buffer.borrow_mut().find_and_replace_all(
&state.search_needle,
state.search_options,
&state.search_replacement,
state.search_replacement.as_bytes(),
),
}
.is_ok();
Expand Down
146 changes: 138 additions & 8 deletions src/buffer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use std::str;

pub use gap_buffer::GapBuffer;

use crate::arena::{ArenaString, scratch_arena};
use crate::arena::{Arena, ArenaString, scratch_arena};
use crate::cell::SemiRefCell;
use crate::clipboard::Clipboard;
use crate::document::{ReadableDocument, WriteableDocument};
Expand Down Expand Up @@ -136,6 +136,11 @@ pub struct SearchOptions {
pub use_regex: bool,
}

enum RegexReplacement<'a> {
Group(i32),
Text(Vec<u8, &'a Arena>),
}

/// Caches the start and length of the active edit line for a single edit.
/// This helps us avoid having to remeasure the buffer after an edit.
struct ActiveEditLineInfo {
Expand Down Expand Up @@ -1078,13 +1083,18 @@ impl TextBuffer {
&mut self,
pattern: &str,
options: SearchOptions,
replacement: &str,
replacement: &[u8],
) -> apperr::Result<()> {
// Editors traditionally replace the previous search hit, not the next possible one.
if let (Some(search), Some(..)) = (&mut self.search, &self.selection) {
let search = search.get_mut();
if let (Some(search), Some(..)) = (&self.search, &self.selection) {
let search = unsafe { &mut *search.get() };
if search.selection_generation == self.selection_generation {
self.write(replacement.as_bytes(), self.cursor, true);
let scratch = scratch_arena(None);
let parsed_replacements =
Self::find_parse_replacement(&scratch, &mut *search, replacement);
let replacement =
self.find_fill_replacement(&mut *search, replacement, &parsed_replacements);
self.write(&replacement, self.cursor, true);
}
}

Expand All @@ -1096,18 +1106,22 @@ impl TextBuffer {
&mut self,
pattern: &str,
options: SearchOptions,
replacement: &str,
replacement: &[u8],
) -> apperr::Result<()> {
let replacement = replacement.as_bytes();
let scratch = scratch_arena(None);
let mut search = self.find_construct_search(pattern, options)?;
let mut offset = 0;
let parsed_replacements = Self::find_parse_replacement(&scratch, &mut search, replacement);

loop {
self.find_select_next(&mut search, offset, false);
if !self.has_selection() {
break;
}
self.write(replacement, self.cursor, true);

let replacement =
self.find_fill_replacement(&mut search, replacement, &parsed_replacements);
self.write(&replacement, self.cursor, true);
offset = self.cursor.offset;
}

Expand Down Expand Up @@ -1215,6 +1229,122 @@ impl TextBuffer {
};
}

fn find_parse_replacement<'a>(
arena: &'a Arena,
search: &mut ActiveSearch,
replacement: &[u8],
) -> Vec<RegexReplacement<'a>, &'a Arena> {
let mut res = Vec::new_in(arena);

if !search.options.use_regex {
return res;
}

let group_count = search.regex.group_count();
let mut text = Vec::new_in(arena);
let mut text_beg = 0;

loop {
let mut off = memchr2(b'$', b'\\', replacement, text_beg);

// Push the raw, unescaped text, if any.
if text_beg < off {
text.extend_from_slice(&replacement[text_beg..off]);
}

// Unescape any escaped characters.
while off < replacement.len() && replacement[off] == b'\\' {
off += 2;
text.push(match replacement.get(off - 1).map_or(b'\\', |&c| c) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lhecker we didn't check whether replacement contained two characters, only that it contained at least one. what happens if it only contains one?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can leave a comment there to make that logic a bit easier to understand. The tl;dr is that this should then result in map_or returning b'\\', because off + 2 - 1 will be out of bounds.

b'n' => b'\n',
b'r' => b'\r',
b't' => b'\t',
ch => ch,
});
}

// Parse out a group number, if any.
let mut group = -1;
if off < replacement.len() && replacement[off] == b'$' {
let mut beg = off;
let mut end = off + 1;
let mut acc = 0i32;
let mut acc_bad = true;

if end < replacement.len() {
let ch = replacement[end];

if ch == b'$' {
// Translate "$$" to "$".
beg += 1;
end += 1;
} else if ch.is_ascii_digit() {
// Parse "$1234" into 1234i32.
// If the number is larger than the group count,
// we flag `acc_bad` which causes us to treat it as text.
acc_bad = false;
while {
acc =
acc.wrapping_mul(10).wrapping_add((replacement[end] - b'0') as i32);
acc_bad |= acc > group_count;
end += 1;
end < replacement.len() && replacement[end].is_ascii_digit()
} {}
}
}

if !acc_bad {
group = acc;
} else {
text.extend_from_slice(&replacement[beg..end]);
}

off = end;
}

if !text.is_empty() {
res.push(RegexReplacement::Text(text));
text = Vec::new_in(arena);
}
if group >= 0 {
res.push(RegexReplacement::Group(group));
}

text_beg = off;
if text_beg >= replacement.len() {
break;
}
}

res
}

fn find_fill_replacement<'a>(
&self,
search: &mut ActiveSearch,
replacement: &'a [u8],
parsed_replacements: &[RegexReplacement],
) -> Cow<'a, [u8]> {
if !search.options.use_regex {
Cow::Borrowed(replacement)
} else {
let mut res = Vec::new();

for replacement in parsed_replacements {
match replacement {
RegexReplacement::Text(text) => res.extend_from_slice(text),
RegexReplacement::Group(group) => {
if let Some(range) = search.regex.group(*group) {
self.buffer.extract_raw(range, &mut res, usize::MAX);
}
}
}
}

Cow::Owned(res)
}
}

fn measurement_config(&self) -> MeasurementConfig<'_> {
MeasurementConfig::new(&self.buffer)
.with_word_wrap_column(self.word_wrap_column)
Expand Down
41 changes: 31 additions & 10 deletions src/icu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,31 @@ impl Regex {
let mut status = icu_ffi::U_ZERO_ERROR;
unsafe { (f.uregex_reset64)(self.0, offset as i64, &mut status) };
}

/// Gets captured group count.
pub fn group_count(&mut self) -> i32 {
let f = assume_loaded();

let mut status = icu_ffi::U_ZERO_ERROR;
let count = unsafe { (f.uregex_groupCount)(self.0, &mut status) };
if status.is_failure() { 0 } else { count }
}

/// Gets the text range of a captured group by index.
pub fn group(&mut self, group: i32) -> Option<Range<usize>> {
let f = assume_loaded();

let mut status = icu_ffi::U_ZERO_ERROR;
let start = unsafe { (f.uregex_start64)(self.0, group, &mut status) };
let end = unsafe { (f.uregex_end64)(self.0, group, &mut status) };
if status.is_failure() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lhecker can uregex_start64 fail too?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the nice property of ICU is that its functions short-circuit if you pass in an error status. So, if uregex_start64 fails, status will contain a failure, uregex_end64 will short-circuit and then we check it. Makes some of this code really neat.

None
} else {
let start = start.max(0);
let end = end.max(start);
Some(start as usize..end as usize)
}
}
}

impl Iterator for Regex {
Expand All @@ -691,15 +716,7 @@ impl Iterator for Regex {
return None;
}

let start = unsafe { (f.uregex_start64)(self.0, 0, &mut status) };
let end = unsafe { (f.uregex_end64)(self.0, 0, &mut status) };
if status.is_failure() {
return None;
}

let start = start.max(0);
let end = end.max(start);
Some(start as usize..end as usize)
self.group(0)
}
}

Expand Down Expand Up @@ -900,6 +917,7 @@ struct LibraryFunctions {
uregex_setUText: icu_ffi::uregex_setUText,
uregex_reset64: icu_ffi::uregex_reset64,
uregex_findNext: icu_ffi::uregex_findNext,
uregex_groupCount: icu_ffi::uregex_groupCount,
uregex_start64: icu_ffi::uregex_start64,
uregex_end64: icu_ffi::uregex_end64,
}
Expand All @@ -919,7 +937,7 @@ const LIBICUUC_PROC_NAMES: [&CStr; 10] = [
];

// Found in libicui18n.so on UNIX, icuin.dll/icu.dll on Windows.
const LIBICUI18N_PROC_NAMES: [&CStr; 10] = [
const LIBICUI18N_PROC_NAMES: [&CStr; 11] = [
c"ucol_open",
c"ucol_strcollUTF8",
c"uregex_open",
Expand All @@ -928,6 +946,7 @@ const LIBICUI18N_PROC_NAMES: [&CStr; 10] = [
c"uregex_setUText",
c"uregex_reset64",
c"uregex_findNext",
c"uregex_groupCount",
c"uregex_start64",
c"uregex_end64",
];
Expand Down Expand Up @@ -1277,6 +1296,8 @@ mod icu_ffi {
unsafe extern "C" fn(regexp: *mut URegularExpression, index: i64, status: &mut UErrorCode);
pub type uregex_findNext =
unsafe extern "C" fn(regexp: *mut URegularExpression, status: &mut UErrorCode) -> bool;
pub type uregex_groupCount =
unsafe extern "C" fn(regexp: *mut URegularExpression, status: &mut UErrorCode) -> i32;
pub type uregex_start64 = unsafe extern "C" fn(
regexp: *mut URegularExpression,
group_num: i32,
Expand Down