Skip to content

Add support for captured groups in Find & Replace #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 17, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/bin/edit/draw_editor.rs
Original file line number Diff line number Diff line change
@@ -181,12 +181,12 @@ pub fn search_execute(ctx: &mut Context, state: &mut State, action: SearchAction
SearchAction::Replace => doc.buffer.borrow_mut().find_and_replace(
&state.search_needle,
state.search_options,
&state.search_replacement,
state.search_replacement.as_bytes(),
),
SearchAction::ReplaceAll => doc.buffer.borrow_mut().find_and_replace_all(
&state.search_needle,
state.search_options,
&state.search_replacement,
state.search_replacement.as_bytes(),
),
}
.is_ok();
154 changes: 146 additions & 8 deletions src/buffer/mod.rs
Original file line number Diff line number Diff line change
@@ -36,7 +36,7 @@ use std::str;

pub use gap_buffer::GapBuffer;

use crate::arena::{ArenaString, scratch_arena};
use crate::arena::{Arena, ArenaString, scratch_arena};
use crate::cell::SemiRefCell;
use crate::clipboard::Clipboard;
use crate::document::{ReadableDocument, WriteableDocument};
@@ -136,6 +136,11 @@ pub struct SearchOptions {
pub use_regex: bool,
}

enum RegexReplacement<'a> {
Group(i32),
Text(Vec<u8, &'a Arena>),
}

/// Caches the start and length of the active edit line for a single edit.
/// This helps us avoid having to remeasure the buffer after an edit.
struct ActiveEditLineInfo {
@@ -1078,13 +1083,18 @@ impl TextBuffer {
&mut self,
pattern: &str,
options: SearchOptions,
replacement: &str,
replacement: &[u8],
) -> apperr::Result<()> {
// Editors traditionally replace the previous search hit, not the next possible one.
if let (Some(search), Some(..)) = (&mut self.search, &self.selection) {
let search = search.get_mut();
if let (Some(search), Some(..)) = (&self.search, &self.selection) {
let search = unsafe { &mut *search.get() };
if search.selection_generation == self.selection_generation {
self.write(replacement.as_bytes(), self.cursor, true);
let scratch = scratch_arena(None);
let parsed_replacements =
Self::find_parse_replacement(&scratch, &mut *search, replacement);
let replacement =
self.find_fill_replacement(&mut *search, replacement, &parsed_replacements);
self.write(&replacement, self.cursor, true);
}
}

@@ -1096,18 +1106,22 @@ impl TextBuffer {
&mut self,
pattern: &str,
options: SearchOptions,
replacement: &str,
replacement: &[u8],
) -> apperr::Result<()> {
let replacement = replacement.as_bytes();
let scratch = scratch_arena(None);
let mut search = self.find_construct_search(pattern, options)?;
let mut offset = 0;
let parsed_replacements = Self::find_parse_replacement(&scratch, &mut search, replacement);

loop {
self.find_select_next(&mut search, offset, false);
if !self.has_selection() {
break;
}
self.write(replacement, self.cursor, true);

let replacement =
self.find_fill_replacement(&mut search, replacement, &parsed_replacements);
self.write(&replacement, self.cursor, true);
offset = self.cursor.offset;
}

@@ -1215,6 +1229,130 @@ impl TextBuffer {
};
}

fn find_parse_replacement<'a>(
arena: &'a Arena,
search: &mut ActiveSearch,
replacement: &[u8],
) -> Vec<RegexReplacement<'a>, &'a Arena> {
let mut res = Vec::new_in(arena);

if !search.options.use_regex {
return res;
}

let group_count = search.regex.group_count();
let mut text = Vec::new_in(arena);
let mut text_beg = 0;

loop {
let mut off = memchr2(b'$', b'\\', replacement, text_beg);

// Push the raw, unescaped text, if any.
if text_beg < off {
text.extend_from_slice(&replacement[text_beg..off]);
}

// Unescape any escaped characters.
while off < replacement.len() && replacement[off] == b'\\' {
off += 2;

// If this backslash is the last character (e.g. because
// `replacement` is just 1 byte long, holding just b"\\"),
// we can't unescape it. In that case, we map it to `b'\\'` here.
// This results in us appending a literal backslash to the text.
let ch = replacement.get(off - 1).map_or(b'\\', |&c| c);

// Unescape and append the character.
text.push(match ch {
b'n' => b'\n',
b'r' => b'\r',
b't' => b'\t',
ch => ch,
});
}

// Parse out a group number, if any.
let mut group = -1;
if off < replacement.len() && replacement[off] == b'$' {
let mut beg = off;
let mut end = off + 1;
let mut acc = 0i32;
let mut acc_bad = true;

if end < replacement.len() {
let ch = replacement[end];

if ch == b'$' {
// Translate "$$" to "$".
beg += 1;
end += 1;
} else if ch.is_ascii_digit() {
// Parse "$1234" into 1234i32.
// If the number is larger than the group count,
// we flag `acc_bad` which causes us to treat it as text.
acc_bad = false;
while {
acc =
acc.wrapping_mul(10).wrapping_add((replacement[end] - b'0') as i32);
acc_bad |= acc > group_count;
end += 1;
end < replacement.len() && replacement[end].is_ascii_digit()
} {}
}
}

if !acc_bad {
group = acc;
} else {
text.extend_from_slice(&replacement[beg..end]);
}

off = end;
}

if !text.is_empty() {
res.push(RegexReplacement::Text(text));
text = Vec::new_in(arena);
}
if group >= 0 {
res.push(RegexReplacement::Group(group));
}

text_beg = off;
if text_beg >= replacement.len() {
break;
}
}

res
}

fn find_fill_replacement<'a>(
&self,
search: &mut ActiveSearch,
replacement: &'a [u8],
parsed_replacements: &[RegexReplacement],
) -> Cow<'a, [u8]> {
if !search.options.use_regex {
Cow::Borrowed(replacement)
} else {
let mut res = Vec::new();

for replacement in parsed_replacements {
match replacement {
RegexReplacement::Text(text) => res.extend_from_slice(text),
RegexReplacement::Group(group) => {
if let Some(range) = search.regex.group(*group) {
self.buffer.extract_raw(range, &mut res, usize::MAX);
}
}
}
}

Cow::Owned(res)
}
}

fn measurement_config(&self) -> MeasurementConfig<'_> {
MeasurementConfig::new(&self.buffer)
.with_word_wrap_column(self.word_wrap_column)
41 changes: 31 additions & 10 deletions src/icu.rs
Original file line number Diff line number Diff line change
@@ -677,6 +677,31 @@ impl Regex {
let mut status = icu_ffi::U_ZERO_ERROR;
unsafe { (f.uregex_reset64)(self.0, offset as i64, &mut status) };
}

/// Gets captured group count.
pub fn group_count(&mut self) -> i32 {
let f = assume_loaded();

let mut status = icu_ffi::U_ZERO_ERROR;
let count = unsafe { (f.uregex_groupCount)(self.0, &mut status) };
if status.is_failure() { 0 } else { count }
}

/// Gets the text range of a captured group by index.
pub fn group(&mut self, group: i32) -> Option<Range<usize>> {
let f = assume_loaded();

let mut status = icu_ffi::U_ZERO_ERROR;
let start = unsafe { (f.uregex_start64)(self.0, group, &mut status) };
let end = unsafe { (f.uregex_end64)(self.0, group, &mut status) };
if status.is_failure() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lhecker can uregex_start64 fail too?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the nice property of ICU is that its functions short-circuit if you pass in an error status. So, if uregex_start64 fails, status will contain a failure, uregex_end64 will short-circuit and then we check it. Makes some of this code really neat.

None
} else {
let start = start.max(0);
let end = end.max(start);
Some(start as usize..end as usize)
}
}
}

impl Iterator for Regex {
@@ -691,15 +716,7 @@ impl Iterator for Regex {
return None;
}

let start = unsafe { (f.uregex_start64)(self.0, 0, &mut status) };
let end = unsafe { (f.uregex_end64)(self.0, 0, &mut status) };
if status.is_failure() {
return None;
}

let start = start.max(0);
let end = end.max(start);
Some(start as usize..end as usize)
self.group(0)
}
}

@@ -900,6 +917,7 @@ struct LibraryFunctions {
uregex_setUText: icu_ffi::uregex_setUText,
uregex_reset64: icu_ffi::uregex_reset64,
uregex_findNext: icu_ffi::uregex_findNext,
uregex_groupCount: icu_ffi::uregex_groupCount,
uregex_start64: icu_ffi::uregex_start64,
uregex_end64: icu_ffi::uregex_end64,
}
@@ -919,7 +937,7 @@ const LIBICUUC_PROC_NAMES: [&CStr; 10] = [
];

// Found in libicui18n.so on UNIX, icuin.dll/icu.dll on Windows.
const LIBICUI18N_PROC_NAMES: [&CStr; 10] = [
const LIBICUI18N_PROC_NAMES: [&CStr; 11] = [
c"ucol_open",
c"ucol_strcollUTF8",
c"uregex_open",
@@ -928,6 +946,7 @@ const LIBICUI18N_PROC_NAMES: [&CStr; 10] = [
c"uregex_setUText",
c"uregex_reset64",
c"uregex_findNext",
c"uregex_groupCount",
c"uregex_start64",
c"uregex_end64",
];
@@ -1277,6 +1296,8 @@ mod icu_ffi {
unsafe extern "C" fn(regexp: *mut URegularExpression, index: i64, status: &mut UErrorCode);
pub type uregex_findNext =
unsafe extern "C" fn(regexp: *mut URegularExpression, status: &mut UErrorCode) -> bool;
pub type uregex_groupCount =
unsafe extern "C" fn(regexp: *mut URegularExpression, status: &mut UErrorCode) -> i32;
pub type uregex_start64 = unsafe extern "C" fn(
regexp: *mut URegularExpression,
group_num: i32,