From 125c0e6ad739585e0ce64678e5030aad9cfc114d Mon Sep 17 00:00:00 2001 From: Canop Date: Sun, 5 Nov 2023 20:32:25 +0100 Subject: [PATCH] add bytes_ prefixed macros for building bytes::Regex Fix #30 --- Cargo.toml | 2 +- examples/regexes/src/main.rs | 20 ++++ src/lib.rs | 29 ++++- src/proc_macros/mod.rs | 218 ++++++++++++++++++++++++++++++++-- src/proc_macros/regex_code.rs | 19 +-- 5 files changed, 257 insertions(+), 31 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 603b508..46716d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ rust-version = "1.56" [dependencies] once_cell = "1.17" -regex = {version = "1.9", default_features = false, features = ["std"], optional = true} +regex = {version = "1.9", default_features = false, optional = true} regex-lite = {version = "0.1", optional = true} [dependencies.lazy-regex-proc_macros] diff --git a/examples/regexes/src/main.rs b/examples/regexes/src/main.rs index f5e7e15..665d06d 100644 --- a/examples/regexes/src/main.rs +++ b/examples/regexes/src/main.rs @@ -21,6 +21,26 @@ fn example_builds() { // Try to uncomment the following line to see the compilation error // let r = regex!("(unclosed"); + + // build a bytes::Regex macro + let rb = bytes_regex!("b+"); + assert!(rb.is_match(b"abcd")); + let rb = bytes_regex!("sa+b?$"i); + assert_eq!(rb.is_match(b"Saa"), true); + + // build a bytes::Regex macro using the suffix syntax + let rb = regex!("b+"B); + assert!(rb.is_match(b"abcd")); + + // 4 equivalent ways to build a case insensitive bytes::Regex + let case_insensitive_regex = bytes_regex!("^ab+$"i); + assert!(case_insensitive_regex.is_match(b"abB")); + let case_insensitive_regex = bytes_regex!("(?i)^ab+$"); + assert!(case_insensitive_regex.is_match(b"abB")); + let case_insensitive_regex = regex!("^ab+$"iB); + assert!(case_insensitive_regex.is_match(b"abB")); + let case_insensitive_regex = regex!("(?i)^ab+$"B); + assert!(case_insensitive_regex.is_match(b"abB")); } fn example_is_match() { diff --git a/src/lib.rs b/src/lib.rs index ce7b25d..ff7917a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,7 +9,7 @@ With lazy-regex macros, regular expressions The [regex!] macro returns references to normal instances of [regex::Regex] or [regex::bytes::Regex] so all the usual features are available. -Other macros are specialized for testing a match, replacing with concise closures, or capturing groups as substrings in some common situations: +But most often, you won't even use the `regex!` macro but the other macros which are specialized for testing a match, replacing, or capturing groups in some common situations: * [regex_is_match!] * [regex_find!] @@ -19,6 +19,8 @@ Other macros are specialized for testing a match, replacing with concise closure All of them support the `B` flag for the `regex::bytes::Regex` variant. +All macros exist with a `bytes_` prefix for building `bytes::Regex`, so you also have [bytes_regex!], [regex_is_match!], [regex_find!], [regex_captures!], [regex_replace!], and [regex_replace_all!]. + Some structs of the regex crate are reexported to ease dependency managment. # Build Regexes @@ -61,17 +63,26 @@ assert_eq!(r.find("This is lazy_regex-2.2!").unwrap().as_str(), "lazy_regex-2.2" let r = regex!("(unclosed"); ``` -Supported regex flags: `i`, `m`, `s`, `x`, `U`. +Supported regex flags: [`i`, `m`, `s`, `x`, `U`][regex::RegexBuilder], and you may also use `B` to build a bytes regex. + +The following regexes are equivalent: +* `bytes_regex!("^ab+$"i)` +* `bytes_regex!("(?i)^ab+$")` +* `regex!("^ab+$"iB)` +* `regex!("(?i)^ab+$"B)` + +They're all case insensitive instances of `regex::bytes::Regex`. -See [regex::RegexBuilder]. # Test a match ```rust -use lazy_regex::regex_is_match; +use lazy_regex::*; let b = regex_is_match!("[ab]+", "car"); assert_eq!(b, true); +let b = bytes_regex_is_match!("[ab]+", b"car"); +assert_eq!(b, true); ``` doc: [regex_is_match!] @@ -168,12 +179,20 @@ doc: [lazy_regex!] pub use { lazy_regex_proc_macros::{ - lazy_regex, regex, + lazy_regex, + regex, regex_captures, regex_find, regex_is_match, regex_replace, regex_replace_all, + bytes_lazy_regex, + bytes_regex, + bytes_regex_captures, + bytes_regex_find, + bytes_regex_is_match, + bytes_regex_replace, + bytes_regex_replace_all, }, once_cell::sync::Lazy, }; diff --git a/src/proc_macros/mod.rs b/src/proc_macros/mod.rs index 20e9aac..ccaabc4 100644 --- a/src/proc_macros/mod.rs +++ b/src/proc_macros/mod.rs @@ -5,30 +5,29 @@ use { crate::{args::*, regex_code::*}, proc_macro::TokenStream, quote::quote, - std::convert::TryFrom, syn::{parse_macro_input, Expr}, }; // The following `process*` functions are convenience funcs // to reduce boilerplate in macro implementations below. -fn process(input: TokenStream, f: F) -> TokenStream +fn process(input: TokenStream, as_bytes: bool, f: F) -> TokenStream where T: Into, F: Fn(RegexCode) -> T, { - match RegexCode::try_from(input) { + match RegexCode::from_token_stream(input, as_bytes) { Ok(r) => f(r).into(), Err(e) => e.to_compile_error().into(), } } -fn process_with_value(input: TokenStream, f: F) -> TokenStream +fn process_with_value(input: TokenStream, as_bytes: bool, f: F) -> TokenStream where T: Into, F: Fn(RegexCode, Expr) -> T, { let parsed = parse_macro_input!(input as RexValArgs); - match RegexCode::try_from(parsed.regex_str) { + match RegexCode::from_lit_str(parsed.regex_str, as_bytes) { Ok(r) => f(r, parsed.value).into(), Err(e) => e.to_compile_error().into(), } @@ -52,7 +51,20 @@ where /// ``` #[proc_macro] pub fn regex(input: TokenStream) -> TokenStream { - process(input, |regex_code| regex_code.lazy_static()) + process(input, false, |regex_code| regex_code.lazy_static()) +} + +/// Return a lazy static `regex::bytes::Regex` checked at compilation time and +/// built at first use. +/// +/// Flags can be specified as suffix: +/// ``` +/// let case_insensitive_regex = bytes_regex!("^ab+$"i); +/// assert!(case_insensitive_regex.is_match(b"abB")); +/// ``` +#[proc_macro] +pub fn bytes_regex(input: TokenStream) -> TokenStream { + process(input, true, |regex_code| regex_code.lazy_static()) } /// Return an instance of `once_cell::sync::Lazy` or @@ -68,7 +80,22 @@ pub fn regex(input: TokenStream) -> TokenStream { /// As for other macros, the regex is checked at compilation time. #[proc_macro] pub fn lazy_regex(input: TokenStream) -> TokenStream { - process(input, |regex_code| regex_code.build) + process(input, false, |regex_code| regex_code.build) +} + +/// Return an instance of `once_cell::sync::Lazy` that +/// you can use in a public static declaration. +/// +/// Example: +/// +/// ``` +/// pub static GLOBAL_REX: Lazy = bytes_lazy_regex!("^ab+$"i); +/// ``` +/// +/// As for other macros, the regex is checked at compilation time. +#[proc_macro] +pub fn bytes_lazy_regex(input: TokenStream) -> TokenStream { + process(input, true, |regex_code| regex_code.build) } /// Test whether an expression matches a lazy static @@ -82,7 +109,27 @@ pub fn lazy_regex(input: TokenStream) -> TokenStream { /// ``` #[proc_macro] pub fn regex_is_match(input: TokenStream) -> TokenStream { - process_with_value(input, |regex_code, value| { + process_with_value(input, false, |regex_code, value| { + let statick = regex_code.statick(); + quote! {{ + #statick; + RE.is_match(#value) + }} + }) +} + +/// Test whether an expression matches a lazy static +/// bytes::Regex regular expression (the regex is checked +/// at compile time) +/// +/// Example: +/// ``` +/// let b = bytes_regex_is_match!("[ab]+", b"car"); +/// assert_eq!(b, true); +/// ``` +#[proc_macro] +pub fn bytes_regex_is_match(input: TokenStream) -> TokenStream { + process_with_value(input, true, |regex_code, value| { let statick = regex_code.statick(); quote! {{ #statick; @@ -103,7 +150,30 @@ pub fn regex_is_match(input: TokenStream) -> TokenStream { /// ``` #[proc_macro] pub fn regex_find(input: TokenStream) -> TokenStream { - process_with_value(input, |regex_code, value| { + process_with_value(input, false, |regex_code, value| { + let statick = regex_code.statick(); + let as_method = match regex_code.regex { + RegexInstance::Regex(..) => quote!(as_str), + RegexInstance::Bytes(..) => quote!(as_bytes), + }; + quote! {{ + #statick; + RE.find(#value).map(|mat| mat. #as_method ()) + }} + }) +} + +/// Extract the leftmost match of the regex in the +/// second argument as a `&[u8]` +/// +/// Example: +/// ``` +/// let f_word = bytes_regex_find!(r#"\bf\w+\b"#, b"The fox jumps."); +/// assert_eq!(f_word, Some("fox".as_bytes())); +/// ``` +#[proc_macro] +pub fn bytes_regex_find(input: TokenStream) -> TokenStream { + process_with_value(input, true, |regex_code, value| { let statick = regex_code.statick(); let as_method = match regex_code.regex { RegexInstance::Regex(..) => quote!(as_str), @@ -135,7 +205,7 @@ pub fn regex_find(input: TokenStream) -> TokenStream { /// ``` #[proc_macro] pub fn regex_captures(input: TokenStream) -> TokenStream { - process_with_value(input, |regex_code, value| { + process_with_value(input, false, |regex_code, value| { let statick = regex_code.statick(); let n = regex_code.captures_len(); let groups = (0..n).map(|i| { @@ -153,11 +223,48 @@ pub fn regex_captures(input: TokenStream) -> TokenStream { }) } +/// Extract captured groups as a tuple of &[u8] +/// +/// If there's no match, the macro returns `None`. +/// +/// If an optional group has no value, the tuple +/// will contain `b""` instead. +/// +/// Example: +/// ``` +/// let (whole, name, version) = bytes_regex_captures!( +/// r#"(\w+)-([0-9.]+)"#, // a literal regex +/// b"This is lazy_regex-2.0!", // any expression +/// ).unwrap(); +/// assert_eq!(whole, b"lazy_regex-2.0"); +/// assert_eq!(name, b"lazy_regex"); +/// assert_eq!(version, "2.0".as_bytes()); +/// ``` +#[proc_macro] +pub fn bytes_regex_captures(input: TokenStream) -> TokenStream { + process_with_value(input, true, |regex_code, value| { + let statick = regex_code.statick(); + let n = regex_code.captures_len(); + let groups = (0..n).map(|i| { + quote! { + caps.get(#i).map_or(&b""[..], |c| c.as_bytes()) + } + }); + quote! {{ + #statick; + RE.captures(#value) + .map(|caps| ( + #(#groups),* + )) + }} + }) +} + /// common implementation of regex_replace and regex_replace_all fn replacen(input: TokenStream, limit: usize) -> TokenStream { let parsed = parse_macro_input!(input as ReplaceArgs); let ReplaceArgs { regex_str, value, replacer } = parsed; - let regex_code = match RegexCode::try_from(regex_str) { + let regex_code = match RegexCode::from_lit_str(regex_str, false) { Ok(r) => r, Err(e) => { return e.to_compile_error().into(); @@ -195,6 +302,48 @@ fn replacen(input: TokenStream, limit: usize) -> TokenStream { stream.into() } +/// common implementation of bytes_regex_replace and bytes_regex_replace_all +fn bytes_replacen(input: TokenStream, limit: usize) -> TokenStream { + let parsed = parse_macro_input!(input as ReplaceArgs); + let ReplaceArgs { regex_str, value, replacer } = parsed; + let regex_code = match RegexCode::from_lit_str(regex_str, true) { + Ok(r) => r, + Err(e) => { + return e.to_compile_error().into(); + } + }; + let statick = regex_code.statick(); + let stream = match replacer { + MaybeFun::Fun(fun) => { + let n = regex_code.captures_len(); + let groups = (0..n).map(|i| { + quote! { + caps.get(#i).map_or(&b""[..], |c| c.as_bytes()) + } + }); + quote! {{ + #statick; + RE.replacen( + #value, + #limit, + |caps: &lazy_regex::regex::bytes::Captures<'_>| { + let mut fun = #fun; + fun( + #(#groups),* + ) + }) + }} + } + MaybeFun::Expr(expr) => { + quote! {{ + #statick; + RE.replacen(#value, #limit, #expr) + }} + } + }; + stream.into() +} + /// Replaces the leftmost match in the second argument /// using the replacer given as third argument. /// @@ -218,6 +367,30 @@ pub fn regex_replace(input: TokenStream) -> TokenStream { replacen(input, 1) } +/// Replaces the leftmost match in the second argument +/// using the replacer given as third argument. +/// +/// When the replacer is a closure, it is given one or more `&str`, +/// the first one for the whole match and the following ones for +/// the groups. +/// Any optional group with no value is replaced with `b""`. +/// +/// Example: +/// ``` +/// println!("{:?}", "ck ck".as_bytes()); +/// let text = b"Fuu fuuu"; +/// let text = bytes_regex_replace!( +/// "f(u*)"i, +/// text, +/// b"ck", +/// ); +/// assert_eq!(text, "ck fuuu".as_bytes()); +/// ``` +#[proc_macro] +pub fn bytes_regex_replace(input: TokenStream) -> TokenStream { + bytes_replacen(input, 1) +} + /// Replaces all non-overlapping matches in the second argument /// using the replacer given as third argument. /// @@ -240,3 +413,26 @@ pub fn regex_replace(input: TokenStream) -> TokenStream { pub fn regex_replace_all(input: TokenStream) -> TokenStream { replacen(input, 0) } + +/// Replaces all non-overlapping matches in the second argument +/// using the replacer given as third argument. +/// +/// When the replacer is a closure, it is given one or more `&str`, +/// the first one for the whole match and the following ones for +/// the groups. +/// Any optional group with no value is replaced with `""`. +/// +/// Example: +/// ``` +/// let text = b"Foo fuu"; +/// let text = bytes_regex_replace_all!( +/// r#"\bf(?P\w+)"#i, +/// text, +/// b"H", +/// ); +/// assert_eq!(text, "H H".as_bytes()); +/// ``` +#[proc_macro] +pub fn bytes_regex_replace_all(input: TokenStream) -> TokenStream { + bytes_replacen(input, 0) +} diff --git a/src/proc_macros/regex_code.rs b/src/proc_macros/regex_code.rs index 7658656..917cdff 100644 --- a/src/proc_macros/regex_code.rs +++ b/src/proc_macros/regex_code.rs @@ -2,7 +2,6 @@ use { proc_macro::TokenStream, proc_macro2::TokenStream as TokenStream2, quote::quote, - std::convert::TryFrom, syn::LitStr, }; @@ -18,17 +17,17 @@ pub(crate) enum RegexInstance { Bytes(regex::bytes::Regex), } -impl TryFrom for RegexCode { - type Error = syn::Error; - - fn try_from(lit_str: LitStr) -> Result { +impl RegexCode { + pub fn from_token_stream(token_stream: TokenStream, is_bytes: bool) -> Result { + Self::from_lit_str(syn::parse::(token_stream)?, is_bytes) + } + pub fn from_lit_str(lit_str: LitStr, mut is_bytes: bool) -> Result { let pattern = lit_str.value(); let mut case_insensitive = false; let mut multi_line = false; let mut dot_matches_new_line = false; let mut ignore_whitespace = false; let mut swap_greed = false; - let mut is_bytes = false; for (i, ch) in lit_str.suffix().chars().enumerate() { match ch { 'i' => case_insensitive = true, @@ -78,14 +77,6 @@ impl TryFrom for RegexCode { } } -impl TryFrom for RegexCode { - type Error = syn::Error; - - fn try_from(token_stream: TokenStream) -> Result { - Self::try_from(syn::parse::(token_stream)?) - } -} - impl RegexCode { pub fn statick(&self) -> TokenStream2 { let build = &self.build;