From 125c0e6ad739585e0ce64678e5030aad9cfc114d Mon Sep 17 00:00:00 2001
From: Canop <cano.petrole@gmail.com>
Date: Sun, 5 Nov 2023 20:32:25 +0100
Subject: [PATCH] add bytes_ prefixed macros for building bytes::Regex

Fix #30
---
 Cargo.toml                    |   2 +-
 examples/regexes/src/main.rs  |  20 ++++
 src/lib.rs                    |  29 ++++-
 src/proc_macros/mod.rs        | 218 ++++++++++++++++++++++++++++++++--
 src/proc_macros/regex_code.rs |  19 +--
 5 files changed, 257 insertions(+), 31 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 603b508..46716d3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ rust-version = "1.56"
 
 [dependencies]
 once_cell = "1.17"
-regex = {version = "1.9", default_features = false, features = ["std"], optional = true}
+regex = {version = "1.9", default_features = false, optional = true}
 regex-lite = {version = "0.1", optional = true}
 
 [dependencies.lazy-regex-proc_macros]
diff --git a/examples/regexes/src/main.rs b/examples/regexes/src/main.rs
index f5e7e15..665d06d 100644
--- a/examples/regexes/src/main.rs
+++ b/examples/regexes/src/main.rs
@@ -21,6 +21,26 @@ fn example_builds() {
 
     // Try to uncomment the following line to see the compilation error
     // let r = regex!("(unclosed");
+
+    // build a bytes::Regex macro
+    let rb = bytes_regex!("b+");
+    assert!(rb.is_match(b"abcd"));
+    let rb = bytes_regex!("sa+b?$"i);
+    assert_eq!(rb.is_match(b"Saa"), true);
+
+    // build a bytes::Regex macro using the suffix syntax
+    let rb = regex!("b+"B);
+    assert!(rb.is_match(b"abcd"));
+
+    // 4 equivalent ways to build a case insensitive bytes::Regex
+    let case_insensitive_regex = bytes_regex!("^ab+$"i);
+    assert!(case_insensitive_regex.is_match(b"abB"));
+    let case_insensitive_regex = bytes_regex!("(?i)^ab+$");
+    assert!(case_insensitive_regex.is_match(b"abB"));
+    let case_insensitive_regex = regex!("^ab+$"iB);
+    assert!(case_insensitive_regex.is_match(b"abB"));
+    let case_insensitive_regex = regex!("(?i)^ab+$"B);
+    assert!(case_insensitive_regex.is_match(b"abB"));
 }
 
 fn example_is_match() {
diff --git a/src/lib.rs b/src/lib.rs
index ce7b25d..ff7917a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,7 +9,7 @@ With lazy-regex macros, regular expressions
 
 The [regex!] macro returns references to normal instances of [regex::Regex] or [regex::bytes::Regex] so all the usual features are available.
 
-Other macros are specialized for testing a match, replacing with concise closures, or capturing groups as substrings in some common situations:
+But most often, you won't even use the `regex!` macro but the other macros which are specialized for testing a match, replacing, or capturing groups in some common situations:
 
 * [regex_is_match!]
 * [regex_find!]
@@ -19,6 +19,8 @@ Other macros are specialized for testing a match, replacing with concise closure
 
 All of them support the `B` flag for the `regex::bytes::Regex` variant.
 
+All macros exist with a `bytes_` prefix for building `bytes::Regex`, so you also have [bytes_regex!], [regex_is_match!], [regex_find!], [regex_captures!], [regex_replace!], and [regex_replace_all!].
+
 Some structs of the regex crate are reexported to ease dependency managment.
 
 # Build Regexes
@@ -61,17 +63,26 @@ assert_eq!(r.find("This is lazy_regex-2.2!").unwrap().as_str(), "lazy_regex-2.2"
 let r = regex!("(unclosed");
 
 ```
-Supported regex flags: `i`, `m`, `s`, `x`, `U`.
+Supported regex flags: [`i`, `m`, `s`, `x`, `U`][regex::RegexBuilder], and you may also use `B` to build a bytes regex.
+
+The following regexes are equivalent:
+* `bytes_regex!("^ab+$"i)`
+* `bytes_regex!("(?i)^ab+$")`
+* `regex!("^ab+$"iB)`
+* `regex!("(?i)^ab+$"B)`
+
+They're all case insensitive instances of `regex::bytes::Regex`.
 
-See [regex::RegexBuilder].
 
 # Test a match
 
 ```rust
-use lazy_regex::regex_is_match;
+use lazy_regex::*;
 
 let b = regex_is_match!("[ab]+", "car");
 assert_eq!(b, true);
+let b = bytes_regex_is_match!("[ab]+", b"car");
+assert_eq!(b, true);
 ```
 
 doc: [regex_is_match!]
@@ -168,12 +179,20 @@ doc: [lazy_regex!]
 
 pub use {
     lazy_regex_proc_macros::{
-        lazy_regex, regex,
+        lazy_regex,
+        regex,
         regex_captures,
         regex_find,
         regex_is_match,
         regex_replace,
         regex_replace_all,
+        bytes_lazy_regex,
+        bytes_regex,
+        bytes_regex_captures,
+        bytes_regex_find,
+        bytes_regex_is_match,
+        bytes_regex_replace,
+        bytes_regex_replace_all,
     },
     once_cell::sync::Lazy,
 };
diff --git a/src/proc_macros/mod.rs b/src/proc_macros/mod.rs
index 20e9aac..ccaabc4 100644
--- a/src/proc_macros/mod.rs
+++ b/src/proc_macros/mod.rs
@@ -5,30 +5,29 @@ use {
     crate::{args::*, regex_code::*},
     proc_macro::TokenStream,
     quote::quote,
-    std::convert::TryFrom,
     syn::{parse_macro_input, Expr},
 };
 
 //  The following `process*` functions are convenience funcs
 //  to reduce boilerplate in macro implementations below.
-fn process<T, F>(input: TokenStream, f: F) -> TokenStream
+fn process<T, F>(input: TokenStream, as_bytes: bool, f: F) -> TokenStream
 where
     T: Into<TokenStream>,
     F: Fn(RegexCode) -> T,
 {
-    match RegexCode::try_from(input) {
+    match RegexCode::from_token_stream(input, as_bytes) {
         Ok(r) => f(r).into(),
         Err(e) => e.to_compile_error().into(),
     }
 }
 
-fn process_with_value<T, F>(input: TokenStream, f: F) -> TokenStream
+fn process_with_value<T, F>(input: TokenStream, as_bytes: bool, f: F) -> TokenStream
 where
     T: Into<TokenStream>,
     F: Fn(RegexCode, Expr) -> T,
 {
     let parsed = parse_macro_input!(input as RexValArgs);
-    match RegexCode::try_from(parsed.regex_str) {
+    match RegexCode::from_lit_str(parsed.regex_str, as_bytes) {
         Ok(r) => f(r, parsed.value).into(),
         Err(e) => e.to_compile_error().into(),
     }
@@ -52,7 +51,20 @@ where
 /// ```
 #[proc_macro]
 pub fn regex(input: TokenStream) -> TokenStream {
-    process(input, |regex_code| regex_code.lazy_static())
+    process(input, false, |regex_code| regex_code.lazy_static())
+}
+
+/// Return a lazy static `regex::bytes::Regex` checked at compilation time and
+/// built at first use.
+///
+/// Flags can be specified as suffix:
+/// ```
+/// let case_insensitive_regex = bytes_regex!("^ab+$"i);
+/// assert!(case_insensitive_regex.is_match(b"abB"));
+/// ```
+#[proc_macro]
+pub fn bytes_regex(input: TokenStream) -> TokenStream {
+    process(input, true, |regex_code| regex_code.lazy_static())
 }
 
 /// Return an instance of `once_cell::sync::Lazy<regex::Regex>` or
@@ -68,7 +80,22 @@ pub fn regex(input: TokenStream) -> TokenStream {
 /// As for other macros, the regex is checked at compilation time.
 #[proc_macro]
 pub fn lazy_regex(input: TokenStream) -> TokenStream {
-    process(input, |regex_code| regex_code.build)
+    process(input, false, |regex_code| regex_code.build)
+}
+
+/// Return an instance of `once_cell::sync::Lazy<bytes::Regex>` that
+/// you can use in a public static declaration.
+///
+/// Example:
+///
+/// ```
+/// pub static GLOBAL_REX: Lazy<bytes::Regex> = bytes_lazy_regex!("^ab+$"i);
+/// ```
+///
+/// As for other macros, the regex is checked at compilation time.
+#[proc_macro]
+pub fn bytes_lazy_regex(input: TokenStream) -> TokenStream {
+    process(input, true, |regex_code| regex_code.build)
 }
 
 /// Test whether an expression matches a lazy static
@@ -82,7 +109,27 @@ pub fn lazy_regex(input: TokenStream) -> TokenStream {
 /// ```
 #[proc_macro]
 pub fn regex_is_match(input: TokenStream) -> TokenStream {
-    process_with_value(input, |regex_code, value| {
+    process_with_value(input, false, |regex_code, value| {
+        let statick = regex_code.statick();
+        quote! {{
+            #statick;
+            RE.is_match(#value)
+        }}
+    })
+}
+
+/// Test whether an expression matches a lazy static
+/// bytes::Regex regular expression (the regex is checked
+/// at compile time)
+///
+/// Example:
+/// ```
+/// let b = bytes_regex_is_match!("[ab]+", b"car");
+/// assert_eq!(b, true);
+/// ```
+#[proc_macro]
+pub fn bytes_regex_is_match(input: TokenStream) -> TokenStream {
+    process_with_value(input, true, |regex_code, value| {
         let statick = regex_code.statick();
         quote! {{
             #statick;
@@ -103,7 +150,30 @@ pub fn regex_is_match(input: TokenStream) -> TokenStream {
 /// ```
 #[proc_macro]
 pub fn regex_find(input: TokenStream) -> TokenStream {
-    process_with_value(input, |regex_code, value| {
+    process_with_value(input, false, |regex_code, value| {
+        let statick = regex_code.statick();
+        let as_method = match regex_code.regex {
+            RegexInstance::Regex(..) => quote!(as_str),
+            RegexInstance::Bytes(..) => quote!(as_bytes),
+        };
+        quote! {{
+            #statick;
+            RE.find(#value).map(|mat| mat. #as_method ())
+        }}
+    })
+}
+
+/// Extract the leftmost match of the regex in the
+/// second argument as a `&[u8]`
+///
+/// Example:
+/// ```
+/// let f_word = bytes_regex_find!(r#"\bf\w+\b"#, b"The fox jumps.");
+/// assert_eq!(f_word, Some("fox".as_bytes()));
+/// ```
+#[proc_macro]
+pub fn bytes_regex_find(input: TokenStream) -> TokenStream {
+    process_with_value(input, true, |regex_code, value| {
         let statick = regex_code.statick();
         let as_method = match regex_code.regex {
             RegexInstance::Regex(..) => quote!(as_str),
@@ -135,7 +205,7 @@ pub fn regex_find(input: TokenStream) -> TokenStream {
 /// ```
 #[proc_macro]
 pub fn regex_captures(input: TokenStream) -> TokenStream {
-    process_with_value(input, |regex_code, value| {
+    process_with_value(input, false, |regex_code, value| {
         let statick = regex_code.statick();
         let n = regex_code.captures_len();
         let groups = (0..n).map(|i| {
@@ -153,11 +223,48 @@ pub fn regex_captures(input: TokenStream) -> TokenStream {
     })
 }
 
+/// Extract captured groups as a tuple of &[u8]
+///
+/// If there's no match, the macro returns `None`.
+///
+/// If an optional group has no value, the tuple
+/// will contain `b""` instead.
+///
+/// Example:
+/// ```
+/// let (whole, name, version) = bytes_regex_captures!(
+///     r#"(\w+)-([0-9.]+)"#, // a literal regex
+///     b"This is lazy_regex-2.0!", // any expression
+/// ).unwrap();
+/// assert_eq!(whole, b"lazy_regex-2.0");
+/// assert_eq!(name, b"lazy_regex");
+/// assert_eq!(version, "2.0".as_bytes());
+/// ```
+#[proc_macro]
+pub fn bytes_regex_captures(input: TokenStream) -> TokenStream {
+    process_with_value(input, true, |regex_code, value| {
+        let statick = regex_code.statick();
+        let n = regex_code.captures_len();
+        let groups = (0..n).map(|i| {
+            quote! {
+                caps.get(#i).map_or(&b""[..], |c| c.as_bytes())
+            }
+        });
+        quote! {{
+            #statick;
+            RE.captures(#value)
+                .map(|caps| (
+                    #(#groups),*
+                ))
+        }}
+    })
+}
+
 /// common implementation of regex_replace and regex_replace_all
 fn replacen(input: TokenStream, limit: usize) -> TokenStream {
     let parsed = parse_macro_input!(input as ReplaceArgs);
     let ReplaceArgs { regex_str, value, replacer } = parsed;
-    let regex_code = match RegexCode::try_from(regex_str) {
+    let regex_code = match RegexCode::from_lit_str(regex_str, false) {
         Ok(r) => r,
         Err(e) => {
             return e.to_compile_error().into();
@@ -195,6 +302,48 @@ fn replacen(input: TokenStream, limit: usize) -> TokenStream {
     stream.into()
 }
 
+/// common implementation of bytes_regex_replace and bytes_regex_replace_all
+fn bytes_replacen(input: TokenStream, limit: usize) -> TokenStream {
+    let parsed = parse_macro_input!(input as ReplaceArgs);
+    let ReplaceArgs { regex_str, value, replacer } = parsed;
+    let regex_code = match RegexCode::from_lit_str(regex_str, true) {
+        Ok(r) => r,
+        Err(e) => {
+            return e.to_compile_error().into();
+        }
+    };
+    let statick = regex_code.statick();
+    let stream = match replacer {
+        MaybeFun::Fun(fun) => {
+            let n = regex_code.captures_len();
+            let groups = (0..n).map(|i| {
+                quote! {
+                    caps.get(#i).map_or(&b""[..], |c| c.as_bytes())
+                }
+            });
+            quote! {{
+                #statick;
+                RE.replacen(
+                    #value,
+                    #limit,
+                    |caps: &lazy_regex::regex::bytes::Captures<'_>| {
+                        let mut fun = #fun;
+                        fun(
+                            #(#groups),*
+                        )
+                    })
+            }}
+        }
+        MaybeFun::Expr(expr) => {
+            quote! {{
+                #statick;
+                RE.replacen(#value, #limit, #expr)
+            }}
+        }
+    };
+    stream.into()
+}
+
 /// Replaces the leftmost match in the second argument
 /// using the replacer given as third argument.
 ///
@@ -218,6 +367,30 @@ pub fn regex_replace(input: TokenStream) -> TokenStream {
     replacen(input, 1)
 }
 
+/// Replaces the leftmost match in the second argument
+/// using the replacer given as third argument.
+///
+/// When the replacer is a closure, it is given one or more `&str`,
+/// the first one for the whole match and the following ones for
+/// the groups.
+/// Any optional group with no value is replaced with `b""`.
+///
+/// Example:
+/// ```
+/// println!("{:?}", "ck ck".as_bytes());
+/// let text = b"Fuu fuuu";
+/// let text = bytes_regex_replace!(
+///     "f(u*)"i,
+///     text,
+///     b"ck",
+/// );
+/// assert_eq!(text, "ck fuuu".as_bytes());
+/// ```
+#[proc_macro]
+pub fn bytes_regex_replace(input: TokenStream) -> TokenStream {
+    bytes_replacen(input, 1)
+}
+
 /// Replaces all non-overlapping matches in the second argument
 /// using the replacer given as third argument.
 ///
@@ -240,3 +413,26 @@ pub fn regex_replace(input: TokenStream) -> TokenStream {
 pub fn regex_replace_all(input: TokenStream) -> TokenStream {
     replacen(input, 0)
 }
+
+/// Replaces all non-overlapping matches in the second argument
+/// using the replacer given as third argument.
+///
+/// When the replacer is a closure, it is given one or more `&str`,
+/// the first one for the whole match and the following ones for
+/// the groups.
+/// Any optional group with no value is replaced with `""`.
+///
+/// Example:
+/// ```
+/// let text = b"Foo fuu";
+/// let text = bytes_regex_replace_all!(
+///     r#"\bf(?P<suffix>\w+)"#i,
+///     text,
+///     b"H",
+/// );
+/// assert_eq!(text, "H H".as_bytes());
+/// ```
+#[proc_macro]
+pub fn bytes_regex_replace_all(input: TokenStream) -> TokenStream {
+    bytes_replacen(input, 0)
+}
diff --git a/src/proc_macros/regex_code.rs b/src/proc_macros/regex_code.rs
index 7658656..917cdff 100644
--- a/src/proc_macros/regex_code.rs
+++ b/src/proc_macros/regex_code.rs
@@ -2,7 +2,6 @@ use {
     proc_macro::TokenStream,
     proc_macro2::TokenStream as TokenStream2,
     quote::quote,
-    std::convert::TryFrom,
     syn::LitStr,
 };
 
@@ -18,17 +17,17 @@ pub(crate) enum RegexInstance {
     Bytes(regex::bytes::Regex),
 }
 
-impl TryFrom<LitStr> for RegexCode {
-    type Error = syn::Error;
-
-    fn try_from(lit_str: LitStr) -> Result<Self, Self::Error> {
+impl RegexCode {
+    pub fn from_token_stream(token_stream: TokenStream, is_bytes: bool) -> Result<Self, syn::Error> {
+        Self::from_lit_str(syn::parse::<syn::LitStr>(token_stream)?, is_bytes)
+    }
+    pub fn from_lit_str(lit_str: LitStr, mut is_bytes: bool) -> Result<Self, syn::Error> {
         let pattern = lit_str.value();
         let mut case_insensitive = false;
         let mut multi_line = false;
         let mut dot_matches_new_line = false;
         let mut ignore_whitespace = false;
         let mut swap_greed = false;
-        let mut is_bytes = false;
         for (i, ch) in lit_str.suffix().chars().enumerate() {
             match ch {
                 'i' => case_insensitive = true,
@@ -78,14 +77,6 @@ impl TryFrom<LitStr> for RegexCode {
     }
 }
 
-impl TryFrom<TokenStream> for RegexCode {
-    type Error = syn::Error;
-
-    fn try_from(token_stream: TokenStream) -> Result<Self, Self::Error> {
-        Self::try_from(syn::parse::<syn::LitStr>(token_stream)?)
-    }
-}
-
 impl RegexCode {
     pub fn statick(&self) -> TokenStream2 {
         let build = &self.build;