Skip to content

Commit

Permalink
ENH: Use explicit methods instead of regex pattern in arrow strings (p…
Browse files Browse the repository at this point in the history
…andas-dev#54006)

* ENH: Use explicit methods instead of regex pattern in arrow strings

* Fixup

* Fix
  • Loading branch information
phofl authored and im-vinicius committed Jul 8, 2023
1 parent 22d21bc commit cace5b8
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 33 deletions.
29 changes: 16 additions & 13 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,28 +307,31 @@ def _str_contains(
return super()._str_contains(pat, case, flags, na, regex)

if regex:
if case is False:
fallback_performancewarning()
return super()._str_contains(pat, case, flags, na, regex)
else:
result = pc.match_substring_regex(self._pa_array, pat)
result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
else:
if case:
result = pc.match_substring(self._pa_array, pat)
else:
result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper())
result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
return result

def _str_startswith(self, pat: str, na=None):
pat = f"^{re.escape(pat)}"
return self._str_contains(pat, na=na, regex=True)
result = pc.starts_with(self._pa_array, pattern=pat)
if not isna(na):
result = result.fill_null(na)
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
return result

def _str_endswith(self, pat: str, na=None):
pat = f"{re.escape(pat)}$"
return self._str_contains(pat, na=na, regex=True)
result = pc.ends_with(self._pa_array, pattern=pat)
if not isna(na):
result = result.fill_null(na)
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
return result

def _str_replace(
self,
Expand Down
26 changes: 6 additions & 20 deletions pandas/tests/strings/test_find_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,8 @@ def test_contains(any_string_dtype):
np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object),
dtype=any_string_dtype,
)
with tm.maybe_produces_warning(
PerformanceWarning, any_string_dtype == "string[pyarrow]"
):
result = values.str.contains("FOO|mmm", case=False)

result = values.str.contains("FOO|mmm", case=False)
expected = Series(np.array([True, False, True, True]), dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -172,10 +170,7 @@ def test_contains_moar(any_string_dtype):
)
tm.assert_series_equal(result, expected)

with tm.maybe_produces_warning(
PerformanceWarning, any_string_dtype == "string[pyarrow]"
):
result = s.str.contains("a", case=False)
result = s.str.contains("a", case=False)
expected = Series(
[True, False, False, True, True, False, np.nan, True, False, True],
dtype=expected_dtype,
Expand All @@ -196,10 +191,7 @@ def test_contains_moar(any_string_dtype):
)
tm.assert_series_equal(result, expected)

with tm.maybe_produces_warning(
PerformanceWarning, any_string_dtype == "string[pyarrow]"
):
result = s.str.contains("ba", case=False)
result = s.str.contains("ba", case=False)
expected = Series(
[False, False, False, True, True, False, np.nan, True, False, False],
dtype=expected_dtype,
Expand Down Expand Up @@ -723,10 +715,7 @@ def test_match_na_kwarg(any_string_dtype):

def test_match_case_kwarg(any_string_dtype):
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
with tm.maybe_produces_warning(
PerformanceWarning, any_string_dtype == "string[pyarrow]"
):
result = values.str.match("ab", case=False)
result = values.str.match("ab", case=False)
expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean"
expected = Series([True, True, True, True], dtype=expected_dtype)
tm.assert_series_equal(result, expected)
Expand Down Expand Up @@ -769,10 +758,7 @@ def test_fullmatch_case_kwarg(any_string_dtype):

expected = Series([True, True, False, False], dtype=expected_dtype)

with tm.maybe_produces_warning(
PerformanceWarning, any_string_dtype == "string[pyarrow]"
):
result = ser.str.fullmatch("ab", case=False)
result = ser.str.fullmatch("ab", case=False)
tm.assert_series_equal(result, expected)

with tm.maybe_produces_warning(
Expand Down

0 comments on commit cace5b8

Please sign in to comment.