DEPR: Replacing builtin and NumPy funcs in agg/apply/transform (panda…

…s-dev#53974) * DEPR: Replacing builtin and NumPy funcs in agg/apply/transform * mypy fixup
Daquisu · Jul 8, 2023 · 817ec46 · 817ec46
1 parent cace5b8
commit 817ec46
Show file tree

Hide file tree

Showing 44 changed files with 510 additions and 272 deletions.
diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -246,7 +246,7 @@ In pandas we may use :meth:`~pandas.pivot_table` method to handle this:
        }
    )
 
-   baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max)
+   baseball.pivot_table(values="batting avg", columns="team", aggfunc="max")
 
 For more details and examples see :ref:`the reshaping documentation
 <reshaping.pivot>`.
@@ -359,7 +359,7 @@ In pandas the equivalent expression, using the
    )
 
    grouped = df.groupby(["month", "week"])
-   grouped["x"].agg([np.mean, np.std])
+   grouped["x"].agg(["mean", "std"])
 
 
 For more details and examples see :ref:`the groupby documentation
@@ -482,7 +482,7 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`:
        values="value",
        index=["variable", "week"],
        columns=["month"],
-       aggfunc=np.mean,
+       aggfunc="mean",
    )
 
 Similarly for ``dcast`` which uses a data.frame called ``df`` in R to

diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst
@@ -198,7 +198,7 @@ to your grouped DataFrame, indicating which functions to apply to specific colum
 
 .. ipython:: python
 
-    tips.groupby("day").agg({"tip": np.mean, "day": np.size})
+    tips.groupby("day").agg({"tip": "mean", "day": "size"})
 
 Grouping by more than one column is done by passing a list of columns to the
 :meth:`~pandas.DataFrame.groupby` method.
@@ -222,7 +222,7 @@ Grouping by more than one column is done by passing a list of columns to the
 
 .. ipython:: python
 
-    tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})
+    tips.groupby(["smoker", "day"]).agg({"tip": ["size", "mean"]})
 
 .. _compare_with_sql.join:
 

diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -881,8 +881,8 @@ statistics methods, takes an optional ``axis`` argument:
 
 .. ipython:: python
 
-   df.apply(np.mean)
-   df.apply(np.mean, axis=1)
+   df.apply(lambda x: np.mean(x))
+   df.apply(lambda x: np.mean(x), axis=1)
    df.apply(lambda x: x.max() - x.min())
    df.apply(np.cumsum)
    df.apply(np.exp)
@@ -986,7 +986,7 @@ output:
 
 .. ipython:: python
 
-   tsdf.agg(np.sum)
+   tsdf.agg(lambda x: np.sum(x))
 
    tsdf.agg("sum")
 

diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -530,7 +530,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
 
    code_groups = df.groupby("code")
 
-   agg_n_sort_order = code_groups[["data"]].transform(sum).sort_values(by="data")
+   agg_n_sort_order = code_groups[["data"]].transform("sum").sort_values(by="data")
 
    sorted_df = df.loc[agg_n_sort_order.index]
 
@@ -549,7 +549,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to
            return x.iloc[1] * 1.234
        return pd.NaT
 
-   mhc = {"Mean": np.mean, "Max": np.max, "Custom": MyCust}
+   mhc = {"Mean": "mean", "Max": "max", "Custom": MyCust}
    ts.resample("5min").apply(mhc)
    ts
 
@@ -685,7 +685,7 @@ The :ref:`Pivot <reshaping.pivot>` docs.
        values=["Sales"],
        index=["Province"],
        columns=["City"],
-       aggfunc=np.sum,
+       aggfunc="sum",
        margins=True,
    )
    table.stack("City")

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -402,12 +402,12 @@ We can produce pivot tables from this data very easily:
 .. ipython:: python
 
    pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])
-   pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum)
+   pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc="sum")
    pd.pivot_table(
        df, values=["D", "E"],
        index=["B"],
        columns=["A", "C"],
-       aggfunc=np.sum,
+       aggfunc="sum",
    )
 
 The result object is a :class:`DataFrame` having potentially hierarchical indexes on the
@@ -451,7 +451,7 @@ rows and columns:
        columns="C",
        values=["D", "E"],
        margins=True,
-       aggfunc=np.std
+       aggfunc="std"
    )
    table
 
@@ -552,7 +552,7 @@ each group defined by the first two :class:`Series`:
 
 .. ipython:: python
 
-   pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc=np.sum)
+   pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc="sum")
 
 Adding margins
 ~~~~~~~~~~~~~~
@@ -562,7 +562,7 @@ Finally, one can also add margins or normalize this output.
 .. ipython:: python
 
    pd.crosstab(
-       df["A"], df["B"], values=df["C"], aggfunc=np.sum, normalize=True, margins=True
+       df["A"], df["B"], values=df["C"], aggfunc="sum", normalize=True, margins=True
    )
 
 .. _reshaping.tile:

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -1801,22 +1801,22 @@ You can pass a list or dict of functions to do aggregation with, outputting a ``
 
 .. ipython:: python
 
-   r["A"].agg([np.sum, np.mean, np.std])
+   r["A"].agg(["sum", "mean", "std"])
 
 On a resampled ``DataFrame``, you can pass a list of functions to apply to each
 column, which produces an aggregated result with a hierarchical index:
 
 .. ipython:: python
 
-   r.agg([np.sum, np.mean])
+   r.agg(["sum", "mean"])
 
 By passing a dict to ``aggregate`` you can apply a different aggregation to the
 columns of a ``DataFrame``:
 
 .. ipython:: python
    :okexcept:
 
-   r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)})
+   r.agg({"A": "sum", "B": lambda x: np.std(x, ddof=1)})
 
 The function names can also be strings. In order for a string to be valid it
 must be implemented on the resampled object:

diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
@@ -140,7 +140,7 @@ of multiple aggregations applied to a window.
 .. ipython:: python
 
    df = pd.DataFrame({"A": range(5), "B": range(10, 15)})
-   df.expanding().agg([np.sum, np.mean, np.std])
+   df.expanding().agg(["sum", "mean", "std"])
 
 
 .. _window.generic:

diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst
@@ -846,7 +846,7 @@ Enhancements
      df.pivot_table(values='Quantity',
                     index=pd.Grouper(freq='M', key='Date'),
                     columns=pd.Grouper(freq='M', key='PayDay'),
-                    aggfunc=np.sum)
+                    aggfunc="sum")
 
 - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`)
 - Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs <basics.nsorted>` (:issue:`3960`)

diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst
@@ -984,7 +984,7 @@ Previous behavior:
      75%    3.750000
      max    4.000000
 
-   In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max])
+   In [3]: df.groupby('A').agg(["mean", "std", "min", "max"])
    Out[3]:
         B
      mean       std amin amax
@@ -1000,7 +1000,7 @@ New behavior:
 
    df.groupby('A').describe()
 
-   df.groupby('A').agg([np.mean, np.std, np.min, np.max])
+   df.groupby('A').agg(["mean", "std", "min", "max"])
 
 .. _whatsnew_0200.api_breaking.rolling_pairwise:
 
@@ -1163,7 +1163,7 @@ Previous behavior:
 
 .. code-block:: ipython
 
-   In [2]: df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum)
+   In [2]: df.pivot_table('col1', index=['col3', 'col2'], aggfunc="sum")
    Out[2]:
    col3  col2
    1     C       3
@@ -1175,7 +1175,7 @@ New behavior:
 
 .. ipython:: python
 
-   df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum)
+   df.pivot_table('col1', index=['col3', 'col2'], aggfunc="sum")
 
 .. _whatsnew_0200.api:
 

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -48,7 +48,7 @@ output columns when applying multiple aggregation functions to specific columns
    animals.groupby("kind").agg(
        min_height=pd.NamedAgg(column='height', aggfunc='min'),
        max_height=pd.NamedAgg(column='height', aggfunc='max'),
-       average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean),
+       average_weight=pd.NamedAgg(column='weight', aggfunc="mean"),
    )
 
 Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs``
@@ -61,7 +61,7 @@ what the arguments to the function are, but plain tuples are accepted as well.
    animals.groupby("kind").agg(
        min_height=('height', 'min'),
        max_height=('height', 'max'),
-       average_weight=('weight', np.mean),
+       average_weight=('weight', 'mean'),
    )
 
 Named aggregation is the recommended replacement for the deprecated "dict-of-dicts"

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -315,11 +315,11 @@ Deprecations
 - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`)
 - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`)
 - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`)
+- Deprecated replacing builtin and NumPy functions in ``.agg``, ``.apply``, and ``.transform``; use the corresponding string alias (e.g. ``"sum"`` for ``sum`` or ``np.sum``) instead (:issue:`53425`)
 - Deprecated strings ``T``, ``t``, ``L`` and ``l`` denoting units in :func:`to_timedelta` (:issue:`52536`)
 - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`)
 - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`)
 - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.performance:

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -170,6 +170,7 @@ def agg(self) -> DataFrame | Series | None:
         if callable(func):
             f = com.get_cython_func(func)
             if f and not args and not kwargs:
+                warn_alias_replacement(obj, func, f)
                 return getattr(obj, f)()
 
         # caller can react
@@ -280,6 +281,7 @@ def transform_str_or_callable(self, func) -> DataFrame | Series:
         if not args and not kwargs:
             f = com.get_cython_func(func)
             if f:
+                warn_alias_replacement(obj, func, f)
                 return getattr(obj, f)()
 
         # Two possible ways to use a UDF - apply or call directly
@@ -1695,3 +1697,23 @@ def validate_func_kwargs(
         no_arg_message = "Must provide 'func' or named aggregation **kwargs."
         raise TypeError(no_arg_message)
     return columns, func
+
+
+def warn_alias_replacement(
+    obj: AggObjType,
+    func: Callable,
+    alias: str,
+) -> None:
+    if alias.startswith("np."):
+        full_alias = alias
+    else:
+        full_alias = f"{type(obj).__name__}.{alias}"
+        alias = f"'{alias}'"
+    warnings.warn(
+        f"The provided callable {func} is currently using "
+        f"{full_alias}. In a future version of pandas, "
+        f"the provided callable will be used directly. To keep current "
+        f"behavior pass {alias} instead.",
+        category=FutureWarning,
+        stacklevel=find_stack_level(),
+    )
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -565,6 +565,13 @@ def require_length_match(data, index: Index) -> None:
     builtins.min: np.minimum.reduce,
 }
 
+# GH#53425: Only for deprecation
+_builtin_table_alias = {
+    builtins.sum: "np.sum",
+    builtins.max: "np.maximum.reduce",
+    builtins.min: "np.minimum.reduce",
+}
+
 _cython_table = {
     builtins.sum: "sum",
     builtins.max: "max",

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -8851,7 +8851,7 @@ def pivot(
             it can contain any of the other types (except list). If an array is
             passed, it must be the same length as the data and will be used in
             the same manner as column values.
-        aggfunc : function, list of functions, dict, default numpy.mean
+        aggfunc : function, list of functions, dict, default "mean"
             If a list of functions is passed, the resulting pivot table will have
             hierarchical columns whose top level are the function names
             (inferred from the function objects themselves).
@@ -8926,7 +8926,7 @@ def pivot(
         This first example aggregates values by taking the sum.
 
         >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
-        ...                        columns=['C'], aggfunc=np.sum)
+        ...                        columns=['C'], aggfunc="sum")
         >>> table
         C        large  small
         A   B
@@ -8938,7 +8938,7 @@ def pivot(
         We can also fill missing values using the `fill_value` parameter.
 
         >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
-        ...                        columns=['C'], aggfunc=np.sum, fill_value=0)
+        ...                        columns=['C'], aggfunc="sum", fill_value=0)
         >>> table
         C        large  small
         A   B
@@ -8950,7 +8950,7 @@ def pivot(
         The next example aggregates by taking the mean across multiple columns.
 
         >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
-        ...                        aggfunc={'D': np.mean, 'E': np.mean})
+        ...                        aggfunc={'D': "mean", 'E': "mean"})
         >>> table
                         D         E
         A   C
@@ -8963,8 +8963,8 @@ def pivot(
         value column.
 
         >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
-        ...                        aggfunc={'D': np.mean,
-        ...                                 'E': [min, max, np.mean]})
+        ...                        aggfunc={'D': "mean",
+        ...                                 'E': ["min", "max", "mean"]})
         >>> table
                           D   E
                        mean max      mean  min
@@ -9565,7 +9565,7 @@ def _gotitem(
     Aggregate different functions over the columns and rename the index of the resulting
     DataFrame.
 
-    >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))
+    >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean'))
          A    B    C
     x  7.0  NaN  NaN
     y  NaN  2.0  NaN