REF: Convert list comprehensions into lazy iterators (pandas-dev#58798)

Aloqeely · May 21, 2024 · 1e3bf39 · 1e3bf39
1 parent 695b170
commit 1e3bf39
Show file tree

Hide file tree

Showing 11 changed files with 62 additions and 54 deletions.
diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py
@@ -329,8 +329,8 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any)
         reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
 
     if self.ndim == 1:
-        names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
-        name = names[0] if len(set(names)) == 1 else None
+        names = {getattr(x, "name") for x in inputs if hasattr(x, "name")}
+        name = names.pop() if len(names) == 1 else None
         reconstruct_kwargs = {"name": name}
     else:
         reconstruct_kwargs = {}

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -335,11 +335,12 @@ def is_empty_slice(obj) -> bool:
     )
 
 
-def is_true_slices(line) -> list[bool]:
+def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]:
     """
-    Find non-trivial slices in "line": return a list of booleans with same length.
+    Find non-trivial slices in "line": yields a bool.
     """
-    return [isinstance(k, slice) and not is_null_slice(k) for k in line]
+    for k in line:
+        yield isinstance(k, slice) and not is_null_slice(k)
 
 
 # TODO: used only once in indexing; belongs elsewhere?

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -680,10 +680,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
             return None
 
         # categorical is aware of Sparse -> extract sparse subdtypes
-        dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
+        subtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)
         # extract the categories' dtype
         non_cat_dtypes = [
-            x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
+            x.categories.dtype if isinstance(x, CategoricalDtype) else x
+            for x in subtypes
         ]
         # TODO should categorical always give an answer?
         from pandas.core.dtypes.cast import find_common_type

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6999,19 +6999,19 @@ def sort_values(
                 f" != length of by ({len(by)})"
             )
         if len(by) > 1:
-            keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
+            keys = (self._get_label_or_level_values(x, axis=axis) for x in by)
 
             # need to rewrap columns in Series to apply key function
             if key is not None:
-                # error: List comprehension has incompatible type List[Series];
-                # expected List[ndarray]
-                keys = [
-                    Series(k, name=name)  # type: ignore[misc]
-                    for (k, name) in zip(keys, by)
-                ]
+                keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)]
+            else:
+                # error: Argument 1 to "list" has incompatible type
+                # "Generator[ExtensionArray | ndarray[Any, Any], None, None]";
+                # expected "Iterable[Series]"
+                keys_data = list(keys)  # type: ignore[arg-type]
 
             indexer = lexsort_indexer(
-                keys, orders=ascending, na_position=na_position, key=key
+                keys_data, orders=ascending, na_position=na_position, key=key
             )
         elif len(by):
             # len(by) == 1

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -387,7 +387,7 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
             raise SpecificationError("nested renamer is not supported")
 
         if any(isinstance(x, (tuple, list)) for x in arg):
-            arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
+            arg = ((x, x) if not isinstance(x, (tuple, list)) else x for x in arg)
         else:
             # list of functions / function names
             columns = (com.get_callable_name(f) or f for f in arg)
@@ -2077,7 +2077,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
 
         obj = self._obj_with_exclusions
         columns = obj.columns
-        sgbs = [
+        sgbs = (
             SeriesGroupBy(
                 obj.iloc[:, i],
                 selection=colname,
@@ -2086,7 +2086,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
                 observed=self.observed,
             )
             for i, colname in enumerate(obj.columns)
-        ]
+        )
         results = [func(sgb) for sgb in sgbs]
 
         if not len(results):

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -11,6 +11,7 @@ class providing the base-class of operations.
 
 from collections.abc import (
     Hashable,
+    Iterable,
     Iterator,
     Mapping,
     Sequence,
@@ -758,7 +759,7 @@ def get_converter(s):
                     )
                     raise ValueError(msg) from err
 
-            converters = [get_converter(s) for s in index_sample]
+            converters = (get_converter(s) for s in index_sample)
             names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
 
         else:
@@ -2645,7 +2646,7 @@ def _value_counts(
         }
         if isinstance(obj, Series):
             _name = obj.name
-            keys = [] if _name in in_axis_names else [obj]
+            keys: Iterable[Series] = [] if _name in in_axis_names else [obj]
         else:
             unique_cols = set(obj.columns)
             if subset is not None:
@@ -2665,12 +2666,12 @@ def _value_counts(
             else:
                 subsetted = unique_cols
 
-            keys = [
+            keys = (
                 # Can't use .values because the column label needs to be preserved
                 obj.iloc[:, idx]
                 for idx, _name in enumerate(obj.columns)
                 if _name not in in_axis_names and _name in subsetted
-            ]
+            )
 
         groupings = list(self._grouper.groupings)
         for key in keys:

diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -212,20 +212,25 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
     if kind == "special":
         result = indexes[0]
 
-        dtis = [x for x in indexes if isinstance(x, DatetimeIndex)]
-        dti_tzs = [x for x in dtis if x.tz is not None]
-        if len(dti_tzs) not in [0, len(dtis)]:
+        num_dtis = 0
+        num_dti_tzs = 0
+        for idx in indexes:
+            if isinstance(idx, DatetimeIndex):
+                num_dtis += 1
+                if idx.tz is not None:
+                    num_dti_tzs += 1
+        if num_dti_tzs not in [0, num_dtis]:
             # TODO: this behavior is not tested (so may not be desired),
             #  but is kept in order to keep behavior the same when
             #  deprecating union_many
             # test_frame_from_dict_with_mixed_indexes
             raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
 
-        if len(dtis) == len(indexes):
+        if num_dtis == len(indexes):
             sort = True
             result = indexes[0]
 
-        elif len(dtis) > 1:
+        elif num_dtis > 1:
             # If we have mixed timezones, our casting behavior may depend on
             #  the order of indexes, which we don't want.
             sort = False

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3140,7 +3140,7 @@ def _union(self, other: Index, sort: bool | None):
 
                 # worth making this faster? a very unusual case
                 value_set = set(lvals)
-                value_list.extend([x for x in rvals if x not in value_set])
+                value_list.extend(x for x in rvals if x not in value_set)
                 # If objects are unorderable, we must have object dtype.
                 return np.array(value_list, dtype=object)
 
@@ -7620,8 +7620,8 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
     list
         A list representing the unanimous 'names' found.
     """
-    name_tups = [tuple(i.names) for i in indexes]
-    name_sets = [{*ns} for ns in zip_longest(*name_tups)]
+    name_tups = (tuple(i.names) for i in indexes)
+    name_sets = ({*ns} for ns in zip_longest(*name_tups))
     names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
     return names
 

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1387,7 +1387,7 @@ def _formatter_func(self, tup):
         """
         Formats each item in tup according to its level's formatter function.
         """
-        formatter_funcs = [level._formatter_func for level in self.levels]
+        formatter_funcs = (level._formatter_func for level in self.levels)
         return tuple(func(val) for func, val in zip(formatter_funcs, tup))
 
     def _get_values_for_csv(
@@ -1537,7 +1537,7 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
         if level is None:
             level = range(self.nlevels)
         else:
-            level = [self._get_level_number(lev) for lev in level]
+            level = (self._get_level_number(lev) for lev in level)
 
         # set the name
         for lev, name in zip(level, names):

diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -560,7 +560,7 @@ def get_result(self):
 
             # combine as columns in a frame
             else:
-                data = dict(zip(range(len(self.objs)), self.objs))
+                data = dict(enumerate(self.objs))
 
                 # GH28330 Preserves subclassed objects through concat
                 cons = sample._constructor_expanddim
@@ -874,7 +874,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
 
     if isinstance(new_index, MultiIndex):
         new_levels.extend(new_index.levels)
-        new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
+        new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes)
     else:
         new_levels.append(new_index.unique())
         single_codes = new_index.unique().get_indexer(new_index)

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -137,24 +137,24 @@ def __init__(
             self.removed_level = self.removed_level.take(unique_codes)
             self.removed_level_full = self.removed_level_full.take(unique_codes)
 
-        # Bug fix GH 20601
-        # If the data frame is too big, the number of unique index combination
-        # will cause int32 overflow on windows environments.
-        # We want to check and raise an warning before this happens
-        num_rows = np.max([index_level.size for index_level in self.new_index_levels])
-        num_columns = self.removed_level.size
-
-        # GH20601: This forces an overflow if the number of cells is too high.
-        num_cells = num_rows * num_columns
-
-        # GH 26314: Previous ValueError raised was too restrictive for many users.
-        if get_option("performance_warnings") and num_cells > np.iinfo(np.int32).max:
-            warnings.warn(
-                f"The following operation may generate {num_cells} cells "
-                f"in the resulting pandas object.",
-                PerformanceWarning,
-                stacklevel=find_stack_level(),
-            )
+        if get_option("performance_warnings"):
+            # Bug fix GH 20601
+            # If the data frame is too big, the number of unique index combination
+            # will cause int32 overflow on windows environments.
+            # We want to check and raise an warning before this happens
+            num_rows = max(index_level.size for index_level in self.new_index_levels)
+            num_columns = self.removed_level.size
+
+            # GH20601: This forces an overflow if the number of cells is too high.
+            # GH 26314: Previous ValueError raised was too restrictive for many users.
+            num_cells = num_rows * num_columns
+            if num_cells > np.iinfo(np.int32).max:
+                warnings.warn(
+                    f"The following operation may generate {num_cells} cells "
+                    f"in the resulting pandas object.",
+                    PerformanceWarning,
+                    stacklevel=find_stack_level(),
+                )
 
         self._make_selectors()
 
@@ -731,10 +731,10 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index:
     if len(columns.levels) <= 2:
         return columns.levels[0]._rename(name=columns.names[0])
 
-    levs = [
+    levs = (
         [lev[c] if c >= 0 else None for c in codes]
         for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
-    ]
+    )
 
     # Remove duplicate tuples in the MultiIndex.
     tuples = zip(*levs)