Skip to content

Commit

Permalink
REF: Convert list comprehensions into lazy iterators (pandas-dev#58798)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed May 21, 2024
1 parent 695b170 commit 1e3bf39
Show file tree
Hide file tree
Showing 11 changed files with 62 additions and 54 deletions.
4 changes: 2 additions & 2 deletions pandas/core/arraylike.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,8 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any)
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))

if self.ndim == 1:
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
name = names[0] if len(set(names)) == 1 else None
names = {getattr(x, "name") for x in inputs if hasattr(x, "name")}
name = names.pop() if len(names) == 1 else None
reconstruct_kwargs = {"name": name}
else:
reconstruct_kwargs = {}
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,12 @@ def is_empty_slice(obj) -> bool:
)


def is_true_slices(line) -> list[bool]:
def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]:
"""
Find non-trivial slices in "line": return a list of booleans with same length.
Find non-trivial slices in "line": yields a bool.
"""
return [isinstance(k, slice) and not is_null_slice(k) for k in line]
for k in line:
yield isinstance(k, slice) and not is_null_slice(k)


# TODO: used only once in indexing; belongs elsewhere?
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,10 +680,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
return None

# categorical is aware of Sparse -> extract sparse subdtypes
dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
subtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)
# extract the categories' dtype
non_cat_dtypes = [
x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
x.categories.dtype if isinstance(x, CategoricalDtype) else x
for x in subtypes
]
# TODO should categorical always give an answer?
from pandas.core.dtypes.cast import find_common_type
Expand Down
16 changes: 8 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6999,19 +6999,19 @@ def sort_values(
f" != length of by ({len(by)})"
)
if len(by) > 1:
keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
keys = (self._get_label_or_level_values(x, axis=axis) for x in by)

# need to rewrap columns in Series to apply key function
if key is not None:
# error: List comprehension has incompatible type List[Series];
# expected List[ndarray]
keys = [
Series(k, name=name) # type: ignore[misc]
for (k, name) in zip(keys, by)
]
keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)]
else:
# error: Argument 1 to "list" has incompatible type
# "Generator[ExtensionArray | ndarray[Any, Any], None, None]";
# expected "Iterable[Series]"
keys_data = list(keys) # type: ignore[arg-type]

indexer = lexsort_indexer(
keys, orders=ascending, na_position=na_position, key=key
keys_data, orders=ascending, na_position=na_position, key=key
)
elif len(by):
# len(by) == 1
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
raise SpecificationError("nested renamer is not supported")

if any(isinstance(x, (tuple, list)) for x in arg):
arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
arg = ((x, x) if not isinstance(x, (tuple, list)) else x for x in arg)
else:
# list of functions / function names
columns = (com.get_callable_name(f) or f for f in arg)
Expand Down Expand Up @@ -2077,7 +2077,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:

obj = self._obj_with_exclusions
columns = obj.columns
sgbs = [
sgbs = (
SeriesGroupBy(
obj.iloc[:, i],
selection=colname,
Expand All @@ -2086,7 +2086,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
observed=self.observed,
)
for i, colname in enumerate(obj.columns)
]
)
results = [func(sgb) for sgb in sgbs]

if not len(results):
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class providing the base-class of operations.

from collections.abc import (
Hashable,
Iterable,
Iterator,
Mapping,
Sequence,
Expand Down Expand Up @@ -758,7 +759,7 @@ def get_converter(s):
)
raise ValueError(msg) from err

converters = [get_converter(s) for s in index_sample]
converters = (get_converter(s) for s in index_sample)
names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)

else:
Expand Down Expand Up @@ -2645,7 +2646,7 @@ def _value_counts(
}
if isinstance(obj, Series):
_name = obj.name
keys = [] if _name in in_axis_names else [obj]
keys: Iterable[Series] = [] if _name in in_axis_names else [obj]
else:
unique_cols = set(obj.columns)
if subset is not None:
Expand All @@ -2665,12 +2666,12 @@ def _value_counts(
else:
subsetted = unique_cols

keys = [
keys = (
# Can't use .values because the column label needs to be preserved
obj.iloc[:, idx]
for idx, _name in enumerate(obj.columns)
if _name not in in_axis_names and _name in subsetted
]
)

groupings = list(self._grouper.groupings)
for key in keys:
Expand Down
15 changes: 10 additions & 5 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,20 +212,25 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
if kind == "special":
result = indexes[0]

dtis = [x for x in indexes if isinstance(x, DatetimeIndex)]
dti_tzs = [x for x in dtis if x.tz is not None]
if len(dti_tzs) not in [0, len(dtis)]:
num_dtis = 0
num_dti_tzs = 0
for idx in indexes:
if isinstance(idx, DatetimeIndex):
num_dtis += 1
if idx.tz is not None:
num_dti_tzs += 1
if num_dti_tzs not in [0, num_dtis]:
# TODO: this behavior is not tested (so may not be desired),
# but is kept in order to keep behavior the same when
# deprecating union_many
# test_frame_from_dict_with_mixed_indexes
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")

if len(dtis) == len(indexes):
if num_dtis == len(indexes):
sort = True
result = indexes[0]

elif len(dtis) > 1:
elif num_dtis > 1:
# If we have mixed timezones, our casting behavior may depend on
# the order of indexes, which we don't want.
sort = False
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3140,7 +3140,7 @@ def _union(self, other: Index, sort: bool | None):

# worth making this faster? a very unusual case
value_set = set(lvals)
value_list.extend([x for x in rvals if x not in value_set])
value_list.extend(x for x in rvals if x not in value_set)
# If objects are unorderable, we must have object dtype.
return np.array(value_list, dtype=object)

Expand Down Expand Up @@ -7620,8 +7620,8 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
list
A list representing the unanimous 'names' found.
"""
name_tups = [tuple(i.names) for i in indexes]
name_sets = [{*ns} for ns in zip_longest(*name_tups)]
name_tups = (tuple(i.names) for i in indexes)
name_sets = ({*ns} for ns in zip_longest(*name_tups))
names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
return names

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,7 +1387,7 @@ def _formatter_func(self, tup):
"""
Formats each item in tup according to its level's formatter function.
"""
formatter_funcs = [level._formatter_func for level in self.levels]
formatter_funcs = (level._formatter_func for level in self.levels)
return tuple(func(val) for func, val in zip(formatter_funcs, tup))

def _get_values_for_csv(
Expand Down Expand Up @@ -1537,7 +1537,7 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
if level is None:
level = range(self.nlevels)
else:
level = [self._get_level_number(lev) for lev in level]
level = (self._get_level_number(lev) for lev in level)

# set the name
for lev, name in zip(level, names):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ def get_result(self):

# combine as columns in a frame
else:
data = dict(zip(range(len(self.objs)), self.objs))
data = dict(enumerate(self.objs))

# GH28330 Preserves subclassed objects through concat
cons = sample._constructor_expanddim
Expand Down Expand Up @@ -874,7 +874,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde

if isinstance(new_index, MultiIndex):
new_levels.extend(new_index.levels)
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes)
else:
new_levels.append(new_index.unique())
single_codes = new_index.unique().get_indexer(new_index)
Expand Down
40 changes: 20 additions & 20 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,24 +137,24 @@ def __init__(
self.removed_level = self.removed_level.take(unique_codes)
self.removed_level_full = self.removed_level_full.take(unique_codes)

# Bug fix GH 20601
# If the data frame is too big, the number of unique index combination
# will cause int32 overflow on windows environments.
# We want to check and raise an warning before this happens
num_rows = np.max([index_level.size for index_level in self.new_index_levels])
num_columns = self.removed_level.size

# GH20601: This forces an overflow if the number of cells is too high.
num_cells = num_rows * num_columns

# GH 26314: Previous ValueError raised was too restrictive for many users.
if get_option("performance_warnings") and num_cells > np.iinfo(np.int32).max:
warnings.warn(
f"The following operation may generate {num_cells} cells "
f"in the resulting pandas object.",
PerformanceWarning,
stacklevel=find_stack_level(),
)
if get_option("performance_warnings"):
# Bug fix GH 20601
# If the data frame is too big, the number of unique index combination
# will cause int32 overflow on windows environments.
# We want to check and raise an warning before this happens
num_rows = max(index_level.size for index_level in self.new_index_levels)
num_columns = self.removed_level.size

# GH20601: This forces an overflow if the number of cells is too high.
# GH 26314: Previous ValueError raised was too restrictive for many users.
num_cells = num_rows * num_columns
if num_cells > np.iinfo(np.int32).max:
warnings.warn(
f"The following operation may generate {num_cells} cells "
f"in the resulting pandas object.",
PerformanceWarning,
stacklevel=find_stack_level(),
)

self._make_selectors()

Expand Down Expand Up @@ -731,10 +731,10 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index:
if len(columns.levels) <= 2:
return columns.levels[0]._rename(name=columns.names[0])

levs = [
levs = (
[lev[c] if c >= 0 else None for c in codes]
for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
]
)

# Remove duplicate tuples in the MultiIndex.
tuples = zip(*levs)
Expand Down

0 comments on commit 1e3bf39

Please sign in to comment.