Skip to content

Commit

Permalink
FEAT-modin-project#6398: Improved performance of list-like objects in…
Browse files Browse the repository at this point in the history
…sertion into DataFrames

Wrap a list-like object into a single-column query compiler before the insertion.

Signed-off-by: Andrey Pavlenko <andrey.a.pavlenko@gmail.com>
  • Loading branch information
AndreyPavlenko committed Aug 9, 2023
1 parent 58f850d commit ded13c3
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 4 deletions.
21 changes: 21 additions & 0 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2757,6 +2757,7 @@ def getitem_row_array(self, key):
)

def setitem(self, axis, key, value):
value = self._wrap_column_data(value)
return self._setitem(axis=axis, key=key, value=value, how=None)

def _setitem(self, axis, key, value, how="inner"):
Expand Down Expand Up @@ -2922,6 +2923,7 @@ def _compute_duplicated(df): # pragma: no cover
# return a new one from here and let the front end handle the inplace
# update.
def insert(self, loc, column, value):
value = self._wrap_column_data(value)
if isinstance(value, type(self)):
value.columns = [column]
return self.insert_item(axis=1, loc=loc, value=value, how=None)
Expand Down Expand Up @@ -2954,6 +2956,25 @@ def insert(df, internal_indices=[]): # pragma: no cover
)
return self.__constructor__(new_modin_frame)

def _wrap_column_data(self, data):
"""
If the data is list-like, create a single column query compiler.
Parameters
----------
data : any
Returns
-------
data or PandasQueryCompiler
"""
if is_list_like(data):
return self.from_pandas(
pandas.DataFrame(pandas.Series(data, index=self.index)),
data_cls=type(self._modin_frame),
)
return data

# END Insert

def explode(self, column):
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2511,7 +2511,7 @@ def setitem_unhashable_key(df, value):
value = value.T.reshape(-1)
if len(self) > 0:
value = value[: len(self)]
if not isinstance(value, (Series, Categorical, np.ndarray)):
if not isinstance(value, (Series, Categorical, np.ndarray, list, range)):
value = list(value)

if not self._query_compiler.lazy_execution and len(self.index) == 0:
Expand Down
33 changes: 33 additions & 0 deletions modin/pandas/test/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import pytest

from modin.config import Engine, StorageFormat


def pytest_collection_modifyitems(items):
if (
Engine.get() in ("Ray", "Unidist", "Dask", "Python")
and StorageFormat.get() != "Base"
):
for item in items:
if item.name in (
"test_dataframe_dt_index[3s-both-DateCol-0]",
"test_dataframe_dt_index[3s-right-DateCol-0]",
):
item.add_marker(
pytest.mark.xfail(
reason="https://github.com/modin-project/modin/issues/6399"
)
)
17 changes: 15 additions & 2 deletions modin/pandas/test/dataframe/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
test_data_large_categorical_dataframe,
default_to_pandas_ignore_string,
)
from modin.config import NPartitions, StorageFormat
from modin.config import NPartitions, StorageFormat, Engine
from modin.test.test_utils import warns_that_defaulting_to_pandas

NPartitions.put(4)
Expand Down Expand Up @@ -850,7 +850,20 @@ def test_resampler_functions_with_arg(rule, axis, method_arg):
@pytest.mark.parametrize("rule", ["5T"])
@pytest.mark.parametrize("closed", ["left", "right"])
@pytest.mark.parametrize("label", ["right", "left"])
@pytest.mark.parametrize("on", [None, "DateColumn"])
@pytest.mark.parametrize(
"on",
[
None,
pytest.param(
"DateColumn",
marks=pytest.mark.xfail(
condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python")
and StorageFormat.get() != "Base",
reason="https://github.com/modin-project/modin/issues/6399",
),
),
],
)
@pytest.mark.parametrize("level", [None, 1])
def test_resample_specific(rule, closed, label, on, level):
data, index = (
Expand Down
6 changes: 5 additions & 1 deletion modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2748,7 +2748,11 @@ def test_rolling_timedelta_window(center, closed, as_index, on):
pd_df = md_df._to_pandas()

if StorageFormat.get() == "Pandas":
assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2
assert (
md_df._query_compiler._modin_frame._partitions.shape[1] == 2
if on is None
else 3
)

md_window = md_df.groupby("by", as_index=as_index).rolling(
datetime.timedelta(days=3), center=center, closed=closed, on=on
Expand Down

0 comments on commit ded13c3

Please sign in to comment.