FEAT-modin-project#6398: Improved performance of list-like objects in…

…sertion into DataFrames Wrap a list-like object into a single-column query compiler before the insertion. Signed-off-by: Andrey Pavlenko <andrey.a.pavlenko@gmail.com>
AndreyPavlenko · Aug 9, 2023 · ded13c3 · ded13c3
1 parent 58f850d
commit ded13c3
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 4 deletions.
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -2757,6 +2757,7 @@ def getitem_row_array(self, key):
         )
 
     def setitem(self, axis, key, value):
+        value = self._wrap_column_data(value)
         return self._setitem(axis=axis, key=key, value=value, how=None)
 
     def _setitem(self, axis, key, value, how="inner"):
@@ -2922,6 +2923,7 @@ def _compute_duplicated(df):  # pragma: no cover
     # return a new one from here and let the front end handle the inplace
     # update.
     def insert(self, loc, column, value):
+        value = self._wrap_column_data(value)
         if isinstance(value, type(self)):
             value.columns = [column]
             return self.insert_item(axis=1, loc=loc, value=value, how=None)
@@ -2954,6 +2956,25 @@ def insert(df, internal_indices=[]):  # pragma: no cover
         )
         return self.__constructor__(new_modin_frame)
 
+    def _wrap_column_data(self, data):
+        """
+        If the data is list-like, create a single column query compiler.
+
+        Parameters
+        ----------
+        data : any
+
+        Returns
+        -------
+        data or PandasQueryCompiler
+        """
+        if is_list_like(data):
+            return self.from_pandas(
+                pandas.DataFrame(pandas.Series(data, index=self.index)),
+                data_cls=type(self._modin_frame),
+            )
+        return data
+
     # END Insert
 
     def explode(self, column):

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -2511,7 +2511,7 @@ def setitem_unhashable_key(df, value):
                 value = value.T.reshape(-1)
                 if len(self) > 0:
                     value = value[: len(self)]
-            if not isinstance(value, (Series, Categorical, np.ndarray)):
+            if not isinstance(value, (Series, Categorical, np.ndarray, list, range)):
                 value = list(value)
 
         if not self._query_compiler.lazy_execution and len(self.index) == 0:

diff --git a/modin/pandas/test/conftest.py b/modin/pandas/test/conftest.py
@@ -0,0 +1,33 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+import pytest
+
+from modin.config import Engine, StorageFormat
+
+
+def pytest_collection_modifyitems(items):
+    if (
+        Engine.get() in ("Ray", "Unidist", "Dask", "Python")
+        and StorageFormat.get() != "Base"
+    ):
+        for item in items:
+            if item.name in (
+                "test_dataframe_dt_index[3s-both-DateCol-0]",
+                "test_dataframe_dt_index[3s-right-DateCol-0]",
+            ):
+                item.add_marker(
+                    pytest.mark.xfail(
+                        reason="https://github.com/modin-project/modin/issues/6399"
+                    )
+                )
diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py
@@ -46,7 +46,7 @@
     test_data_large_categorical_dataframe,
     default_to_pandas_ignore_string,
 )
-from modin.config import NPartitions, StorageFormat
+from modin.config import NPartitions, StorageFormat, Engine
 from modin.test.test_utils import warns_that_defaulting_to_pandas
 
 NPartitions.put(4)
@@ -850,7 +850,20 @@ def test_resampler_functions_with_arg(rule, axis, method_arg):
 @pytest.mark.parametrize("rule", ["5T"])
 @pytest.mark.parametrize("closed", ["left", "right"])
 @pytest.mark.parametrize("label", ["right", "left"])
-@pytest.mark.parametrize("on", [None, "DateColumn"])
+@pytest.mark.parametrize(
+    "on",
+    [
+        None,
+        pytest.param(
+            "DateColumn",
+            marks=pytest.mark.xfail(
+                condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python")
+                and StorageFormat.get() != "Base",
+                reason="https://github.com/modin-project/modin/issues/6399",
+            ),
+        ),
+    ],
+)
 @pytest.mark.parametrize("level", [None, 1])
 def test_resample_specific(rule, closed, label, on, level):
     data, index = (

diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py
@@ -2748,7 +2748,11 @@ def test_rolling_timedelta_window(center, closed, as_index, on):
     pd_df = md_df._to_pandas()
 
     if StorageFormat.get() == "Pandas":
-        assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2
+        assert (
+            md_df._query_compiler._modin_frame._partitions.shape[1] == 2
+            if on is None
+            else 3
+        )
 
     md_window = md_df.groupby("by", as_index=as_index).rolling(
         datetime.timedelta(days=3), center=center, closed=closed, on=on