ClickHouse · alexey-milovidov · Apr 29, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 9, 2024
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
@@ -50,6 +50,7 @@ class IColumn;
     M(MaxThreads, max_threads, 0, "The maximum number of threads to execute the request. By default, it is determined automatically.", 0) \
     M(Bool, use_concurrency_control, true, "Respect the server's concurrency control (see the `concurrent_threads_soft_limit_num` and `concurrent_threads_soft_limit_ratio_to_cores` global server settings). If disabled, it allows using a larger number of threads even if the server is overloaded (not recommended for normal usage, and needed mostly for tests).", 0) \
     M(MaxThreads, max_download_threads, 4, "The maximum number of threads to download data (e.g. for URL engine).", 0) \
+    M(MaxThreads, max_parsing_threads, 0, "The maximum number of threads to parse data in input formats that support parallel parsing. By default, it is determined automatically", 0) \
     M(UInt64, max_download_buffer_size, 10*1024*1024, "The maximal size of buffer for parallel downloading (e.g. for URL engine) per each thread.", 0) \
     M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \
     M(UInt64, max_read_buffer_size_local_fs, 128*1024, "The maximum size of the buffer to read from local filesystem. If set to 0 then max_read_buffer_size will be used.", 0) \

diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
@@ -86,6 +86,7 @@ namespace SettingsChangesHistory
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
     {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"},
+              {"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"},
               {"lightweight_deletes_sync", 2, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes"},
               }},
     {"24.3", {{"s3_connect_timeout_ms", 1000, 1000, "Introduce new dedicated setting for s3 connection timeout"},

diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
@@ -303,7 +303,7 @@ InputFormatPtr FormatFactory::getInput(
 
     auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
     const Settings & settings = context->getSettingsRef();
-    size_t max_parsing_threads = _max_parsing_threads.value_or(settings.max_threads);
+    size_t max_parsing_threads = _max_parsing_threads.value_or(settings.max_parsing_threads);
     size_t max_download_threads = _max_download_threads.value_or(settings.max_download_threads);
 
     RowInputFormatParams row_input_format_params;

diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp
@@ -1392,7 +1392,7 @@ Chunk StorageFileSource::generate()
 
             chassert(file_num > 0);
 
-            const auto max_parsing_threads = std::max<size_t>(settings.max_threads / file_num, 1UL);
+            const auto max_parsing_threads = std::max<size_t>(settings.max_parsing_threads / file_num, 1UL);
             input_format = FormatFactory::instance().getInput(
                 storage->format_name, *read_buf, block_for_format, getContext(), max_block_size, storage->format_settings,
                 max_parsing_threads, std::nullopt, /*is_remote_fs*/ false, CompressionMethod::None, need_only_count);

diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
@@ -1206,8 +1206,8 @@ void ReadFromStorageS3Step::initializePipeline(QueryPipelineBuilder & pipeline,
         /// Disclosed glob iterator can underestimate the amount of keys in some cases. We will keep one stream for this particular case.
         num_streams = 1;
 
-    const size_t max_threads = context->getSettingsRef().max_threads;
-    const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / std::max(num_streams, 1ul));
+    const auto & settings = context->getSettingsRef();
+    const size_t max_parsing_threads = num_streams >= settings.max_parsing_threads ? 1 : (settings.max_parsing_threads / std::max(num_streams, 1ul));
     LOG_DEBUG(getLogger("StorageS3"), "Reading in {} streams, {} threads per stream", num_streams, max_parsing_threads);
 
     Pipes pipes;

diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp
@@ -1172,8 +1172,8 @@ void ReadFromURL::initializePipeline(QueryPipelineBuilder & pipeline, const Buil
     Pipes pipes;
     pipes.reserve(num_streams);
 
-    const size_t max_threads = context->getSettingsRef().max_threads;
-    const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / num_streams);
+    const auto & settings = context->getSettingsRef();
+    const size_t max_parsing_threads = num_streams >= settings.max_parsing_threads ? 1 : (settings.max_parsing_threads  / num_streams);
 
     for (size_t i = 0; i < num_streams; ++i)
     {
@@ -1204,7 +1204,7 @@ void ReadFromURL::initializePipeline(QueryPipelineBuilder & pipeline, const Buil
 
     auto pipe = Pipe::unitePipes(std::move(pipes));
     size_t output_ports = pipe.numOutputPorts();
-    const bool parallelize_output = context->getSettingsRef().parallelize_output_from_storages;
+    const bool parallelize_output = settings.parallelize_output_from_storages;
     if (parallelize_output && storage->parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < max_num_streams)
         pipe.resize(max_num_streams);
 

diff --git a/tests/performance/trivial_insert_select_from_files.xml b/tests/performance/trivial_insert_select_from_files.xml
@@ -0,0 +1,29 @@
+<test>
+
+<substitutions>
+    <substitution>
+        <name>format</name>
+        <values>
+            <value>TabSeparated</value>
+            <value>TabSeparatedWithNames</value>
+            <value>TabSeparatedWithNamesAndTypes</value>
+            <value>CSV</value>
+            <value>CSVWithNames</value>
+            <value>JSONEachRow</value>
+            <value>JSONCompactEachRow</value>
+            <value>JSONCompactEachRowWithNamesAndTypes</value>
+            <value>TSKV</value>
+        </values>
+    </substitution>
+</substitutions>
+
+<create_query>CREATE TABLE IF NOT EXISTS table_src_{format} ENGINE = File({format}) AS test.hits</create_query>
+<create_query>CREATE TABLE IF NOT EXISTS table_dst_{format} AS test.hits</create_query>
+<fill_query>INSERT INTO table_src_{format} SELECT * FROM test.hits LIMIT 100000</fill_query>
+
+<query>INSERT INTO table_dst_{format} SELECT * FROM table_src_{format}</query>
+
+<drop_query>DROP TABLE IF EXISTS table_src_{format}</drop_query>
+<drop_query>DROP TABLE IF EXISTS table_dst_{format}</drop_query>
+
+</test>