ClickHouse · Avogar · Feb 19, 2024 · Feb 1, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
@@ -1008,6 +1008,7 @@ class IColumn;
     M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
     M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
     M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
+    M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats", 0) \
     M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
     M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
     M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \

diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
@@ -84,6 +84,7 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
+    {"24.2", {{"input_format_try_infer_exponent_floats", true, false, "Don't infer floats in exponential notation by default"}}},
     {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."},
               {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"},
               {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"},

diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
@@ -226,6 +226,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
     format_settings.try_infer_integers = settings.input_format_try_infer_integers;
     format_settings.try_infer_dates = settings.input_format_try_infer_dates;
     format_settings.try_infer_datetimes = settings.input_format_try_infer_datetimes;
+    format_settings.try_infer_exponent_floats = settings.input_format_try_infer_exponent_floats;
     format_settings.markdown.escape_special_characters = settings.output_format_markdown_escape_special_characters;
     format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string;
     format_settings.bson.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_bson_skip_fields_with_unsupported_types_in_schema_inference;

diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h
@@ -46,6 +46,7 @@ struct FormatSettings
     bool try_infer_integers = false;
     bool try_infer_dates = false;
     bool try_infer_datetimes = false;
+    bool try_infer_exponent_floats = false;
 
     enum class DateTimeInputFormat
     {

diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp
@@ -17,6 +17,7 @@
 #include <IO/ReadHelpers.h>
 #include <IO/parseDateTimeBestEffort.h>
 #include <IO/PeekableReadBuffer.h>
+#include <IO/readFloatText.h>
 
 #include <Core/Block.h>
 #include <Common/assert_cast.h>
@@ -865,6 +866,13 @@ namespace
         return std::make_shared<DataTypeTuple>(nested_types);
     }
 
+    bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
+    {
+        if (settings.try_infer_exponent_floats)
+            return tryReadFloatText(value, buf);
+        return tryReadFloatTextNoExponent(value, buf);
+    }
+
     DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
     {
         if (buf.eof())
@@ -903,7 +911,7 @@ namespace
                     buf.position() = number_start;
                 }
 
-                if (tryReadFloatText(tmp_float, buf))
+                if (tryReadFloat(tmp_float, buf, settings))
                 {
                     if (read_int && buf.position() == int_end)
                         return std::make_shared<DataTypeInt64>();
@@ -937,7 +945,7 @@ namespace
                 peekable_buf.rollbackToCheckpoint(true);
             }
 
-            if (tryReadFloatText(tmp_float, peekable_buf))
+            if (tryReadFloat(tmp_float, peekable_buf, settings))
             {
                 /// Float parsing reads no fewer bytes than integer parsing,
                 /// so position of the buffer is either the same, or further.
@@ -949,7 +957,7 @@ namespace
                 return std::make_shared<DataTypeFloat64>();
             }
         }
-        else if (tryReadFloatText(tmp_float, buf))
+        else if (tryReadFloat(tmp_float, buf, settings))
         {
             return std::make_shared<DataTypeFloat64>();
         }
@@ -1390,7 +1398,7 @@ DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSetting
     buf.position() = buf.buffer().begin();
 
     Float64 tmp;
-    if (tryReadFloatText(tmp, buf) && buf.eof())
+    if (tryReadFloat(tmp, buf, settings) && buf.eof())
         return std::make_shared<DataTypeFloat64>();
 
     return nullptr;

diff --git a/src/IO/readFloatText.cpp b/src/IO/readFloatText.cpp
@@ -67,4 +67,7 @@ template void readFloatText<Float64>(Float64 &, ReadBuffer &);
 template bool tryReadFloatText<Float32>(Float32 &, ReadBuffer &);
 template bool tryReadFloatText<Float64>(Float64 &, ReadBuffer &);
 
+template bool tryReadFloatTextNoExponent<Float32>(Float32 &, ReadBuffer &);
+template bool tryReadFloatTextNoExponent<Float64>(Float64 &, ReadBuffer &);
+
 }
diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h
@@ -324,7 +324,7 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf)
 }
 
 
-template <typename T, typename ReturnType>
+template <typename T, typename ReturnType, bool allow_exponent = true>
 ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
 {
     static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double");
@@ -395,30 +395,33 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in)
         after_point_exponent = (read_digits > significant_digits ? -significant_digits : static_cast<int>(-read_digits)) - after_point_num_leading_zeros;
     }
 
-    if (checkChar('e', in) || checkChar('E', in))
+    if constexpr (allow_exponent)
     {
-        if (in.eof())
+        if (checkChar('e', in) || checkChar('E', in))
         {
-            if constexpr (throw_exception)
-                throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent");
-            else
-                return false;
-        }
+            if (in.eof())
+            {
+                if constexpr (throw_exception)
+                    throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent");
+                else
+                    return false;
+            }
 
-        bool exponent_negative = false;
-        if (*in.position() == '-')
-        {
-            exponent_negative = true;
-            ++in.position();
-        }
-        else if (*in.position() == '+')
-        {
-            ++in.position();
-        }
+            bool exponent_negative = false;
+            if (*in.position() == '-')
+            {
+                exponent_negative = true;
+                ++in.position();
+            }
+            else if (*in.position() == '+')
+            {
+                ++in.position();
+            }
 
-        readUIntTextUpToNSignificantDigits<4>(exponent, in);
-        if (exponent_negative)
-            exponent = -exponent;
+            readUIntTextUpToNSignificantDigits<4>(exponent, in);
+            if (exponent_negative)
+                exponent = -exponent;
+        }
     }
 
     if (after_point)
@@ -604,4 +607,7 @@ template <typename T> bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { retu
 template <typename T> void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); }
 template <typename T> bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); }
 
+/// Don't read exponent part of the number.
+template <typename T> bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl<T, bool, false>(x, in); }
+
 }
diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference
@@ -0,0 +1,2 @@
+c1	Nullable(String)					
+c1	Nullable(Float64)					
diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql
@@ -0,0 +1,3 @@
+DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 0;
+DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 1;
+