Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clickhouse-format improvements: support VALUES, comments, max_line_length #58246

Merged
merged 3 commits into from Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/en/operations/utilities/clickhouse-format.md
Expand Up @@ -11,6 +11,8 @@ Keys:
- `--query` — Format queries of any length and complexity.
- `--hilite` — Add syntax highlight with ANSI terminal escape sequences.
- `--oneline` — Format in single line.
- `--max_line_length` — Format in single line queries with length less than specified.
- `--comments` — Keep comments in the output.
- `--quiet` or `-q` — Just check syntax, no output on success.
- `--multiquery` or `-n` — Allow multiple queries in the same file.
- `--obfuscate` — Obfuscate instead of formatting.
Expand Down
155 changes: 114 additions & 41 deletions programs/format/Format.cpp
Expand Up @@ -3,6 +3,7 @@
#include <string_view>
#include <boost/program_options.hpp>

#include <IO/copyData.h>
#include <IO/ReadBufferFromFileDescriptor.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFileDescriptor.h>
Expand All @@ -13,6 +14,7 @@
#include <Parsers/obfuscateQueries.h>
#include <Parsers/parseQuery.h>
#include <Common/ErrorCodes.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/TerminalSize.h>

#include <Interpreters/Context.h>
Expand All @@ -29,22 +31,49 @@
#include <DataTypes/DataTypeFactory.h>
#include <Formats/FormatFactory.h>
#include <Formats/registerFormats.h>
#include <Processors/Transforms/getSourceFromASTInsertQuery.h>


#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wmissing-declarations"

extern const char * auto_time_zones[];

namespace DB::ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}

namespace DB
namespace
{
namespace ErrorCodes

void skipSpacesAndComments(const char*& pos, const char* end, bool print_comments)
{
extern const int INVALID_FORMAT_INSERT_QUERY_WITH_DATA;
do
{
/// skip spaces to avoid throw exception after last query
while (pos != end && std::isspace(*pos))
++pos;

const char * comment_begin = pos;
/// for skip comment after the last query and to not throw exception
if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-')
{
pos += 2;
/// skip until the end of the line
while (pos != end && *pos != '\n')
++pos;
if (print_comments)
std::cout << std::string_view(comment_begin, pos - comment_begin) << "\n";
}
/// need to parse next sql
else
break;
} while (pos != end);
}

}

#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wmissing-declarations"

extern const char * auto_time_zones[];

int mainEntryClickHouseFormat(int argc, char ** argv)
{
using namespace DB;
Expand All @@ -55,8 +84,10 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
desc.add_options()
("query", po::value<std::string>(), "query to format")
("help,h", "produce help message")
("comments", "keep comments in the output")
("hilite", "add syntax highlight with ANSI terminal escape sequences")
("oneline", "format in single line")
("max_line_length", po::value<size_t>()->default_value(0), "format in single line queries with length less than specified")
("quiet,q", "just check syntax, no output on success")
("multiquery,n", "allow multiple queries in the same file")
("obfuscate", "obfuscate instead of formatting")
Expand Down Expand Up @@ -88,6 +119,8 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
bool oneline = options.count("oneline");
bool quiet = options.count("quiet");
bool multiple = options.count("multiquery");
bool print_comments = options.count("comments");
size_t max_line_length = options["max_line_length"].as<size_t>();
bool obfuscate = options.count("obfuscate");
bool backslash = options.count("backslash");
bool allow_settings_after_format_in_insert = options.count("allow_settings_after_format_in_insert");
Expand All @@ -104,6 +137,19 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
return 2;
}

if (oneline && max_line_length)
{
std::cerr << "Options 'oneline' and 'max_line_length' are mutually exclusive." << std::endl;
return 2;
}

if (max_line_length > 255)
{
std::cerr << "Option 'max_line_length' must be less than 256." << std::endl;
return 2;
}


String query;

if (options.count("query"))
Expand All @@ -124,7 +170,6 @@ int mainEntryClickHouseFormat(int argc, char ** argv)

if (options.count("seed"))
{
std::string seed;
hash_func.update(options["seed"].as<std::string>());
}

Expand Down Expand Up @@ -179,30 +224,75 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
{
const char * pos = query.data();
const char * end = pos + query.size();
skipSpacesAndComments(pos, end, print_comments);

ParserQuery parser(end, allow_settings_after_format_in_insert);
do
while (pos != end)
{
size_t approx_query_length = multiple ? find_first_symbols<';'>(pos, end) - pos : end - pos;

ASTPtr res = parseQueryAndMovePosition(
parser, pos, end, "query", multiple, cmd_settings.max_query_size, cmd_settings.max_parser_depth);

/// For insert query with data(INSERT INTO ... VALUES ...), that will lead to the formatting failure,
/// we should throw an exception early, and make exception message more readable.
if (const auto * insert_query = res->as<ASTInsertQuery>(); insert_query && insert_query->data)
std::unique_ptr<ReadBuffer> insert_query_payload = nullptr;
/// If the query is INSERT ... VALUES, then we will try to parse the data.
if (auto * insert_query = res->as<ASTInsertQuery>(); insert_query && insert_query->data)
{
throw Exception(DB::ErrorCodes::INVALID_FORMAT_INSERT_QUERY_WITH_DATA,
"Can't format ASTInsertQuery with data, since data will be lost");
if ("Values" != insert_query->format)
throw Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Can't format INSERT query with data format '{}'", insert_query->format);

/// Reset format to default to have `INSERT INTO table VALUES` instead of `INSERT INTO table VALUES FORMAT Values`
insert_query->format = {};

/// We assume that data ends with a newline character (same as client does)
const char * this_query_end = find_first_symbols<'\n'>(insert_query->data, end);
insert_query->end = this_query_end;
pos = this_query_end;
insert_query_payload = getReadBufferFromASTInsertQuery(res);
}

if (!quiet)
{
if (!backslash)
{
WriteBufferFromOStream res_buf(std::cout, 4096);
formatAST(*res, res_buf, hilite, oneline);
res_buf.finalize();
if (multiple)
std::cout << "\n;\n";
WriteBufferFromOwnString str_buf;
formatAST(*res, str_buf, hilite, oneline || approx_query_length < max_line_length);

if (insert_query_payload)
{
str_buf.write(' ');
copyData(*insert_query_payload, str_buf);
}

String res_string = str_buf.str();
const char * s_pos = res_string.data();
const char * s_end = s_pos + res_string.size();
/// remove trailing spaces
while (s_end > s_pos && isWhitespaceASCIIOneLine(*(s_end - 1)))
--s_end;
WriteBufferFromOStream res_cout(std::cout, 4096);
/// For multiline queries we print ';' at new line,
/// but for single line queries we print ';' at the same line
bool has_multiple_lines = false;
while (s_pos != s_end)
{
if (*s_pos == '\n')
has_multiple_lines = true;
res_cout.write(*s_pos++);
}
res_cout.finalize();

if (multiple && !insert_query_payload)
{
if (oneline || !has_multiple_lines)
std::cout << ";\n";
else
std::cout << "\n;\n";
}
else if (multiple && insert_query_payload)
/// Do not need to add ; because it's already in the insert_query_payload
std::cout << "\n";

std::cout << std::endl;
}
/// add additional '\' at the end of each line;
Expand Down Expand Up @@ -230,27 +320,10 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
std::cout << std::endl;
}
}

do
{
/// skip spaces to avoid throw exception after last query
while (pos != end && std::isspace(*pos))
++pos;

/// for skip comment after the last query and to not throw exception
if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-')
{
pos += 2;
/// skip until the end of the line
while (pos != end && *pos != '\n')
++pos;
}
/// need to parse next sql
else
break;
} while (pos != end);

} while (multiple && pos != end);
skipSpacesAndComments(pos, end, print_comments);
if (!multiple)
break;
}
}
}
catch (...)
Expand Down
@@ -1,5 +1,4 @@
SELECT 1
;
SELECT 1;

SELECT 1
UNION ALL
Expand All @@ -10,8 +9,7 @@ UNION ALL
)
;

SELECT 1
;
SELECT 1;

SELECT 1
UNION ALL
Expand All @@ -22,4 +20,6 @@ UNION ALL
)
;

INSERT INTO t VALUES (1);

OK
4 changes: 3 additions & 1 deletion tests/queries/0_stateless/01753_fix_clickhouse_format.sh
Expand Up @@ -8,4 +8,6 @@ echo "select 1; select 1 union all (select 1 union distinct select 1); " | $CL

echo "select 1; select 1 union all (select 1 union distinct select 1); -- comment " | $CLICKHOUSE_FORMAT -n;

echo "insert into t values (1); " | $CLICKHOUSE_FORMAT -n 2>&1 \ | grep -F -q "Code: 578" && echo 'OK' || echo 'FAIL'
echo "insert into t values (1); " | $CLICKHOUSE_FORMAT -n

echo 'insert into t format JSONEachRow {"a":1};' | $CLICKHOUSE_FORMAT -n 2>&1 \ | grep -F -q "NOT_IMPLEMENTED" && echo 'OK' || echo 'FAIL'
@@ -1,7 +1,7 @@
[multi] insert into foo settings max_threads=1
Syntax error (query): failed at position 40 (end of query):
[multi] insert into foo format tsv settings max_threads=1
Can't format ASTInsertQuery with data, since data will be lost.
NOT_IMPLEMENTED
[multi] insert into foo format tsv settings max_threads=1
INSERT INTO foo
SETTINGS max_threads = 1
Expand Down
2 changes: 1 addition & 1 deletion tests/queries/0_stateless/02263_format_insert_settings.sh
Expand Up @@ -25,7 +25,7 @@ function run_format_both()
run_format 'insert into foo settings max_threads=1' |& grep --max-count 2 --only-matching -e "Syntax error (query): failed at position .* (end of query):" -e '^\[.*$'

# compatibility
run_format 'insert into foo format tsv settings max_threads=1' |& grep --max-count 2 --only-matching -e "Can't format ASTInsertQuery with data, since data will be lost." -e '^\[.*$'
run_format 'insert into foo format tsv settings max_threads=1' |& grep --max-count 2 --only-matching -e "NOT_IMPLEMENTED" -e '^\[.*$'
run_format_both 'insert into foo format tsv settings max_threads=1' --allow_settings_after_format_in_insert
run_format 'insert into foo settings max_threads=1 format tsv settings max_threads=1' --allow_settings_after_format_in_insert |& grep --max-count 2 --only-matching -e "You have SETTINGS before and after FORMAT" -e '^\[.*$'

Expand Down