From 8099fbf017f21db1445bdc354a00ef3bb55c0bd7 Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 9 Apr 2019 15:59:12 +0200 Subject: [PATCH 01/12] handle json files (json array with one json per line or a json file containing one json) --- json_to_csv/json_to_csv.py | 64 ++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index e45cdd1..ac1ed0c 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -190,7 +190,38 @@ def update_columns_list(columns_list, json_list, sep, int_to_float, remove_null) return columns_list -def get_columns(list_data_paths, sep, logger, int_to_float, remove_null): +def read_jsons_chunks(file_object, chunk_size=10000): + """Lazy function to read a json by chunk. + Default chunk size: 10k""" + # Check first element of a file + # If it is "[", that means we have a json array + first_line = file_object.readline() + if first_line[0] == '[': + while True: + # Parse the next real chunk_size lines + data = [] + for i in range(chunk_size): + # Here it works with one json, or an array of jsons with one json in each line + # TODO Make it work with no assumption over json + # Remove comma and to the next line + line = file_object.readline().strip(',\n') + # If EOF obtained or end of jsonarray send what's left of the data + if line == "" or line == "]": + yield data + return + else: + data.append(json.loads(line)) + if not data: + break + yield data + # End of file obtained + elif file_object.read() == ']': + return None + # Otherwise, we have one json in the file + else: + yield [json.loads(first_line)] + +def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json=False): """ Get the columns created accordingly to a list of files containing json @@ -199,6 +230,7 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null): :param logger: logger (used to print) :param int_to_float: if set to true int will be casted to float :param remove_null: if set to true, will remove_null from json arrays + :param is_json: if set to true, inputs are considered as valid json :return: Exhaustive list of columns """ @@ -206,21 +238,41 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null): columns_list = [] j = 0 + chunk_size = 500000 for data_file in list_data_paths: logger.info(data_file) json_list = [] - with open(data_file) as f: - for i, line in enumerate(f): + # If we deal with json (or json array) file + if is_json: + f = open(data_file) + # Read json file by chunk + for x in read_jsons_chunks(f, chunk_size=chunk_size): j += 1 - if (j % 500000 == 0): + if (j % chunk_size == 0): columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null) logger.info('Iteration ' + str(j) + ': Updating columns ===> ' + str(len(columns_list)) + ' columns found') json_list = [] try: - json_list.append(json.loads(line)) + json_list.extend(x) + # Maximum of chunk_size elements were added + j+=chunk_size - 1 # -1 because we add 1 at the beginning of the loop except: logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped") continue + # If we deal with ljson + else: + with open(data_file) as f: + for i, line in enumerate(f): + j += 1 + if (j % 500000 == 0): + columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null) + logger.info('Iteration ' + str(j) + ': Updating columns ===> ' + str(len(columns_list)) + ' columns found') + json_list = [] + try: + json_list.append(json.loads(line)) + except: + logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped") + continue # A quicker solution would be to join directly to create a valid json if (len(json_list) > 0): columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null) @@ -313,7 +365,7 @@ def main(): logger.info("Reading " + opt.path_data_jsonperline) data = [opt.path_data_jsonperline] - # Get list of columns if not in streaming + # Get list of columns if in streaming columns_list = None if opt.streaming: columns_list = get_columns(data, opt.sep, logger, opt.int_to_float, opt.remove_null) From 10918a5665371c1c926fb10a6dd7510677aeb0fe Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 9 Apr 2019 18:05:34 +0200 Subject: [PATCH 02/12] Adapt streaming flow for json files --- json_to_csv/json_to_csv.py | 63 +++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index ac1ed0c..68c5efa 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -25,6 +25,7 @@ def get_args(): parser.add_argument("--int_to_float", action='store_true', default=False, help="Cast int to float") parser.add_argument("--path_output", type=str, help="Path output") parser.add_argument("--remove_null", action='store_true', default=False, help="Remove null values (kept by default)") + parser.add_argument("--is_json", action='store_true', default=False, help="Indicate if input file is a json") args = parser.parse_args() return args @@ -221,7 +222,7 @@ def read_jsons_chunks(file_object, chunk_size=10000): else: yield [json.loads(first_line)] -def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json=False): +def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json): """ Get the columns created accordingly to a list of files containing json @@ -238,7 +239,7 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json columns_list = [] j = 0 - chunk_size = 500000 + chunk_size = 50000 for data_file in list_data_paths: logger.info(data_file) json_list = [] @@ -247,24 +248,23 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json f = open(data_file) # Read json file by chunk for x in read_jsons_chunks(f, chunk_size=chunk_size): - j += 1 - if (j % chunk_size == 0): + if j!=0 and (j % chunk_size == 0): columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null) logger.info('Iteration ' + str(j) + ': Updating columns ===> ' + str(len(columns_list)) + ' columns found') json_list = [] try: json_list.extend(x) # Maximum of chunk_size elements were added - j+=chunk_size - 1 # -1 because we add 1 at the beginning of the loop + j += chunk_size except: - logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped") + logger.info("Json in line " + str(j) + " (in file: " + data_file + ") does not seem well formed. Example was skipped") continue # If we deal with ljson else: with open(data_file) as f: for i, line in enumerate(f): j += 1 - if (j % 500000 == 0): + if (j % 50000 == 0): columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null) logger.info('Iteration ' + str(j) + ': Updating columns ===> ' + str(len(columns_list)) + ' columns found') json_list = [] @@ -283,7 +283,7 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json return columns_list -def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep='.', int_to_float=False, remove_null=False): +def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep='.', int_to_float=False, remove_null=False, is_json=False): """ Get dataframe from files containing one json per line @@ -294,30 +294,51 @@ def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep :param sep: separator to use when creating columns' names :param int_to_float: if set to true int will be casted to float :param remove_null: if set to true, will remove_null from json arrays + :param is_json: if set to true, inputs are considered as valid json :return: dataframe or nothing if the dataframe is generated while streaming the files """ json_list = [] j = 0 + chunk_size = 50000 for data_file in list_data_paths: logger.info(data_file) - with open(data_file) as f: - for i, line in enumerate(f): - j += 1 - if (j % 500000 == 0): + json_list = [] + # If we deal with json (or json array) file + if is_json: + f = open(data_file) + # Read json file by chunk + for x in read_jsons_chunks(f, chunk_size=chunk_size): + if j!=0 and (j % chunk_size == 0): + update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null) logger.info('Iteration ' + str(j) + ': Creating sub dataframe') - if columns: - update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null) - json_list.clear() - - if (j % 100000 == 0): - logger.info(str(i) + ' documents processed') + json_list = [] try: - json_list.append(json.loads(line)) + json_list.extend(x) + # Maximum of chunk_size elements were added + j += chunk_size # -1 because we add 1 at the beginning of the loop except: logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped") continue + # If we deal with ljson + else: + with open(data_file) as f: + for i, line in enumerate(f): + j += 1 + if (j % 50000 == 0): + logger.info('Iteration ' + str(j) + ': Creating sub dataframe') + if columns: + update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null) + json_list.clear() + + if (j % 100000 == 0): + logger.info(str(i) + ' documents processed') + try: + json_list.append(json.loads(line)) + except: + logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped") + continue # A quicker solution would be to join directly to create a valid json logger.info('Convert to DataFrame') @@ -368,7 +389,7 @@ def main(): # Get list of columns if in streaming columns_list = None if opt.streaming: - columns_list = get_columns(data, opt.sep, logger, opt.int_to_float, opt.remove_null) + columns_list = get_columns(data, opt.sep, logger, opt.int_to_float, opt.remove_null, opt.is_json) # Sort columns in alphabetical order columns_list.sort() df = pd.DataFrame(columns=columns_list) @@ -378,7 +399,7 @@ def main(): df.to_csv(opt.path_output, encoding="utf-8", index=None, quoting=1) # Get dataframe - df = get_dataframe(data, columns=columns_list, path_csv=opt.path_output, logger=logger, sep=opt.sep, int_to_float=opt.int_to_float, remove_null=opt.remove_null) + df = get_dataframe(data, columns=columns_list, path_csv=opt.path_output, logger=logger, sep=opt.sep, int_to_float=opt.int_to_float, remove_null=opt.remove_null, is_json=opt.is_json) if not opt.streaming: logger.info("saving data to " + opt.path_output) From c3969a394cd3547b371e57c227cce77c4c56e804 Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 9 Apr 2019 18:09:20 +0200 Subject: [PATCH 03/12] Update Readme --- README.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index faae00f..0f066a6 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,14 @@ The csv column's will be team_captain, team_defend, team_str ``` +Up until now, ljson are handled. Jsons are handled if in the file indicated as input is in one of the following format: +- Json array with one element per line +- One json element in the first line + +TODO: + +It does not yet parse jsons formatted in another way. +But soon, it will be ## Installation @@ -44,14 +52,14 @@ optional arguments: every json in memory --sep SEP Separator used to create columns names --int_to_float Cast int to float - --path_output PATH_OUTPUT - Path output + --path_output PATH_OUTPUT Path output + --remove_null Remove null values (kept by default) + --is_json Indicate if input file is a json ``` Please refer to [here](https://github.com/Besedo/json-to-csv/examples) for examples. - ## Meta Distributed under the Apache license v2.0. See ``LICENSE`` for more information. From 424dffd0c62979e546f51ef6f2cbbcc02ebf153b Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 9 Apr 2019 18:09:30 +0200 Subject: [PATCH 04/12] Add requirements file --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fb6c7ed --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +pandas From 5cc938adac6d62d984e2faadeaeef41c98dcfd7f Mon Sep 17 00:00:00 2001 From: YaYaB Date: Sun, 21 Apr 2019 18:45:59 +0200 Subject: [PATCH 05/12] Manage all jsons type --- json_to_csv/json_to_csv.py | 49 +++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 68c5efa..3eb4a92 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -203,21 +203,52 @@ def read_jsons_chunks(file_object, chunk_size=10000): data = [] for i in range(chunk_size): # Here it works with one json, or an array of jsons with one json in each line - # TODO Make it work with no assumption over json # Remove comma and to the next line - line = file_object.readline().strip(',\n') - # If EOF obtained or end of jsonarray send what's left of the data - if line == "" or line == "]": - yield data + nb_bracket = 0 + nb_quotes = 0 + example = "" + c_bef = "" + while True: + # Read one character + c = file_object.read(1) + # If we are at the end of the file + if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: + break + # If we are in between 2 json examples or a the end + if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: + continue + # Check beginning of brackets + if c == '{': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_bracket += 1 + # Check quoting + elif c == '"': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_quotes += 1 + # Check ending of brackets + elif c == '}': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_bracket -= 1 + # This means we finished to read one json + if nb_bracket == 0 and nb_quotes % 2 == 0: + example += c + break + # Append character to the json example + example += c + # Set previous character + c_bef = c + # If EOF obtained or end of jsonarray send what's left of the data + if example == "" or example == "]": + yield(data) return else: - data.append(json.loads(line)) + data.append(json.loads(example)) if not data: break yield data - # End of file obtained - elif file_object.read() == ']': - return None # Otherwise, we have one json in the file else: yield [json.loads(first_line)] From b9aaf593dfe78bc63aa400fef03758fbc7f890ff Mon Sep 17 00:00:00 2001 From: YaYaB Date: Sun, 21 Apr 2019 19:05:38 +0200 Subject: [PATCH 06/12] Manage one json split in several lines --- json_to_csv/json_to_csv.py | 109 +++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 58 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 3eb4a92..968c43d 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -194,64 +194,57 @@ def update_columns_list(columns_list, json_list, sep, int_to_float, remove_null) def read_jsons_chunks(file_object, chunk_size=10000): """Lazy function to read a json by chunk. Default chunk size: 10k""" - # Check first element of a file - # If it is "[", that means we have a json array - first_line = file_object.readline() - if first_line[0] == '[': - while True: - # Parse the next real chunk_size lines - data = [] - for i in range(chunk_size): - # Here it works with one json, or an array of jsons with one json in each line - # Remove comma and to the next line - nb_bracket = 0 - nb_quotes = 0 - example = "" - c_bef = "" - while True: - # Read one character - c = file_object.read(1) - # If we are at the end of the file - if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: - break - # If we are in between 2 json examples or a the end - if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: - continue - # Check beginning of brackets - if c == '{': - # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': - nb_bracket += 1 - # Check quoting - elif c == '"': - # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': - nb_quotes += 1 - # Check ending of brackets - elif c == '}': - # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': - nb_bracket -= 1 - # This means we finished to read one json - if nb_bracket == 0 and nb_quotes % 2 == 0: - example += c - break - # Append character to the json example - example += c - # Set previous character - c_bef = c - # If EOF obtained or end of jsonarray send what's left of the data - if example == "" or example == "]": - yield(data) - return - else: - data.append(json.loads(example)) - if not data: - break - yield data - # Otherwise, we have one json in the file - else: - yield [json.loads(first_line)] + + while True: + # Parse the next real chunk_size lines + data = [] + for i in range(chunk_size): + nb_bracket = 0 + nb_quotes = 0 + example = "" + c_bef = "" + while True: + # Read one character + c = file_object.read(1) + # If we are at the end of the file + if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: + break + # If we are in between 2 json examples or a the end + if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: + continue + # Check beginning of brackets + if c == '{': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_bracket += 1 + # Check quoting + elif c == '"': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_quotes += 1 + # Check ending of brackets + elif c == '}': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_bracket -= 1 + # This means we finished to read one json + if nb_bracket == 0 and nb_quotes % 2 == 0: + example += c + break + # Append character to the json example + example += c + # Set previous character + c_bef = c + # If EOF obtained or end of jsonarray send what's left of the data + if example == "" or example == "]": + yield(data) + return + else: + data.append(json.loads(example)) + if not data: + break + yield data + def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json): """ From 3e8a62986b635df528c1be6ce23cbdbfeb5567bc Mon Sep 17 00:00:00 2001 From: YaYaB Date: Sun, 21 Apr 2019 19:11:39 +0200 Subject: [PATCH 07/12] Fix order txt files --- json_to_csv/json_to_csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 968c43d..9214140 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -209,8 +209,8 @@ def read_jsons_chunks(file_object, chunk_size=10000): # If we are at the end of the file if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: break - # If we are in between 2 json examples or a the end - if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: + # If we are in between 2 json examples or a the end or at the beginning + if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: continue # Check beginning of brackets if c == '{': From 5e851a085004a642c30ccdd3dca284db486fbf4e Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 23 Apr 2019 09:43:13 +0200 Subject: [PATCH 08/12] Fix parsing jsons with escaped brackets andn quotes --- json_to_csv/json_to_csv.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 9214140..063118a 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -203,6 +203,7 @@ def read_jsons_chunks(file_object, chunk_size=10000): nb_quotes = 0 example = "" c_bef = "" + c_2bef = "" while True: # Read one character c = file_object.read(1) @@ -213,27 +214,29 @@ def read_jsons_chunks(file_object, chunk_size=10000): if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: continue # Check beginning of brackets - if c == '{': + if c == '{' and nb_quotes % 2 == 0: # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': + if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_bracket += 1 # Check quoting elif c == '"': # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': + if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_quotes += 1 # Check ending of brackets - elif c == '}': + elif c == '}' and nb_quotes % 2 == 0: # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': + # Issue here when we have two + if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_bracket -= 1 - # This means we finished to read one json - if nb_bracket == 0 and nb_quotes % 2 == 0: - example += c - break + # This means we finished to read one json + if nb_bracket == 0 and nb_quotes % 2 == 0: + example += c + break # Append character to the json example example += c - # Set previous character + # Set previous characters + c_2bef = c_bef c_bef = c # If EOF obtained or end of jsonarray send what's left of the data if example == "" or example == "]": From c5648ed8101d3a0759f910113ea1ae5ad5e40c20 Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 23 Apr 2019 10:34:06 +0200 Subject: [PATCH 09/12] Clean comments --- json_to_csv/json_to_csv.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 063118a..5506491 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -215,18 +215,17 @@ def read_jsons_chunks(file_object, chunk_size=10000): continue # Check beginning of brackets if c == '{' and nb_quotes % 2 == 0: - # That means that the '"' is a delimiter of field or value in json + # Check only when '{' is a delimiter of field or value in json if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_bracket += 1 # Check quoting elif c == '"': - # That means that the '"' is a delimiter of field or value in json + # Check only when '"' is a delimiter of field or value in json if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_quotes += 1 # Check ending of brackets elif c == '}' and nb_quotes % 2 == 0: - # That means that the '"' is a delimiter of field or value in json - # Issue here when we have two + # Check only when '"' is a delimiter of field or value in json if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_bracket -= 1 # This means we finished to read one json From 2cbb061b649da3f2923861189ce11ec14d3ba543 Mon Sep 17 00:00:00 2001 From: YaYaB Date: Mon, 29 Apr 2019 15:38:09 +0200 Subject: [PATCH 10/12] Refacto and clean escaped elements --- json_to_csv/json_to_csv.py | 113 ++++++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 47 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 5506491..4bdaabf 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -195,57 +195,76 @@ def read_jsons_chunks(file_object, chunk_size=10000): """Lazy function to read a json by chunk. Default chunk size: 10k""" + # Parse the next real chunk_size lines + chunk = file_object.read(1000000) + data = [] + i = 0 + nb_bracket = 0 + nb_quotes = 0 + example = "" + count_escape_char = 0 while True: - # Parse the next real chunk_size lines - data = [] - for i in range(chunk_size): - nb_bracket = 0 - nb_quotes = 0 - example = "" - c_bef = "" - c_2bef = "" - while True: - # Read one character - c = file_object.read(1) - # If we are at the end of the file - if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: - break - # If we are in between 2 json examples or a the end or at the beginning - if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: + # Read cahracter by character + for k, c in enumerate(chunk): + # Check quoting + if c == '"': + # Check only when '"' is a delimiter of field or value in json + if count_escape_char % 2 == 0: + nb_quotes += 1 + # Check beginning of brackets + elif c == '{' and nb_quotes % 2 == 0: + # Check only when '{' is a delimiter of field or value in json + if count_escape_char % 2 == 0: + nb_bracket += 1 + # Check ending of brackets + elif c == '}' and nb_quotes % 2 == 0: + # Check only when '"' is a delimiter of field or value in json + if count_escape_char % 2 == 0: + nb_bracket -= 1 + # This means we finished to read one json + if nb_bracket == 0 and nb_quotes % 2 == 0: + example += c + data.append(json.loads(example)) + i += 1 + # When chunk_size jsons obtained, dump those + if i % chunk_size == 0: + yield(data) + data = [] + + # Initialize those + example = "" + c_bef = "" + c_2bef = "" continue - # Check beginning of brackets - if c == '{' and nb_quotes % 2 == 0: - # Check only when '{' is a delimiter of field or value in json - if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': - nb_bracket += 1 - # Check quoting - elif c == '"': - # Check only when '"' is a delimiter of field or value in json - if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': - nb_quotes += 1 - # Check ending of brackets - elif c == '}' and nb_quotes % 2 == 0: - # Check only when '"' is a delimiter of field or value in json - if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': - nb_bracket -= 1 - # This means we finished to read one json - if nb_bracket == 0 and nb_quotes % 2 == 0: - example += c - break - # Append character to the json example - example += c - # Set previous characters - c_2bef = c_bef - c_bef = c - # If EOF obtained or end of jsonarray send what's left of the data - if example == "" or example == "]": - yield(data) - return + # If we are in between 2 json examples or at the beginning + elif c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: + continue + # If we are at the end of the file + if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: + # If EOF obtained or end of jsonarray send what's left of the data + if example == "" or example == "]": + yield(data) + return + if c == "\\": + count_escape_char += 1 else: - data.append(json.loads(example)) - if not data: + count_escape_char = 0 + # Append character to the json example + example += c + + # Set previous characters + c_2bef = c_bef + c_bef = c + # If at the end of the chunk, read new chunk + if k == len(chunk) - 1: + chunk = file_object.read(1000000) + # Keep what's left of the chunk + elif len(chunk) != 0: + chunk = chunk[k:] + # if k == 0 that means that we read the whole file + else: break - yield data + def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json): From f9311c9878a27242d738a4a151a0071d9ec1ebfd Mon Sep 17 00:00:00 2001 From: YaYaB Date: Mon, 29 Apr 2019 15:55:55 +0200 Subject: [PATCH 11/12] Remove unused variables --- json_to_csv/json_to_csv.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 4bdaabf..2cf78e6 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -233,8 +233,6 @@ def read_jsons_chunks(file_object, chunk_size=10000): # Initialize those example = "" - c_bef = "" - c_2bef = "" continue # If we are in between 2 json examples or at the beginning elif c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: @@ -251,10 +249,7 @@ def read_jsons_chunks(file_object, chunk_size=10000): count_escape_char = 0 # Append character to the json example example += c - - # Set previous characters - c_2bef = c_bef - c_bef = c + # If at the end of the chunk, read new chunk if k == len(chunk) - 1: chunk = file_object.read(1000000) From d343889b0249578cb73ccb4587209fd39f06bd2e Mon Sep 17 00:00:00 2001 From: YaYaB Date: Thu, 9 May 2019 13:06:33 +0200 Subject: [PATCH 12/12] Fix non streamin transformation when it is a json --- json_to_csv/json_to_csv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 2cf78e6..dc70eb7 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -351,9 +351,10 @@ def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep # Read json file by chunk for x in read_jsons_chunks(f, chunk_size=chunk_size): if j!=0 and (j % chunk_size == 0): - update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null) logger.info('Iteration ' + str(j) + ': Creating sub dataframe') - json_list = [] + if columns: + update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null) + json_list = [] try: json_list.extend(x) # Maximum of chunk_size elements were added