From 5cc938adac6d62d984e2faadeaeef41c98dcfd7f Mon Sep 17 00:00:00 2001 From: YaYaB Date: Sun, 21 Apr 2019 18:45:59 +0200 Subject: [PATCH 1/5] Manage all jsons type --- json_to_csv/json_to_csv.py | 49 +++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 68c5efa..3eb4a92 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -203,21 +203,52 @@ def read_jsons_chunks(file_object, chunk_size=10000): data = [] for i in range(chunk_size): # Here it works with one json, or an array of jsons with one json in each line - # TODO Make it work with no assumption over json # Remove comma and to the next line - line = file_object.readline().strip(',\n') - # If EOF obtained or end of jsonarray send what's left of the data - if line == "" or line == "]": - yield data + nb_bracket = 0 + nb_quotes = 0 + example = "" + c_bef = "" + while True: + # Read one character + c = file_object.read(1) + # If we are at the end of the file + if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: + break + # If we are in between 2 json examples or a the end + if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: + continue + # Check beginning of brackets + if c == '{': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_bracket += 1 + # Check quoting + elif c == '"': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_quotes += 1 + # Check ending of brackets + elif c == '}': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_bracket -= 1 + # This means we finished to read one json + if nb_bracket == 0 and nb_quotes % 2 == 0: + example += c + break + # Append character to the json example + example += c + # Set previous character + c_bef = c + # If EOF obtained or end of jsonarray send what's left of the data + if example == "" or example == "]": + yield(data) return else: - data.append(json.loads(line)) + data.append(json.loads(example)) if not data: break yield data - # End of file obtained - elif file_object.read() == ']': - return None # Otherwise, we have one json in the file else: yield [json.loads(first_line)] From b9aaf593dfe78bc63aa400fef03758fbc7f890ff Mon Sep 17 00:00:00 2001 From: YaYaB Date: Sun, 21 Apr 2019 19:05:38 +0200 Subject: [PATCH 2/5] Manage one json split in several lines --- json_to_csv/json_to_csv.py | 109 +++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 58 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 3eb4a92..968c43d 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -194,64 +194,57 @@ def update_columns_list(columns_list, json_list, sep, int_to_float, remove_null) def read_jsons_chunks(file_object, chunk_size=10000): """Lazy function to read a json by chunk. Default chunk size: 10k""" - # Check first element of a file - # If it is "[", that means we have a json array - first_line = file_object.readline() - if first_line[0] == '[': - while True: - # Parse the next real chunk_size lines - data = [] - for i in range(chunk_size): - # Here it works with one json, or an array of jsons with one json in each line - # Remove comma and to the next line - nb_bracket = 0 - nb_quotes = 0 - example = "" - c_bef = "" - while True: - # Read one character - c = file_object.read(1) - # If we are at the end of the file - if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: - break - # If we are in between 2 json examples or a the end - if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: - continue - # Check beginning of brackets - if c == '{': - # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': - nb_bracket += 1 - # Check quoting - elif c == '"': - # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': - nb_quotes += 1 - # Check ending of brackets - elif c == '}': - # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': - nb_bracket -= 1 - # This means we finished to read one json - if nb_bracket == 0 and nb_quotes % 2 == 0: - example += c - break - # Append character to the json example - example += c - # Set previous character - c_bef = c - # If EOF obtained or end of jsonarray send what's left of the data - if example == "" or example == "]": - yield(data) - return - else: - data.append(json.loads(example)) - if not data: - break - yield data - # Otherwise, we have one json in the file - else: - yield [json.loads(first_line)] + + while True: + # Parse the next real chunk_size lines + data = [] + for i in range(chunk_size): + nb_bracket = 0 + nb_quotes = 0 + example = "" + c_bef = "" + while True: + # Read one character + c = file_object.read(1) + # If we are at the end of the file + if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: + break + # If we are in between 2 json examples or a the end + if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: + continue + # Check beginning of brackets + if c == '{': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_bracket += 1 + # Check quoting + elif c == '"': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_quotes += 1 + # Check ending of brackets + elif c == '}': + # That means that the '"' is a delimiter of field or value in json + if c_bef != '\\': + nb_bracket -= 1 + # This means we finished to read one json + if nb_bracket == 0 and nb_quotes % 2 == 0: + example += c + break + # Append character to the json example + example += c + # Set previous character + c_bef = c + # If EOF obtained or end of jsonarray send what's left of the data + if example == "" or example == "]": + yield(data) + return + else: + data.append(json.loads(example)) + if not data: + break + yield data + def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json): """ From 3e8a62986b635df528c1be6ce23cbdbfeb5567bc Mon Sep 17 00:00:00 2001 From: YaYaB Date: Sun, 21 Apr 2019 19:11:39 +0200 Subject: [PATCH 3/5] Fix order txt files --- json_to_csv/json_to_csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 968c43d..9214140 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -209,8 +209,8 @@ def read_jsons_chunks(file_object, chunk_size=10000): # If we are at the end of the file if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0: break - # If we are in between 2 json examples or a the end - if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: + # If we are in between 2 json examples or a the end or at the beginning + if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: continue # Check beginning of brackets if c == '{': From 5e851a085004a642c30ccdd3dca284db486fbf4e Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 23 Apr 2019 09:43:13 +0200 Subject: [PATCH 4/5] Fix parsing jsons with escaped brackets andn quotes --- json_to_csv/json_to_csv.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 9214140..063118a 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -203,6 +203,7 @@ def read_jsons_chunks(file_object, chunk_size=10000): nb_quotes = 0 example = "" c_bef = "" + c_2bef = "" while True: # Read one character c = file_object.read(1) @@ -213,27 +214,29 @@ def read_jsons_chunks(file_object, chunk_size=10000): if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0: continue # Check beginning of brackets - if c == '{': + if c == '{' and nb_quotes % 2 == 0: # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': + if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_bracket += 1 # Check quoting elif c == '"': # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': + if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_quotes += 1 # Check ending of brackets - elif c == '}': + elif c == '}' and nb_quotes % 2 == 0: # That means that the '"' is a delimiter of field or value in json - if c_bef != '\\': + # Issue here when we have two + if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_bracket -= 1 - # This means we finished to read one json - if nb_bracket == 0 and nb_quotes % 2 == 0: - example += c - break + # This means we finished to read one json + if nb_bracket == 0 and nb_quotes % 2 == 0: + example += c + break # Append character to the json example example += c - # Set previous character + # Set previous characters + c_2bef = c_bef c_bef = c # If EOF obtained or end of jsonarray send what's left of the data if example == "" or example == "]": From c5648ed8101d3a0759f910113ea1ae5ad5e40c20 Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 23 Apr 2019 10:34:06 +0200 Subject: [PATCH 5/5] Clean comments --- json_to_csv/json_to_csv.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 063118a..5506491 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -215,18 +215,17 @@ def read_jsons_chunks(file_object, chunk_size=10000): continue # Check beginning of brackets if c == '{' and nb_quotes % 2 == 0: - # That means that the '"' is a delimiter of field or value in json + # Check only when '{' is a delimiter of field or value in json if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_bracket += 1 # Check quoting elif c == '"': - # That means that the '"' is a delimiter of field or value in json + # Check only when '"' is a delimiter of field or value in json if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_quotes += 1 # Check ending of brackets elif c == '}' and nb_quotes % 2 == 0: - # That means that the '"' is a delimiter of field or value in json - # Issue here when we have two + # Check only when '"' is a delimiter of field or value in json if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\': nb_bracket -= 1 # This means we finished to read one json