From 8099fbf017f21db1445bdc354a00ef3bb55c0bd7 Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Tue, 9 Apr 2019 15:59:12 +0200
Subject: [PATCH 01/12] handle json files (json array with one json per line or
 a json file containing one json)

---
 json_to_csv/json_to_csv.py | 64 ++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index e45cdd1..ac1ed0c 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -190,7 +190,38 @@ def update_columns_list(columns_list, json_list, sep, int_to_float, remove_null)
     return columns_list
 
 
-def get_columns(list_data_paths, sep, logger, int_to_float, remove_null):
+def read_jsons_chunks(file_object, chunk_size=10000):
+    """Lazy function to read a json by chunk.
+    Default chunk size: 10k"""
+    # Check first element of a file
+    # If it is "[", that means we have a json array
+    first_line = file_object.readline()
+    if first_line[0] == '[':
+        while True: 
+            # Parse the next real chunk_size lines
+            data = []
+            for i in range(chunk_size):
+                # Here it works with one json, or an array of jsons with one json in each line
+                # TODO Make it work with no assumption over json 
+                # Remove comma and to the next line
+                line = file_object.readline().strip(',\n')
+                # If EOF obtained or end of jsonarray send what's left of the data
+                if line == "" or line == "]":
+                    yield data
+                    return
+                else:
+                    data.append(json.loads(line))
+            if not data:
+                break
+            yield data
+    # End of file obtained
+    elif file_object.read() == ']':
+        return None
+    # Otherwise, we have one json in the file
+    else:
+        yield [json.loads(first_line)]
+
+def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json=False):
     """
         Get the columns created accordingly to a list of files containing json
 
@@ -199,6 +230,7 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null):
         :param logger: logger (used to print)
         :param int_to_float: if set to true int will be casted to float
         :param remove_null: if set to true, will remove_null from json arrays
+        :param is_json: if set to true, inputs are considered as valid json
         
         :return: Exhaustive list of columns
     """
@@ -206,21 +238,41 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null):
     columns_list = []
 
     j = 0
+    chunk_size = 500000
     for data_file in list_data_paths:
         logger.info(data_file)
         json_list = []
-        with open(data_file) as f:
-            for i, line in enumerate(f):
+        # If we deal with json (or json array) file
+        if is_json:
+            f = open(data_file)
+            # Read json file by chunk
+            for x in read_jsons_chunks(f, chunk_size=chunk_size):
                 j += 1
-                if (j % 500000 == 0):
+                if (j % chunk_size == 0):
                     columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null)
                     logger.info('Iteration ' + str(j) + ': Updating columns ===> ' + str(len(columns_list)) + ' columns found')                    
                     json_list = []
                 try:
-                    json_list.append(json.loads(line))
+                    json_list.extend(x)
+                    # Maximum of chunk_size elements were added
+                    j+=chunk_size - 1 # -1 because we add 1 at the beginning of the loop
                 except:
                     logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped")
                     continue
+        # If we deal with ljson
+        else:
+            with open(data_file) as f:
+                for i, line in enumerate(f):
+                    j += 1
+                    if (j % 500000 == 0):
+                        columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null)
+                        logger.info('Iteration ' + str(j) + ': Updating columns ===> ' + str(len(columns_list)) + ' columns found')                    
+                        json_list = []
+                    try:
+                        json_list.append(json.loads(line))
+                    except:
+                        logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped")
+                        continue
         # A quicker solution would be to join directly to create a valid json
         if (len(json_list) > 0):
             columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null)
@@ -313,7 +365,7 @@ def main():
         logger.info("Reading " + opt.path_data_jsonperline) 
         data = [opt.path_data_jsonperline]   
     
-    # Get list of columns if not in streaming
+    # Get list of columns if in streaming
     columns_list = None
     if opt.streaming:
         columns_list = get_columns(data, opt.sep, logger, opt.int_to_float, opt.remove_null)

From 10918a5665371c1c926fb10a6dd7510677aeb0fe Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Tue, 9 Apr 2019 18:05:34 +0200
Subject: [PATCH 02/12] Adapt streaming flow for json files

---
 json_to_csv/json_to_csv.py | 63 +++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 21 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index ac1ed0c..68c5efa 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -25,6 +25,7 @@ def get_args():
     parser.add_argument("--int_to_float", action='store_true', default=False, help="Cast int to float")    
     parser.add_argument("--path_output", type=str, help="Path output")
     parser.add_argument("--remove_null", action='store_true', default=False, help="Remove null values (kept by default)")
+    parser.add_argument("--is_json", action='store_true', default=False, help="Indicate if input file is a json")
     
     args = parser.parse_args()
     return args
@@ -221,7 +222,7 @@ def read_jsons_chunks(file_object, chunk_size=10000):
     else:
         yield [json.loads(first_line)]
 
-def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json=False):
+def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json):
     """
         Get the columns created accordingly to a list of files containing json
 
@@ -238,7 +239,7 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json
     columns_list = []
 
     j = 0
-    chunk_size = 500000
+    chunk_size = 50000
     for data_file in list_data_paths:
         logger.info(data_file)
         json_list = []
@@ -247,24 +248,23 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json
             f = open(data_file)
             # Read json file by chunk
             for x in read_jsons_chunks(f, chunk_size=chunk_size):
-                j += 1
-                if (j % chunk_size == 0):
+                if j!=0 and (j % chunk_size == 0):
                     columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null)
                     logger.info('Iteration ' + str(j) + ': Updating columns ===> ' + str(len(columns_list)) + ' columns found')                    
                     json_list = []
                 try:
                     json_list.extend(x)
                     # Maximum of chunk_size elements were added
-                    j+=chunk_size - 1 # -1 because we add 1 at the beginning of the loop
+                    j += chunk_size 
                 except:
-                    logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped")
+                    logger.info("Json in line " + str(j) + " (in file: " + data_file + ") does not seem well formed. Example was skipped")
                     continue
         # If we deal with ljson
         else:
             with open(data_file) as f:
                 for i, line in enumerate(f):
                     j += 1
-                    if (j % 500000 == 0):
+                    if (j % 50000 == 0):
                         columns_list = update_columns_list(columns_list, json_list, sep, int_to_float, remove_null)
                         logger.info('Iteration ' + str(j) + ': Updating columns ===> ' + str(len(columns_list)) + ' columns found')                    
                         json_list = []
@@ -283,7 +283,7 @@ def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json
     return columns_list
 
 
-def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep='.', int_to_float=False, remove_null=False):
+def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep='.', int_to_float=False, remove_null=False, is_json=False):
     """
         Get dataframe from files containing one json per line
 
@@ -294,30 +294,51 @@ def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep
         :param sep: separator to use when creating columns' names
         :param int_to_float: if set to true int will be casted to float
         :param remove_null: if set to true, will remove_null from json arrays
+        :param is_json: if set to true, inputs are considered as valid json
         
         :return: dataframe or nothing if the dataframe is generated while streaming the files
     """
 
     json_list = [] 
     j = 0
+    chunk_size = 50000
     for data_file in list_data_paths:
         logger.info(data_file)
-        with open(data_file) as f:
-            for i, line in enumerate(f):
-                j += 1
-                if (j % 500000 == 0):
+        json_list = []
+        # If we deal with json (or json array) file
+        if is_json:
+            f = open(data_file)
+            # Read json file by chunk
+            for x in read_jsons_chunks(f, chunk_size=chunk_size):
+                if j!=0 and (j % chunk_size == 0):
+                    update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null)
                     logger.info('Iteration ' + str(j) + ': Creating sub dataframe')
-                    if columns:
-                        update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null)
-                        json_list.clear()
-
-                if (j % 100000 == 0):
-                    logger.info(str(i) + ' documents processed')
+                    json_list = []
                 try:
-                    json_list.append(json.loads(line))
+                    json_list.extend(x)
+                    # Maximum of chunk_size elements were added
+                    j += chunk_size # -1 because we add 1 at the beginning of the loop
                 except:
                     logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped")
                     continue
+        # If we deal with ljson
+        else:
+            with open(data_file) as f:
+                for i, line in enumerate(f):
+                    j += 1
+                    if (j % 50000 == 0):
+                        logger.info('Iteration ' + str(j) + ': Creating sub dataframe')
+                        if columns:
+                            update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null)
+                            json_list.clear()
+
+                    if (j % 100000 == 0):
+                        logger.info(str(i) + ' documents processed')
+                    try:
+                        json_list.append(json.loads(line))
+                    except:
+                        logger.info("Json in line " + str(i) + " (in file: " + data_file + ") does not seem well formed. Example was skipped")
+                        continue
 
         # A quicker solution would be to join directly to create a valid json
         logger.info('Convert to DataFrame')
@@ -368,7 +389,7 @@ def main():
     # Get list of columns if in streaming
     columns_list = None
     if opt.streaming:
-        columns_list = get_columns(data, opt.sep, logger, opt.int_to_float, opt.remove_null)
+        columns_list = get_columns(data, opt.sep, logger, opt.int_to_float, opt.remove_null, opt.is_json)
         # Sort columns in alphabetical order
         columns_list.sort()
         df = pd.DataFrame(columns=columns_list)
@@ -378,7 +399,7 @@ def main():
         df.to_csv(opt.path_output, encoding="utf-8", index=None, quoting=1)
     
     # Get dataframe
-    df = get_dataframe(data, columns=columns_list, path_csv=opt.path_output, logger=logger, sep=opt.sep, int_to_float=opt.int_to_float, remove_null=opt.remove_null)
+    df = get_dataframe(data, columns=columns_list, path_csv=opt.path_output, logger=logger, sep=opt.sep, int_to_float=opt.int_to_float, remove_null=opt.remove_null, is_json=opt.is_json)
 
     if not opt.streaming:
         logger.info("saving data to "  + opt.path_output)

From c3969a394cd3547b371e57c227cce77c4c56e804 Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Tue, 9 Apr 2019 18:09:20 +0200
Subject: [PATCH 03/12] Update Readme

---
 README.md | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index faae00f..0f066a6 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,14 @@ The csv column's will be
 team_captain, team_defend, team_str
 ```
 
+Up until now, ljson are handled. Jsons are handled if in the file indicated as input is in one of the following format:
+- Json array with one element per line
+- One json element in the first line
+
+TODO: 
+
+It does not yet parse jsons formatted in another way.
+But soon, it will be
 
 ## Installation
 
@@ -44,14 +52,14 @@ optional arguments:
                         every json in memory
   --sep SEP             Separator used to create columns names
   --int_to_float        Cast int to float
-  --path_output PATH_OUTPUT
-                        Path output
+  --path_output PATH_OUTPUT Path output
+  --remove_null         Remove null values (kept by default)
+  --is_json             Indicate if input file is a json
 ```
 
 Please refer to [here](https://github.com/Besedo/json-to-csv/examples) for examples.
 
 
-
 ## Meta
 
 Distributed under the Apache license v2.0. See ``LICENSE`` for more information.

From 424dffd0c62979e546f51ef6f2cbbcc02ebf153b Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Tue, 9 Apr 2019 18:09:30 +0200
Subject: [PATCH 04/12] Add requirements file

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..fb6c7ed
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+pandas

From 5cc938adac6d62d984e2faadeaeef41c98dcfd7f Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Sun, 21 Apr 2019 18:45:59 +0200
Subject: [PATCH 05/12] Manage all jsons type

---
 json_to_csv/json_to_csv.py | 49 +++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index 68c5efa..3eb4a92 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -203,21 +203,52 @@ def read_jsons_chunks(file_object, chunk_size=10000):
             data = []
             for i in range(chunk_size):
                 # Here it works with one json, or an array of jsons with one json in each line
-                # TODO Make it work with no assumption over json 
                 # Remove comma and to the next line
-                line = file_object.readline().strip(',\n')
-                # If EOF obtained or end of jsonarray send what's left of the data
-                if line == "" or line == "]":
-                    yield data
+                nb_bracket = 0
+                nb_quotes = 0
+                example = ""
+                c_bef = ""
+                while True:
+                    # Read one character
+                    c = file_object.read(1)
+                    # If we are at the end of the file
+                    if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0:
+                        break
+                    # If we are in between 2 json examples or a the end                    
+                    if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
+                        continue
+                    # Check beginning of brackets
+                    if c == '{':
+                        # That means that the '"' is a delimiter of field or value in json
+                        if c_bef != '\\':
+                            nb_bracket += 1
+                    # Check quoting
+                    elif c == '"':
+                        # That means that the '"' is a delimiter of field or value in json
+                        if c_bef != '\\':
+                            nb_quotes += 1
+                    # Check ending of brackets                    
+                    elif c == '}':
+                        # That means that the '"' is a delimiter of field or value in json
+                        if c_bef != '\\':
+                            nb_bracket -= 1
+                            # This means we finished to read one json
+                            if nb_bracket == 0 and nb_quotes % 2 == 0:
+                                example += c
+                                break
+                    # Append character to the json example
+                    example += c
+                    # Set previous character
+                    c_bef = c
+                # If EOF obtained or end of jsonarray send what's left of the data                
+                if example == "" or example == "]":
+                    yield(data)
                     return
                 else:
-                    data.append(json.loads(line))
+                    data.append(json.loads(example))
             if not data:
                 break
             yield data
-    # End of file obtained
-    elif file_object.read() == ']':
-        return None
     # Otherwise, we have one json in the file
     else:
         yield [json.loads(first_line)]

From b9aaf593dfe78bc63aa400fef03758fbc7f890ff Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Sun, 21 Apr 2019 19:05:38 +0200
Subject: [PATCH 06/12] Manage one json split in several lines

---
 json_to_csv/json_to_csv.py | 109 +++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 58 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index 3eb4a92..968c43d 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -194,64 +194,57 @@ def update_columns_list(columns_list, json_list, sep, int_to_float, remove_null)
 def read_jsons_chunks(file_object, chunk_size=10000):
     """Lazy function to read a json by chunk.
     Default chunk size: 10k"""
-    # Check first element of a file
-    # If it is "[", that means we have a json array
-    first_line = file_object.readline()
-    if first_line[0] == '[':
-        while True: 
-            # Parse the next real chunk_size lines
-            data = []
-            for i in range(chunk_size):
-                # Here it works with one json, or an array of jsons with one json in each line
-                # Remove comma and to the next line
-                nb_bracket = 0
-                nb_quotes = 0
-                example = ""
-                c_bef = ""
-                while True:
-                    # Read one character
-                    c = file_object.read(1)
-                    # If we are at the end of the file
-                    if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0:
-                        break
-                    # If we are in between 2 json examples or a the end                    
-                    if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
-                        continue
-                    # Check beginning of brackets
-                    if c == '{':
-                        # That means that the '"' is a delimiter of field or value in json
-                        if c_bef != '\\':
-                            nb_bracket += 1
-                    # Check quoting
-                    elif c == '"':
-                        # That means that the '"' is a delimiter of field or value in json
-                        if c_bef != '\\':
-                            nb_quotes += 1
-                    # Check ending of brackets                    
-                    elif c == '}':
-                        # That means that the '"' is a delimiter of field or value in json
-                        if c_bef != '\\':
-                            nb_bracket -= 1
-                            # This means we finished to read one json
-                            if nb_bracket == 0 and nb_quotes % 2 == 0:
-                                example += c
-                                break
-                    # Append character to the json example
-                    example += c
-                    # Set previous character
-                    c_bef = c
-                # If EOF obtained or end of jsonarray send what's left of the data                
-                if example == "" or example == "]":
-                    yield(data)
-                    return
-                else:
-                    data.append(json.loads(example))
-            if not data:
-                break
-            yield data
-    # Otherwise, we have one json in the file
-    else:
-        yield [json.loads(first_line)]
+
+    while True: 
+        # Parse the next real chunk_size lines
+        data = []
+        for i in range(chunk_size):
+            nb_bracket = 0
+            nb_quotes = 0
+            example = ""
+            c_bef = ""
+            while True:
+                # Read one character
+                c = file_object.read(1)
+                # If we are at the end of the file
+                if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0:
+                    break
+                # If we are in between 2 json examples or a the end                    
+                if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
+                    continue
+                # Check beginning of brackets
+                if c == '{':
+                    # That means that the '"' is a delimiter of field or value in json
+                    if c_bef != '\\':
+                        nb_bracket += 1
+                # Check quoting
+                elif c == '"':
+                    # That means that the '"' is a delimiter of field or value in json
+                    if c_bef != '\\':
+                        nb_quotes += 1
+                # Check ending of brackets                    
+                elif c == '}':
+                    # That means that the '"' is a delimiter of field or value in json
+                    if c_bef != '\\':
+                        nb_bracket -= 1
+                        # This means we finished to read one json
+                        if nb_bracket == 0 and nb_quotes % 2 == 0:
+                            example += c
+                            break
+                # Append character to the json example
+                example += c
+                # Set previous character
+                c_bef = c
+            # If EOF obtained or end of jsonarray send what's left of the data
+            if example == "" or example == "]":
+                yield(data)
+                return
+            else:
+                data.append(json.loads(example))
+        if not data:
+            break
+        yield data
+
 
 def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json):
     """

From 3e8a62986b635df528c1be6ce23cbdbfeb5567bc Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Sun, 21 Apr 2019 19:11:39 +0200
Subject: [PATCH 07/12] Fix order txt files

---
 json_to_csv/json_to_csv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index 968c43d..9214140 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -209,8 +209,8 @@ def read_jsons_chunks(file_object, chunk_size=10000):
                 # If we are at the end of the file
                 if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0:
                     break
-                # If we are in between 2 json examples or a the end                    
-                if c in [',','\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
+                # If we are in between 2 json examples or a the end or at the beginning             
+                if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
                     continue
                 # Check beginning of brackets
                 if c == '{':

From 5e851a085004a642c30ccdd3dca284db486fbf4e Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Tue, 23 Apr 2019 09:43:13 +0200
Subject: [PATCH 08/12] Fix parsing jsons with escaped brackets andn quotes

---
 json_to_csv/json_to_csv.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index 9214140..063118a 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -203,6 +203,7 @@ def read_jsons_chunks(file_object, chunk_size=10000):
             nb_quotes = 0
             example = ""
             c_bef = ""
+            c_2bef = ""
             while True:
                 # Read one character
                 c = file_object.read(1)
@@ -213,27 +214,29 @@ def read_jsons_chunks(file_object, chunk_size=10000):
                 if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
                     continue
                 # Check beginning of brackets
-                if c == '{':
+                if c == '{' and nb_quotes % 2 == 0:
                     # That means that the '"' is a delimiter of field or value in json
-                    if c_bef != '\\':
+                    if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
                         nb_bracket += 1
                 # Check quoting
                 elif c == '"':
                     # That means that the '"' is a delimiter of field or value in json
-                    if c_bef != '\\':
+                    if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
                         nb_quotes += 1
                 # Check ending of brackets                    
-                elif c == '}':
+                elif c == '}' and nb_quotes % 2 == 0:
                     # That means that the '"' is a delimiter of field or value in json
-                    if c_bef != '\\':
+                    # Issue here when we have two
+                    if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
                         nb_bracket -= 1
-                        # This means we finished to read one json
-                        if nb_bracket == 0 and nb_quotes % 2 == 0:
-                            example += c
-                            break
+                    # This means we finished to read one json
+                    if nb_bracket == 0 and nb_quotes % 2 == 0:
+                        example += c
+                        break
                 # Append character to the json example
                 example += c
-                # Set previous character
+                # Set previous characters
+                c_2bef = c_bef
                 c_bef = c
             # If EOF obtained or end of jsonarray send what's left of the data
             if example == "" or example == "]":

From c5648ed8101d3a0759f910113ea1ae5ad5e40c20 Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Tue, 23 Apr 2019 10:34:06 +0200
Subject: [PATCH 09/12] Clean comments

---
 json_to_csv/json_to_csv.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index 063118a..5506491 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -215,18 +215,17 @@ def read_jsons_chunks(file_object, chunk_size=10000):
                     continue
                 # Check beginning of brackets
                 if c == '{' and nb_quotes % 2 == 0:
-                    # That means that the '"' is a delimiter of field or value in json
+                    # Check only when '{' is a delimiter of field or value in json
                     if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
                         nb_bracket += 1
                 # Check quoting
                 elif c == '"':
-                    # That means that the '"' is a delimiter of field or value in json
+                    # Check only when '"' is a delimiter of field or value in json
                     if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
                         nb_quotes += 1
                 # Check ending of brackets                    
                 elif c == '}' and nb_quotes % 2 == 0:
-                    # That means that the '"' is a delimiter of field or value in json
-                    # Issue here when we have two
+                    # Check only when '"' is a delimiter of field or value in json
                     if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
                         nb_bracket -= 1
                     # This means we finished to read one json

From 2cbb061b649da3f2923861189ce11ec14d3ba543 Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Mon, 29 Apr 2019 15:38:09 +0200
Subject: [PATCH 10/12] Refacto and clean escaped elements

---
 json_to_csv/json_to_csv.py | 113 ++++++++++++++++++++++---------------
 1 file changed, 66 insertions(+), 47 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index 5506491..4bdaabf 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -195,57 +195,76 @@ def read_jsons_chunks(file_object, chunk_size=10000):
     """Lazy function to read a json by chunk.
     Default chunk size: 10k"""
 
+    # Parse the next real chunk_size lines
+    chunk = file_object.read(1000000)
+    data = []
+    i = 0
+    nb_bracket = 0
+    nb_quotes = 0
+    example = ""
+    count_escape_char = 0
     while True: 
-        # Parse the next real chunk_size lines
-        data = []
-        for i in range(chunk_size):
-            nb_bracket = 0
-            nb_quotes = 0
-            example = ""
-            c_bef = ""
-            c_2bef = ""
-            while True:
-                # Read one character
-                c = file_object.read(1)
-                # If we are at the end of the file
-                if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0:
-                    break
-                # If we are in between 2 json examples or a the end or at the beginning             
-                if c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
+        # Read cahracter by character
+        for k, c in enumerate(chunk):
+            # Check quoting
+            if c == '"':
+                # Check only when '"' is a delimiter of field or value in json
+                if count_escape_char % 2 == 0:
+                    nb_quotes += 1
+            # Check beginning of brackets
+            elif c == '{' and nb_quotes % 2 == 0:
+                # Check only when '{' is a delimiter of field or value in json
+                if count_escape_char % 2 == 0:
+                    nb_bracket += 1
+            # Check ending of brackets                    
+            elif c == '}' and nb_quotes % 2 == 0:
+                # Check only when '"' is a delimiter of field or value in json
+                if count_escape_char % 2 == 0:
+                    nb_bracket -= 1
+                # This means we finished to read one json
+                if nb_bracket == 0 and nb_quotes % 2 == 0:
+                    example += c
+                    data.append(json.loads(example))
+                    i += 1
+                    # When chunk_size jsons obtained, dump those
+                    if i % chunk_size == 0:
+                        yield(data)
+                        data = []
+
+                    # Initialize those
+                    example = ""
+                    c_bef = ""
+                    c_2bef = ""
                     continue
-                # Check beginning of brackets
-                if c == '{' and nb_quotes % 2 == 0:
-                    # Check only when '{' is a delimiter of field or value in json
-                    if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
-                        nb_bracket += 1
-                # Check quoting
-                elif c == '"':
-                    # Check only when '"' is a delimiter of field or value in json
-                    if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
-                        nb_quotes += 1
-                # Check ending of brackets                    
-                elif c == '}' and nb_quotes % 2 == 0:
-                    # Check only when '"' is a delimiter of field or value in json
-                    if c_bef != '\\' or c_bef == '\\' and c_2bef == '\\':
-                        nb_bracket -= 1
-                    # This means we finished to read one json
-                    if nb_bracket == 0 and nb_quotes % 2 == 0:
-                        example += c
-                        break
-                # Append character to the json example
-                example += c
-                # Set previous characters
-                c_2bef = c_bef
-                c_bef = c
-            # If EOF obtained or end of jsonarray send what's left of the data
-            if example == "" or example == "]":
-                yield(data)
-                return
+            # If we are in between 2 json examples or at the beginning             
+            elif c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
+                continue
+            # If we are at the end of the file
+            if c in [']', ''] and nb_bracket == 0 and nb_quotes % 2 == 0:
+                # If EOF obtained or end of jsonarray send what's left of the data
+                if example == "" or example == "]":
+                    yield(data)
+                    return
+            if c == "\\":
+                count_escape_char += 1
             else:
-                data.append(json.loads(example))
-        if not data:
+                count_escape_char = 0
+            # Append character to the json example
+            example += c
+
+            # Set previous characters
+            c_2bef = c_bef
+            c_bef = c       
+        # If at the end of the chunk, read new chunk
+        if k == len(chunk) - 1:
+            chunk = file_object.read(1000000)
+        # Keep what's left of the chunk        
+        elif len(chunk) != 0:
+            chunk = chunk[k:]
+        # if k == 0 that means that we read the whole file
+        else:
             break
-        yield data
+        
 
 
 def get_columns(list_data_paths, sep, logger, int_to_float, remove_null, is_json):

From f9311c9878a27242d738a4a151a0071d9ec1ebfd Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Mon, 29 Apr 2019 15:55:55 +0200
Subject: [PATCH 11/12] Remove unused variables

---
 json_to_csv/json_to_csv.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index 4bdaabf..2cf78e6 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -233,8 +233,6 @@ def read_jsons_chunks(file_object, chunk_size=10000):
 
                     # Initialize those
                     example = ""
-                    c_bef = ""
-                    c_2bef = ""
                     continue
             # If we are in between 2 json examples or at the beginning             
             elif c in ['[', ',', '\n'] and nb_bracket == 0 and nb_quotes % 2 == 0:
@@ -251,10 +249,7 @@ def read_jsons_chunks(file_object, chunk_size=10000):
                 count_escape_char = 0
             # Append character to the json example
             example += c
-
-            # Set previous characters
-            c_2bef = c_bef
-            c_bef = c       
+ 
         # If at the end of the chunk, read new chunk
         if k == len(chunk) - 1:
             chunk = file_object.read(1000000)

From d343889b0249578cb73ccb4587209fd39f06bd2e Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Thu, 9 May 2019 13:06:33 +0200
Subject: [PATCH 12/12] Fix non streamin transformation when it is a json

---
 json_to_csv/json_to_csv.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py
index 2cf78e6..dc70eb7 100644
--- a/json_to_csv/json_to_csv.py
+++ b/json_to_csv/json_to_csv.py
@@ -351,9 +351,10 @@ def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep
             # Read json file by chunk
             for x in read_jsons_chunks(f, chunk_size=chunk_size):
                 if j!=0 and (j % chunk_size == 0):
-                    update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null)
                     logger.info('Iteration ' + str(j) + ': Creating sub dataframe')
-                    json_list = []
+                    if columns:
+                        update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null)
+                        json_list = []
                 try:
                     json_list.extend(x)
                     # Maximum of chunk_size elements were added