From c1d9dabf8281b7598e88bdec798221a63b75f0e4 Mon Sep 17 00:00:00 2001 From: YaYaB Date: Tue, 5 Jun 2018 17:29:07 +0200 Subject: [PATCH] Add a flag to remove or no null values --- json_to_csv/json_to_csv.py | 40 +++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/json_to_csv/json_to_csv.py b/json_to_csv/json_to_csv.py index 4ae4ee1..2823d05 100644 --- a/json_to_csv/json_to_csv.py +++ b/json_to_csv/json_to_csv.py @@ -22,8 +22,10 @@ def get_args(): parser.add_argument("--path_data_jsonperline", type=str, help="File or folder of files containing one json per line") parser.add_argument("--streaming", action='store_true', default=False, help="Create the csv in a stream way instead of loading every json in memory") parser.add_argument("--sep", default='.', help="Separator used to create columns' names") - parser.add_argument("--int_to_float", action='store_true', default=False, help="Cast int to float") + parser.add_argument("--int_to_float", action='store_true', default=False, help="Cast int to float") parser.add_argument("--path_output", type=str, help="Path output") + parser.add_argument("--remove_null", action='store_true', default=False, help="Remove null values (kept by default)") + args = parser.parse_args() return args @@ -48,7 +50,7 @@ def setup_custom_logger(name): return logger -def _flatten(d, parent_key='', sep='_', int_to_float=False): +def _flatten(d, parent_key='', sep='_', int_to_float=False, remove_null=False): """ Flatten a nested dictionary to one leve dictionary (recursive function) @@ -69,13 +71,17 @@ def _flatten(d, parent_key='', sep='_', int_to_float=False): for w in v: my_elems_w = [] if isinstance(w, dict): - my_elems_w.extend(_flatten(w, sep=sep, int_to_float=int_to_float).items()) + my_elems_w.extend(_flatten(w, sep=sep, int_to_float=int_to_float, remove_null=remove_null).items()) elif isinstance(w, str): my_elems.append('"' + w + '"') continue - else: + elif w != None: my_elems.append(w) continue + else: + if not remove_null: + my_elems.append('null') + continue # Put in in alphabetical order my_elems_w = sorted(my_elems_w, key=lambda tup: tup[0]) @@ -100,17 +106,19 @@ def _flatten(d, parent_key='', sep='_', int_to_float=False): my_elems = '[' + ','.join(my_elems) + ']' items.append((new_key, my_elems)) elif isinstance(v, dict): - items.extend(_flatten(v, new_key, sep=sep, int_to_float=int_to_float).items()) + items.extend(_flatten(v, new_key, sep=sep, int_to_float=int_to_float, remove_null=remove_null).items()) else: if isinstance(v, int) and int_to_float: items.append((new_key, float(v))) else: if v != None: items.append((new_key, v)) + elif not remove_null: + items.append((new_key, 'null')) return dict(items) -def _transform_jsons(json_list, sep, int_to_float): +def _transform_jsons(json_list, sep, int_to_float, remove_null): """ Transform list of jsons by flattening those @@ -123,11 +131,11 @@ def _transform_jsons(json_list, sep, int_to_float): """ # Transform - new_jsons = [_flatten(j, sep=sep, int_to_float=int_to_float) for j in json_list] + new_jsons = [_flatten(j, sep=sep, int_to_float=int_to_float, remove_null=remove_null) for j in json_list] return new_jsons -def update_df_list(df_list, json_list, sep, int_to_float): +def update_df_list(df_list, json_list, sep, int_to_float, remove_null): """ Update list of dataframes with list of jsons @@ -140,7 +148,7 @@ def update_df_list(df_list, json_list, sep, int_to_float): :return: list of dataframes udpated """ - data = _transform_jsons(json_list, sep, int_to_float) + data = _transform_jsons(json_list, sep, int_to_float, remove_null) df = pd.DataFrame(data) df_list.append(df) @@ -148,7 +156,7 @@ def update_df_list(df_list, json_list, sep, int_to_float): return df_list -def update_csv(path_csv, json_list, columns, sep, int_to_float): +def update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null): """ Append a csv with json list @@ -159,7 +167,7 @@ def update_csv(path_csv, json_list, columns, sep, int_to_float): :param int_to_float: if set tu true int will be casted to float """ - data = _transform_jsons(json_list, sep, int_to_float) + data = _transform_jsons(json_list, sep, int_to_float, remove_null) df = pd.DataFrame(data) @@ -237,7 +245,7 @@ def get_columns(list_data_paths, sep, logger): return columns_list -def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep='.', int_to_float=False): +def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep='.', int_to_float=False, remove_null=False): """ Get dataframe from files containing one json per line @@ -262,7 +270,7 @@ def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep if (j % 500000 == 0): logger.info('Iteration ' + str(j) + ': Creating sub dataframe') if columns: - update_csv(path_csv, json_list, columns, sep, int_to_float) + update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null) json_list.clear() if (j % 100000 == 0): @@ -279,12 +287,12 @@ def get_dataframe(list_data_paths, columns=None, path_csv=None, logger=None, sep logger.info('Iteration ' + str(j) + ': Creating last sub dataframe') if columns: logger.info("updating csv with new data" + path_csv) - update_csv(path_csv, json_list, columns, sep, int_to_float) + update_csv(path_csv, json_list, columns, sep, int_to_float, remove_null) json_list.clear() if not columns: # Concatenate the dataframes created - list_of_dfs = update_df_list([], json_list, sep, int_to_float) + list_of_dfs = update_df_list([], json_list, sep, int_to_float, remove_null) logger.info('Concatenate ' + str(len(list_of_dfs)) + ' DataFrames') df = pd.concat(list_of_dfs) @@ -332,7 +340,7 @@ def main(): df.to_csv(opt.path_output, encoding="utf-8", index=None) # Get dataframe - df = get_dataframe(data, columns=columns_list, path_csv=opt.path_output, logger=logger, sep=opt.sep, int_to_float=opt.int_to_float) + df = get_dataframe(data, columns=columns_list, path_csv=opt.path_output, logger=logger, sep=opt.sep, int_to_float=opt.int_to_float, remove_null=opt.remove_null) if not opt.streaming: logger.info("saving data to " + opt.path_output)