diff --git a/chemicals/data_reader.py b/chemicals/data_reader.py index 0feab07..c5986f2 100644 --- a/chemicals/data_reader.py +++ b/chemicals/data_reader.py @@ -32,12 +32,24 @@ 'list_available_methods_from_df'] import os -from math import isnan +from math import isnan, nan try: path_join = os.path.join except: # pragma: no cover pass - +try: + import numpy as np + float64_dtype = np.dtype(np.float64) + float32_dtype = np.dtype(np.float32) + int64_dtype = np.dtype(np.int64) + int32_dtype = np.dtype(np.int32) + int16_dtype = np.dtype(np.int16) + int8_dtype = np.dtype(np.int8) + float_dtype_ids = set([id(float64_dtype), id(float32_dtype)]) + int_dtype_ids = set([id(int64_dtype), id(int32_dtype), id(int16_dtype), id(int8_dtype)]) + +except: + pass # %% Loading data from local databanks pd = None @@ -45,18 +57,55 @@ df_sources = {} load_cmds = {} +def make_df_sparse(df, non_sparse_columns=[]): + '''Take a dataframe, and convert any floating-point columns which are mostly + missing into sparse series. Return the resulting dataframe. + ''' + for col, dtype in zip(df.columns, df.dtypes): + if col in non_sparse_columns: + continue + if id(dtype) in float_dtype_ids: + series_orig = df[col] + series_small = series_orig.astype(pd.SparseDtype("float", nan)) + if series_small.memory_usage() < series_orig.memory_usage(): + df[col] = series_small + elif dtype in int_dtype_ids: + pass + else: + continue + return df + + def register_df_source(folder, name, sep='\t', index_col=0, csv_kwargs={}, - postload=None): - load_cmds[name] = (folder, name, sep, index_col, csv_kwargs, postload) + postload=None, sparsify=False): + load_cmds[name] = (folder, name, sep, index_col, csv_kwargs, postload, sparsify) + +'''The following flags will strip out the excess memory usage of redundant +chemical metadata information. +''' +try: + low_mem = bool(int(os.environ.get('CHEDL_LOW_MEMORY', '0'))) +except: + low_mem = False +spurious_columns = set(['name', 'formula', 'MW', 'InChI', 'InChI_key', 'Chemical', + 'Data Type', 'Uncertainty', 'Fluid', 'Name', 'Names', 'Name ', + 'Formula', 'Formula ']) def load_df(key): global pd if pd is None: import pandas as pd - folder, name, sep, index_col, csv_kwargs, postload = load_cmds[key] + folder, name, sep, index_col, csv_kwargs, postload, sparsify = load_cmds[key] path = path_join(folder, name) df = pd.read_csv(path, sep=sep, index_col=index_col, **csv_kwargs) if postload: postload(df) + if sparsify: + df = make_df_sparse(df) + if low_mem: + for col_name in df.columns.values.tolist(): + if col_name in spurious_columns: + del df[col_name] + df_sources[key] = df def data_source(key): diff --git a/chemicals/miscdata.py b/chemicals/miscdata.py index 8c35c84..b5e4f96 100644 --- a/chemicals/miscdata.py +++ b/chemicals/miscdata.py @@ -50,9 +50,9 @@ register_df_source(folder, 'Physical Constants of Inorganic Compounds.csv') register_df_source(folder, 'Physical Constants of Organic Compounds.csv') register_df_source(folder, 'joback_predictions.tsv') -register_df_source(folder, 'wikidata_properties.tsv') -register_df_source(folder, 'webbook_constants.tsv') -register_df_source(folder, 'common_chemistry_data.tsv') +register_df_source(folder, 'wikidata_properties.tsv', sparsify=True) +register_df_source(folder, 'webbook_constants.tsv', sparsify=True) +register_df_source(folder, 'common_chemistry_data.tsv', sparsify=True) JOBACK = 'JOBACK' WIKIDATA = 'WIKIDATA'