Skip to content

Commit

Permalink
Add option to make some pandas dataframes sparse
Browse files Browse the repository at this point in the history
  • Loading branch information
CalebBell committed Dec 24, 2021
1 parent 9d36e54 commit 25dd6ed
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 8 deletions.
59 changes: 54 additions & 5 deletions chemicals/data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,80 @@
'list_available_methods_from_df']

import os
from math import isnan
from math import isnan, nan
try:
path_join = os.path.join
except: # pragma: no cover
pass

try:
import numpy as np
float64_dtype = np.dtype(np.float64)
float32_dtype = np.dtype(np.float32)
int64_dtype = np.dtype(np.int64)
int32_dtype = np.dtype(np.int32)
int16_dtype = np.dtype(np.int16)
int8_dtype = np.dtype(np.int8)
float_dtype_ids = set([id(float64_dtype), id(float32_dtype)])
int_dtype_ids = set([id(int64_dtype), id(int32_dtype), id(int16_dtype), id(int8_dtype)])

except:
pass
# %% Loading data from local databanks

pd = None

df_sources = {}
load_cmds = {}

def make_df_sparse(df, non_sparse_columns=[]):
'''Take a dataframe, and convert any floating-point columns which are mostly
missing into sparse series. Return the resulting dataframe.
'''
for col, dtype in zip(df.columns, df.dtypes):
if col in non_sparse_columns:
continue
if id(dtype) in float_dtype_ids:
series_orig = df[col]
series_small = series_orig.astype(pd.SparseDtype("float", nan))
if series_small.memory_usage() < series_orig.memory_usage():
df[col] = series_small
elif dtype in int_dtype_ids:
pass
else:
continue
return df


def register_df_source(folder, name, sep='\t', index_col=0, csv_kwargs={},
postload=None):
load_cmds[name] = (folder, name, sep, index_col, csv_kwargs, postload)
postload=None, sparsify=False):
load_cmds[name] = (folder, name, sep, index_col, csv_kwargs, postload, sparsify)

'''The following flags will strip out the excess memory usage of redundant
chemical metadata information.
'''
try:
low_mem = bool(int(os.environ.get('CHEDL_LOW_MEMORY', '0')))
except:
low_mem = False
spurious_columns = set(['name', 'formula', 'MW', 'InChI', 'InChI_key', 'Chemical',
'Data Type', 'Uncertainty', 'Fluid', 'Name', 'Names', 'Name ',
'Formula', 'Formula '])

def load_df(key):
global pd
if pd is None:
import pandas as pd
folder, name, sep, index_col, csv_kwargs, postload = load_cmds[key]
folder, name, sep, index_col, csv_kwargs, postload, sparsify = load_cmds[key]
path = path_join(folder, name)
df = pd.read_csv(path, sep=sep, index_col=index_col, **csv_kwargs)
if postload: postload(df)
if sparsify:
df = make_df_sparse(df)
if low_mem:
for col_name in df.columns.values.tolist():
if col_name in spurious_columns:
del df[col_name]

df_sources[key] = df

def data_source(key):
Expand Down
6 changes: 3 additions & 3 deletions chemicals/miscdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@
register_df_source(folder, 'Physical Constants of Inorganic Compounds.csv')
register_df_source(folder, 'Physical Constants of Organic Compounds.csv')
register_df_source(folder, 'joback_predictions.tsv')
register_df_source(folder, 'wikidata_properties.tsv')
register_df_source(folder, 'webbook_constants.tsv')
register_df_source(folder, 'common_chemistry_data.tsv')
register_df_source(folder, 'wikidata_properties.tsv', sparsify=True)
register_df_source(folder, 'webbook_constants.tsv', sparsify=True)
register_df_source(folder, 'common_chemistry_data.tsv', sparsify=True)

JOBACK = 'JOBACK'
WIKIDATA = 'WIKIDATA'
Expand Down

0 comments on commit 25dd6ed

Please sign in to comment.