Skip to content

Commit

Permalink
Add option for some dataframes to include CASs as integers, which cos…
Browse files Browse the repository at this point in the history
…ts 4x less memory
  • Loading branch information
CalebBell committed Dec 25, 2021
1 parent 31ab0fd commit fa2a02e
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 21 deletions.
12 changes: 6 additions & 6 deletions chemicals/Reactions/Yaws Hf S0 (g).tsv
Original file line number Diff line number Diff line change
Expand Up @@ -3280,7 +3280,7 @@ CAS name Hfg S0g
23009-73-6 1-chloro-2-methyl-trans-2-butene -54840 373.45
23009-74-7 1-chloro-2-methyl-cis-2-butene -54840 373.45
23010-04-0 1,2-dichloro-2-methylbutane -186760 378.73
23010-05-I 1,1-dichloro-2-methylbutane -188570 398.56
23010-05-1 1,1-dichloro-2-methylbutane -188570 398.56
23010-07-3 1,3-dichloro-2-methylbutane -188570 398.56
23068-94-2 3-bromo-cis-2-pentene -12770 391.07
23068-95-3 4-bromo-trans-2-pentene -8260 385.7
Expand Down Expand Up @@ -3649,7 +3649,7 @@ CAS name Hfg S0g
53566-37-3 5-methyl-2-hexyne 74400 401.99
53594-82-4 2,2-dimethylheptadecane -444240 857.22
53731-22-9 1,1-difluoro-3-methylbutane -549310 395.1
53731-23-O 1,1-difluoro-2,2-dimethylpropane -552780 365.76
53731-23-0 1,1-difluoro-2,2-dimethylpropane -552780 365.76
53731-24-1 3-fluoro-2-methyl-1-butene -232280 364.17
53731-25-2 2,3-difluoro-2-methylbutane -552780 365.76
53750-52-0 4-iodo-2-methyl-1-butene 45980 397.62
Expand Down Expand Up @@ -4126,7 +4126,7 @@ CAS name Hfg S0g
62127-42-8 1,1-difluorononane -645200 560.54
62127-43-9 1,1-difluorodecane -665800 600.36
62127-44-0 1,1-difluoroundecane -686500 639.52
62127-45-I 1,1-difluorododecane -707100 679.34
62127-45-0 1,1-difluorododecane -707100 679.34
62127-46-2 1,3,3-tribromobutane -55650 426.08
62127-47-3 2,3,3-tribromobutane -60930 416.55
62127-48-4 1,3-dibromo-2-(bromomethyl)propane -52180 455.43
Expand Down Expand Up @@ -4794,7 +4794,7 @@ CAS name Hfg S0g
66553-15-9 2,3-dimethyl-1,2-butanediol -494400 413.79
66553-16-0 2-ethyl-1,2-butanediol -490200 419.69
66553-17-1 2-ethyl-1,3-butanediol -484400 465.03
66553-33-I 1,1-diiodododecane -142550 725.14
66553-33-0 1,1-diiodododecane -142550 725.14
66553-34-2 1-nitro-2,3-dimethylbutane -188490 450.96
66553-39-7 cis-1-bromo-1-dodecene -147460 667.9
66553-40-0 trans-1-bromo-1-dodecene -147460 667.9
Expand Down Expand Up @@ -4925,7 +4925,7 @@ CAS name Hfg S0g
66688-66-2 1,3-difluoro-2,2-dimethylpropane -547500 375.28
66688-67-3 1,1-difluoro-2-methylbutane -549310 395.1
66688-69-5 1,2-difluoro-3-methylbutane -549310 395.1
66688-6R-4 1,2-difluoro-2-methylbutane -547500 375.28
66688-68-4 1,2-difluoro-2-methylbutane -547500 375.28
66688-72-0 cis-1-chloro-1-nonene -127610 533.41
66688-73-1 trans-1-chloro-1-nonene -127610 533.41
66688-74-2 cis-1-fluoro-1-nonene -307980 531.69
Expand Down Expand Up @@ -4967,7 +4967,7 @@ CAS name Hfg S0g
66719-53-7 2,5-dimethyl-4-octanol -417800 567.84
66719-54-8 2,6-dimethyl-4-octanol -417800 567.84
66719-55-9 2,7-dimethyl-3-octanol -417800 567.84
66719-5I-5 4,4-dimethyl-3-isopropyl-1-pentanol -421270 538.49
66719-51-5 4,4-dimethyl-3-isopropyl-1-pentanol -421270 538.49
66731-94-0 4-methyl-5-ethyl-3-heptanol -417800 567.84
66731-95-1 5-methyl-2-nonanol -412520 577.37
66779-42-4 3-methyl-5-nonanol -412520 577.37
Expand Down
47 changes: 40 additions & 7 deletions chemicals/data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
pass
try:
import numpy as np
object_dtype = np.dtype(object)
float64_dtype = np.dtype(np.float64)
float32_dtype = np.dtype(np.float32)
int64_dtype = np.dtype(np.int64)
Expand All @@ -50,6 +51,7 @@

except:
pass
from chemicals.identifiers import CAS_to_int
# %% Loading data from local databanks

pd = None
Expand Down Expand Up @@ -77,8 +79,8 @@ def make_df_sparse(df, non_sparse_columns=[]):


def register_df_source(folder, name, sep='\t', index_col=0, csv_kwargs={},
postload=None, sparsify=False):
load_cmds[name] = (folder, name, sep, index_col, csv_kwargs, postload, sparsify)
postload=None, sparsify=False, int_CAS=False):
load_cmds[name] = (folder, name, sep, index_col, csv_kwargs, postload, sparsify, int_CAS)

'''The following flags will strip out the excess memory usage of redundant
chemical metadata information.
Expand All @@ -87,6 +89,7 @@ def register_df_source(folder, name, sep='\t', index_col=0, csv_kwargs={},
low_mem = bool(int(os.environ.get('CHEDL_LOW_MEMORY', '0')))
except:
low_mem = False

spurious_columns = set(['name', 'formula', 'MW', 'InChI', 'InChI_key', 'Chemical',
'Data Type', 'Uncertainty', 'Fluid', 'Name', 'Names', 'Name ',
'Formula', 'Formula '])
Expand All @@ -95,7 +98,7 @@ def load_df(key):
global pd
if pd is None:
import pandas as pd
folder, name, sep, index_col, csv_kwargs, postload, sparsify = load_cmds[key]
folder, name, sep, index_col, csv_kwargs, postload, sparsify, int_CAS = load_cmds[key]
path = path_join(folder, name)
df = pd.read_csv(path, sep=sep, index_col=index_col, **csv_kwargs)
if postload: postload(df)
Expand All @@ -105,6 +108,9 @@ def load_df(key):
for col_name in df.columns.values.tolist():
if col_name in spurious_columns:
del df[col_name]

if int_CAS:
df.index = pd.Index([CAS_to_int(s) for s in df.index])

df_sources[key] = df

Expand Down Expand Up @@ -134,13 +140,27 @@ def retrieve_any_from_df_dict(df_dict, index, key):
if value is not None: return value

def retrieve_from_df(df, index, key):
if index in df.index:
if isinstance(key, str):
df_index = df.index
if df_index.dtype is not object_dtype:
try:
index = CAS_to_int(index)
except:
pass

if index in df_index:
if isinstance(key, (int, str)):
return get_value_from_df(df, index, key)
else: # Assume its an iterable of strings
return [float(df.at[index, i]) for i in key]

def retrieve_any_from_df(df, index, keys):
df_index = df.index
if df_index.dtype is not object_dtype:
try:
index = CAS_to_int(index)
except:
pass

if index not in df.index: return None
for key in keys:
value = df.at[index, key]
Expand All @@ -159,8 +179,21 @@ def get_value_from_df(df, index, key):
return value

def list_available_methods_from_df_dict(df_dict, index, key):
return [method for method, df in df_dict.items()
if (index in df.index) and not pd.isnull(df.at[index, key])]
methods = []
for method, df in df_dict.items():
df_index = df.index
if df_index.dtype is not object_dtype:
try:
index_int = CAS_to_int(index)
except:
pass
if (index_int in df_index) and not isnan(df.at[index_int, key]):
methods.append(method)
else:
if (index in df_index) and not isnan(df.at[index, key]):
methods.append(method)

return methods

def list_available_methods_from_df(df, index, keys_by_method):
if index in df.index:
Expand Down
8 changes: 4 additions & 4 deletions chemicals/miscdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@
### CRC Handbook general tables
register_df_source(folder, 'Physical Constants of Inorganic Compounds.csv')
register_df_source(folder, 'Physical Constants of Organic Compounds.csv')
register_df_source(folder, 'joback_predictions.tsv')
register_df_source(folder, 'wikidata_properties.tsv', sparsify=True)
register_df_source(folder, 'webbook_constants.tsv', sparsify=True)
register_df_source(folder, 'common_chemistry_data.tsv', sparsify=True)
register_df_source(folder, 'joback_predictions.tsv', int_CAS=True)
register_df_source(folder, 'wikidata_properties.tsv', sparsify=True, int_CAS=True)
register_df_source(folder, 'webbook_constants.tsv', sparsify=True, int_CAS=True)
register_df_source(folder, 'common_chemistry_data.tsv', sparsify=True, int_CAS=True)

JOBACK = 'JOBACK'
WIKIDATA = 'WIKIDATA'
Expand Down
9 changes: 5 additions & 4 deletions tests/test_critical.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import pandas as pd
from fluids.numerics import assert_close, assert_close1d
from chemicals.miscdata import webbook_data
from chemicals import int_to_CAS
from chemicals.critical import *
from chemicals.critical import (critical_data_IUPAC,
critical_data_Matthews,
Expand Down Expand Up @@ -203,7 +204,7 @@ def test_Tc_all_values():
for k in sources:
for i in k.index:
if pd.notnull(k.at[i, 'Tc']):
CASs.add(i)
CASs.add(i if type(i) is str else int_to_CAS(i))

# Use the default method for each chemical in this file
Tcs = [Tc(i) for i in CASs]
Expand Down Expand Up @@ -260,7 +261,7 @@ def test_Pc_all_values():
for k in sources:
for i in k.index:
if pd.notnull(k.at[i, 'Pc']):
CASs.add(i)
CASs.add(i if type(i) is str else int_to_CAS(i))

# Use the default method for each chemical in this file
Pcs = [Pc(i) for i in CASs]
Expand Down Expand Up @@ -291,7 +292,7 @@ def test_Vc_all_values():
for k in sources:
for i in k.index:
if pd.notnull(k.at[i, 'Vc']):
CASs.add(i)
CASs.add(i if type(i) is str else int_to_CAS(i))

# Use the default method for each chemical in this file
Vcs = [Vc(i) for i in CASs]
Expand Down Expand Up @@ -322,7 +323,7 @@ def test_Zc_all_values():
for k in sources:
for i in k.index:
if pd.notnull(k.at[i, 'Zc']):
CASs.add(i)
CASs.add(i if type(i) is str else int_to_CAS(i))

# Use the default method for each chemical in this file
Zcs = [Zc(i) for i in CASs]
Expand Down

0 comments on commit fa2a02e

Please sign in to comment.