Add option for some dataframes to include CASs as integers, which cos…

…ts 4x less memory
CalebBell · Dec 25, 2021 · fa2a02e · fa2a02e
1 parent 31ab0fd
commit fa2a02e
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 21 deletions.
diff --git a/chemicals/Reactions/Yaws Hf S0 (g).tsv b/chemicals/Reactions/Yaws Hf S0 (g).tsv
@@ -3280,7 +3280,7 @@ CAS	name	Hfg	S0g
 23009-73-6	1-chloro-2-methyl-trans-2-butene	-54840	373.45
 23009-74-7	1-chloro-2-methyl-cis-2-butene	-54840	373.45
 23010-04-0	1,2-dichloro-2-methylbutane	-186760	378.73
-23010-05-I	1,1-dichloro-2-methylbutane	-188570	398.56
+23010-05-1	1,1-dichloro-2-methylbutane	-188570	398.56
 23010-07-3	1,3-dichloro-2-methylbutane	-188570	398.56
 23068-94-2	3-bromo-cis-2-pentene	-12770	391.07
 23068-95-3	4-bromo-trans-2-pentene	-8260	385.7
@@ -3649,7 +3649,7 @@ CAS	name	Hfg	S0g
 53566-37-3	5-methyl-2-hexyne	74400	401.99
 53594-82-4	2,2-dimethylheptadecane	-444240	857.22
 53731-22-9	1,1-difluoro-3-methylbutane	-549310	395.1
-53731-23-O	1,1-difluoro-2,2-dimethylpropane	-552780	365.76
+53731-23-0	1,1-difluoro-2,2-dimethylpropane	-552780	365.76
 53731-24-1	3-fluoro-2-methyl-1-butene	-232280	364.17
 53731-25-2	2,3-difluoro-2-methylbutane	-552780	365.76
 53750-52-0	4-iodo-2-methyl-1-butene	45980	397.62
@@ -4126,7 +4126,7 @@ CAS	name	Hfg	S0g
 62127-42-8	1,1-difluorononane	-645200	560.54
 62127-43-9	1,1-difluorodecane	-665800	600.36
 62127-44-0	1,1-difluoroundecane	-686500	639.52
-62127-45-I	1,1-difluorododecane	-707100	679.34
+62127-45-0	1,1-difluorododecane	-707100	679.34
 62127-46-2	1,3,3-tribromobutane	-55650	426.08
 62127-47-3	2,3,3-tribromobutane	-60930	416.55
 62127-48-4	1,3-dibromo-2-(bromomethyl)propane	-52180	455.43
@@ -4794,7 +4794,7 @@ CAS	name	Hfg	S0g
 66553-15-9	2,3-dimethyl-1,2-butanediol	-494400	413.79
 66553-16-0	2-ethyl-1,2-butanediol	-490200	419.69
 66553-17-1	2-ethyl-1,3-butanediol	-484400	465.03
-66553-33-I	1,1-diiodododecane	-142550	725.14
+66553-33-0	1,1-diiodododecane	-142550	725.14
 66553-34-2	1-nitro-2,3-dimethylbutane	-188490	450.96
 66553-39-7	cis-1-bromo-1-dodecene	-147460	667.9
 66553-40-0	trans-1-bromo-1-dodecene	-147460	667.9
@@ -4925,7 +4925,7 @@ CAS	name	Hfg	S0g
 66688-66-2	1,3-difluoro-2,2-dimethylpropane	-547500	375.28
 66688-67-3	1,1-difluoro-2-methylbutane	-549310	395.1
 66688-69-5	1,2-difluoro-3-methylbutane	-549310	395.1
-66688-6R-4	1,2-difluoro-2-methylbutane	-547500	375.28
+66688-68-4	1,2-difluoro-2-methylbutane	-547500	375.28
 66688-72-0	cis-1-chloro-1-nonene	-127610	533.41
 66688-73-1	trans-1-chloro-1-nonene	-127610	533.41
 66688-74-2	cis-1-fluoro-1-nonene	-307980	531.69
@@ -4967,7 +4967,7 @@ CAS	name	Hfg	S0g
 66719-53-7	2,5-dimethyl-4-octanol	-417800	567.84
 66719-54-8	2,6-dimethyl-4-octanol	-417800	567.84
 66719-55-9	2,7-dimethyl-3-octanol	-417800	567.84
-66719-5I-5	4,4-dimethyl-3-isopropyl-1-pentanol	-421270	538.49
+66719-51-5	4,4-dimethyl-3-isopropyl-1-pentanol	-421270	538.49
 66731-94-0	4-methyl-5-ethyl-3-heptanol	-417800	567.84
 66731-95-1	5-methyl-2-nonanol	-412520	577.37
 66779-42-4	3-methyl-5-nonanol	-412520	577.37

diff --git a/chemicals/data_reader.py b/chemicals/data_reader.py
@@ -39,6 +39,7 @@
     pass
 try:
     import numpy as np
+    object_dtype = np.dtype(object)
     float64_dtype = np.dtype(np.float64)
     float32_dtype = np.dtype(np.float32)
     int64_dtype = np.dtype(np.int64)
@@ -50,6 +51,7 @@
 
 except:
     pass
+from chemicals.identifiers import CAS_to_int
 # %% Loading data from local databanks
 
 pd = None
@@ -77,8 +79,8 @@ def make_df_sparse(df, non_sparse_columns=[]):
 
 
 def register_df_source(folder, name, sep='\t', index_col=0, csv_kwargs={},
-                       postload=None, sparsify=False):
-    load_cmds[name] = (folder, name, sep, index_col, csv_kwargs, postload, sparsify)
+                       postload=None, sparsify=False, int_CAS=False):
+    load_cmds[name] = (folder, name, sep, index_col, csv_kwargs, postload, sparsify, int_CAS)
 
 '''The following flags will strip out the excess memory usage of redundant 
 chemical metadata information.
@@ -87,6 +89,7 @@ def register_df_source(folder, name, sep='\t', index_col=0, csv_kwargs={},
     low_mem = bool(int(os.environ.get('CHEDL_LOW_MEMORY', '0')))
 except:
     low_mem = False
+
 spurious_columns = set(['name', 'formula', 'MW', 'InChI', 'InChI_key', 'Chemical',
                     'Data Type', 'Uncertainty', 'Fluid', 'Name', 'Names', 'Name ',
                     'Formula', 'Formula '])
@@ -95,7 +98,7 @@ def load_df(key):
     global pd
     if pd is None:
         import pandas as pd
-    folder, name, sep, index_col, csv_kwargs, postload, sparsify = load_cmds[key]
+    folder, name, sep, index_col, csv_kwargs, postload, sparsify, int_CAS = load_cmds[key]
     path = path_join(folder, name)
     df = pd.read_csv(path, sep=sep, index_col=index_col, **csv_kwargs)
     if postload: postload(df)
@@ -105,6 +108,9 @@ def load_df(key):
         for col_name in df.columns.values.tolist():
             if col_name in spurious_columns:
                 del df[col_name]
+
+    if int_CAS:
+        df.index = pd.Index([CAS_to_int(s) for s in df.index])
 
     df_sources[key] = df
 
@@ -134,13 +140,27 @@ def retrieve_any_from_df_dict(df_dict, index, key):
         if value is not None: return value
 
 def retrieve_from_df(df, index, key):
-    if index in df.index:
-        if isinstance(key, str):
+    df_index = df.index
+    if df_index.dtype is not object_dtype:
+        try:
+            index = CAS_to_int(index)
+        except:
+            pass
+
+    if index in df_index:
+        if isinstance(key, (int, str)):
             return get_value_from_df(df, index, key)
         else: # Assume its an iterable of strings
             return [float(df.at[index, i]) for i in key]
 
 def retrieve_any_from_df(df, index, keys):
+    df_index = df.index
+    if df_index.dtype is not object_dtype:
+        try:
+            index = CAS_to_int(index)
+        except:
+            pass
+
     if index not in df.index: return None
     for key in keys:
         value = df.at[index, key]
@@ -159,8 +179,21 @@ def get_value_from_df(df, index, key):
         return value
 
 def list_available_methods_from_df_dict(df_dict, index, key):
-    return [method for method, df in df_dict.items()
-            if (index in df.index) and not pd.isnull(df.at[index, key])]
+    methods = []
+    for method, df in df_dict.items():
+        df_index = df.index
+        if df_index.dtype is not object_dtype:
+            try:
+                index_int = CAS_to_int(index)
+            except:
+                pass
+            if (index_int in df_index) and not isnan(df.at[index_int, key]):
+                methods.append(method)
+        else:
+            if (index in df_index) and not isnan(df.at[index, key]):
+                methods.append(method)
+
+    return methods
 
 def list_available_methods_from_df(df, index, keys_by_method):
     if index in df.index:

diff --git a/chemicals/miscdata.py b/chemicals/miscdata.py
@@ -49,10 +49,10 @@
 ### CRC Handbook general tables
 register_df_source(folder, 'Physical Constants of Inorganic Compounds.csv')
 register_df_source(folder, 'Physical Constants of Organic Compounds.csv')
-register_df_source(folder, 'joback_predictions.tsv')
-register_df_source(folder, 'wikidata_properties.tsv', sparsify=True)
-register_df_source(folder, 'webbook_constants.tsv', sparsify=True)
-register_df_source(folder, 'common_chemistry_data.tsv', sparsify=True)
+register_df_source(folder, 'joback_predictions.tsv', int_CAS=True)
+register_df_source(folder, 'wikidata_properties.tsv', sparsify=True, int_CAS=True)
+register_df_source(folder, 'webbook_constants.tsv', sparsify=True, int_CAS=True)
+register_df_source(folder, 'common_chemistry_data.tsv', sparsify=True, int_CAS=True)
 
 JOBACK = 'JOBACK'
 WIKIDATA = 'WIKIDATA'

diff --git a/tests/test_critical.py b/tests/test_critical.py
@@ -25,6 +25,7 @@
 import pandas as pd
 from fluids.numerics import assert_close, assert_close1d
 from chemicals.miscdata import webbook_data
+from chemicals import int_to_CAS
 from chemicals.critical import *
 from chemicals.critical import (critical_data_IUPAC,
                                 critical_data_Matthews,
@@ -203,7 +204,7 @@ def test_Tc_all_values():
     for k in sources:
         for i in k.index:
             if pd.notnull(k.at[i, 'Tc']):
-                CASs.add(i)
+                CASs.add(i if type(i) is str else int_to_CAS(i))
 
     # Use the default method for each chemical in this file
     Tcs = [Tc(i) for i in CASs]
@@ -260,7 +261,7 @@ def test_Pc_all_values():
     for k in sources:
         for i in k.index:
             if pd.notnull(k.at[i, 'Pc']):
-                CASs.add(i)
+                CASs.add(i if type(i) is str else int_to_CAS(i))
 
     # Use the default method for each chemical in this file
     Pcs = [Pc(i) for i in CASs]
@@ -291,7 +292,7 @@ def test_Vc_all_values():
     for k in sources:
         for i in k.index:
             if pd.notnull(k.at[i, 'Vc']):
-                CASs.add(i)
+                CASs.add(i if type(i) is str else int_to_CAS(i))
 
     # Use the default method for each chemical in this file
     Vcs = [Vc(i) for i in CASs]
@@ -322,7 +323,7 @@ def test_Zc_all_values():
     for k in sources:
         for i in k.index:
             if pd.notnull(k.at[i, 'Zc']):
-                CASs.add(i)
+                CASs.add(i if type(i) is str else int_to_CAS(i))
 
     # Use the default method for each chemical in this file
     Zcs = [Zc(i) for i in CASs]