Merge branch 'feature/epathermostat_2.0' into feature/epa2.0_line_vol…

…tage
EPAENERGYSTAR · Mar 11, 2021 · dda7674 · dda7674
2 parents c65f0aa + 6bc6ebe
commit dda7674
Show file tree

Hide file tree

Showing 14 changed files with 3,149 additions and 3,103 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,10 +9,10 @@ cache:
 
 language: python
 python:
-  - 3.5
   - 3.6
   - 3.7
   - 3.8
+  - 3.9
 
 notifications:
   email: false

diff --git a/scripts/multi_thermostat_tutorial.py b/scripts/multi_thermostat_tutorial.py
@@ -35,10 +35,19 @@ def main():
     # console
     logging.captureWarnings(True)
 
-    # data_dir = os.path.join("..", "tests", "data", "single_stage")
+    # This section finds the metadata files and data files for the thermostats.
+    # These point to examples of the various styles of files 
+    # Single Stage
+    data_dir = os.path.join("..", "tests", "data", "single_stage")
+    metadata_filename = os.path.join(data_dir, "metadata.csv")
+
+    # Two Stage
     # data_dir = os.path.join("..", "tests", "data", "two_stage")
-    data_dir = os.path.join("..", "tests", "data", "two_stage_ert")
-    metadata_filename = os.path.join(data_dir, "epa_two_stage_metadata.csv")
+    # metadata_filename = os.path.join(data_dir, "epa_two_stage_metadata.csv")
+
+    # Two Stage ERT
+    # data_dir = os.path.join("..", "tests", "data", "two_stage_ert")
+    # metadata_filename = os.path.join(data_dir, "epa_two_stage_metadata.csv")
 
     # Use this to save the weather cache to local disk files
     # thermostats = from_csv(metadata_filename, verbose=True, save_cache=True,

diff --git a/setup.py b/setup.py
@@ -23,8 +23,8 @@
     install_requires=[
         'eemeter==2.5.2',
         'eeweather==0.3.23',
-        'numpy < 1.20',
-        'pandas==0.25.3',
+        'numpy<=1.20',
+        'pandas<=1.2.0',
         'sqlalchemy==1.3.1',
         'zipcodes==1.1.2',
         ],

diff --git a/tests/data/single_stage/thermostat_example_certification.csv b/tests/data/single_stage/thermostat_example_certification.csv
@@ -1,6 +1,6 @@
 product_id,sw_version,metric,filter,region,statistic,season,value
-test_product,2.0.0a1,percent_savings_baseline_percentile,tau_cvrmse_savings_p01,national_weighted_mean,lower_bound_95,heating,
-test_product,2.0.0a1,percent_savings_baseline_percentile,tau_cvrmse_savings_p01,national_weighted_mean,lower_bound_95,cooling,
-test_product,2.0.0a1,percent_savings_baseline_percentile,tau_cvrmse_savings_p01,national_weighted_mean,q20,heating,
-test_product,2.0.0a1,percent_savings_baseline_percentile,tau_cvrmse_savings_p01,national_weighted_mean,q20,cooling,
-test_product,2.0.0a1,rhu_30F_to_45F,tau_cvrmse_savings_p01,all,upper_bound_95,heating,
+test_product,2.0.0a2,percent_savings_baseline_percentile,tau_cvrmse_savings_p01,national_weighted_mean,lower_bound_95,heating,20.17
+test_product,2.0.0a2,percent_savings_baseline_percentile,tau_cvrmse_savings_p01,national_weighted_mean,lower_bound_95,cooling,44.36
+test_product,2.0.0a2,percent_savings_baseline_percentile,tau_cvrmse_savings_p01,national_weighted_mean,q20,heating,19.83
+test_product,2.0.0a2,percent_savings_baseline_percentile,tau_cvrmse_savings_p01,national_weighted_mean,q20,cooling,38.44
+test_product,2.0.0a2,rhu_30F_to_45F,tau_cvrmse_savings_p01,all,upper_bound_95,heating,0.09
diff --git a/tests/data/single_stage/thermostat_example_stats.csv b/tests/data/single_stage/thermostat_example_stats.csv
diff --git a/tests/data/two_stage_ert/thermostat_example_stats.csv b/tests/data/two_stage_ert/thermostat_example_stats.csv
diff --git a/tests/test_core_single_stage.py b/tests/test_core_single_stage.py
@@ -57,11 +57,8 @@ def test_pandas_warnings(thermostat_type_1):
     with pytest.warns(Warning):
         __pandas_warnings('0.21.0')
 
-    with pytest.warns(Warning):
-        __pandas_warnings('1.2.0')
-
     with pytest.warns(None) as pytest_warnings:
-        __pandas_warnings('0.25.3')
+        __pandas_warnings('1.2.0')
     assert not pytest_warnings
 
     assert __pandas_warnings(None) is None

diff --git a/tests/test_eeweather_wrapper.py b/tests/test_eeweather_wrapper.py
@@ -5,14 +5,14 @@
 
 
 def test_get_indexed_temperatures_eeweather_empty_index():
-    empty_index = pd.DataFrame()
+    empty_index = pd.DataFrame([])
     results = get_indexed_temperatures_eeweather('720648', empty_index)
     assert results.empty is True
 
 
 def test_get_index_temperatures_eeweather():
     begin_timestamp = pd.Timestamp('2011-01-01 00:00:00')
     periods = 8766
-    hourly_index = pd.date_range(begin_timestamp, periods=periods, freq="H")
+    hourly_index = pd.date_range(begin_timestamp, periods=periods, freq='H', tz='UTC')
     results = get_indexed_temperatures_eeweather('720648', hourly_index)
     assert results.shape == (8766,)
diff --git a/thermostat/core.py b/thermostat/core.py
@@ -59,12 +59,6 @@ def __pandas_warnings(pandas_version):
             warnings.warn(
                 "WARNING: Pandas version 0.21.x has known issues and is not supported. "
                 "Please upgrade to the Pandas version 0.25.3.")
-        # Pandas 1.x causes issues. Need to warn about this at the moment.
-        if pd_major >= 1:
-            warnings.warn(
-                "WARNING: Pandas version 1.x has changed significantly, and causes "
-                "issues with this software. We are working on supporting Pandas 1.x in "
-                "a future release. Please downgrade to Pandas 0.25.3")
 
     except Exception:
         # If we can't figure out the version string then don't worry about it for now
@@ -655,7 +649,7 @@ def get_resistance_heat_utilization_bins(self, runtime_temp, bins, core_heating_
 
         # Create the bins and group by them
         runtime_temp['bins'] = pd.cut(runtime_temp['temperature'], bins)
-        runtime_rhu = runtime_temp.groupby('bins')['heat_runtime', 'aux_runtime', 'emg_runtime', 'total_minutes'].sum()
+        runtime_rhu = runtime_temp.groupby('bins')[['heat_runtime', 'aux_runtime', 'emg_runtime', 'total_minutes']].sum()
 
         # Calculate the RHU based on the bins
         runtime_rhu['rhu'] = (runtime_rhu['aux_runtime'] + runtime_rhu['emg_runtime']) / (runtime_rhu['heat_runtime'] + runtime_rhu['emg_runtime'])
@@ -863,7 +857,11 @@ def estimate_errors(tau_estimate):
         mape = np.nanmean(np.absolute(errors / mean_daily_runtime))
         mae = np.nanmean(np.absolute(errors))
 
-        return pd.Series(cdd, index=daily_index), tau_estimate, alpha_estimate, mse, rmse, cvrmse, mape, mae
+        demand = pd.Series(cdd, index=daily_index)
+        if demand.empty is True:
+            demand = np.nan
+
+        return demand, tau_estimate, alpha_estimate, mse, rmse, cvrmse, mape, mae
 
     def get_heating_demand(self, core_heating_day_set):
         """
@@ -987,8 +985,12 @@ def estimate_errors(tau_estimate):
         mape = np.nanmean(np.absolute(errors / mean_daily_runtime))
         mae = np.nanmean(np.absolute(errors))
 
+        demand = pd.Series(hdd, index=daily_index)
+        if demand.empty is True:
+            demand = np.nan
+
         return (
-            pd.Series(hdd, index=daily_index),
+            demand,
             tau_estimate,
             alpha_estimate,
             mse,
@@ -1285,6 +1287,9 @@ def _calculate_cooling_epa_field_savings_metrics(
             mae,
         ) = self.get_cooling_demand(core_cooling_day_set)
 
+        if demand.empty is True:
+            demand = np.nan
+
         total_runtime_core_cooling = daily_runtime.sum()
         n_days = core_cooling_day_set.daily.sum()
         n_hours = core_cooling_day_set.hourly.sum()
@@ -1442,6 +1447,9 @@ def _calculate_heating_epa_field_savings_metrics(
             mae,
         ) = self.get_heating_demand(core_heating_day_set)
 
+        if demand.empty is True:
+            demand = np.nan
+
         total_runtime_core_heating = daily_runtime.sum()
         n_days = core_heating_day_set.daily.sum()
         n_hours = core_heating_day_set.hourly.sum()

diff --git a/thermostat/eeweather_wrapper.py b/thermostat/eeweather_wrapper.py
@@ -53,7 +53,7 @@ def get_indexed_temperatures_eeweather(usaf_id, index):
     """
 
     if index.shape == (0, 0) or index.shape == (0,):
-        return pd.Series([], index=index, dtype=float)
+        return pd.Series([], index=(), dtype=float)
     years = sorted(index.groupby(index.year).keys())
     start = pd.to_datetime(datetime(years[0], 1, 1), utc=True)
     end = pd.to_datetime(datetime(years[-1], 12, 31, 23, 59), utc=True)

diff --git a/thermostat/importers.py b/thermostat/importers.py
@@ -1,3 +1,4 @@
+import csv
 from thermostat.core import Thermostat
 from thermostat.equipment_type import (
         has_heating,
@@ -147,7 +148,8 @@ def normalize_utc_offset(utc_offset):
            e))
 
 
-def from_csv(metadata_filename, verbose=False, save_cache=False, shuffle=True, cache_path=None):
+def from_csv(metadata_filename, verbose=False, save_cache=False, shuffle=True,
+             cache_path=None, log_error=True, log_error_filename='thermostat_import_errors.csv'):
     """
     Creates Thermostat objects from data stored in CSV files.
 
@@ -162,7 +164,11 @@ def from_csv(metadata_filename, verbose=False, save_cache=False, shuffle=True, c
     shuffle: boolean
         Shuffles the thermostats to give them random ordering if desired (helps with caching).
     cache_path: str
-        Directory path to save the cached data
+        Directory path to save the cached data.
+    log_error: boolean
+        Create a log file of thermostats that weren't imported and the reason they weren't imported.
+    log_error_filename: boolean
+        Name of the file to use for logging the thermostats that weren't imported.
 
     Returns
     -------
@@ -187,6 +193,7 @@ def from_csv(metadata_filename, verbose=False, save_cache=False, shuffle=True, c
     )
     metadata.fillna('', inplace=True)
 
+
     # Shuffle the results to help alleviate cache issues
     if shuffle:
         logging.info("Metadata randomized to prevent collisions in cache.")
@@ -203,20 +210,27 @@ def from_csv(metadata_filename, verbose=False, save_cache=False, shuffle=True, c
     p.close()
     p.join()
 
-    # Bad thermostats return None so remove those.
-    results = [x for x in result_list if x is not None]
-
-    # Check for thermostats that were not loaded and log them
-    metadata_thermostat_ids = set(metadata.thermostat_id)
-    loaded_thermostat_ids = set([x.thermostat_id for x in results])
-    missing_thermostats = metadata_thermostat_ids.difference(loaded_thermostat_ids)
-    missing_thermostats_num = len(missing_thermostats)
-    if missing_thermostats_num > 0:
-        logging.warning("Unable to load {} thermostat records because of "
-                        "errors. Please check the logs for the following thermostats:".format(
-                            missing_thermostats_num))
-        for thermostat in missing_thermostats:
-            logging.warning(thermostat)
+    results = []
+    error_list = []
+
+    for result in result_list:
+        if result['thermostat'] is None:
+            for error in result['errors']:
+                logging.warning(result['thermostat_id'] + ': ' + error)
+                error_dict = {}
+                error_dict['thermostat_id'] = result['thermostat_id']
+                error_dict['error'] = error
+            error_list.append(error_dict)
+        else:
+            results.append(result['thermostat'])
+
+    if log_error and error_list:
+        fieldnames = ['thermostat_id', 'error']
+        with open(log_error_filename, 'w') as error_file:
+            writer = csv.DictWriter(error_file, fieldnames=fieldnames, dialect='excel')
+            writer.writeheader()
+            for thermostat_error in error_list:
+                writer.writerow(thermostat_error)
 
     # Convert this to an iterator to maintain compatibility
     return iter(results)
@@ -232,48 +246,54 @@ def _multiprocess_func(metadata, metadata_filename, verbose=False, save_cache=Fa
 
     interval_data_filename = os.path.join(os.path.dirname(metadata_filename), row.interval_data_filename)
 
+    status_metadata = {
+        'thermostat_id': row.thermostat_id,
+        'errors': [],
+        'thermostat': None,
+    }
+    errors = []
+    thermostat = None
+
     try:
         thermostat = get_single_thermostat(
-                thermostat_id=row.thermostat_id,
-                zipcode=row.zipcode,
-                heat_type=row.heat_type,
-                heat_stage=row.heat_stage,
-                cool_type=row.cool_type,
-                cool_stage=row.cool_stage,
-                utc_offset=row.utc_offset,
-                interval_data_filename=interval_data_filename,
-                save_cache=save_cache,
-                cache_path=cache_path,
+            thermostat_id=row.thermostat_id,
+            zipcode=row.zipcode,
+            heat_type=row.heat_type,
+            heat_stage=row.heat_stage,
+            cool_type=row.cool_type,
+            cool_stage=row.cool_stage,
+            utc_offset=row.utc_offset,
+            interval_data_filename=interval_data_filename,
+            save_cache=save_cache,
+            cache_path=cache_path,
         )
     except ZCTAError as e:
         # Could not locate a station for the thermostat. Warn and skip.
-        warnings.warn(
-            "Skipping import of thermostat (id={}) for which "
+        errors.append(
+            "Skipping import of thermostat because "
             "a sufficient source of outdoor weather data could not"
-            "be located using the given ZIP code ({}). This is likely "
+            f"be located using the given ZIP code ({row.zipcode}). This is likely "
             "due to the discrepancy between US Postal Service ZIP "
             "codes (which do not always map well to locations) and "
             "Census Bureau ZCTAs (which usually do). Please supply "
             "a zipcode which corresponds to a US Census Bureau ZCTA."
-            "\nError Message: {}"
-            .format(row.thermostat_id, row.zipcode, e))
-        return
+            f"\nError Message: {e}"
+            )
 
     except ISDDataNotAvailableError as e:
-        warnings.warn(
-            "Skipping import of thermostat(id={} because the NCDC "
-            "does not have data: {}"
-            .format(row.thermostat_id, e))
-        return
+        errors.append(
+            "Skipping import of thermostat because the NCDC "
+            f"does not have data: {e}"
+            )
 
     except Exception as e:
-        warnings.warn(
-            "Skipping import of thermostat(id={}) because of "
-            "the following error: {}"
-            .format(row.thermostat_id, e))
-        return
+        errors.append(
+            f"Skipping import of thermostat because of "
+            f"the following error: {e}")
 
-    return thermostat
+    status_metadata['errors'] = errors
+    status_metadata['thermostat'] = thermostat
+    return status_metadata
 
 
 def get_single_thermostat(thermostat_id, zipcode,
@@ -367,17 +387,18 @@ def get_single_thermostat(thermostat_id, zipcode,
     enough_cool_runtime = True
     enough_heat_runtime = True
 
+    # Currently checks hourly runtime, not daily
     if cool_runtime is not None:
-        enough_cool_runtime = _enough_daily_runtume(cool_runtime)
+        enough_cool_runtime = _enough_runtume(cool_runtime)
     if heat_runtime is not None:
-        enough_heat_runtime = _enough_daily_runtume(heat_runtime)
+        enough_heat_runtime = _enough_runtume(heat_runtime)
 
     if not(enough_cool_runtime and enough_heat_runtime):
-        message = "Not enough runtime for thermostat %s\n" % thermostat_id
+        message = "Not enough runtime for thermostat "
         if not enough_heat_runtime:
-            message += "Heat runtime has over 5% missing data.\n"
+            message += "(Heat runtime has over 5% missing data) "
         if not enough_cool_runtime:
-            message += "Cool runtime has over 5% missing data.\n"
+            message += "(Cool runtime has over 5% missing data) "
         raise ValueError(message)
 
     # create thermostat instance
@@ -481,10 +502,10 @@ def _create_series(df, index):
     return series
 
 
-def _enough_daily_runtume(series):
+def _enough_runtume(series):
     if series is None:
         return False
 
-    num_days = len(series)
-    num_dropped_days = len(series.dropna())
-    return (num_dropped_days / num_days) > 0.95
+    num_elements = len(series)
+    num_dropped_elements = len(series.dropna())
+    return (num_dropped_elements / num_elements) > 0.95
diff --git a/thermostat/stations.py b/thermostat/stations.py
@@ -116,7 +116,7 @@ def get_closest_station_by_zipcode(zipcode):
 
         if station is None:
             zipcode_mapping = zipcodes.matching(zipcode)
-            warn("No station found for ZCTA / ZIP %s (%s, %s)." % (
+            warnings.warn("No station found for ZCTA / ZIP %s (%s, %s)." % (
                 zipcode,
                 zipcode_mapping[0].get('city'),
                 zipcode_mapping[0].get('state')