From f78ce4e60aa4db678149705737cb104e7d83e387 Mon Sep 17 00:00:00 2001 From: John Calderon <81483067+johncalesp@users.noreply.github.com> Date: Tue, 9 May 2023 11:29:35 -0400 Subject: [PATCH] added error messages and memory reset (#53) * added error messages and memory reset * renamed utils --- deepview_profile/analysis/runner.py | 12 +- deepview_profile/analysis/session.py | 269 +++++++++++---------- deepview_profile/profiler/iteration.py | 4 +- deepview_profile/protocol_gen/innpv_pb2.py | 110 +++++---- deepview_profile/utils.py | 10 + protocol/innpv.proto | 2 + 6 files changed, 229 insertions(+), 178 deletions(-) create mode 100644 deepview_profile/utils.py diff --git a/deepview_profile/analysis/runner.py b/deepview_profile/analysis/runner.py index d73101c..bd5aac8 100644 --- a/deepview_profile/analysis/runner.py +++ b/deepview_profile/analysis/runner.py @@ -5,23 +5,23 @@ import torch from deepview_profile.analysis.session import AnalysisSession from deepview_profile.nvml import NVML - +from deepview_profile.utils import release_memory def analyze_project(project_root, entry_point, nvml): - torch.cuda.empty_cache() + release_memory() session = AnalysisSession.new_from(project_root, entry_point) yield session.measure_breakdown(nvml) - torch.cuda.empty_cache() + release_memory() yield session.measure_throughput() - torch.cuda.empty_cache() + release_memory() print("analyze_project: running deepview_predict()") yield session.habitat_predict() - torch.cuda.empty_cache() + release_memory() print("analyze_project: running energy_compute()") yield session.energy_compute() - torch.cuda.empty_cache() + release_memory() def main(): diff --git a/deepview_profile/analysis/session.py b/deepview_profile/analysis/session.py index cc12088..3e92150 100644 --- a/deepview_profile/analysis/session.py +++ b/deepview_profile/analysis/session.py @@ -150,47 +150,53 @@ def energy_compute(self) -> pm.EnergyResponse: for _ in range(iterations): iteration(*inputs) energy_measurer.end_measurement() - except PermissionError as err: - # Remind user to set their CPU permissions - print(err) + resp.total_consumption = energy_measurer.total_energy()/float(iterations) + resp.batch_size = self._batch_size + + components = [] + components_joules = [] + + if energy_measurer.cpu_energy() is not None: + cpu_component = pm.EnergyConsumptionComponent() + cpu_component.component_type = pm.ENERGY_CPU_DRAM + cpu_component.consumption_joules = energy_measurer.cpu_energy()/float(iterations) + components.append(cpu_component) + components_joules.append(cpu_component.consumption_joules) + else: + cpu_component = pm.EnergyConsumptionComponent() + cpu_component.component_type = pm.ENERGY_CPU_DRAM + cpu_component.consumption_joules = 0.0 + components.append(cpu_component) + components_joules.append(cpu_component.consumption_joules) - resp.total_consumption = energy_measurer.total_energy()/float(iterations) - resp.batch_size = self._batch_size - - components = [] - components_joules = [] - - if energy_measurer.cpu_energy() is not None: - cpu_component = pm.EnergyConsumptionComponent() - cpu_component.component_type = pm.ENERGY_CPU_DRAM - cpu_component.consumption_joules = energy_measurer.cpu_energy()/float(iterations) - components.append(cpu_component) - components_joules.append(cpu_component.consumption_joules) - else: - cpu_component = pm.EnergyConsumptionComponent() - cpu_component.component_type = pm.ENERGY_CPU_DRAM - cpu_component.consumption_joules = 0.0 - components.append(cpu_component) - components_joules.append(cpu_component.consumption_joules) + gpu_component = pm.EnergyConsumptionComponent() + gpu_component.component_type = pm.ENERGY_NVIDIA + gpu_component.consumption_joules = energy_measurer.gpu_energy()/float(iterations) + components.append(gpu_component) + components_joules.append(gpu_component.consumption_joules) + + resp.components.extend(components) - gpu_component = pm.EnergyConsumptionComponent() - gpu_component.component_type = pm.ENERGY_NVIDIA - gpu_component.consumption_joules = energy_measurer.gpu_energy()/float(iterations) - components.append(gpu_component) - components_joules.append(gpu_component.consumption_joules) + # get last 10 runs if they exist + path_to_entry_point = os.path.join(self._project_root, self._entry_point) + past_runs = self._energy_table_interface.get_latest_n_entries_of_entry_point(10, path_to_entry_point) + resp.past_measurements.extend(_convert_to_energy_responses(past_runs)) + + # add current run to database + current_entry = [path_to_entry_point] + components_joules + current_entry.append(self._batch_size) + self._energy_table_interface.add_entry(current_entry) + except AnalysisError as ex: + message = str(ex) + logger.error(message) + resp.analysis_error.error_message = message + except: + logger.error("There was an error obtaining energy measurements") + resp.analysis_error.error_message = "There was an error obtaining energy measurements" + finally: + return resp + - resp.components.extend(components) - - # get last 10 runs if they exist - path_to_entry_point = os.path.join(self._project_root, self._entry_point) - past_runs = self._energy_table_interface.get_latest_n_entries_of_entry_point(10, path_to_entry_point) - resp.past_measurements.extend(_convert_to_energy_responses(past_runs)) - - # add current run to database - current_entry = [path_to_entry_point] + components_joules - current_entry.append(self._batch_size) - self._energy_table_interface.add_entry(current_entry) - return resp def habitat_compute_threshold(self, runnable, context): tracker = habitat.OperationTracker(context.origin_device) @@ -210,102 +216,115 @@ def habitat_compute_threshold(self, runnable, context): def habitat_predict(self): - resp = pm.HabitatResponse() + resp = pm.HabitatResponse() if not habitat_found: logger.debug("Skipping deepview predictions, returning empty response.") return resp - print("deepview_predict: begin") - DEVICES = [ - habitat.Device.P100, - habitat.Device.P4000, - habitat.Device.RTX2070, - habitat.Device.RTX2080Ti, - habitat.Device.T4, - habitat.Device.V100, - habitat.Device.A100, - habitat.Device.RTX3090, - habitat.Device.A40, - habitat.Device.A4000, - habitat.Device.RTX4000 - ] - - # Detect source GPU - pynvml.nvmlInit() - if pynvml.nvmlDeviceGetCount() == 0: - raise Exception("NVML failed to find a GPU. PLease ensure that you have a NVIDIA GPU installed and that the drivers are functioning correctly.") - - # TODO: Consider profiling on not only the first detected GPU - nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0) - source_device_name = pynvml.nvmlDeviceGetName(nvml_handle).decode("utf-8") - split_source_device_name = re.split(r"-|\s|_|\\|/", source_device_name) - source_device = None if logging.root.level > logging.DEBUG else habitat.Device.T4 - for device in DEVICES: - if device.name in split_source_device_name: - source_device = device - pynvml.nvmlShutdown() - if not source_device: - logger.debug("Skipping DeepView predictions, source not in list of supported GPUs.") - src = pm.HabitatDevicePrediction() - src.device_name = 'unavailable' - src.runtime_ms = -1 - resp.predictions.append(src) - return resp - - print("deepview_predict: detected source device", source_device.name) + try: + print("deepview_predict: begin") + DEVICES = [ + habitat.Device.P100, + habitat.Device.P4000, + habitat.Device.RTX2070, + habitat.Device.RTX2080Ti, + habitat.Device.T4, + habitat.Device.V100, + habitat.Device.A100, + habitat.Device.RTX3090, + habitat.Device.A40, + habitat.Device.A4000, + habitat.Device.RTX4000 + ] + + # Detect source GPU + pynvml.nvmlInit() + if pynvml.nvmlDeviceGetCount() == 0: + raise Exception("NVML failed to find a GPU. PLease ensure that you have a NVIDIA GPU installed and that the drivers are functioning correctly.") + + # TODO: Consider profiling on not only the first detected GPU + nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0) + source_device_name = pynvml.nvmlDeviceGetName(nvml_handle).decode("utf-8") + split_source_device_name = re.split(r"-|\s|_|\\|/", source_device_name) + source_device = None if logging.root.level > logging.DEBUG else habitat.Device.T4 + for device in DEVICES: + if device.name in split_source_device_name: + source_device = device + pynvml.nvmlShutdown() + if not source_device: + logger.debug("Skipping DeepView predictions, source not in list of supported GPUs.") + src = pm.HabitatDevicePrediction() + src.device_name = 'unavailable' + src.runtime_ms = -1 + resp.predictions.append(src) + return resp + + print("deepview_predict: detected source device", source_device.name) + + # get model + model = self._model_provider() + inputs = self._input_provider() + iteration = self._iteration_provider(model) - # get model - model = self._model_provider() - inputs = self._input_provider() - iteration = self._iteration_provider(model) + def runnable(): + iteration(*inputs) - def runnable(): - iteration(*inputs) + profiler = RunTimeProfiler() - profiler = RunTimeProfiler() + context = Context( + origin_device=source_device, + profiler=profiler, + percentile=99.5 + ) - context = Context( - origin_device=source_device, - profiler=profiler, - percentile=99.5 - ) + threshold = self.habitat_compute_threshold(runnable, context) + + tracker = habitat.OperationTracker( + device=context.origin_device, + metrics=[ + habitat.Metric.SinglePrecisionFLOPEfficiency, + habitat.Metric.DRAMReadBytes, + habitat.Metric.DRAMWriteBytes, + ], + metrics_threshold_ms=threshold, + ) - threshold = self.habitat_compute_threshold(runnable, context) - tracker = habitat.OperationTracker( - device=context.origin_device, - metrics=[ - habitat.Metric.SinglePrecisionFLOPEfficiency, - habitat.Metric.DRAMReadBytes, - habitat.Metric.DRAMWriteBytes, - ], - metrics_threshold_ms=threshold, - ) - - with tracker.track(): - iteration(*inputs) - - print("deepview_predict: tracing on origin device") - trace = tracker.get_tracked_trace() - - src = pm.HabitatDevicePrediction() - src.device_name = 'source' - src.runtime_ms = trace.run_time_ms - resp.predictions.append(src) - - for device in DEVICES: - print("deepview_predict: predicting for", device) - predicted_trace = trace.to_device(device) - - pred = pm.HabitatDevicePrediction() - pred.device_name = device.name - pred.runtime_ms = predicted_trace.run_time_ms - resp.predictions.append(pred) + with tracker.track(): + iteration(*inputs) + + print("deepview_predict: tracing on origin device") + trace = tracker.get_tracked_trace() - print(f"returning {len(resp.predictions)} predictions.") + src = pm.HabitatDevicePrediction() + src.device_name = 'source' + src.runtime_ms = trace.run_time_ms + resp.predictions.append(src) - return resp + for device in DEVICES: + print("deepview_predict: predicting for", device) + predicted_trace = trace.to_device(device) + + pred = pm.HabitatDevicePrediction() + pred.device_name = device.name + pred.runtime_ms = predicted_trace.run_time_ms + resp.predictions.append(pred) + + print(f"returning {len(resp.predictions)} predictions.") + except AnalysisError as ex: + message = str(ex) + logger.error(message) + resp.analysis_error.error_message = message + except: + logger.error("There was an error running DeepView Predict") + resp.analysis_error.error_message = "There was an error running DeepView Predict" + finally: + return resp + + + + def measure_breakdown(self, nvml): # 1. Measure the breakdown entries @@ -361,6 +380,7 @@ def measure_throughput(self): ) # 2. Begin filling in the throughput response + logger.debug("sampling results", samples) measured_throughput = ( samples[0].batch_size / samples[0].run_time_ms * 1000 ) @@ -405,18 +425,19 @@ def measure_throughput(self): throughput.peak_usage_bytes.bias = peak_usage_model[1] predicted_max_throughput = 1000.0 / run_time_model[0] - + # Our prediction can be inaccurate due to sampling error or incorrect # assumptions. In these cases, we ignore our prediction. At the very # minimum, a good linear model has a positive slope and bias. - if (run_time_model[0] < 1e-3 or run_time_model[1] < 1e-3 or + #if (run_time_model[0] < 1e-3 or run_time_model[1] < 1e-3 or + if (run_time_model[0] < 1e-3 or measured_throughput > predicted_max_throughput): return throughput throughput.predicted_max_samples_per_second = predicted_max_throughput throughput.run_time_ms.slope = run_time_model[0] throughput.run_time_ms.bias = run_time_model[1] - + return throughput def measure_peak_usage_bytes(self): diff --git a/deepview_profile/profiler/iteration.py b/deepview_profile/profiler/iteration.py index 824b453..831cc11 100644 --- a/deepview_profile/profiler/iteration.py +++ b/deepview_profile/profiler/iteration.py @@ -1,10 +1,10 @@ import collections import logging - import torch from deepview_profile.exceptions import AnalysisError from deepview_profile.user_code_utils import user_code_environment +from deepview_profile.utils import release_memory logger = logging.getLogger(__name__) @@ -49,6 +49,7 @@ def measure_run_time_ms(self, batch_size, initial_repetitions=None): NOTE: This method will raise a RuntimeError if there is not enough GPU memory to run the iteration. """ + with user_code_environment( self._path_to_entry_point_dir, self._project_root): inputs = self._input_provider(batch_size=batch_size) @@ -111,6 +112,7 @@ def measure_run_time_ms_catch_oom( self, batch_size, initial_repetitions=None): # This function is useful when we want to explicitly handle OOM errors # without aborting the profiling. + release_memory() try: return ( None, diff --git a/deepview_profile/protocol_gen/innpv_pb2.py b/deepview_profile/protocol_gen/innpv_pb2.py index 967c333..e2b266b 100644 --- a/deepview_profile/protocol_gen/innpv_pb2.py +++ b/deepview_profile/protocol_gen/innpv_pb2.py @@ -20,7 +20,7 @@ syntax='proto3', serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x0binnpv.proto\x12\x0einnpv.protocol\"\xcf\x01\n\nFromClient\x12\x17\n\x0fsequence_number\x18\x01 \x01(\r\x12\x37\n\ninitialize\x18\x02 \x01(\x0b\x32!.innpv.protocol.InitializeRequestH\x00\x12\x33\n\x08\x61nalysis\x18\x03 \x01(\x0b\x32\x1f.innpv.protocol.AnalysisRequestH\x00\x12/\n\x07generic\x18\x04 \x01(\x0b\x32\x1c.innpv.protocol.GenericEventH\x00\x42\t\n\x07payload\">\n\x0cGenericEvent\x12\x12\n\nevent_type\x18\x01 \x01(\t\x12\x1a\n\x12optional_arguments\x18\x02 \x01(\t\"X\n\x11InitializeRequest\x12\x18\n\x10protocol_version\x18\x01 \x01(\r\x12\x14\n\x0cproject_root\x18\x02 \x01(\t\x12\x13\n\x0b\x65ntry_point\x18\x03 \x01(\t\"(\n\x0f\x41nalysisRequest\x12\x15\n\rmock_response\x18\x01 \x01(\x08\"\xcf\x03\n\nFromServer\x12\x17\n\x0fsequence_number\x18\x01 \x01(\r\x12.\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x1d.innpv.protocol.ProtocolErrorH\x00\x12\x38\n\ninitialize\x18\x03 \x01(\x0b\x32\".innpv.protocol.InitializeResponseH\x00\x12\x37\n\x0e\x61nalysis_error\x18\x05 \x01(\x0b\x32\x1d.innpv.protocol.AnalysisErrorH\x00\x12\x38\n\nthroughput\x18\x06 \x01(\x0b\x32\".innpv.protocol.ThroughputResponseH\x00\x12\x36\n\tbreakdown\x18\x08 \x01(\x0b\x32!.innpv.protocol.BreakdownResponseH\x00\x12\x32\n\x07habitat\x18\t \x01(\x0b\x32\x1f.innpv.protocol.HabitatResponseH\x00\x12\x30\n\x06\x65nergy\x18\n \x01(\x0b\x32\x1e.innpv.protocol.EnergyResponseH\x00\x42\t\n\x07payloadJ\x04\x08\x04\x10\x05J\x04\x08\x07\x10\x08R\x0cmemory_usageR\x08run_time\"B\n\x17HabitatDevicePrediction\x12\x13\n\x0b\x64\x65vice_name\x18\x01 \x01(\t\x12\x12\n\nruntime_ms\x18\x02 \x01(\x02\"O\n\x0fHabitatResponse\x12<\n\x0bpredictions\x18\x01 \x03(\x0b\x32\'.innpv.protocol.HabitatDevicePrediction\"\xba\x01\n\x0e\x45nergyResponse\x12\x19\n\x11total_consumption\x18\x01 \x01(\x02\x12>\n\ncomponents\x18\x02 \x03(\x0b\x32*.innpv.protocol.EnergyConsumptionComponent\x12\x12\n\nbatch_size\x18\x03 \x01(\x05\x12\x39\n\x11past_measurements\x18\x04 \x03(\x0b\x32\x1e.innpv.protocol.EnergyResponse\"\x80\x01\n\x1a\x45nergyConsumptionComponent\x12\x46\n\x0e\x63omponent_type\x18\x01 \x01(\x0e\x32..innpv.protocol.EnergyConsumptionComponentType\x12\x1a\n\x12\x63onsumption_joules\x18\x02 \x01(\x02\"\x8c\x01\n\x12InitializeResponse\x12\x1b\n\x13server_project_root\x18\x01 \x01(\t\x12)\n\x0b\x65ntry_point\x18\x02 \x01(\x0b\x32\x14.innpv.protocol.Path\x12.\n\x08hardware\x18\x03 \x01(\x0b\x32\x1c.innpv.protocol.HardwareInfo\"[\n\rAnalysisError\x12\x15\n\rerror_message\x18\x01 \x01(\t\x12\x33\n\x0c\x66ile_context\x18\x02 \x01(\x0b\x32\x1d.innpv.protocol.FileReference\"\xa1\x02\n\x12ThroughputResponse\x12\x1a\n\x12samples_per_second\x18\x01 \x01(\x02\x12(\n predicted_max_samples_per_second\x18\x02 \x01(\x02\x12\x30\n\x0brun_time_ms\x18\x03 \x01(\x0b\x32\x1b.innpv.protocol.LinearModel\x12\x35\n\x10peak_usage_bytes\x18\x04 \x01(\x0b\x32\x1b.innpv.protocol.LinearModel\x12\x39\n\x12\x62\x61tch_size_context\x18\x05 \x01(\x0b\x32\x1d.innpv.protocol.FileReference\x12!\n\x19\x63\x61n_manipulate_batch_size\x18\x06 \x01(\x08\"\xea\x01\n\x11\x42reakdownResponse\x12\x18\n\x10peak_usage_bytes\x18\x01 \x01(\x04\x12\x1d\n\x15memory_capacity_bytes\x18\x02 \x01(\x04\x12\x1d\n\x15iteration_run_time_ms\x18\x03 \x01(\x02\x12\x12\n\nbatch_size\x18\x06 \x01(\r\x12\x35\n\x0eoperation_tree\x18\x04 \x03(\x0b\x32\x1d.innpv.protocol.BreakdownNode\x12\x32\n\x0bweight_tree\x18\x05 \x03(\x0b\x32\x1d.innpv.protocol.BreakdownNode\"\xca\x01\n\rProtocolError\x12;\n\nerror_code\x18\x01 \x01(\x0e\x32\'.innpv.protocol.ProtocolError.ErrorCode\"|\n\tErrorCode\x12\x0b\n\x07UNKNOWN\x10\x00\x12 \n\x1cUNSUPPORTED_PROTOCOL_VERSION\x10\x01\x12\x1c\n\x18UNINITIALIZED_CONNECTION\x10\x02\x12\"\n\x1e\x41LREADY_INITIALIZED_CONNECTION\x10\x03\"\x1a\n\x04Path\x12\x12\n\ncomponents\x18\x01 \x03(\t\"M\n\rFileReference\x12\'\n\tfile_path\x18\x01 \x01(\x0b\x32\x14.innpv.protocol.Path\x12\x13\n\x0bline_number\x18\x02 \x01(\r\"\xce\x01\n\rBreakdownNode\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x14\n\x0cnum_children\x18\x02 \x01(\r\x12/\n\x08\x63ontexts\x18\x03 \x03(\x0b\x32\x1d.innpv.protocol.FileReference\x12\x32\n\toperation\x18\x04 \x01(\x0b\x32\x1d.innpv.protocol.OperationDataH\x00\x12,\n\x06weight\x18\x05 \x01(\x0b\x32\x1a.innpv.protocol.WeightDataH\x00\x42\x06\n\x04\x64\x61ta\"{\n\x0b\x43ontextInfo\x12.\n\x07\x63ontext\x18\x01 \x01(\x0b\x32\x1d.innpv.protocol.FileReference\x12\x13\n\x0brun_time_ms\x18\x02 \x01(\x02\x12\x12\n\nsize_bytes\x18\x03 \x01(\x04\x12\x13\n\x0binvocations\x18\x04 \x01(\r\"\x83\x01\n\rOperationData\x12\x12\n\nforward_ms\x18\x01 \x01(\x02\x12\x13\n\x0b\x62\x61\x63kward_ms\x18\x02 \x01(\x02\x12\x12\n\nsize_bytes\x18\x03 \x01(\x04\x12\x35\n\x10\x63ontext_info_map\x18\x04 \x03(\x0b\x32\x1b.innpv.protocol.ContextInfo\"9\n\nWeightData\x12\x12\n\nsize_bytes\x18\x01 \x01(\x04\x12\x17\n\x0fgrad_size_bytes\x18\x02 \x01(\x04\"*\n\x0bLinearModel\x12\r\n\x05slope\x18\x01 \x01(\x01\x12\x0c\n\x04\x62ias\x18\x02 \x01(\x01\":\n\x0cHardwareInfo\x12\x10\n\x08hostname\x18\x01 \x01(\t\x12\n\n\x02os\x18\x02 \x01(\t\x12\x0c\n\x04gpus\x18\x03 \x03(\t\"\x1b\n\x13MemoryUsageResponseJ\x04\x08\x01\x10\x65\"\x17\n\x0fRunTimeResponseJ\x04\x08\x01\x10\x65\"\x17\n\x0f\x41\x63tivationEntryJ\x04\x08\x01\x10\x65\"\x13\n\x0bWeightEntryJ\x04\x08\x01\x10\x65\"\x14\n\x0cRunTimeEntryJ\x04\x08\x01\x10\x65*`\n\x1e\x45nergyConsumptionComponentType\x12\x16\n\x12\x45NERGY_UNSPECIFIED\x10\x00\x12\x13\n\x0f\x45NERGY_CPU_DRAM\x10\x01\x12\x11\n\rENERGY_NVIDIA\x10\x02\x62\x06proto3' + serialized_pb=b'\n\x0binnpv.proto\x12\x0einnpv.protocol\"\xcf\x01\n\nFromClient\x12\x17\n\x0fsequence_number\x18\x01 \x01(\r\x12\x37\n\ninitialize\x18\x02 \x01(\x0b\x32!.innpv.protocol.InitializeRequestH\x00\x12\x33\n\x08\x61nalysis\x18\x03 \x01(\x0b\x32\x1f.innpv.protocol.AnalysisRequestH\x00\x12/\n\x07generic\x18\x04 \x01(\x0b\x32\x1c.innpv.protocol.GenericEventH\x00\x42\t\n\x07payload\">\n\x0cGenericEvent\x12\x12\n\nevent_type\x18\x01 \x01(\t\x12\x1a\n\x12optional_arguments\x18\x02 \x01(\t\"X\n\x11InitializeRequest\x12\x18\n\x10protocol_version\x18\x01 \x01(\r\x12\x14\n\x0cproject_root\x18\x02 \x01(\t\x12\x13\n\x0b\x65ntry_point\x18\x03 \x01(\t\"(\n\x0f\x41nalysisRequest\x12\x15\n\rmock_response\x18\x01 \x01(\x08\"\xcf\x03\n\nFromServer\x12\x17\n\x0fsequence_number\x18\x01 \x01(\r\x12.\n\x05\x65rror\x18\x02 \x01(\x0b\x32\x1d.innpv.protocol.ProtocolErrorH\x00\x12\x38\n\ninitialize\x18\x03 \x01(\x0b\x32\".innpv.protocol.InitializeResponseH\x00\x12\x37\n\x0e\x61nalysis_error\x18\x05 \x01(\x0b\x32\x1d.innpv.protocol.AnalysisErrorH\x00\x12\x38\n\nthroughput\x18\x06 \x01(\x0b\x32\".innpv.protocol.ThroughputResponseH\x00\x12\x36\n\tbreakdown\x18\x08 \x01(\x0b\x32!.innpv.protocol.BreakdownResponseH\x00\x12\x32\n\x07habitat\x18\t \x01(\x0b\x32\x1f.innpv.protocol.HabitatResponseH\x00\x12\x30\n\x06\x65nergy\x18\n \x01(\x0b\x32\x1e.innpv.protocol.EnergyResponseH\x00\x42\t\n\x07payloadJ\x04\x08\x04\x10\x05J\x04\x08\x07\x10\x08R\x0cmemory_usageR\x08run_time\"B\n\x17HabitatDevicePrediction\x12\x13\n\x0b\x64\x65vice_name\x18\x01 \x01(\t\x12\x12\n\nruntime_ms\x18\x02 \x01(\x02\"\x86\x01\n\x0fHabitatResponse\x12<\n\x0bpredictions\x18\x01 \x03(\x0b\x32\'.innpv.protocol.HabitatDevicePrediction\x12\x35\n\x0e\x61nalysis_error\x18\x02 \x01(\x0b\x32\x1d.innpv.protocol.AnalysisError\"\xf1\x01\n\x0e\x45nergyResponse\x12\x19\n\x11total_consumption\x18\x01 \x01(\x02\x12>\n\ncomponents\x18\x02 \x03(\x0b\x32*.innpv.protocol.EnergyConsumptionComponent\x12\x12\n\nbatch_size\x18\x03 \x01(\x05\x12\x39\n\x11past_measurements\x18\x04 \x03(\x0b\x32\x1e.innpv.protocol.EnergyResponse\x12\x35\n\x0e\x61nalysis_error\x18\x05 \x01(\x0b\x32\x1d.innpv.protocol.AnalysisError\"\x80\x01\n\x1a\x45nergyConsumptionComponent\x12\x46\n\x0e\x63omponent_type\x18\x01 \x01(\x0e\x32..innpv.protocol.EnergyConsumptionComponentType\x12\x1a\n\x12\x63onsumption_joules\x18\x02 \x01(\x02\"\x8c\x01\n\x12InitializeResponse\x12\x1b\n\x13server_project_root\x18\x01 \x01(\t\x12)\n\x0b\x65ntry_point\x18\x02 \x01(\x0b\x32\x14.innpv.protocol.Path\x12.\n\x08hardware\x18\x03 \x01(\x0b\x32\x1c.innpv.protocol.HardwareInfo\"[\n\rAnalysisError\x12\x15\n\rerror_message\x18\x01 \x01(\t\x12\x33\n\x0c\x66ile_context\x18\x02 \x01(\x0b\x32\x1d.innpv.protocol.FileReference\"\xa1\x02\n\x12ThroughputResponse\x12\x1a\n\x12samples_per_second\x18\x01 \x01(\x02\x12(\n predicted_max_samples_per_second\x18\x02 \x01(\x02\x12\x30\n\x0brun_time_ms\x18\x03 \x01(\x0b\x32\x1b.innpv.protocol.LinearModel\x12\x35\n\x10peak_usage_bytes\x18\x04 \x01(\x0b\x32\x1b.innpv.protocol.LinearModel\x12\x39\n\x12\x62\x61tch_size_context\x18\x05 \x01(\x0b\x32\x1d.innpv.protocol.FileReference\x12!\n\x19\x63\x61n_manipulate_batch_size\x18\x06 \x01(\x08\"\xea\x01\n\x11\x42reakdownResponse\x12\x18\n\x10peak_usage_bytes\x18\x01 \x01(\x04\x12\x1d\n\x15memory_capacity_bytes\x18\x02 \x01(\x04\x12\x1d\n\x15iteration_run_time_ms\x18\x03 \x01(\x02\x12\x12\n\nbatch_size\x18\x06 \x01(\r\x12\x35\n\x0eoperation_tree\x18\x04 \x03(\x0b\x32\x1d.innpv.protocol.BreakdownNode\x12\x32\n\x0bweight_tree\x18\x05 \x03(\x0b\x32\x1d.innpv.protocol.BreakdownNode\"\xca\x01\n\rProtocolError\x12;\n\nerror_code\x18\x01 \x01(\x0e\x32\'.innpv.protocol.ProtocolError.ErrorCode\"|\n\tErrorCode\x12\x0b\n\x07UNKNOWN\x10\x00\x12 \n\x1cUNSUPPORTED_PROTOCOL_VERSION\x10\x01\x12\x1c\n\x18UNINITIALIZED_CONNECTION\x10\x02\x12\"\n\x1e\x41LREADY_INITIALIZED_CONNECTION\x10\x03\"\x1a\n\x04Path\x12\x12\n\ncomponents\x18\x01 \x03(\t\"M\n\rFileReference\x12\'\n\tfile_path\x18\x01 \x01(\x0b\x32\x14.innpv.protocol.Path\x12\x13\n\x0bline_number\x18\x02 \x01(\r\"\xce\x01\n\rBreakdownNode\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x14\n\x0cnum_children\x18\x02 \x01(\r\x12/\n\x08\x63ontexts\x18\x03 \x03(\x0b\x32\x1d.innpv.protocol.FileReference\x12\x32\n\toperation\x18\x04 \x01(\x0b\x32\x1d.innpv.protocol.OperationDataH\x00\x12,\n\x06weight\x18\x05 \x01(\x0b\x32\x1a.innpv.protocol.WeightDataH\x00\x42\x06\n\x04\x64\x61ta\"{\n\x0b\x43ontextInfo\x12.\n\x07\x63ontext\x18\x01 \x01(\x0b\x32\x1d.innpv.protocol.FileReference\x12\x13\n\x0brun_time_ms\x18\x02 \x01(\x02\x12\x12\n\nsize_bytes\x18\x03 \x01(\x04\x12\x13\n\x0binvocations\x18\x04 \x01(\r\"\x83\x01\n\rOperationData\x12\x12\n\nforward_ms\x18\x01 \x01(\x02\x12\x13\n\x0b\x62\x61\x63kward_ms\x18\x02 \x01(\x02\x12\x12\n\nsize_bytes\x18\x03 \x01(\x04\x12\x35\n\x10\x63ontext_info_map\x18\x04 \x03(\x0b\x32\x1b.innpv.protocol.ContextInfo\"9\n\nWeightData\x12\x12\n\nsize_bytes\x18\x01 \x01(\x04\x12\x17\n\x0fgrad_size_bytes\x18\x02 \x01(\x04\"*\n\x0bLinearModel\x12\r\n\x05slope\x18\x01 \x01(\x01\x12\x0c\n\x04\x62ias\x18\x02 \x01(\x01\":\n\x0cHardwareInfo\x12\x10\n\x08hostname\x18\x01 \x01(\t\x12\n\n\x02os\x18\x02 \x01(\t\x12\x0c\n\x04gpus\x18\x03 \x03(\t\"\x1b\n\x13MemoryUsageResponseJ\x04\x08\x01\x10\x65\"\x17\n\x0fRunTimeResponseJ\x04\x08\x01\x10\x65\"\x17\n\x0f\x41\x63tivationEntryJ\x04\x08\x01\x10\x65\"\x13\n\x0bWeightEntryJ\x04\x08\x01\x10\x65\"\x14\n\x0cRunTimeEntryJ\x04\x08\x01\x10\x65*`\n\x1e\x45nergyConsumptionComponentType\x12\x16\n\x12\x45NERGY_UNSPECIFIED\x10\x00\x12\x13\n\x0f\x45NERGY_CPU_DRAM\x10\x01\x12\x11\n\rENERGY_NVIDIA\x10\x02\x62\x06proto3' ) _ENERGYCONSUMPTIONCOMPONENTTYPE = _descriptor.EnumDescriptor( @@ -48,8 +48,8 @@ ], containing_type=None, serialized_options=None, - serialized_start=3202, - serialized_end=3298, + serialized_start=3313, + serialized_end=3409, ) _sym_db.RegisterEnumDescriptor(_ENERGYCONSUMPTIONCOMPONENTTYPE) @@ -89,8 +89,8 @@ ], containing_type=None, serialized_options=None, - serialized_start=2216, - serialized_end=2340, + serialized_start=2327, + serialized_end=2451, ) _sym_db.RegisterEnumDescriptor(_PROTOCOLERROR_ERRORCODE) @@ -410,6 +410,13 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='analysis_error', full_name='innpv.protocol.HabitatResponse.analysis_error', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -422,8 +429,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=971, - serialized_end=1050, + serialized_start=972, + serialized_end=1106, ) @@ -463,6 +470,13 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='analysis_error', full_name='innpv.protocol.EnergyResponse.analysis_error', index=4, + number=5, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -475,8 +489,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1053, - serialized_end=1239, + serialized_start=1109, + serialized_end=1350, ) @@ -514,8 +528,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1242, - serialized_end=1370, + serialized_start=1353, + serialized_end=1481, ) @@ -560,8 +574,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1373, - serialized_end=1513, + serialized_start=1484, + serialized_end=1624, ) @@ -599,8 +613,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1515, - serialized_end=1606, + serialized_start=1626, + serialized_end=1717, ) @@ -666,8 +680,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1609, - serialized_end=1898, + serialized_start=1720, + serialized_end=2009, ) @@ -733,8 +747,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1901, - serialized_end=2135, + serialized_start=2012, + serialized_end=2246, ) @@ -766,8 +780,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=2138, - serialized_end=2340, + serialized_start=2249, + serialized_end=2451, ) @@ -798,8 +812,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=2342, - serialized_end=2368, + serialized_start=2453, + serialized_end=2479, ) @@ -837,8 +851,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=2370, - serialized_end=2447, + serialized_start=2481, + serialized_end=2558, ) @@ -902,8 +916,8 @@ create_key=_descriptor._internal_create_key, fields=[]), ], - serialized_start=2450, - serialized_end=2656, + serialized_start=2561, + serialized_end=2767, ) @@ -955,8 +969,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=2658, - serialized_end=2781, + serialized_start=2769, + serialized_end=2892, ) @@ -1008,8 +1022,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=2784, - serialized_end=2915, + serialized_start=2895, + serialized_end=3026, ) @@ -1047,8 +1061,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=2917, - serialized_end=2974, + serialized_start=3028, + serialized_end=3085, ) @@ -1086,8 +1100,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=2976, - serialized_end=3018, + serialized_start=3087, + serialized_end=3129, ) @@ -1132,8 +1146,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=3020, - serialized_end=3078, + serialized_start=3131, + serialized_end=3189, ) @@ -1157,8 +1171,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=3080, - serialized_end=3107, + serialized_start=3191, + serialized_end=3218, ) @@ -1182,8 +1196,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=3109, - serialized_end=3132, + serialized_start=3220, + serialized_end=3243, ) @@ -1207,8 +1221,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=3134, - serialized_end=3157, + serialized_start=3245, + serialized_end=3268, ) @@ -1232,8 +1246,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=3159, - serialized_end=3178, + serialized_start=3270, + serialized_end=3289, ) @@ -1257,8 +1271,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=3180, - serialized_end=3200, + serialized_start=3291, + serialized_end=3311, ) _FROMCLIENT.fields_by_name['initialize'].message_type = _INITIALIZEREQUEST @@ -1302,8 +1316,10 @@ _FROMSERVER.fields_by_name['energy']) _FROMSERVER.fields_by_name['energy'].containing_oneof = _FROMSERVER.oneofs_by_name['payload'] _HABITATRESPONSE.fields_by_name['predictions'].message_type = _HABITATDEVICEPREDICTION +_HABITATRESPONSE.fields_by_name['analysis_error'].message_type = _ANALYSISERROR _ENERGYRESPONSE.fields_by_name['components'].message_type = _ENERGYCONSUMPTIONCOMPONENT _ENERGYRESPONSE.fields_by_name['past_measurements'].message_type = _ENERGYRESPONSE +_ENERGYRESPONSE.fields_by_name['analysis_error'].message_type = _ANALYSISERROR _ENERGYCONSUMPTIONCOMPONENT.fields_by_name['component_type'].enum_type = _ENERGYCONSUMPTIONCOMPONENTTYPE _INITIALIZERESPONSE.fields_by_name['entry_point'].message_type = _PATH _INITIALIZERESPONSE.fields_by_name['hardware'].message_type = _HARDWAREINFO diff --git a/deepview_profile/utils.py b/deepview_profile/utils.py new file mode 100644 index 0000000..4f43bc6 --- /dev/null +++ b/deepview_profile/utils.py @@ -0,0 +1,10 @@ +import torch +import logging +import gc + +logger = logging.getLogger(__name__) + +def release_memory(): + logger.debug("Emptying cache") + gc.collect() + torch.cuda.empty_cache() \ No newline at end of file diff --git a/protocol/innpv.proto b/protocol/innpv.proto index ed5b73c..c74767b 100644 --- a/protocol/innpv.proto +++ b/protocol/innpv.proto @@ -96,6 +96,7 @@ message HabitatDevicePrediction { message HabitatResponse { repeated HabitatDevicePrediction predictions = 1; + AnalysisError analysis_error = 2; } // Energy messages @@ -108,6 +109,7 @@ message EnergyResponse { // A list of past energy measurements repeated EnergyResponse past_measurements = 4; + AnalysisError analysis_error = 5; } // Reports the energy consumption of one system component (e.g. CPU+DRAM or GPU)