From 4e5ec7b61a4f9eb33a5eaa9f64a9932e67a72c3e Mon Sep 17 00:00:00 2001 From: Clint Daniels Date: Thu, 23 Sep 2021 08:30:20 -0700 Subject: [PATCH 1/4] black and isort formatting --- activitysim/__init__.py | 4 +- activitysim/abm/__init__.py | 4 +- activitysim/abm/misc.py | 37 +- activitysim/abm/models/__init__.py | 70 +- activitysim/abm/models/accessibility.py | 121 ++- .../abm/models/atwork_subtour_destination.py | 88 +- .../abm/models/atwork_subtour_frequency.py | 67 +- .../abm/models/atwork_subtour_mode_choice.py | 167 +-- .../abm/models/atwork_subtour_scheduling.py | 88 +- activitysim/abm/models/auto_ownership.py | 39 +- activitysim/abm/models/cdap.py | 128 ++- activitysim/abm/models/free_parking.py | 49 +- activitysim/abm/models/initialize.py | 120 +-- activitysim/abm/models/initialize_los.py | 128 ++- activitysim/abm/models/initialize_tours.py | 104 +- .../abm/models/joint_tour_composition.py | 68 +- .../abm/models/joint_tour_destination.py | 68 +- .../abm/models/joint_tour_frequency.py | 103 +- .../abm/models/joint_tour_participation.py | 233 +++-- .../abm/models/joint_tour_scheduling.py | 77 +- activitysim/abm/models/location_choice.py | 647 +++++++----- .../abm/models/mandatory_scheduling.py | 75 +- .../abm/models/mandatory_tour_frequency.py | 99 +- .../abm/models/non_mandatory_destination.py | 74 +- .../abm/models/non_mandatory_scheduling.py | 60 +- .../models/non_mandatory_tour_frequency.py | 246 +++-- .../abm/models/parking_location_choice.py | 208 ++-- activitysim/abm/models/stop_frequency.py | 192 ++-- activitysim/abm/models/summarize.py | 24 +- activitysim/abm/models/tour_mode_choice.py | 194 ++-- .../abm/models/trip_departure_choice.py | 379 ++++--- activitysim/abm/models/trip_destination.py | 958 +++++++++++------- activitysim/abm/models/trip_matrices.py | 206 ++-- activitysim/abm/models/trip_mode_choice.py | 194 ++-- activitysim/abm/models/trip_purpose.py | 183 ++-- .../models/trip_purpose_and_destination.py | 138 +-- activitysim/abm/models/trip_scheduling.py | 348 ++++--- .../abm/models/trip_scheduling_choice.py | 191 ++-- activitysim/abm/models/util/canonical_ids.py | 101 +- activitysim/abm/models/util/cdap.py | 467 +++++---- activitysim/abm/models/util/estimation.py | 342 ++++--- activitysim/abm/models/util/logsums.py | 130 ++- activitysim/abm/models/util/mode.py | 86 +- activitysim/abm/models/util/overlap.py | 70 +- activitysim/abm/models/util/test/test_cdap.py | 151 +-- .../test/test_mandatory_tour_frequency.py | 56 +- .../test/test_non_mandatory_tour_frequency.py | 34 +- .../test/test_vectorize_tour_scheduling.py | 71 +- .../abm/models/util/tour_destination.py | 605 ++++++----- activitysim/abm/models/util/tour_frequency.py | 116 ++- .../abm/models/util/tour_scheduling.py | 105 +- activitysim/abm/models/util/trip.py | 48 +- .../models/util/vectorize_tour_scheduling.py | 800 +++++++++------ activitysim/abm/tables/__init__.py | 25 +- activitysim/abm/tables/accessibility.py | 11 +- activitysim/abm/tables/households.py | 57 +- activitysim/abm/tables/landuse.py | 4 +- activitysim/abm/tables/persons.py | 44 +- activitysim/abm/tables/shadow_pricing.py | 347 ++++--- activitysim/abm/tables/size_terms.py | 27 +- activitysim/abm/tables/skims.py | 22 +- activitysim/abm/tables/table_dict.py | 3 +- activitysim/abm/tables/time_windows.py | 21 +- activitysim/abm/tables/tours.py | 2 +- activitysim/abm/tables/trips.py | 3 +- activitysim/abm/test/test_misc/setup_utils.py | 41 +- .../test_load_cached_accessibility.py | 50 +- activitysim/abm/test/test_misc/test_misc.py | 6 +- .../test_misc/test_trip_departure_choice.py | 133 ++- .../test_misc/test_trip_scheduling_choice.py | 161 +-- .../abm/test/test_misc/test_trip_utils.py | 14 +- .../abm/test/test_pipeline/test_pipeline.py | 247 ++--- activitysim/cli/__init__.py | 3 +- activitysim/cli/cli.py | 11 +- activitysim/cli/create.py | 68 +- activitysim/cli/main.py | 30 +- activitysim/cli/run.py | 260 ++--- activitysim/cli/test/test_cli.py | 44 +- activitysim/core/assign.py | 121 ++- activitysim/core/chunk.py | 498 +++++---- activitysim/core/config.py | 247 +++-- activitysim/core/expressions.py | 74 +- activitysim/core/inject.py | 46 +- activitysim/core/input.py | 142 +-- activitysim/core/interaction_sample.py | 334 +++--- .../core/interaction_sample_simulate.py | 245 +++-- activitysim/core/interaction_simulate.py | 299 ++++-- activitysim/core/logit.py | 169 +-- activitysim/core/los.py | 401 +++++--- activitysim/core/mem.py | 131 ++- activitysim/core/mp_tasks.py | 648 +++++++----- activitysim/core/pathbuilder.py | 914 +++++++++++------ activitysim/core/pathbuilder_cache.py | 145 +-- activitysim/core/pipeline.py | 184 ++-- activitysim/core/random.py | 108 +- activitysim/core/simulate.py | 723 ++++++++----- activitysim/core/skim_dict_factory.py | 133 ++- activitysim/core/skim_dictionary.py | 143 ++- activitysim/core/steps/output.py | 122 ++- activitysim/core/test/extensions/__init__.py | 1 - activitysim/core/test/extensions/steps.py | 31 +- activitysim/core/test/test_assign.py | 99 +- activitysim/core/test/test_inject_defaults.py | 8 +- activitysim/core/test/test_input.py | 91 +- activitysim/core/test/test_logit.py | 128 +-- activitysim/core/test/test_los.py | 172 ++-- activitysim/core/test/test_pipeline.py | 57 +- activitysim/core/test/test_random.py | 54 +- activitysim/core/test/test_simulate.py | 48 +- activitysim/core/test/test_skim.py | 58 +- activitysim/core/test/test_timetable.py | 105 +- activitysim/core/test/test_tracing.py | 110 +- activitysim/core/test/test_util.py | 42 +- activitysim/core/test/utils_testing.py | 21 +- activitysim/core/timetable.py | 156 +-- activitysim/core/tracing.py | 290 ++++-- activitysim/core/util.py | 99 +- .../estimation/larch/auto_ownership.py | 10 +- activitysim/estimation/larch/cdap.py | 24 +- activitysim/estimation/larch/general.py | 28 +- .../estimation/larch/location_choice.py | 133 +-- activitysim/estimation/larch/mode_choice.py | 53 +- .../estimation/larch/nonmand_tour_freq.py | 25 +- activitysim/estimation/larch/scheduling.py | 61 +- .../estimation/larch/simple_simulate.py | 50 +- .../estimation/larch/stop_frequency.py | 70 +- .../estimation/test/test_larch_estimation.py | 92 +- .../examples/create_run_all_examples.py | 2 +- .../examples/example_arc/scripts/arc_crop.py | 80 +- .../examples/example_arc/simulation.py | 4 +- .../examples/example_arc/test/simulation.py | 4 +- .../examples/example_arc/test/test_arc.py | 38 +- .../build_example_data/build_stop_coeffs.py | 68 +- .../build_example_data/mode_choice_wrangle.py | 40 +- .../scripts/extract_survey_data.py | 113 ++- .../example_estimation/scripts/infer.py | 722 ++++++++----- .../example_marin/scripts/marin_crop.py | 106 +- .../example_marin/scripts/marin_fix.py | 31 +- .../marin_work_tour_mode_choice_data.py | 85 +- .../examples/example_marin/test/simulation.py | 4 +- .../examples/example_marin/test/test_marin.py | 38 +- .../examples/example_mtc/simulation.py | 5 +- .../examples/example_mtc/test/simulation.py | 4 +- .../examples/example_mtc/test/test_mtc.py | 57 +- .../scripts/three_zone_example_data.py | 134 +-- .../scripts/two_zone_example_data.py | 81 +- .../example_multiple_zone/test/simulation.py | 4 +- .../test/test_multiple_zone.py | 74 +- .../example_psrc/scripts/integrity.py | 85 +- .../example_psrc/scripts/psrc_crop.py | 130 ++- .../examples/example_psrc/test/simulation.py | 4 +- .../examples/example_psrc/test/test_psrc.py | 38 +- .../scripts/sandag_crop_1_zone.py | 64 +- .../scripts/sandag_crop_2_zone.py | 120 ++- .../scripts/sandag_crop_3_zone.py | 112 +- .../example_sandag/test/simulation.py | 4 +- .../example_sandag/test/test_sandag.py | 62 +- .../example_semcog/extensions/__init__.py | 10 +- .../extensions/telecommute_frequency.py | 46 +- .../extensions/transit_pass_ownership.py | 44 +- .../extensions/transit_pass_subsidy.py | 44 +- .../extensions/work_from_home.py | 127 ++- .../scripts/reindex_household_ids.py | 184 ++-- .../example_semcog/scripts/semcog_crop.py | 89 +- .../examples/example_semcog/simulation.py | 8 +- .../example_semcog/test/test_semcog.py | 38 +- .../examples/scan_examples_for_errors.py | 7 +- conda-environments/activitysim-dev.yml | 3 +- conda-environments/activitysim-test.yml | 3 +- docs/add_image_map.py | 5 +- docs/conf.py | 73 +- ez_setup.py | 129 ++- other_resources/scripts/build_omx.py | 65 +- other_resources/scripts/create_sf_example.py | 53 +- .../scripts/make_pipeline_output.py | 46 +- other_resources/scripts/omx32.py | 25 +- other_resources/scripts/simulation.py | 61 +- other_resources/scripts/verify_results.py | 499 ++++++--- other_resources/verification/simulation.py | 66 +- setup.py | 53 +- 180 files changed, 13657 insertions(+), 9272 deletions(-) diff --git a/activitysim/__init__.py b/activitysim/__init__.py index 17ade8cdc8..b22b37eee4 100644 --- a/activitysim/__init__.py +++ b/activitysim/__init__.py @@ -2,5 +2,5 @@ # See full license in LICENSE.txt. -__version__ = '1.0.3' -__doc__ = 'Activity-Based Travel Modeling' +__version__ = "1.0.3" +__doc__ = "Activity-Based Travel Modeling" diff --git a/activitysim/abm/__init__.py b/activitysim/abm/__init__.py index 98b4b2cb14..eb5a299611 100644 --- a/activitysim/abm/__init__.py +++ b/activitysim/abm/__init__.py @@ -1,5 +1,3 @@ # ActivitySim # See full license in LICENSE.txt. -from . import misc -from . import tables -from . import models +from . import misc, models, tables diff --git a/activitysim/abm/misc.py b/activitysim/abm/misc.py index 994346dfc2..528c8db937 100644 --- a/activitysim/abm/misc.py +++ b/activitysim/abm/misc.py @@ -4,8 +4,7 @@ import pandas as pd -from activitysim.core import config -from activitysim.core import inject +from activitysim.core import config, inject # FIXME # warnings.filterwarnings('ignore', category=pd.io.pytables.PerformanceWarning) @@ -18,7 +17,7 @@ def households_sample_size(settings, override_hh_ids): if override_hh_ids is None: - return settings.get('households_sample_size', 0) + return settings.get("households_sample_size", 0) else: return 0 if override_hh_ids is None else len(override_hh_ids) @@ -26,18 +25,20 @@ def households_sample_size(settings, override_hh_ids): @inject.injectable(cache=True) def override_hh_ids(settings): - hh_ids_filename = settings.get('hh_ids', None) + hh_ids_filename = settings.get("hh_ids", None) if hh_ids_filename is None: return None file_path = config.data_file_path(hh_ids_filename, mandatory=False) if not file_path: - logger.error("hh_ids file name '%s' specified in settings not found" % hh_ids_filename) + logger.error( + "hh_ids file name '%s' specified in settings not found" % hh_ids_filename + ) return None - df = pd.read_csv(file_path, comment='#') + df = pd.read_csv(file_path, comment="#") - if 'household_id' not in df.columns: + if "household_id" not in df.columns: logger.error("No 'household_id' column in hh_ids file %s" % hh_ids_filename) return None @@ -47,8 +48,10 @@ def override_hh_ids(settings): logger.error("No households in hh_ids file %s" % hh_ids_filename) return None - logger.info("Using hh_ids list with %s households from file %s" % - (len(household_ids), hh_ids_filename)) + logger.info( + "Using hh_ids list with %s households from file %s" + % (len(household_ids), hh_ids_filename) + ) return household_ids @@ -56,10 +59,12 @@ def override_hh_ids(settings): @inject.injectable(cache=True) def trace_hh_id(settings): - id = settings.get('trace_hh_id', None) + id = settings.get("trace_hh_id", None) if id and not isinstance(id, int): - logger.warning("setting trace_hh_id is wrong type, should be an int, but was %s" % type(id)) + logger.warning( + "setting trace_hh_id is wrong type, should be an int, but was %s" % type(id) + ) id = None return id @@ -68,9 +73,11 @@ def trace_hh_id(settings): @inject.injectable(cache=True) def trace_od(settings): - od = settings.get('trace_od', None) + od = settings.get("trace_od", None) - if od and not (isinstance(od, list) and len(od) == 2 and all(isinstance(x, int) for x in od)): + if od and not ( + isinstance(od, list) and len(od) == 2 and all(isinstance(x, int) for x in od) + ): logger.warning("setting trace_od should be a list of length 2, but was %s" % od) od = None @@ -79,11 +86,11 @@ def trace_od(settings): @inject.injectable(cache=True) def chunk_size(settings): - _chunk_size = int(settings.get('chunk_size', 0) or 0) + _chunk_size = int(settings.get("chunk_size", 0) or 0) return _chunk_size @inject.injectable(cache=True) def check_for_variability(settings): - return bool(settings.get('check_for_variability', False)) + return bool(settings.get("check_for_variability", False)) diff --git a/activitysim/abm/models/__init__.py b/activitysim/abm/models/__init__.py index 005496d4d2..4b115f3a6f 100644 --- a/activitysim/abm/models/__init__.py +++ b/activitysim/abm/models/__init__.py @@ -1,36 +1,38 @@ # ActivitySim # See full license in LICENSE.txt. -from . import accessibility -from . import atwork_subtour_destination -from . import atwork_subtour_frequency -from . import atwork_subtour_mode_choice -from . import atwork_subtour_scheduling -from . import auto_ownership -from . import cdap -from . import free_parking -from . import initialize -from . import initialize_tours -from . import initialize_los -from . import joint_tour_composition -from . import joint_tour_destination -from . import joint_tour_frequency -from . import joint_tour_participation -from . import joint_tour_scheduling -from . import location_choice -from . import mandatory_scheduling -from . import mandatory_tour_frequency -from . import non_mandatory_destination -from . import non_mandatory_scheduling -from . import non_mandatory_tour_frequency -from . import parking_location_choice -from . import stop_frequency -from . import tour_mode_choice -from . import trip_destination -from . import trip_mode_choice -from . import trip_purpose -from . import trip_purpose_and_destination -from . import trip_scheduling -from . import trip_departure_choice -from . import trip_scheduling_choice -from . import trip_matrices -from . import summarize +from . import ( + accessibility, + atwork_subtour_destination, + atwork_subtour_frequency, + atwork_subtour_mode_choice, + atwork_subtour_scheduling, + auto_ownership, + cdap, + free_parking, + initialize, + initialize_los, + initialize_tours, + joint_tour_composition, + joint_tour_destination, + joint_tour_frequency, + joint_tour_participation, + joint_tour_scheduling, + location_choice, + mandatory_scheduling, + mandatory_tour_frequency, + non_mandatory_destination, + non_mandatory_scheduling, + non_mandatory_tour_frequency, + parking_location_choice, + stop_frequency, + summarize, + tour_mode_choice, + trip_departure_choice, + trip_destination, + trip_matrices, + trip_mode_choice, + trip_purpose, + trip_purpose_and_destination, + trip_scheduling, + trip_scheduling_choice, +) diff --git a/activitysim/abm/models/accessibility.py b/activitysim/abm/models/accessibility.py index 16d0e85597..f66a2663b6 100644 --- a/activitysim/abm/models/accessibility.py +++ b/activitysim/abm/models/accessibility.py @@ -2,31 +2,24 @@ # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import assign -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import chunk -from activitysim.core import mem - -from activitysim.core import los +from activitysim.core import assign, chunk, config, inject, los, mem, pipeline, tracing from activitysim.core.pathbuilder import TransitVirtualPathBuilder logger = logging.getLogger(__name__) def compute_accessibilities_for_zones( - accessibility_df, - land_use_df, - assignment_spec, - constants, - network_los, - trace_od, - trace_label): + accessibility_df, + land_use_df, + assignment_spec, + constants, + network_los, + trace_od, + trace_label, +): orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values @@ -34,14 +27,16 @@ def compute_accessibilities_for_zones( orig_zone_count = len(orig_zones) dest_zone_count = len(dest_zones) - logger.info("Running %s with %d orig zones %d dest zones" % - (trace_label, orig_zone_count, dest_zone_count)) + logger.info( + "Running %s with %d orig zones %d dest zones" + % (trace_label, orig_zone_count, dest_zone_count) + ) # create OD dataframe od_df = pd.DataFrame( data={ - 'orig': np.repeat(orig_zones, dest_zone_count), - 'dest': np.tile(dest_zones, orig_zone_count) + "orig": np.repeat(orig_zones, dest_zone_count), + "dest": np.tile(dest_zones, orig_zone_count), } ) @@ -52,26 +47,31 @@ def compute_accessibilities_for_zones( trace_od_rows = None # merge land_use_columns into od_df - od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() + od_df = pd.merge(od_df, land_use_df, left_on="dest", right_index=True).sort_index() chunk.log_df(trace_label, "od_df", od_df) locals_d = { - 'log': np.log, - 'exp': np.exp, - 'network_los': network_los, + "log": np.log, + "exp": np.exp, + "network_los": network_los, } locals_d.update(constants) skim_dict = network_los.get_default_skim_dict() - locals_d['skim_od'] = skim_dict.wrap('orig', 'dest').set_df(od_df) - locals_d['skim_do'] = skim_dict.wrap('dest', 'orig').set_df(od_df) + locals_d["skim_od"] = skim_dict.wrap("orig", "dest").set_df(od_df) + locals_d["skim_do"] = skim_dict.wrap("dest", "orig").set_df(od_df) if network_los.zone_system == los.THREE_ZONE: - locals_d['tvpb'] = network_los.tvpb + locals_d["tvpb"] = network_los.tvpb - results, trace_results, trace_assigned_locals \ - = assign.assign_variables(assignment_spec, od_df, locals_d, - trace_rows=trace_od_rows, trace_label=trace_label, chunk_log=True) + results, trace_results, trace_assigned_locals = assign.assign_variables( + assignment_spec, + od_df, + locals_d, + trace_rows=trace_od_rows, + trace_label=trace_label, + chunk_log=True, + ) chunk.log_df(trace_label, "results", results) @@ -84,23 +84,29 @@ def compute_accessibilities_for_zones( if trace_od: if not trace_od_rows.any(): - logger.warning(f"trace_od not found origin = {trace_orig}, dest = {trace_dest}") + logger.warning( + f"trace_od not found origin = {trace_orig}, dest = {trace_dest}" + ) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging - tracing.trace_df(df, - label='accessibility', - index_label='skim_offset', - slicer='NONE', - warn_if_empty=True) + tracing.trace_df( + df, + label="accessibility", + index_label="skim_offset", + slicer="NONE", + warn_if_empty=True, + ) if trace_assigned_locals: - tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals") + tracing.write_csv( + trace_assigned_locals, file_name="accessibility_locals" + ) - return(accessibility_df) + return accessibility_df @inject.step() @@ -122,32 +128,45 @@ def compute_accessibility(land_use, accessibility, network_los, chunk_size, trac steeper than automobile or transit. The minimum accessibility is zero. """ - trace_label = 'compute_accessibility' - model_settings = config.read_model_settings('accessibility.yaml') - assignment_spec = assign.read_assignment_spec(config.config_file_path('accessibility.csv')) + trace_label = "compute_accessibility" + model_settings = config.read_model_settings("accessibility.yaml") + assignment_spec = assign.read_assignment_spec( + config.config_file_path("accessibility.csv") + ) accessibility_df = accessibility.to_frame() if len(accessibility_df.columns) > 0: - logger.warning(f"accessibility table is not empty. Columns:{list(accessibility_df.columns)}") + logger.warning( + f"accessibility table is not empty. Columns:{list(accessibility_df.columns)}" + ) raise RuntimeError(f"accessibility table is not empty.") constants = config.get_model_constants(model_settings) # only include the land_use columns needed by spec, as specified by land_use_columns model_setting - land_use_columns = model_settings.get('land_use_columns', []) + land_use_columns = model_settings.get("land_use_columns", []) land_use_df = land_use.to_frame() land_use_df = land_use_df[land_use_columns] - logger.info(f"Running {trace_label} with {len(accessibility_df.index)} orig zones {len(land_use_df)} dest zones") + logger.info( + f"Running {trace_label} with {len(accessibility_df.index)} orig zones {len(land_use_df)} dest zones" + ) accessibilities_list = [] - for i, chooser_chunk, chunk_trace_label in \ - chunk.adaptive_chunked_choosers(accessibility_df, chunk_size, trace_label): - - accessibilities = \ - compute_accessibilities_for_zones(chooser_chunk, land_use_df, assignment_spec, - constants, network_los, trace_od, trace_label) + for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + accessibility_df, chunk_size, trace_label + ): + + accessibilities = compute_accessibilities_for_zones( + chooser_chunk, + land_use_df, + assignment_spec, + constants, + network_los, + trace_od, + trace_label, + ) accessibilities_list.append(accessibilities) accessibility_df = pd.concat(accessibilities_list) diff --git a/activitysim/abm/models/atwork_subtour_destination.py b/activitysim/abm/models/atwork_subtour_destination.py index b7f2ae8208..1b69cde775 100644 --- a/activitysim/abm/models/atwork_subtour_destination.py +++ b/activitysim/abm/models/atwork_subtour_destination.py @@ -4,18 +4,12 @@ import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject - -from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.core import config, inject, pipeline, simulate, tracing from activitysim.core.interaction_sample import interaction_sample +from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.util import assign_in_place -from .util import tour_destination -from .util import estimation +from .util import estimation, tour_destination logger = logging.getLogger(__name__) DUMP = False @@ -23,47 +17,54 @@ @inject.step() def atwork_subtour_destination( - tours, - persons_merged, - network_los, - chunk_size, trace_hh_id): + tours, persons_merged, network_los, chunk_size, trace_hh_id +): - trace_label = 'atwork_subtour_destination' - model_settings_file_name = 'atwork_subtour_destination.yaml' + trace_label = "atwork_subtour_destination" + model_settings_file_name = "atwork_subtour_destination.yaml" model_settings = config.read_model_settings(model_settings_file_name) future_settings = { - 'SIZE_TERM_SELECTOR': 'atwork', - 'SEGMENTS': ['atwork'], - 'ORIG_ZONE_ID': 'workplace_zone_id' + "SIZE_TERM_SELECTOR": "atwork", + "SEGMENTS": ["atwork"], + "ORIG_ZONE_ID": "workplace_zone_id", } - model_settings = config.future_model_settings(model_settings_file_name, model_settings, future_settings) + model_settings = config.future_model_settings( + model_settings_file_name, model_settings, future_settings + ) - destination_column_name = 'destination' - logsum_column_name = model_settings.get('DEST_CHOICE_LOGSUM_COLUMN_NAME') + destination_column_name = "destination" + logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None - sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') - want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None + sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") + want_sample_table = ( + config.setting("want_dest_choice_sample_tables") + and sample_table_name is not None + ) persons_merged = persons_merged.to_frame() tours = tours.to_frame() - subtours = tours[tours.tour_category == 'atwork'] + subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours if subtours.shape[0] == 0: - tracing.no_results('atwork_subtour_destination') + tracing.no_results("atwork_subtour_destination") return - estimator = estimation.manager.begin_estimation('atwork_subtour_destination') + estimator = estimation.manager.begin_estimation("atwork_subtour_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') - estimator.write_spec(model_settings, tag='SPEC') + estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) - estimator.write_table(inject.get_injectable('size_terms'), 'size_terms', append=False) - estimator.write_table(inject.get_table('land_use').to_frame(), 'landuse', append=False) + estimator.write_table( + inject.get_injectable("size_terms"), "size_terms", append=False + ) + estimator.write_table( + inject.get_table("land_use").to_frame(), "landuse", append=False + ) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( @@ -74,26 +75,31 @@ def atwork_subtour_destination( model_settings, network_los, estimator, - chunk_size, trace_hh_id, trace_label) + chunk_size, + trace_hh_id, + trace_label, + ) if estimator: - estimator.write_choices(choices_df['choice']) - choices_df['choice'] = estimator.get_survey_values(choices_df['choice'], 'tours', 'destination') - estimator.write_override_choices(choices_df['choice']) + estimator.write_choices(choices_df["choice"]) + choices_df["choice"] = estimator.get_survey_values( + choices_df["choice"], "tours", "destination" + ) + estimator.write_override_choices(choices_df["choice"]) estimator.end_estimation() - subtours[destination_column_name] = choices_df['choice'] + subtours[destination_column_name] = choices_df["choice"] assign_in_place(tours, subtours[[destination_column_name]]) if want_logsums: - subtours[logsum_column_name] = choices_df['logsum'] + subtours[logsum_column_name] = choices_df["logsum"] assign_in_place(tours, subtours[[logsum_column_name]]) pipeline.replace_table("tours", tours) - tracing.print_summary(destination_column_name, - subtours[destination_column_name], - describe=True) + tracing.print_summary( + destination_column_name, subtours[destination_column_name], describe=True + ) if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) @@ -101,6 +107,6 @@ def atwork_subtour_destination( pipeline.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df(tours, - label='atwork_subtour_destination', - columns=['destination']) + tracing.trace_df( + tours, label="atwork_subtour_destination", columns=["destination"] + ) diff --git a/activitysim/abm/models/atwork_subtour_frequency.py b/activitysim/abm/models/atwork_subtour_frequency.py index cb9d3ca4c7..d42b97fdc3 100644 --- a/activitysim/abm/models/atwork_subtour_frequency.py +++ b/activitysim/abm/models/atwork_subtour_frequency.py @@ -2,18 +2,12 @@ # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import expressions +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing from .util import estimation - from .util.tour_frequency import process_atwork_subtours logger = logging.getLogger(__name__) @@ -21,26 +15,23 @@ def add_null_results(trace_label, tours): logger.info("Skipping %s: add_null_results", trace_label) - tours['atwork_subtour_frequency'] = np.nan + tours["atwork_subtour_frequency"] = np.nan pipeline.replace_table("tours", tours) @inject.step() -def atwork_subtour_frequency(tours, - persons_merged, - chunk_size, - trace_hh_id): +def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making at-work subtour tours (alternatives for this model come from a separate csv file which is configured by the user). """ - trace_label = 'atwork_subtour_frequency' - model_settings_file_name = 'atwork_subtour_frequency.yaml' + trace_label = "atwork_subtour_frequency" + model_settings_file_name = "atwork_subtour_frequency.yaml" tours = tours.to_frame() - work_tours = tours[tours.tour_type == 'work'] + work_tours = tours[tours.tour_type == "work"] # - if no work_tours if len(work_tours) == 0: @@ -48,17 +39,21 @@ def atwork_subtour_frequency(tours, return model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation('atwork_subtour_frequency') + estimator = estimation.manager.begin_estimation("atwork_subtour_frequency") - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) - alternatives = simulate.read_model_alts('atwork_subtour_frequency_alternatives.csv', set_index='alt') + alternatives = simulate.read_model_alts( + "atwork_subtour_frequency_alternatives.csv", set_index="alt" + ) # merge persons into work_tours persons_merged = persons_merged.to_frame() - work_tours = pd.merge(work_tours, persons_merged, left_on='person_id', right_index=True) + work_tours = pd.merge( + work_tours, persons_merged, left_on="person_id", right_index=True + ) logger.info("Running atwork_subtour_frequency with %d work tours", len(work_tours)) @@ -66,13 +61,12 @@ def atwork_subtour_frequency(tours, constants = config.get_model_constants(model_settings) # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: expressions.assign_columns( - df=work_tours, - model_settings=preprocessor_settings, - trace_label=trace_label) + df=work_tours, model_settings=preprocessor_settings, trace_label=trace_label + ) if estimator: estimator.write_spec(model_settings) @@ -87,37 +81,40 @@ def atwork_subtour_frequency(tours, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='atwork_subtour_frequency', - estimator=estimator) + trace_choice_name="atwork_subtour_frequency", + estimator=estimator, + ) # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'tours', 'atwork_subtour_frequency') + choices = estimator.get_survey_values( + choices, "tours", "atwork_subtour_frequency" + ) estimator.write_override_choices(choices) estimator.end_estimation() # add atwork_subtour_frequency column to tours # reindex since we are working with a subset of tours - tours['atwork_subtour_frequency'] = choices.reindex(tours.index) + tours["atwork_subtour_frequency"] = choices.reindex(tours.index) pipeline.replace_table("tours", tours) # - create atwork_subtours based on atwork_subtour_frequency choice names - work_tours = tours[tours.tour_type == 'work'] + work_tours = tours[tours.tour_type == "work"] assert not work_tours.atwork_subtour_frequency.isnull().any() subtours = process_atwork_subtours(work_tours, alternatives) tours = pipeline.extend_table("tours", subtours) - tracing.register_traceable_table('tours', subtours) - pipeline.get_rn_generator().add_channel('tours', subtours) + tracing.register_traceable_table("tours", subtours) + pipeline.get_rn_generator().add_channel("tours", subtours) - tracing.print_summary('atwork_subtour_frequency', tours.atwork_subtour_frequency, - value_counts=True) + tracing.print_summary( + "atwork_subtour_frequency", tours.atwork_subtour_frequency, value_counts=True + ) if trace_hh_id: - tracing.trace_df(tours, - label='atwork_subtour_frequency.tours') + tracing.trace_df(tours, label="atwork_subtour_frequency.tours") diff --git a/activitysim/abm/models/atwork_subtour_mode_choice.py b/activitysim/abm/models/atwork_subtour_mode_choice.py index b9e7d0c444..cff086693c 100644 --- a/activitysim/abm/models/atwork_subtour_mode_choice.py +++ b/activitysim/abm/models/atwork_subtour_mode_choice.py @@ -2,61 +2,56 @@ # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import simulate - -from activitysim.core import los +from activitysim.core import config, inject, los, pipeline, simulate, tracing from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from activitysim.core.util import assign_in_place -from .util.mode import run_tour_mode_choice_simulate from .util import estimation - -from activitysim.core.util import assign_in_place +from .util.mode import run_tour_mode_choice_simulate logger = logging.getLogger(__name__) @inject.step() def atwork_subtour_mode_choice( - tours, - persons_merged, - network_los, - chunk_size, - trace_hh_id): + tours, persons_merged, network_los, chunk_size, trace_hh_id +): """ At-work subtour mode choice simulate """ - trace_label = 'atwork_subtour_mode_choice' + trace_label = "atwork_subtour_mode_choice" - model_settings_file_name = 'tour_mode_choice.yaml' + model_settings_file_name = "tour_mode_choice.yaml" model_settings = config.read_model_settings(model_settings_file_name) - logsum_column_name = model_settings.get('MODE_CHOICE_LOGSUM_COLUMN_NAME') - mode_column_name = 'tour_mode' + logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") + mode_column_name = "tour_mode" tours = tours.to_frame() - subtours = tours[tours.tour_category == 'atwork'] + subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours if subtours.shape[0] == 0: tracing.no_results(trace_label) return - subtours_merged = \ - pd.merge(subtours, persons_merged.to_frame(), - left_on='person_id', right_index=True, how='left') + subtours_merged = pd.merge( + subtours, + persons_merged.to_frame(), + left_on="person_id", + right_index=True, + how="left", + ) logger.info("Running %s with %d subtours" % (trace_label, subtours_merged.shape[0])) - tracing.print_summary('%s tour_type' % trace_label, - subtours_merged.tour_type, value_counts=True) + tracing.print_summary( + "%s tour_type" % trace_label, subtours_merged.tour_type, value_counts=True + ) constants = {} constants.update(config.get_model_constants(model_settings)) @@ -64,18 +59,22 @@ def atwork_subtour_mode_choice( skim_dict = network_los.get_default_skim_dict() # setup skim keys - orig_col_name = 'workplace_zone_id' - dest_col_name = 'destination' - out_time_col_name = 'start' - in_time_col_name = 'end' - odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, - dim3_key='out_period') - dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, - dim3_key='in_period') - odr_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, - dim3_key='in_period') - dor_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, - dim3_key='out_period') + orig_col_name = "workplace_zone_id" + dest_col_name = "destination" + out_time_col_name = "start" + in_time_col_name = "end" + odt_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col_name, dest_key=dest_col_name, dim3_key="out_period" + ) + dot_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col_name, dest_key=orig_col_name, dim3_key="in_period" + ) + odr_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col_name, dest_key=dest_col_name, dim3_key="in_period" + ) + dor_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col_name, dest_key=orig_col_name, dim3_key="out_period" + ) od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { @@ -84,34 +83,45 @@ def atwork_subtour_mode_choice( "odr_skims": odr_skim_stack_wrapper, "dor_skims": dor_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, - 'orig_col_name': orig_col_name, - 'dest_col_name': dest_col_name, - 'out_time_col_name': out_time_col_name, - 'in_time_col_name': in_time_col_name + "orig_col_name": orig_col_name, + "dest_col_name": dest_col_name, + "out_time_col_name": out_time_col_name, + "in_time_col_name": in_time_col_name, } if network_los.zone_system == los.THREE_ZONE: # fixme - is this a lightweight object? tvpb = network_los.tvpb - tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col_name, dest_key=dest_col_name, - tod_key='out_period', segment_key='demographic_segment', - cache_choices=True, - trace_label=trace_label, tag='tvpb_logsum_odt') - tvpb_logsum_dot = tvpb.wrap_logsum(orig_key=dest_col_name, dest_key=orig_col_name, - tod_key='in_period', segment_key='demographic_segment', - cache_choices=True, - trace_label=trace_label, tag='tvpb_logsum_dot') - - skims.update({ - 'tvpb_logsum_odt': tvpb_logsum_odt, - 'tvpb_logsum_dot': tvpb_logsum_dot - }) + tvpb_logsum_odt = tvpb.wrap_logsum( + orig_key=orig_col_name, + dest_key=dest_col_name, + tod_key="out_period", + segment_key="demographic_segment", + cache_choices=True, + trace_label=trace_label, + tag="tvpb_logsum_odt", + ) + tvpb_logsum_dot = tvpb.wrap_logsum( + orig_key=dest_col_name, + dest_key=orig_col_name, + tod_key="in_period", + segment_key="demographic_segment", + cache_choices=True, + trace_label=trace_label, + tag="tvpb_logsum_dot", + ) + + skims.update( + {"tvpb_logsum_odt": tvpb_logsum_odt, "tvpb_logsum_dot": tvpb_logsum_dot} + ) # TVPB constants can appear in expressions - constants.update(network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) + constants.update( + network_los.setting("TVPB_SETTINGS.tour_mode_choice.CONSTANTS") + ) - estimator = estimation.manager.begin_estimation('atwork_subtour_mode_choice') + estimator = estimation.manager.begin_estimation("atwork_subtour_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) @@ -121,7 +131,8 @@ def atwork_subtour_mode_choice( choices_df = run_tour_mode_choice_simulate( subtours_merged, - tour_purpose='atwork', model_settings=model_settings, + tour_purpose="atwork", + model_settings=model_settings, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, network_los=network_los, @@ -130,15 +141,18 @@ def atwork_subtour_mode_choice( estimator=estimator, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='tour_mode_choice') + trace_choice_name="tour_mode_choice", + ) # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get('tvpb_mode_path_types') + tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") for mode, path_types in tvpb_mode_path_types.items(): - for direction, skim in zip(['od', 'do'], [tvpb_logsum_odt, tvpb_logsum_dot]): + for direction, skim in zip( + ["od", "do"], [tvpb_logsum_odt, tvpb_logsum_dot] + ): path_type = path_types[direction] skim_cache = skim.cache[path_type] @@ -147,26 +161,37 @@ def atwork_subtour_mode_choice( for c in skim_cache: - dest_col = f'{direction}_{c}' + dest_col = f"{direction}_{c}" if dest_col not in choices_df: - choices_df[dest_col] = np.nan if pd.api.types.is_numeric_dtype(skim_cache[c]) else '' - choices_df[dest_col].where(choices_df.tour_mode != mode, skim_cache[c], inplace=True) + choices_df[dest_col] = ( + np.nan + if pd.api.types.is_numeric_dtype(skim_cache[c]) + else "" + ) + choices_df[dest_col].where( + choices_df.tour_mode != mode, skim_cache[c], inplace=True + ) if estimator: estimator.write_choices(choices_df[mode_column_name]) - choices_df[mode_column_name] = \ - estimator.get_survey_values(choices_df[mode_column_name], 'tours', mode_column_name) + choices_df[mode_column_name] = estimator.get_survey_values( + choices_df[mode_column_name], "tours", mode_column_name + ) estimator.write_override_choices(choices_df[mode_column_name]) estimator.end_estimation() - tracing.print_summary('%s choices' % trace_label, choices_df[mode_column_name], value_counts=True) + tracing.print_summary( + "%s choices" % trace_label, choices_df[mode_column_name], value_counts=True + ) assign_in_place(tours, choices_df) pipeline.replace_table("tours", tours) if trace_hh_id: - tracing.trace_df(tours[tours.tour_category == 'atwork'], - label=tracing.extend_trace_label(trace_label, mode_column_name), - slicer='tour_id', - index_label='tour_id') + tracing.trace_df( + tours[tours.tour_category == "atwork"], + label=tracing.extend_trace_label(trace_label, mode_column_name), + slicer="tour_id", + index_label="tour_id", + ) diff --git a/activitysim/abm/models/atwork_subtour_scheduling.py b/activitysim/abm/models/atwork_subtour_scheduling.py index f7a61b3a4b..188cfaeaaf 100644 --- a/activitysim/abm/models/atwork_subtour_scheduling.py +++ b/activitysim/abm/models/atwork_subtour_scheduling.py @@ -2,22 +2,16 @@ # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject +from activitysim.core import config, expressions, inject, pipeline, simulate from activitysim.core import timetable as tt -from activitysim.core import expressions - -from .util.vectorize_tour_scheduling import vectorize_subtour_scheduling +from activitysim.core import tracing +from activitysim.core.util import assign_in_place from .util import estimation - -from activitysim.core.util import assign_in_place +from .util.vectorize_tour_scheduling import vectorize_subtour_scheduling logger = logging.getLogger(__name__) @@ -26,21 +20,17 @@ @inject.step() def atwork_subtour_scheduling( - tours, - persons_merged, - tdd_alts, - skim_dict, - chunk_size, - trace_hh_id): + tours, persons_merged, tdd_alts, skim_dict, chunk_size, trace_hh_id +): """ This model predicts the departure time and duration of each activity for at work subtours tours """ - trace_label = 'atwork_subtour_scheduling' - model_settings_file_name = 'tour_scheduling_atwork.yaml' + trace_label = "atwork_subtour_scheduling" + model_settings_file_name = "tour_scheduling_atwork.yaml" tours = tours.to_frame() - subtours = tours[tours.tour_category == 'atwork'] + subtours = tours[tours.tour_category == "atwork"] # - if no atwork subtours if subtours.shape[0] == 0: @@ -48,9 +38,9 @@ def atwork_subtour_scheduling( return model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation('atwork_subtour_scheduling') + estimator = estimation.manager.begin_estimation("atwork_subtour_scheduling") - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -60,18 +50,18 @@ def atwork_subtour_scheduling( # preprocessor constants = config.get_model_constants(model_settings) - od_skim_wrapper = skim_dict.wrap('origin', 'destination') + od_skim_wrapper = skim_dict.wrap("origin", "destination") skims = { "od_skims": od_skim_wrapper, } expressions.annotate_preprocessors( - subtours, constants, skims, - model_settings, trace_label) + subtours, constants, skims, model_settings, trace_label + ) # parent_tours table with columns ['tour_id', 'tdd'] index = tour_id parent_tour_ids = subtours.parent_tour_id.astype(np.int64).unique() - parent_tours = pd.DataFrame({'tour_id': parent_tour_ids}, index=parent_tour_ids) - parent_tours = parent_tours.merge(tours[['tdd']], left_index=True, right_index=True) + parent_tours = pd.DataFrame({"tour_id": parent_tour_ids}, index=parent_tour_ids) + parent_tours = parent_tours.merge(tours[["tdd"]], left_index=True, right_index=True) if estimator: estimator.write_model_settings(model_settings, model_settings_file_name) @@ -83,42 +73,52 @@ def atwork_subtour_scheduling( parent_tours, subtours, persons_merged, - tdd_alts, model_spec, + tdd_alts, + model_spec, model_settings, estimator=estimator, chunk_size=chunk_size, - trace_label=trace_label) + trace_label=trace_label, + ) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'tours', 'tdd') + choices = estimator.get_survey_values(choices, "tours", "tdd") estimator.write_override_choices(choices) estimator.end_estimation() # choices are tdd alternative ids # we want to add start, end, and duration columns to tours, which we have in tdd_alts table - tdd_choices = pd.merge(choices.to_frame('tdd'), tdd_alts, left_on=['tdd'], right_index=True, how='left') + tdd_choices = pd.merge( + choices.to_frame("tdd"), tdd_alts, left_on=["tdd"], right_index=True, how="left" + ) assign_in_place(tours, tdd_choices) pipeline.replace_table("tours", tours) if trace_hh_id: - tracing.trace_df(tours[tours.tour_category == 'atwork'], - label="atwork_subtour_scheduling", - slicer='person_id', - index_label='tour_id', - columns=None) + tracing.trace_df( + tours[tours.tour_category == "atwork"], + label="atwork_subtour_scheduling", + slicer="person_id", + index_label="tour_id", + columns=None, + ) if DUMP: - subtours = tours[tours.tour_category == 'atwork'] + subtours = tours[tours.tour_category == "atwork"] parent_tours = tours[tours.index.isin(subtours.parent_tour_id)] - tracing.dump_df(DUMP, subtours, trace_label, 'sub_tours') - tracing.dump_df(DUMP, parent_tours, trace_label, 'parent_tours') + tracing.dump_df(DUMP, subtours, trace_label, "sub_tours") + tracing.dump_df(DUMP, parent_tours, trace_label, "parent_tours") - parent_tours['parent_tour_id'] = parent_tours.index + parent_tours["parent_tour_id"] = parent_tours.index subtours = pd.concat([parent_tours, subtours]) - tracing.dump_df(DUMP, - tt.tour_map(parent_tours, subtours, tdd_alts, - persons_id_col='parent_tour_id'), - trace_label, 'tour_map') + tracing.dump_df( + DUMP, + tt.tour_map( + parent_tours, subtours, tdd_alts, persons_id_col="parent_tour_id" + ), + trace_label, + "tour_map", + ) diff --git a/activitysim/abm/models/auto_ownership.py b/activitysim/abm/models/auto_ownership.py index 12f8fbd2bb..564d6f94b6 100644 --- a/activitysim/abm/models/auto_ownership.py +++ b/activitysim/abm/models/auto_ownership.py @@ -2,34 +2,26 @@ # See full license in LICENSE.txt. import logging -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject +from activitysim.core import config, inject, pipeline, simulate, tracing from .util import estimation - logger = logging.getLogger(__name__) @inject.step() -def auto_ownership_simulate(households, - households_merged, - chunk_size, - trace_hh_id): +def auto_ownership_simulate(households, households_merged, chunk_size, trace_hh_id): """ Auto ownership is a standard model which predicts how many cars a household with given characteristics owns """ - trace_label = 'auto_ownership_simulate' - model_settings_file_name = 'auto_ownership.yaml' + trace_label = "auto_ownership_simulate" + model_settings_file_name = "auto_ownership.yaml" model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation('auto_ownership') + estimator = estimation.manager.begin_estimation("auto_ownership") - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -46,7 +38,7 @@ def auto_ownership_simulate(households, estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(choosers) - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) choices = simulate.simple_simulate( choosers=choosers, @@ -55,26 +47,27 @@ def auto_ownership_simulate(households, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='auto_ownership', + trace_choice_name="auto_ownership", log_alt_losers=log_alt_losers, - estimator=estimator) + estimator=estimator, + ) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'households', 'auto_ownership') + choices = estimator.get_survey_values(choices, "households", "auto_ownership") estimator.write_override_choices(choices) estimator.end_estimation() households = households.to_frame() # no need to reindex as we used all households - households['auto_ownership'] = choices + households["auto_ownership"] = choices pipeline.replace_table("households", households) - tracing.print_summary('auto_ownership', households.auto_ownership, value_counts=True) + tracing.print_summary( + "auto_ownership", households.auto_ownership, value_counts=True + ) if trace_hh_id: - tracing.trace_df(households, - label='auto_ownership', - warn_if_empty=True) + tracing.trace_df(households, label="auto_ownership", warn_if_empty=True) diff --git a/activitysim/abm/models/cdap.py b/activitysim/abm/models/cdap.py index 3f29891a71..b37cf8a9a4 100644 --- a/activitysim/abm/models/cdap.py +++ b/activitysim/abm/models/cdap.py @@ -4,24 +4,16 @@ import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import expressions - +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing from activitysim.core.util import reindex -from .util import cdap -from .util import estimation +from .util import cdap, estimation logger = logging.getLogger(__name__) @inject.step() -def cdap_simulate(persons_merged, persons, households, - chunk_size, trace_hh_id): +def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other @@ -32,32 +24,46 @@ def cdap_simulate(persons_merged, persons, households, simply applies those utilities using the simulation framework. """ - trace_label = 'cdap' - model_settings = config.read_model_settings('cdap.yaml') - person_type_map = model_settings.get('PERSON_TYPE_MAP', None) - assert person_type_map is not None, f"Expected to find PERSON_TYPE_MAP setting in cdap.yaml" - estimator = estimation.manager.begin_estimation('cdap') + trace_label = "cdap" + model_settings = config.read_model_settings("cdap.yaml") + person_type_map = model_settings.get("PERSON_TYPE_MAP", None) + assert ( + person_type_map is not None + ), f"Expected to find PERSON_TYPE_MAP setting in cdap.yaml" + estimator = estimation.manager.begin_estimation("cdap") - cdap_indiv_spec = simulate.read_model_spec(file_name=model_settings['INDIV_AND_HHSIZE1_SPEC']) + cdap_indiv_spec = simulate.read_model_spec( + file_name=model_settings["INDIV_AND_HHSIZE1_SPEC"] + ) coefficients_df = simulate.read_model_coefficients(model_settings) - cdap_indiv_spec = simulate.eval_coefficients(cdap_indiv_spec, coefficients_df, estimator) + cdap_indiv_spec = simulate.eval_coefficients( + cdap_indiv_spec, coefficients_df, estimator + ) # Rules and coefficients for generating interaction specs for different household sizes - interaction_coefficients_file_name = \ - model_settings.get('INTERACTION_COEFFICIENTS', 'cdap_interaction_coefficients.csv') - cdap_interaction_coefficients = \ - pd.read_csv(config.config_file_path(interaction_coefficients_file_name), comment='#') + interaction_coefficients_file_name = model_settings.get( + "INTERACTION_COEFFICIENTS", "cdap_interaction_coefficients.csv" + ) + cdap_interaction_coefficients = pd.read_csv( + config.config_file_path(interaction_coefficients_file_name), comment="#" + ) # replace cdap_interaction_coefficients coefficient labels with numeric values # for backward compatibility, use where() to allow hard-coded coefficients and dummy (empty) coefficients_file - coefficients = cdap_interaction_coefficients.coefficient.map(coefficients_df.value.to_dict()) - coefficients = cdap_interaction_coefficients.coefficient.where(coefficients.isnull(), coefficients) - coefficients = pd.to_numeric(coefficients, errors='coerce').astype(float) + coefficients = cdap_interaction_coefficients.coefficient.map( + coefficients_df.value.to_dict() + ) + coefficients = cdap_interaction_coefficients.coefficient.where( + coefficients.isnull(), coefficients + ) + coefficients = pd.to_numeric(coefficients, errors="coerce").astype(float) if coefficients.isnull().any(): # show them the offending lines from interaction_coefficients_file - logger.warning(f"bad coefficients in INTERACTION_COEFFICIENTS {interaction_coefficients_file_name}\n" - f"{cdap_interaction_coefficients[coefficients.isnull()]}") + logger.warning( + f"bad coefficients in INTERACTION_COEFFICIENTS {interaction_coefficients_file_name}\n" + f"{cdap_interaction_coefficients[coefficients.isnull()]}" + ) assert not coefficients.isnull().any() cdap_interaction_coefficients.coefficient = coefficients @@ -68,21 +74,27 @@ def cdap_simulate(persons_merged, persons, households, EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) """ - cdap_fixed_relative_proportions = \ - simulate.read_model_spec(file_name=model_settings['FIXED_RELATIVE_PROPORTIONS_SPEC']) + cdap_fixed_relative_proportions = simulate.read_model_spec( + file_name=model_settings["FIXED_RELATIVE_PROPORTIONS_SPEC"] + ) persons_merged = persons_merged.to_frame() # add tour-based chunk_id so we can chunk all trips in tour together - assert 'chunk_id' not in persons_merged.columns + assert "chunk_id" not in persons_merged.columns unique_household_ids = persons_merged.household_id.unique() - household_chunk_ids = pd.Series(range(len(unique_household_ids)), index=unique_household_ids) - persons_merged['chunk_id'] = reindex(household_chunk_ids, persons_merged.household_id) + household_chunk_ids = pd.Series( + range(len(unique_household_ids)), index=unique_household_ids + ) + persons_merged["chunk_id"] = reindex( + household_chunk_ids, persons_merged.household_id + ) constants = config.get_model_constants(model_settings) - cdap_interaction_coefficients = \ - cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) + cdap_interaction_coefficients = cdap.preprocess_interaction_coefficients( + cdap_interaction_coefficients + ) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory @@ -90,19 +102,28 @@ def cdap_simulate(persons_merged, persons, households, logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) - if inject.get_injectable('locutor', False): - spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) + if inject.get_injectable("locutor", False): + spec.to_csv( + config.output_file_path("cdap_spec_%s.csv" % hhsize), index=True + ) if estimator: - estimator.write_model_settings(model_settings, 'cdap.yaml') - estimator.write_spec(model_settings, tag='INDIV_AND_HHSIZE1_SPEC') - estimator.write_spec(model_settings=model_settings, tag='FIXED_RELATIVE_PROPORTIONS_SPEC') + estimator.write_model_settings(model_settings, "cdap.yaml") + estimator.write_spec(model_settings, tag="INDIV_AND_HHSIZE1_SPEC") + estimator.write_spec( + model_settings=model_settings, tag="FIXED_RELATIVE_PROPORTIONS_SPEC" + ) estimator.write_coefficients(coefficients_df, model_settings) - estimator.write_table(cdap_interaction_coefficients, 'interaction_coefficients', index=False, append=False) + estimator.write_table( + cdap_interaction_coefficients, + "interaction_coefficients", + index=False, + append=False, + ) estimator.write_choosers(persons_merged) for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.get_cached_spec(hhsize) - estimator.write_table(spec, 'spec_%s' % hhsize, append=False) + estimator.write_table(spec, "spec_%s" % hhsize, append=False) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) @@ -115,11 +136,12 @@ def cdap_simulate(persons_merged, persons, households, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, - trace_label=trace_label) + trace_label=trace_label, + ) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'persons', 'cdap_activity') + choices = estimator.get_survey_values(choices, "persons", "cdap_activity") estimator.write_override_choices(choices) estimator.end_estimation() @@ -127,12 +149,13 @@ def cdap_simulate(persons_merged, persons, households, persons = persons.to_frame() choices = choices.reindex(persons.index) - persons['cdap_activity'] = choices + persons["cdap_activity"] = choices expressions.assign_columns( df=persons, - model_settings=model_settings.get('annotate_persons'), - trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) + model_settings=model_settings.get("annotate_persons"), + trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), + ) pipeline.replace_table("persons", persons) @@ -140,10 +163,13 @@ def cdap_simulate(persons_merged, persons, households, households = households.to_frame() expressions.assign_columns( df=households, - model_settings=model_settings.get('annotate_households'), - trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) + model_settings=model_settings.get("annotate_households"), + trace_label=tracing.extend_trace_label(trace_label, "annotate_households"), + ) pipeline.replace_table("households", households) - tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) - logger.info("cdap crosstabs:\n%s" % - pd.crosstab(persons.ptype, persons.cdap_activity, margins=True)) + tracing.print_summary("cdap_activity", persons.cdap_activity, value_counts=True) + logger.info( + "cdap crosstabs:\n%s" + % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True) + ) diff --git a/activitysim/abm/models/free_parking.py b/activitysim/abm/models/free_parking.py index b144ed0543..c9f2f30b69 100644 --- a/activitysim/abm/models/free_parking.py +++ b/activitysim/abm/models/free_parking.py @@ -2,12 +2,7 @@ # See full license in LICENSE.txt. import logging -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import expressions +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing from .util import estimation @@ -15,27 +10,25 @@ @inject.step() -def free_parking( - persons_merged, persons, - chunk_size, trace_hh_id): +def free_parking(persons_merged, persons, chunk_size, trace_hh_id): """ """ - trace_label = 'free_parking' - model_settings_file_name = 'free_parking.yaml' + trace_label = "free_parking" + model_settings_file_name = "free_parking.yaml" choosers = persons_merged.to_frame() choosers = choosers[choosers.workplace_zone_id > -1] logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation('free_parking') + estimator = estimation.manager.begin_estimation("free_parking") constants = config.get_model_constants(model_settings) # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_d = {} @@ -46,9 +39,10 @@ def free_parking( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -67,26 +61,31 @@ def free_parking( locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='free_parking_at_work', - estimator=estimator) + trace_choice_name="free_parking_at_work", + estimator=estimator, + ) - free_parking_alt = model_settings['FREE_PARKING_ALT'] - choices = (choices == free_parking_alt) + free_parking_alt = model_settings["FREE_PARKING_ALT"] + choices = choices == free_parking_alt if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'persons', 'free_parking_at_work') + choices = estimator.get_survey_values( + choices, "persons", "free_parking_at_work" + ) estimator.write_override_choices(choices) estimator.end_estimation() persons = persons.to_frame() - persons['free_parking_at_work'] = choices.reindex(persons.index).fillna(0).astype(bool) + persons["free_parking_at_work"] = ( + choices.reindex(persons.index).fillna(0).astype(bool) + ) pipeline.replace_table("persons", persons) - tracing.print_summary('free_parking', persons.free_parking_at_work, value_counts=True) + tracing.print_summary( + "free_parking", persons.free_parking_at_work, value_counts=True + ) if trace_hh_id: - tracing.trace_df(persons, - label=trace_label, - warn_if_empty=True) + tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/abm/models/initialize.py b/activitysim/abm/models/initialize.py index d72a88f9da..9205e7822c 100644 --- a/activitysim/abm/models/initialize.py +++ b/activitysim/abm/models/initialize.py @@ -1,38 +1,31 @@ # ActivitySim # See full license in LICENSE.txt. import logging -import warnings import os -import pandas as pd - -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import expressions -from activitysim.core import chunk -from activitysim.core import mem +import warnings -from activitysim.core.steps.output import write_data_dictionary -from activitysim.core.steps.output import write_tables -from activitysim.core.steps.output import track_skim_usage +import pandas as pd from activitysim.abm.tables import shadow_pricing +from activitysim.core import chunk, config, expressions, inject, mem, pipeline, tracing +from activitysim.core.steps.output import ( + track_skim_usage, + write_data_dictionary, + write_tables, +) # We are using the naming conventions in the mtc_asim.h5 example # file for our default list. This provides backwards compatibility # with previous versions of ActivitySim in which only 'input_store' # is given in the settings file. DEFAULT_TABLE_LIST = [ - {'tablename': 'households', - 'h5_tablename': 'households', - 'index_col': 'household_id'}, - {'tablename': 'persons', - 'h5_tablename': 'persons', - 'index_col': 'person_id'}, - {'tablename': 'land_use', - 'h5_tablename': 'land_use_taz', - 'index_col': 'TAZ'} + { + "tablename": "households", + "h5_tablename": "households", + "index_col": "household_id", + }, + {"tablename": "persons", "h5_tablename": "persons", "index_col": "person_id"}, + {"tablename": "land_use", "h5_tablename": "land_use_taz", "index_col": "TAZ"}, ] logger = logging.getLogger(__name__) @@ -40,23 +33,26 @@ def annotate_tables(model_settings, trace_label): - trace_label = tracing.extend_trace_label(trace_label, 'annotate_tables') + trace_label = tracing.extend_trace_label(trace_label, "annotate_tables") chunk.log_rss(trace_label) - annotate_tables = model_settings.get('annotate_tables', []) + annotate_tables = model_settings.get("annotate_tables", []) if not annotate_tables: - logger.warning(f"{trace_label} - annotate_tables setting is empty - nothing to do!") + logger.warning( + f"{trace_label} - annotate_tables setting is empty - nothing to do!" + ) - assert isinstance(annotate_tables, list), \ - f"annotate_tables settings should be a list but is {type(annotate_tables)}" + assert isinstance( + annotate_tables, list + ), f"annotate_tables settings should be a list but is {type(annotate_tables)}" t0 = tracing.print_elapsed_time() for table_info in annotate_tables: - tablename = table_info['tablename'] + tablename = table_info["tablename"] chunk.log_rss(f"{trace_label}.pre-get_table.{tablename}") @@ -64,24 +60,27 @@ def annotate_tables(model_settings, trace_label): chunk.log_df(trace_label, tablename, df) # - rename columns - column_map = table_info.get('column_map', None) + column_map = table_info.get("column_map", None) if column_map: - warnings.warn(f"Setting 'column_map' has been changed to 'rename_columns'. " - f"Support for 'column_map' in annotate_tables will be removed in future versions.", - FutureWarning) + warnings.warn( + f"Setting 'column_map' has been changed to 'rename_columns'. " + f"Support for 'column_map' in annotate_tables will be removed in future versions.", + FutureWarning, + ) logger.info(f"{trace_label} - renaming {tablename} columns {column_map}") df.rename(columns=column_map, inplace=True) # - annotate - annotate = table_info.get('annotate', None) + annotate = table_info.get("annotate", None) if annotate: - logger.info(f"{trace_label} - annotating {tablename} SPEC {annotate['SPEC']}") + logger.info( + f"{trace_label} - annotating {tablename} SPEC {annotate['SPEC']}" + ) expressions.assign_columns( - df=df, - model_settings=annotate, - trace_label=trace_label) + df=df, model_settings=annotate, trace_label=trace_label + ) chunk.log_df(trace_label, tablename, df) @@ -95,53 +94,57 @@ def annotate_tables(model_settings, trace_label): @inject.step() def initialize_landuse(): - trace_label = 'initialize_landuse' + trace_label = "initialize_landuse" with chunk.chunk_log(trace_label, base=True): - model_settings = config.read_model_settings('initialize_landuse.yaml', mandatory=True) + model_settings = config.read_model_settings( + "initialize_landuse.yaml", mandatory=True + ) annotate_tables(model_settings, trace_label) # instantiate accessibility (must be checkpointed to be be used to slice accessibility) - accessibility = pipeline.get_table('accessibility') + accessibility = pipeline.get_table("accessibility") chunk.log_df(trace_label, "accessibility", accessibility) @inject.step() def initialize_households(): - trace_label = 'initialize_households' + trace_label = "initialize_households" with chunk.chunk_log(trace_label, base=True): chunk.log_rss(f"{trace_label}.inside-yield") - households = inject.get_table('households').to_frame() + households = inject.get_table("households").to_frame() assert not households._is_view chunk.log_df(trace_label, "households", households) del households chunk.log_df(trace_label, "households", None) - persons = inject.get_table('persons').to_frame() + persons = inject.get_table("persons").to_frame() assert not persons._is_view chunk.log_df(trace_label, "persons", persons) del persons chunk.log_df(trace_label, "persons", None) - model_settings = config.read_model_settings('initialize_households.yaml', mandatory=True) + model_settings = config.read_model_settings( + "initialize_households.yaml", mandatory=True + ) annotate_tables(model_settings, trace_label) # - initialize shadow_pricing size tables after annotating household and person tables # since these are scaled to model size, they have to be created while single-process # this can now be called as a stand alone model step instead, add_size_tables - add_size_tables = model_settings.get('add_size_tables', True) + add_size_tables = model_settings.get("add_size_tables", True) if add_size_tables: # warnings.warn(f"Calling add_size_tables from initialize will be removed in the future.", FutureWarning) shadow_pricing.add_size_tables() # - preload person_windows - person_windows = inject.get_table('person_windows').to_frame() + person_windows = inject.get_table("person_windows").to_frame() chunk.log_df(trace_label, "person_windows", person_windows) @@ -153,34 +156,35 @@ def preload_injectables(): logger.info("preload_injectables") - inject.add_step('track_skim_usage', track_skim_usage) - inject.add_step('write_data_dictionary', write_data_dictionary) - inject.add_step('write_tables', write_tables) + inject.add_step("track_skim_usage", track_skim_usage) + inject.add_step("write_data_dictionary", write_data_dictionary) + inject.add_step("write_tables", write_tables) - table_list = config.setting('input_table_list') + table_list = config.setting("input_table_list") # default ActivitySim table names and indices if table_list is None: logger.warning( "No 'input_table_list' found in settings. This will be a " - "required setting in upcoming versions of ActivitySim.") + "required setting in upcoming versions of ActivitySim." + ) - new_settings = inject.get_injectable('settings') - new_settings['input_table_list'] = DEFAULT_TABLE_LIST - inject.add_injectable('settings', new_settings) + new_settings = inject.get_injectable("settings") + new_settings["input_table_list"] = DEFAULT_TABLE_LIST + inject.add_injectable("settings", new_settings) # FIXME undocumented feature - if config.setting('write_raw_tables'): + if config.setting("write_raw_tables"): # write raw input tables as csv (before annotation) - csv_dir = config.output_file_path('raw_tables') + csv_dir = config.output_file_path("raw_tables") if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed - table_names = [t['tablename'] for t in table_list] + table_names = [t["tablename"] for t in table_list] for t in table_names: df = inject.get_table(t).to_frame() - df.to_csv(os.path.join(csv_dir, '%s.csv' % t), index=True) + df.to_csv(os.path.join(csv_dir, "%s.csv" % t), index=True) t0 = tracing.print_elapsed_time() diff --git a/activitysim/abm/models/initialize_los.py b/activitysim/abm/models/initialize_los.py index aaae57443b..fd7630feee 100644 --- a/activitysim/abm/models/initialize_los.py +++ b/activitysim/abm/models/initialize_los.py @@ -1,26 +1,26 @@ # ActivitySim # See full license in LICENSE.txt. import logging +import multiprocessing import os import time -import multiprocessing -import numba - from contextlib import contextmanager -import pandas as pd +import numba import numpy as np +import pandas as pd -from activitysim.core import assign -from activitysim.core import config -from activitysim.core import simulate -from activitysim.core import pipeline -from activitysim.core import tracing -from activitysim.core import chunk -from activitysim.core import inject -from activitysim.core import los - -from activitysim.core import pathbuilder +from activitysim.core import ( + assign, + chunk, + config, + inject, + los, + pathbuilder, + pipeline, + simulate, + tracing, +) logger = logging.getLogger(__name__) @@ -81,7 +81,7 @@ def initialize_los(network_los): FIXME - to instantiate attribute_combinations_df if the pipeline table version were not available. """ - trace_label = 'initialize_los' + trace_label = "initialize_los" if network_los.zone_system == los.THREE_ZONE: @@ -90,7 +90,7 @@ def initialize_los(network_los): attribute_combinations_df = uid_calculator.scalar_attribute_combinations() # - write table to pipeline (so we can slice it, when multiprocessing) - pipeline.replace_table('attribute_combinations', attribute_combinations_df) + pipeline.replace_table("attribute_combinations", attribute_combinations_df) # clean up any unwanted cache files from previous run if network_los.rebuild_tvpb_cache: @@ -99,7 +99,12 @@ def initialize_los(network_los): # if multiprocessing make sure shared cache is filled with np.nan # so that initialize_tvpb subprocesses can detect when cache is fully populated if network_los.multiprocess(): - data, lock = tap_cache.get_data_and_lock_from_buffers() # don't need lock here since single process + ( + data, + lock, + ) = ( + tap_cache.get_data_and_lock_from_buffers() + ) # don't need lock here since single process if os.path.isfile(tap_cache.cache_path): # fully populated cache should have been loaded from saved cache @@ -112,7 +117,9 @@ def initialize_los(network_los): np.copyto(data, np.nan) -def compute_utilities_for_atttribute_tuple(network_los, scalar_attributes, data, chunk_size, trace_label): +def compute_utilities_for_atttribute_tuple( + network_los, scalar_attributes, data, chunk_size, trace_label +): # scalar_attributes is a dict of attribute name/value pairs for this combination # (e.g. {'demographic_segment': 0, 'tod': 'AM', 'access_mode': 'walk'}) @@ -121,12 +128,15 @@ def compute_utilities_for_atttribute_tuple(network_los, scalar_attributes, data, uid_calculator = network_los.tvpb.uid_calculator - attributes_as_columns = \ - network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attributes_as_columns', []) - model_settings = \ - network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.tap_tap_settings') - model_constants = \ - network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.CONSTANTS').copy() + attributes_as_columns = network_los.setting( + "TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attributes_as_columns", [] + ) + model_settings = network_los.setting( + f"TVPB_SETTINGS.tour_mode_choice.tap_tap_settings" + ) + model_constants = network_los.setting( + f"TVPB_SETTINGS.tour_mode_choice.CONSTANTS" + ).copy() model_constants.update(scalar_attributes) data = data.reshape(uid_calculator.fully_populated_shape) @@ -138,30 +148,32 @@ def compute_utilities_for_atttribute_tuple(network_los, scalar_attributes, data, # since it is created outside of adaptive_chunked_choosers and so will show up in baseline assert not chunk.chunk_logging() # otherwise we should chunk_log this - chunk_tag = 'initialize_tvpb' # all attribute_combinations can use same cached data for row_size calc + chunk_tag = "initialize_tvpb" # all attribute_combinations can use same cached data for row_size calc - for i, chooser_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers(choosers_df, chunk_size, trace_label, chunk_tag=chunk_tag): + for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + choosers_df, chunk_size, trace_label, chunk_tag=chunk_tag + ): # we should count choosers_df as chunk overhead since its pretty big and was custom made for compute_utilities assert chooser_chunk._is_view # otherwise copying it is wasteful chooser_chunk = chooser_chunk.copy() - chunk.log_df(trace_label, 'attribute_chooser_chunk', chooser_chunk) + chunk.log_df(trace_label, "attribute_chooser_chunk", chooser_chunk) # add any attribute columns specified as column attributes in settings (the rest will be scalars in locals_dict) for attribute_name in attributes_as_columns: chooser_chunk[attribute_name] = scalar_attributes[attribute_name] - chunk.log_df(trace_label, 'attribute_chooser_chunk', chooser_chunk) + chunk.log_df(trace_label, "attribute_chooser_chunk", chooser_chunk) - utilities_df = \ - pathbuilder.compute_utilities(network_los, - model_settings=model_settings, - choosers=chooser_chunk, - model_constants=model_constants, - trace_label=trace_label) + utilities_df = pathbuilder.compute_utilities( + network_los, + model_settings=model_settings, + choosers=chooser_chunk, + model_constants=model_constants, + trace_label=trace_label, + ) - chunk.log_df(trace_label, 'utilities_df', utilities_df) + chunk.log_df(trace_label, "utilities_df", utilities_df) assert len(utilities_df) == len(chooser_chunk) assert len(utilities_df.columns) == data.shape[1] @@ -170,7 +182,7 @@ def compute_utilities_for_atttribute_tuple(network_los, scalar_attributes, data, data[chooser_chunk.index.values, :] = utilities_df.values del chooser_chunk - chunk.log_df(trace_label, 'attribute_chooser_chunk', None) + chunk.log_df(trace_label, "attribute_chooser_chunk", None) logger.debug(f"{trace_label} updated utilities") @@ -193,10 +205,12 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): FIXME - if we did not close this, we could avoid having to reload it from mmap when single-process? """ - trace_label = 'initialize_tvpb' + trace_label = "initialize_tvpb" if network_los.zone_system != los.THREE_ZONE: - logger.info(f"{trace_label} - skipping step because zone_system is not THREE_ZONE") + logger.info( + f"{trace_label} - skipping step because zone_system is not THREE_ZONE" + ) return attribute_combinations_df = attribute_combinations.to_frame() @@ -210,8 +224,10 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): if os.path.isfile(tap_cache.cache_path): # otherwise should have been deleted by TVPBCache.cleanup in initialize_los step assert not network_los.rebuild_tvpb_cache - logger.info(f"{trace_label} skipping rebuild of STATIC cache because rebuild_tvpb_cache setting is False" - f" and cache already exists: {tap_cache.cache_path}") + logger.info( + f"{trace_label} skipping rebuild of STATIC cache because rebuild_tvpb_cache setting is False" + f" and cache already exists: {tap_cache.cache_path}" + ) return if multiprocess: @@ -221,24 +237,32 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): data = tap_cache.allocate_data_buffer(shared=False) lock = None - logger.debug(f"{trace_label} processing {len(attribute_combinations_df)} attribute_combinations") - logger.debug(f"{trace_label} compute utilities for attribute_combinations_df\n{attribute_combinations_df}") + logger.debug( + f"{trace_label} processing {len(attribute_combinations_df)} attribute_combinations" + ) + logger.debug( + f"{trace_label} compute utilities for attribute_combinations_df\n{attribute_combinations_df}" + ) - for offset, scalar_attributes in attribute_combinations_df.to_dict('index').items(): + for offset, scalar_attributes in attribute_combinations_df.to_dict("index").items(): # compute utilities for this 'skim' with a single full set of scalar attributes offset = network_los.tvpb.uid_calculator.get_skim_offset(scalar_attributes) - tuple_trace_label = tracing.extend_trace_label(trace_label, f'offset{offset}') + tuple_trace_label = tracing.extend_trace_label(trace_label, f"offset{offset}") - compute_utilities_for_atttribute_tuple(network_los, scalar_attributes, data, chunk_size, tuple_trace_label) + compute_utilities_for_atttribute_tuple( + network_los, scalar_attributes, data, chunk_size, tuple_trace_label + ) # make sure we populated the entire offset - assert not any_uninitialized(data.reshape(uid_calculator.skim_shape)[offset], lock) + assert not any_uninitialized( + data.reshape(uid_calculator.skim_shape)[offset], lock + ) - if multiprocess and not inject.get_injectable('locutor', False): + if multiprocess and not inject.get_injectable("locutor", False): return - write_results = not multiprocess or inject.get_injectable('locutor', False) + write_results = not multiprocess or inject.get_injectable("locutor", False) if write_results: if multiprocess: @@ -247,8 +271,10 @@ def initialize_tvpb(network_los, attribute_combinations, chunk_size): # and they must wait to coalesce at the end of the multiprocessing_step) # FIXME testing entire array is costly in terms of RAM) while any_uninitialized(data, lock): - logger.debug(f"{trace_label}.{multiprocessing.current_process().name} waiting for other processes" - f" to populate {num_uninitialized(data, lock)} uninitialized data values") + logger.debug( + f"{trace_label}.{multiprocessing.current_process().name} waiting for other processes" + f" to populate {num_uninitialized(data, lock)} uninitialized data values" + ) time.sleep(5) logger.info(f"{trace_label} writing static cache.") diff --git a/activitysim/abm/models/initialize_tours.py b/activitysim/abm/models/initialize_tours.py index af354a6906..0d8069c960 100644 --- a/activitysim/abm/models/initialize_tours.py +++ b/activitysim/abm/models/initialize_tours.py @@ -1,71 +1,75 @@ # ActivitySim # See full license in LICENSE.txt. import logging -import warnings import os -import pandas as pd - -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import expressions +import warnings -from activitysim.core.input import read_input_table +import pandas as pd from activitysim.abm.models.util import tour_frequency as tf +from activitysim.core import config, expressions, inject, pipeline, tracing +from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) -SURVEY_TOUR_ID = 'external_tour_id' -SURVEY_PARENT_TOUR_ID = 'external_parent_tour_id' -SURVEY_PARTICIPANT_ID = 'external_participant_id' -ASIM_TOUR_ID = 'tour_id' -ASIM_PARENT_TOUR_ID = 'parent_tour_id' -REQUIRED_TOUR_COLUMNS = set(['person_id', 'tour_category', 'tour_type']) +SURVEY_TOUR_ID = "external_tour_id" +SURVEY_PARENT_TOUR_ID = "external_parent_tour_id" +SURVEY_PARTICIPANT_ID = "external_participant_id" +ASIM_TOUR_ID = "tour_id" +ASIM_PARENT_TOUR_ID = "parent_tour_id" +REQUIRED_TOUR_COLUMNS = set(["person_id", "tour_category", "tour_type"]) def patch_tour_ids(tours): - def set_tour_index(tours, parent_tour_num_col, is_joint): - group_cols = ['person_id', 'tour_category', 'tour_type'] + group_cols = ["person_id", "tour_category", "tour_type"] - if 'parent_tour_num' in tours: - group_cols += ['parent_tour_num'] + if "parent_tour_num" in tours: + group_cols += ["parent_tour_num"] - tours['tour_type_num'] = \ + tours["tour_type_num"] = ( tours.sort_values(by=group_cols).groupby(group_cols).cumcount() + 1 + ) - return tf.set_tour_index(tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint) + return tf.set_tour_index( + tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint + ) - assert REQUIRED_TOUR_COLUMNS.issubset(set(tours.columns)), \ - f"Required columns missing from tours table: {REQUIRED_TOUR_COLUMNS.difference(set(tours.columns))}" + assert REQUIRED_TOUR_COLUMNS.issubset( + set(tours.columns) + ), f"Required columns missing from tours table: {REQUIRED_TOUR_COLUMNS.difference(set(tours.columns))}" # replace tour index with asim standard tour_ids (which are based on person_id and tour_type) if tours.index.name is not None: - tours.insert(loc=0, column='legacy_index', value=tours.index) + tours.insert(loc=0, column="legacy_index", value=tours.index) # FIXME - for now, only grok simple tours - assert set(tours.tour_category.unique()).issubset({'mandatory', 'non_mandatory'}) + assert set(tours.tour_category.unique()).issubset({"mandatory", "non_mandatory"}) # mandatory tours - mandatory_tours = \ - set_tour_index(tours[tours.tour_category == 'mandatory'], parent_tour_num_col=None, is_joint=False) + mandatory_tours = set_tour_index( + tours[tours.tour_category == "mandatory"], + parent_tour_num_col=None, + is_joint=False, + ) - assert mandatory_tours.index.name == 'tour_id' + assert mandatory_tours.index.name == "tour_id" # FIXME joint tours not implemented - assert not (tours.tour_category == 'joint').any() + assert not (tours.tour_category == "joint").any() # non_mandatory tours - non_mandatory_tours = \ - set_tour_index(tours[tours.tour_category == 'non_mandatory'], parent_tour_num_col=None, is_joint=False) + non_mandatory_tours = set_tour_index( + tours[tours.tour_category == "non_mandatory"], + parent_tour_num_col=None, + is_joint=False, + ) # FIXME atwork tours ot implemented - assert not (tours.tour_category == 'atwork').any() + assert not (tours.tour_category == "atwork").any() patched_tours = pd.concat([mandatory_tours, non_mandatory_tours]) - del patched_tours['tour_type_num'] + del patched_tours["tour_type_num"] return patched_tours @@ -73,36 +77,38 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): @inject.step() def initialize_tours(network_los, households, persons, trace_hh_id): - trace_label = 'initialize_tours' + trace_label = "initialize_tours" tours = read_input_table("tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. # FIXME could just always slice... - slice_happened = \ - inject.get_injectable('households_sample_size', 0) > 0 \ - or inject.get_injectable('households_sample_size', 0) > 0 + slice_happened = ( + inject.get_injectable("households_sample_size", 0) > 0 + or inject.get_injectable("households_sample_size", 0) > 0 + ) if slice_happened: logger.info("slicing tours %s" % (tours.shape,)) # keep all persons in the sampled households tours = tours[tours.person_id.isin(persons.index)] # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above - model_settings = config.read_model_settings('initialize_tours.yaml', mandatory=True) + model_settings = config.read_model_settings("initialize_tours.yaml", mandatory=True) expressions.assign_columns( df=tours, - model_settings=model_settings.get('annotate_tours'), - trace_label=tracing.extend_trace_label(trace_label, 'annotate_tours')) + model_settings=model_settings.get("annotate_tours"), + trace_label=tracing.extend_trace_label(trace_label, "annotate_tours"), + ) tours = patch_tour_ids(tours) - assert tours.index.name == 'tour_id' + assert tours.index.name == "tour_id" # replace table function with dataframe - inject.add_table('tours', tours) + inject.add_table("tours", tours) - pipeline.get_rn_generator().add_channel('tours', tours) + pipeline.get_rn_generator().add_channel("tours", tours) - tracing.register_traceable_table('tours', tours) + tracing.register_traceable_table("tours", tours) logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours") logger.debug(f"{len(households.index.unique())} unique household_ids in households") @@ -110,11 +116,11 @@ def initialize_tours(network_los, households, persons, trace_hh_id): tours_without_persons = ~tours.person_id.isin(persons.index) if tours_without_persons.any(): - logger.error(f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n" - f"{pd.Series({'person_id': tours_without_persons.index.values})}") + logger.error( + f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n" + f"{pd.Series({'person_id': tours_without_persons.index.values})}" + ) raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id") if trace_hh_id: - tracing.trace_df(tours, - label='initialize_tours', - warn_if_empty=True) + tracing.trace_df(tours, label="initialize_tours", warn_if_empty=True) diff --git a/activitysim/abm/models/joint_tour_composition.py b/activitysim/abm/models/joint_tour_composition.py index f21b8f8677..a041fb9e83 100644 --- a/activitysim/abm/models/joint_tour_composition.py +++ b/activitysim/abm/models/joint_tour_composition.py @@ -4,40 +4,30 @@ import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import expressions +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing from .util import estimation - from .util.overlap import hh_time_window_overlap - logger = logging.getLogger(__name__) def add_null_results(trace_label, tours): logger.info("Skipping %s: add_null_results" % trace_label) - tours['composition'] = '' + tours["composition"] = "" pipeline.replace_table("tours", tours) @inject.step() -def joint_tour_composition( - tours, households, persons, - chunk_size, - trace_hh_id): +def joint_tour_composition(tours, households, persons, chunk_size, trace_hh_id): """ This model predicts the makeup of the travel party (adults, children, or mixed). """ - trace_label = 'joint_tour_composition' - model_settings_file_name = 'joint_tour_composition.yaml' + trace_label = "joint_tour_composition" + model_settings_file_name = "joint_tour_composition.yaml" tours = tours.to_frame() - joint_tours = tours[tours.tour_category == 'joint'] + joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours if joint_tours.shape[0] == 0: @@ -45,7 +35,7 @@ def joint_tour_composition( return model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation('joint_tour_composition') + estimator = estimation.manager.begin_estimation("joint_tour_composition") # - only interested in households with joint_tours households = households.to_frame() @@ -54,28 +44,32 @@ def joint_tour_composition( persons = persons.to_frame() persons = persons[persons.household_id.isin(households.index)] - logger.info("Running joint_tour_composition with %d joint tours" % joint_tours.shape[0]) + logger.info( + "Running joint_tour_composition with %d joint tours" % joint_tours.shape[0] + ) # - run preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_dict = { - 'persons': persons, - 'hh_time_window_overlap': hh_time_window_overlap + "persons": persons, + "hh_time_window_overlap": hh_time_window_overlap, } expressions.assign_columns( df=households, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) - joint_tours_merged = pd.merge(joint_tours, households, - left_on='household_id', right_index=True, how='left') + joint_tours_merged = pd.merge( + joint_tours, households, left_on="household_id", right_index=True, how="left" + ) # - simple_simulate - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -95,29 +89,33 @@ def joint_tour_composition( locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='composition', - estimator=estimator) + trace_choice_name="composition", + estimator=estimator, + ) # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'tours', 'composition') + choices = estimator.get_survey_values(choices, "tours", "composition") estimator.write_override_choices(choices) estimator.end_estimation() # add composition column to tours for tracing - joint_tours['composition'] = choices + joint_tours["composition"] = choices # reindex since we ran model on a subset of households - tours['composition'] = choices.reindex(tours.index).fillna('').astype(str) + tours["composition"] = choices.reindex(tours.index).fillna("").astype(str) pipeline.replace_table("tours", tours) - tracing.print_summary('joint_tour_composition', joint_tours.composition, - value_counts=True) + tracing.print_summary( + "joint_tour_composition", joint_tours.composition, value_counts=True + ) if trace_hh_id: - tracing.trace_df(joint_tours, - label="joint_tour_composition.joint_tours", - slicer='household_id') + tracing.trace_df( + joint_tours, + label="joint_tour_composition.joint_tours", + slicer="household_id", + ) diff --git a/activitysim/abm/models/joint_tour_destination.py b/activitysim/abm/models/joint_tour_destination.py index 9154fdfedc..02651d2a44 100644 --- a/activitysim/abm/models/joint_tour_destination.py +++ b/activitysim/abm/models/joint_tour_destination.py @@ -4,29 +4,18 @@ import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import simulate - +from activitysim.core import config, inject, pipeline, simulate, tracing from activitysim.core.util import assign_in_place -from .util import tour_destination -from .util import estimation - +from .util import estimation, tour_destination logger = logging.getLogger(__name__) @inject.step() def joint_tour_destination( - tours, - persons_merged, - households_merged, - network_los, - chunk_size, - trace_hh_id): + tours, persons_merged, households_merged, network_los, chunk_size, trace_hh_id +): """ Given the tour generation from the above, each tour needs to have a @@ -34,35 +23,42 @@ def joint_tour_destination( person that's making the tour) """ - trace_label = 'joint_tour_destination' - model_settings_file_name = 'joint_tour_destination.yaml' + trace_label = "joint_tour_destination" + model_settings_file_name = "joint_tour_destination.yaml" model_settings = config.read_model_settings(model_settings_file_name) - logsum_column_name = model_settings.get('DEST_CHOICE_LOGSUM_COLUMN_NAME') + logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None - sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') - want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None + sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") + want_sample_table = ( + config.setting("want_dest_choice_sample_tables") + and sample_table_name is not None + ) # choosers are tours - in a sense tours are choosing their destination tours = tours.to_frame() - joint_tours = tours[tours.tour_category == 'joint'] + joint_tours = tours[tours.tour_category == "joint"] persons_merged = persons_merged.to_frame() # - if no joint tours if joint_tours.shape[0] == 0: - tracing.no_results('joint_tour_destination') + tracing.no_results("joint_tour_destination") return - estimator = estimation.manager.begin_estimation('joint_tour_destination') + estimator = estimation.manager.begin_estimation("joint_tour_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') - estimator.write_spec(model_settings, tag='SPEC') + estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) - estimator.write_table(inject.get_injectable('size_terms'), 'size_terms', append=False) - estimator.write_table(inject.get_table('land_use').to_frame(), 'landuse', append=False) + estimator.write_table( + inject.get_injectable("size_terms"), "size_terms", append=False + ) + estimator.write_table( + inject.get_table("land_use").to_frame(), "landuse", append=False + ) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( @@ -73,24 +69,29 @@ def joint_tour_destination( model_settings, network_los, estimator, - chunk_size, trace_hh_id, trace_label) + chunk_size, + trace_hh_id, + trace_label, + ) if estimator: estimator.write_choices(choices_df.choice) - choices_df.choice = estimator.get_survey_values(choices_df.choice, 'tours', 'destination') + choices_df.choice = estimator.get_survey_values( + choices_df.choice, "tours", "destination" + ) estimator.write_override_choices(choices_df.choice) estimator.end_estimation() # add column as we want joint_tours table for tracing. - joint_tours['destination'] = choices_df.choice - assign_in_place(tours, joint_tours[['destination']]) + joint_tours["destination"] = choices_df.choice + assign_in_place(tours, joint_tours[["destination"]]) pipeline.replace_table("tours", tours) if want_logsums: - joint_tours[logsum_column_name] = choices_df['logsum'] + joint_tours[logsum_column_name] = choices_df["logsum"] assign_in_place(tours, joint_tours[[logsum_column_name]]) - tracing.print_summary('destination', joint_tours.destination, describe=True) + tracing.print_summary("destination", joint_tours.destination, describe=True) if want_sample_table: assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) @@ -98,5 +99,4 @@ def joint_tour_destination( pipeline.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df(joint_tours, - label="joint_tour_destination.joint_tours") + tracing.trace_df(joint_tours, label="joint_tour_destination.joint_tours") diff --git a/activitysim/abm/models/joint_tour_frequency.py b/activitysim/abm/models/joint_tour_frequency.py index d8930d9834..1039646651 100644 --- a/activitysim/abm/models/joint_tour_frequency.py +++ b/activitysim/abm/models/joint_tour_frequency.py @@ -5,15 +5,9 @@ import numpy as np import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import expressions +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing from .util import estimation - from .util.overlap import hh_time_window_overlap from .util.tour_frequency import process_joint_tours @@ -21,22 +15,21 @@ @inject.step() -def joint_tour_frequency( - households, persons, - chunk_size, - trace_hh_id): +def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): """ This model predicts the frequency of making fully joint trips (see the alternatives above). """ - trace_label = 'joint_tour_frequency' - model_settings_file_name = 'joint_tour_frequency.yaml' + trace_label = "joint_tour_frequency" + model_settings_file_name = "joint_tour_frequency.yaml" - estimator = estimation.manager.begin_estimation('joint_tour_frequency') + estimator = estimation.manager.begin_estimation("joint_tour_frequency") model_settings = config.read_model_settings(model_settings_file_name) - alternatives = simulate.read_model_alts('joint_tour_frequency_alternatives.csv', set_index='alt') + alternatives = simulate.read_model_alts( + "joint_tour_frequency_alternatives.csv", set_index="alt" + ) # - only interested in households with more than one cdap travel_active person and # - at least one non-preschooler @@ -48,25 +41,28 @@ def joint_tour_frequency( persons = persons.to_frame() persons = persons[persons.household_id.isin(multi_person_households.index)] - logger.info("Running joint_tour_frequency with %d multi-person households" % - multi_person_households.shape[0]) + logger.info( + "Running joint_tour_frequency with %d multi-person households" + % multi_person_households.shape[0] + ) # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_dict = { - 'persons': persons, - 'hh_time_window_overlap': hh_time_window_overlap + "persons": persons, + "hh_time_window_overlap": hh_time_window_overlap, } expressions.assign_columns( df=multi_person_households, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -86,15 +82,18 @@ def joint_tour_frequency( locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='joint_tour_frequency', - estimator=estimator) + trace_choice_name="joint_tour_frequency", + estimator=estimator, + ) # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'households', 'joint_tour_frequency') + choices = estimator.get_survey_values( + choices, "households", "joint_tour_frequency" + ) estimator.write_override_choices(choices) estimator.end_estimation() @@ -105,53 +104,63 @@ def joint_tour_frequency( # - so we arbitrarily choose the first person in the household # - to be point person for the purpose of generating an index and setting origin temp_point_persons = persons.loc[persons.PNUM == 1] - temp_point_persons['person_id'] = temp_point_persons.index - temp_point_persons = temp_point_persons.set_index('household_id') - temp_point_persons = temp_point_persons[['person_id', 'home_zone_id']] + temp_point_persons["person_id"] = temp_point_persons.index + temp_point_persons = temp_point_persons.set_index("household_id") + temp_point_persons = temp_point_persons[["person_id", "home_zone_id"]] - joint_tours = \ - process_joint_tours(choices, alternatives, temp_point_persons) + joint_tours = process_joint_tours(choices, alternatives, temp_point_persons) tours = pipeline.extend_table("tours", joint_tours) - tracing.register_traceable_table('tours', joint_tours) - pipeline.get_rn_generator().add_channel('tours', joint_tours) + tracing.register_traceable_table("tours", joint_tours) + pipeline.get_rn_generator().add_channel("tours", joint_tours) # - annotate households # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] - households['joint_tour_frequency'] = choices.reindex(households.index).fillna(no_tours_alt).astype(str) - - households['num_hh_joint_tours'] = joint_tours.groupby('household_id').size().\ - reindex(households.index).fillna(0).astype(np.int8) + households["joint_tour_frequency"] = ( + choices.reindex(households.index).fillna(no_tours_alt).astype(str) + ) + + households["num_hh_joint_tours"] = ( + joint_tours.groupby("household_id") + .size() + .reindex(households.index) + .fillna(0) + .astype(np.int8) + ) pipeline.replace_table("households", households) - tracing.print_summary('joint_tour_frequency', households.joint_tour_frequency, - value_counts=True) + tracing.print_summary( + "joint_tour_frequency", households.joint_tour_frequency, value_counts=True + ) if trace_hh_id: - tracing.trace_df(households, - label="joint_tour_frequency.households") + tracing.trace_df(households, label="joint_tour_frequency.households") - tracing.trace_df(joint_tours, - label="joint_tour_frequency.joint_tours", - slicer='household_id') + tracing.trace_df( + joint_tours, label="joint_tour_frequency.joint_tours", slicer="household_id" + ) if estimator: - survey_tours = estimation.manager.get_survey_table('tours') - survey_tours = survey_tours[survey_tours.tour_category == 'joint'] + survey_tours = estimation.manager.get_survey_table("tours") + survey_tours = survey_tours[survey_tours.tour_category == "joint"] print(f"len(survey_tours) {len(survey_tours)}") print(f"len(joint_tours) {len(joint_tours)}") different = False - survey_tours_not_in_tours = survey_tours[~survey_tours.index.isin(joint_tours.index)] + survey_tours_not_in_tours = survey_tours[ + ~survey_tours.index.isin(joint_tours.index) + ] if len(survey_tours_not_in_tours) > 0: print(f"survey_tours_not_in_tours\n{survey_tours_not_in_tours}") different = True - tours_not_in_survey_tours = joint_tours[~joint_tours.index.isin(survey_tours.index)] + tours_not_in_survey_tours = joint_tours[ + ~joint_tours.index.isin(survey_tours.index) + ] if len(survey_tours_not_in_tours) > 0: print(f"tours_not_in_survey_tours\n{tours_not_in_survey_tours}") different = True diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index c69b5093c5..f60e1c2bbe 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -4,25 +4,22 @@ import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import logit -from activitysim.core import expressions -from activitysim.core import chunk - -from activitysim.core.util import assign_in_place +from activitysim.abm.models.util.canonical_ids import MAX_PARTICIPANT_PNUM +from activitysim.core import ( + chunk, + config, + expressions, + inject, + logit, + pipeline, + simulate, + tracing, +) +from activitysim.core.util import assign_in_place, reindex from .util import estimation - -from activitysim.core.util import reindex from .util.overlap import person_time_window_overlap -from activitysim.abm.models.util.canonical_ids import MAX_PARTICIPANT_PNUM - - logger = logging.getLogger(__name__) @@ -40,17 +37,21 @@ def joint_tour_participation_candidates(joint_tours, persons_merged): # - create candidates table candidates = pd.merge( - joint_tours.reset_index().rename(columns={'person_id': 'point_person_id'}), - persons_merged.reset_index().rename(columns={persons_merged.index.name: 'person_id'}), - left_on=['household_id'], right_on=['household_id']) + joint_tours.reset_index().rename(columns={"person_id": "point_person_id"}), + persons_merged.reset_index().rename( + columns={persons_merged.index.name: "person_id"} + ), + left_on=["household_id"], + right_on=["household_id"], + ) # should have all joint_tours - assert len(candidates['tour_id'].unique()) == joint_tours.shape[0] + assert len(candidates["tour_id"].unique()) == joint_tours.shape[0] # - filter out ineligible candidates (adults for children-only tours, and vice-versa) eligible = ~( - ((candidates.composition == 'adults') & ~candidates.adult) | - ((candidates.composition == 'children') & candidates.adult) + ((candidates.composition == "adults") & ~candidates.adult) + | ((candidates.composition == "children") & candidates.adult) ) candidates = candidates[eligible] @@ -58,10 +59,15 @@ def joint_tour_participation_candidates(joint_tours, persons_merged): # if this happens, participant_id may not be unique # channel random seeds will overlap at MAX_PARTICIPANT_PNUM (not probably a big deal) # and estimation infer will fail - assert candidates.PNUM.max() < MAX_PARTICIPANT_PNUM, \ - f"max persons.PNUM ({candidates.PNUM.max()}) > MAX_PARTICIPANT_PNUM ({MAX_PARTICIPANT_PNUM})" - candidates['participant_id'] = (candidates[joint_tours.index.name] * MAX_PARTICIPANT_PNUM) + candidates.PNUM - candidates.set_index('participant_id', drop=True, inplace=True, verify_integrity=True) + assert ( + candidates.PNUM.max() < MAX_PARTICIPANT_PNUM + ), f"max persons.PNUM ({candidates.PNUM.max()}) > MAX_PARTICIPANT_PNUM ({MAX_PARTICIPANT_PNUM})" + candidates["participant_id"] = ( + candidates[joint_tours.index.name] * MAX_PARTICIPANT_PNUM + ) + candidates.PNUM + candidates.set_index( + "participant_id", drop=True, inplace=True, verify_integrity=True + ) return candidates @@ -75,27 +81,34 @@ def get_tour_satisfaction(candidates, participate): candidates = candidates[participate] # if this happens, we would need to filter them out! - assert not ((candidates.composition == 'adults') & ~candidates.adult).any() - assert not ((candidates.composition == 'children') & candidates.adult).any() + assert not ((candidates.composition == "adults") & ~candidates.adult).any() + assert not ((candidates.composition == "children") & candidates.adult).any() # FIXME tour satisfaction - hack # annotate_households_cdap.csv says there has to be at least one non-preschooler in household # so presumably there also has to be at least one non-preschooler in joint tour # participates_in_jtf_model,(num_travel_active > 1) & (num_travel_active_non_preschoolers > 0) - cols = ['tour_id', 'composition', 'adult', 'person_is_preschool'] - - x = candidates[cols].groupby(['tour_id', 'composition'])\ - .agg(participants=('adult', 'size'), adults=('adult', 'sum'), preschoolers=('person_is_preschool', 'sum'))\ - .reset_index('composition') + cols = ["tour_id", "composition", "adult", "person_is_preschool"] + + x = ( + candidates[cols] + .groupby(["tour_id", "composition"]) + .agg( + participants=("adult", "size"), + adults=("adult", "sum"), + preschoolers=("person_is_preschool", "sum"), + ) + .reset_index("composition") + ) # satisfaction = \ # (x.composition == 'adults') & (x.participants > 1) | \ # (x.composition == 'children') & (x.participants > 1) & (x.preschoolers < x.participants) | \ # (x.composition == 'mixed') & (x.adults > 0) & (x.participants > x.adults) - satisfaction = \ - (x.composition != 'mixed') & (x.participants > 1) | \ - (x.composition == 'mixed') & (x.adults > 0) & (x.participants > x.adults) + satisfaction = (x.composition != "mixed") & (x.participants > 1) | ( + x.composition == "mixed" + ) & (x.adults > 0) & (x.participants > x.adults) satisfaction = satisfaction.reindex(tour_ids).fillna(False).astype(bool) @@ -144,22 +157,25 @@ def participants_chooser(probs, choosers, spec, trace_label): assert probs.index.equals(choosers.index) # choice is boolean (participate or not) - model_settings = config.read_model_settings('joint_tour_participation.yaml') + model_settings = config.read_model_settings("joint_tour_participation.yaml") - choice_col = model_settings.get('participation_choice', 'participate') - assert choice_col in spec.columns, \ - "couldn't find participation choice column '%s' in spec" + choice_col = model_settings.get("participation_choice", "participate") + assert ( + choice_col in spec.columns + ), "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = spec.columns.get_loc(choice_col) - MAX_ITERATIONS = model_settings.get('max_participation_choice_iterations', 5000) + MAX_ITERATIONS = model_settings.get("max_participation_choice_iterations", 5000) - trace_label = tracing.extend_trace_label(trace_label, 'participants_chooser') + trace_label = tracing.extend_trace_label(trace_label, "participants_chooser") candidates = choosers.copy() choices_list = [] rands_list = [] num_tours_remaining = len(candidates.tour_id.unique()) - logger.info('%s %s joint tours to satisfy.', trace_label, num_tours_remaining,) + logger.info( + "%s %s joint tours to satisfy.", trace_label, num_tours_remaining, + ) iter = 0 while candidates.shape[0] > 0: @@ -167,16 +183,23 @@ def participants_chooser(probs, choosers, spec, trace_label): iter += 1 if iter > MAX_ITERATIONS: - logger.warning('%s max iterations exceeded (%s).', trace_label, MAX_ITERATIONS) - diagnostic_cols = ['tour_id', 'household_id', 'composition', 'adult'] + logger.warning( + "%s max iterations exceeded (%s).", trace_label, MAX_ITERATIONS + ) + diagnostic_cols = ["tour_id", "household_id", "composition", "adult"] unsatisfied_candidates = candidates[diagnostic_cols].join(probs) - tracing.write_csv(unsatisfied_candidates, - file_name='%s.UNSATISFIED' % trace_label, transpose=False) + tracing.write_csv( + unsatisfied_candidates, + file_name="%s.UNSATISFIED" % trace_label, + transpose=False, + ) print(unsatisfied_candidates.head(20)) assert False - choices, rands = logit.make_choices(probs, trace_label=trace_label, trace_choosers=choosers) - participate = (choices == PARTICIPATE_CHOICE) + choices, rands = logit.make_choices( + probs, trace_label=trace_label, trace_choosers=choosers + ) + participate = choices == PARTICIPATE_CHOICE # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) @@ -195,8 +218,10 @@ def participants_chooser(probs, choosers, spec, trace_label): probs = probs[~satisfied] candidates = candidates[~satisfied] - logger.debug(f"{trace_label} iteration {iter} : " - f"{num_tours_satisfied_this_iter} joint tours satisfied {num_tours_remaining} remaining") + logger.debug( + f"{trace_label} iteration {iter} : " + f"{num_tours_satisfied_this_iter} joint tours satisfied {num_tours_remaining} remaining" + ) choices = pd.concat(choices_list) rands = pd.concat(rands_list).reindex(choosers.index) @@ -207,7 +232,9 @@ def participants_chooser(probs, choosers, spec, trace_label): assert choices.index.equals(choosers.index) assert rands.index.equals(choosers.index) - logger.info('%s %s iterations to satisfy all joint tours.', trace_label, iter,) + logger.info( + "%s %s iterations to satisfy all joint tours.", trace_label, iter, + ) return choices, rands @@ -215,11 +242,12 @@ def participants_chooser(probs, choosers, spec, trace_label): def annotate_jtp(model_settings, trace_label): # - annotate persons - persons = inject.get_table('persons').to_frame() + persons = inject.get_table("persons").to_frame() expressions.assign_columns( df=persons, - model_settings=model_settings.get('annotate_persons'), - trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) + model_settings=model_settings.get("annotate_persons"), + trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), + ) pipeline.replace_table("persons", persons) @@ -227,10 +255,10 @@ def add_null_results(model_settings, trace_label): logger.info("Skipping %s: joint tours", trace_label) # participants table is used downstream in non-joint tour expressions - PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id', 'participant_num'] + PARTICIPANT_COLS = ["tour_id", "household_id", "person_id", "participant_num"] participants = pd.DataFrame(columns=PARTICIPANT_COLS) - participants.index.name = 'participant_id' + participants.index.name = "participant_id" pipeline.replace_table("joint_tour_participants", participants) # - run annotations @@ -238,19 +266,16 @@ def add_null_results(model_settings, trace_label): @inject.step() -def joint_tour_participation( - tours, persons_merged, - chunk_size, - trace_hh_id): +def joint_tour_participation(tours, persons_merged, chunk_size, trace_hh_id): """ Predicts for each eligible person to participate or not participate in each joint tour. """ - trace_label = 'joint_tour_participation' - model_settings_file_name = 'joint_tour_participation.yaml' + trace_label = "joint_tour_participation" + model_settings_file_name = "joint_tour_participation.yaml" model_settings = config.read_model_settings(model_settings_file_name) tours = tours.to_frame() - joint_tours = tours[tours.tour_category == 'joint'] + joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours if joint_tours.shape[0] == 0: @@ -261,32 +286,35 @@ def joint_tour_participation( # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) - tracing.register_traceable_table('joint_tour_participants', candidates) - pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates) + tracing.register_traceable_table("joint_tour_participants", candidates) + pipeline.get_rn_generator().add_channel("joint_tour_participants", candidates) - logger.info("Running joint_tours_participation with %d potential participants (candidates)" % - candidates.shape[0]) + logger.info( + "Running joint_tours_participation with %d potential participants (candidates)" + % candidates.shape[0] + ) # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_dict = { - 'person_time_window_overlap': person_time_window_overlap, - 'persons': persons_merged + "person_time_window_overlap": person_time_window_overlap, + "persons": persons_merged, } expressions.assign_columns( df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) # - simple_simulate - estimator = estimation.manager.begin_estimation('joint_tour_participation') + estimator = estimation.manager.begin_estimation("joint_tour_participation") - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -300,10 +328,12 @@ def joint_tour_participation( estimator.write_choosers(candidates) # add tour-based chunk_id so we can chunk all trips in tour together - assert 'chunk_id' not in candidates.columns + assert "chunk_id" not in candidates.columns unique_household_ids = candidates.household_id.unique() - household_chunk_ids = pd.Series(range(len(unique_household_ids)), index=unique_household_ids) - candidates['chunk_id'] = reindex(household_chunk_ids, candidates.household_id) + household_chunk_ids = pd.Series( + range(len(unique_household_ids)), index=unique_household_ids + ) + candidates["chunk_id"] = reindex(household_chunk_ids, candidates.household_id) choices = simulate.simple_simulate_by_chunk_id( choosers=candidates, @@ -312,28 +342,34 @@ def joint_tour_participation( locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='participation', + trace_choice_name="participation", custom_chooser=participants_chooser, - estimator=estimator) + estimator=estimator, + ) # choice is boolean (participate or not) - choice_col = model_settings.get('participation_choice', 'participate') - assert choice_col in model_spec.columns, \ - "couldn't find participation choice column '%s' in spec" + choice_col = model_settings.get("participation_choice", "participate") + assert ( + choice_col in model_spec.columns + ), "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col) - participate = (choices == PARTICIPATE_CHOICE) + participate = choices == PARTICIPATE_CHOICE if estimator: estimator.write_choices(choices) # we override the 'participate' boolean series, instead of raw alternative index in 'choices' series # its value depends on whether the candidate's 'participant_id' is in the joint_tour_participant index - survey_participants_df = estimator.get_survey_table('joint_tour_participants') - participate = pd.Series(choices.index.isin(survey_participants_df.index.values), index=choices.index) + survey_participants_df = estimator.get_survey_table("joint_tour_participants") + participate = pd.Series( + choices.index.isin(survey_participants_df.index.values), index=choices.index + ) # but estimation software wants to know the choices value (alternative index) - choices = participate.replace({True: PARTICIPATE_CHOICE, False: 1-PARTICIPATE_CHOICE}) + choices = participate.replace( + {True: PARTICIPATE_CHOICE, False: 1 - PARTICIPATE_CHOICE} + ) # estimator.write_override_choices(participate) # write choices as boolean participate estimator.write_override_choices(choices) # write choices as int alt indexes @@ -344,30 +380,33 @@ def joint_tour_participation( assert tour_satisfaction.all() - candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id) + candidates["satisfied"] = reindex(tour_satisfaction, candidates.tour_id) - PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id'] + PARTICIPANT_COLS = ["tour_id", "household_id", "person_id"] participants = candidates[participate][PARTICIPANT_COLS].copy() # assign participant_num # FIXME do we want something smarter than the participant with the lowest person_id? - participants['participant_num'] = \ - participants.sort_values(by=['tour_id', 'person_id']).\ - groupby('tour_id').cumcount() + 1 + participants["participant_num"] = ( + participants.sort_values(by=["tour_id", "person_id"]) + .groupby("tour_id") + .cumcount() + + 1 + ) pipeline.replace_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) - pipeline.get_rn_generator().drop_channel('joint_tour_participants') + pipeline.get_rn_generator().drop_channel("joint_tour_participants") # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] - joint_tours['person_id'] = point_persons.set_index('tour_id').person_id + joint_tours["person_id"] = point_persons.set_index("tour_id").person_id # update number_of_participants which was initialized to 1 - joint_tours['number_of_participants'] = participants.groupby('tour_id').size() + joint_tours["number_of_participants"] = participants.groupby("tour_id").size() - assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']]) + assign_in_place(tours, joint_tours[["person_id", "number_of_participants"]]) pipeline.replace_table("tours", tours) @@ -375,8 +414,6 @@ def joint_tour_participation( annotate_jtp(model_settings, trace_label) if trace_hh_id: - tracing.trace_df(participants, - label="joint_tour_participation.participants") + tracing.trace_df(participants, label="joint_tour_participation.participants") - tracing.trace_df(joint_tours, - label="joint_tour_participation.joint_tours") + tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours") diff --git a/activitysim/abm/models/joint_tour_scheduling.py b/activitysim/abm/models/joint_tour_scheduling.py index 3b41f898e5..32c444c76c 100644 --- a/activitysim/abm/models/joint_tour_scheduling.py +++ b/activitysim/abm/models/joint_tour_scheduling.py @@ -4,39 +4,27 @@ import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import expressions +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core.util import assign_in_place, reindex from .util import estimation - from .util.vectorize_tour_scheduling import vectorize_joint_tour_scheduling -from activitysim.core.util import assign_in_place -from activitysim.core.util import reindex logger = logging.getLogger(__name__) @inject.step() -def joint_tour_scheduling( - tours, - persons_merged, - tdd_alts, - chunk_size, - trace_hh_id): +def joint_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): """ This model predicts the departure time and duration of each joint tour """ - trace_label = 'joint_tour_scheduling' + trace_label = "joint_tour_scheduling" - model_settings_file_name = 'joint_tour_scheduling.yaml' + model_settings_file_name = "joint_tour_scheduling.yaml" model_settings = config.read_model_settings(model_settings_file_name) tours = tours.to_frame() - joint_tours = tours[tours.tour_category == 'joint'] + joint_tours = tours[tours.tour_category == "joint"] # - if no joint tours if joint_tours.shape[0] == 0: @@ -44,7 +32,7 @@ def joint_tour_scheduling( return # use inject.get_table as this won't exist if there are no joint_tours - joint_tour_participants = inject.get_table('joint_tour_participants').to_frame() + joint_tour_participants = inject.get_table("joint_tour_participants").to_frame() persons_merged = persons_merged.to_frame() @@ -64,7 +52,7 @@ def joint_tour_scheduling( constants = config.get_model_constants(model_settings) # - run preprocessor to annotate choosers - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_d = {} @@ -75,13 +63,14 @@ def joint_tour_scheduling( df=joint_tours, model_settings=preprocessor_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) timetable = inject.get_injectable("timetable") - estimator = estimation.manager.begin_estimation('joint_tour_scheduling') + estimator = estimation.manager.begin_estimation("joint_tour_scheduling") - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -92,45 +81,55 @@ def joint_tour_scheduling( timetable.begin_transaction(estimator) choices = vectorize_joint_tour_scheduling( - joint_tours, joint_tour_participants, + joint_tours, + joint_tour_participants, persons_merged, - tdd_alts, timetable, + tdd_alts, + timetable, spec=model_spec, model_settings=model_settings, estimator=estimator, chunk_size=chunk_size, - trace_label=trace_label) + trace_label=trace_label, + ) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'tours', 'tdd') + choices = estimator.get_survey_values(choices, "tours", "tdd") estimator.write_override_choices(choices) estimator.end_estimation() # update timetable to reflect the override choices (assign tours in tour_num order) timetable.rollback() - for tour_num, nth_tours in joint_tours.groupby('tour_num', sort=True): - nth_participants = \ - joint_tour_participants[joint_tour_participants.tour_id.isin(nth_tours.index)] - - estimator.log("assign timetable for %s participants in %s tours with tour_num %s" % - (len(nth_participants), len(nth_tours), tour_num)) + for tour_num, nth_tours in joint_tours.groupby("tour_num", sort=True): + nth_participants = joint_tour_participants[ + joint_tour_participants.tour_id.isin(nth_tours.index) + ] + + estimator.log( + "assign timetable for %s participants in %s tours with tour_num %s" + % (len(nth_participants), len(nth_tours), tour_num) + ) # - update timetables of all joint tour participants - timetable.assign(nth_participants.person_id, reindex(choices, nth_participants.tour_id)) + timetable.assign( + nth_participants.person_id, reindex(choices, nth_participants.tour_id) + ) timetable.replace_table() # choices are tdd alternative ids # we want to add start, end, and duration columns to tours, which we have in tdd_alts table - choices = pd.merge(choices.to_frame('tdd'), tdd_alts, left_on=['tdd'], right_index=True, how='left') + choices = pd.merge( + choices.to_frame("tdd"), tdd_alts, left_on=["tdd"], right_index=True, how="left" + ) assign_in_place(tours, choices) pipeline.replace_table("tours", tours) # updated df for tracing - joint_tours = tours[tours.tour_category == 'joint'] + joint_tours = tours[tours.tour_category == "joint"] if trace_hh_id: - tracing.trace_df(joint_tours, - label="joint_tour_scheduling", - slicer='household_id') + tracing.trace_df( + joint_tours, label="joint_tour_scheduling", slicer="household_id" + ) diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py index 5b3b7d1e47..c433baf4b9 100644 --- a/activitysim/abm/models/location_choice.py +++ b/activitysim/abm/models/location_choice.py @@ -2,29 +2,30 @@ # See full license in LICENSE.txt. import logging -# import multiprocessing - -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import mem -from activitysim.core import expressions -from activitysim.core import los -from activitysim.core import logit - -from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.abm.tables import shadow_pricing +from activitysim.core import ( + config, + expressions, + inject, + logit, + los, + mem, + pipeline, + simulate, + tracing, +) from activitysim.core.interaction_sample import interaction_sample +from activitysim.core.interaction_sample_simulate import interaction_sample_simulate -from .util import logsums as logsum from .util import estimation +from .util import logsums as logsum from .util import tour_destination -from activitysim.abm.tables import shadow_pricing +# import multiprocessing + """ The school/workplace location model predicts the zones in which various people will @@ -78,7 +79,7 @@ logger = logging.getLogger(__name__) # column name of logsum in df returned by run_location_logsums (here because used in more than one place) -ALT_LOGSUM = 'mode_choice_logsum' +ALT_LOGSUM = "mode_choice_logsum" def write_estimation_specs(estimator, model_settings, settings_file): @@ -93,23 +94,29 @@ def write_estimation_specs(estimator, model_settings, settings_file): estimator.write_model_settings(model_settings, settings_file) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') - estimator.write_spec(model_settings, tag='SPEC') + estimator.write_spec(model_settings, tag="SPEC") estimator.write_coefficients(model_settings=model_settings) - estimator.write_table(inject.get_injectable('size_terms'), 'size_terms', append=False) - estimator.write_table(inject.get_table('land_use').to_frame(), 'landuse', append=False) + estimator.write_table( + inject.get_injectable("size_terms"), "size_terms", append=False + ) + estimator.write_table( + inject.get_table("land_use").to_frame(), "landuse", append=False + ) def _location_sample( - segment_name, - choosers, - alternatives, - skims, - estimator, - model_settings, - alt_dest_col_name, - chunk_size, chunk_tag, - trace_label): + segment_name, + choosers, + alternatives, + skims, + estimator, + model_settings, + alt_dest_col_name, + chunk_size, + chunk_tag, + trace_label, +): """ select a sample of alternative locations. @@ -132,23 +139,29 @@ def _location_sample( logger.info("Running %s with %d persons" % (trace_label, len(choosers.index))) sample_size = model_settings["SAMPLE_SIZE"] - if config.setting('disable_destination_sampling', False) or (estimator and estimator.want_unsampled_alternatives): + if config.setting("disable_destination_sampling", False) or ( + estimator and estimator.want_unsampled_alternatives + ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count - logger.info("Estimation mode for %s using unsampled alternatives short_circuit_choices" % (trace_label,)) + logger.info( + "Estimation mode for %s using unsampled alternatives short_circuit_choices" + % (trace_label,) + ) sample_size = 0 - locals_d = { - 'skims': skims, - 'segment_size': segment_name - } + locals_d = {"skims": skims, "segment_size": segment_name} constants = config.get_model_constants(model_settings) locals_d.update(constants) - spec = simulate.spec_for_segment(model_settings, spec_id='SAMPLE_SPEC', - segment_name=segment_name, estimator=estimator) + spec = simulate.spec_for_segment( + model_settings, + spec_id="SAMPLE_SPEC", + segment_name=segment_name, + estimator=estimator, + ) # here since presumably we want this when called for either sample or presample - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_sample( choosers, @@ -161,23 +174,26 @@ def _location_sample( locals_d=locals_d, chunk_size=chunk_size, chunk_tag=chunk_tag, - trace_label=trace_label) + trace_label=trace_label, + ) return choices def location_sample( - segment_name, - persons_merged, - network_los, - dest_size_terms, - estimator, - model_settings, - chunk_size, chunk_tag, - trace_label): + segment_name, + persons_merged, + network_los, + dest_size_terms, + estimator, + model_settings, + chunk_size, + chunk_tag, + trace_label, +): # FIXME - MEMORY HACK - only include columns actually used in spec - chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] + chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] choosers = persons_merged[chooser_columns] # create wrapper with keys for this lookup - in this case there is a home_zone_id in the choosers @@ -185,9 +201,9 @@ def location_sample( # (logit.interaction_dataset suffixes duplicate chooser column with '_chooser') # the skims will be available under the name "skims" for any @ expressions skim_dict = network_los.get_default_skim_dict() - skims = skim_dict.wrap('home_zone_id', 'zone_id') + skims = skim_dict.wrap("home_zone_id", "zone_id") - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _location_sample( segment_name, @@ -197,16 +213,18 @@ def location_sample( estimator, model_settings, alt_dest_col_name, - chunk_size, chunk_tag, - trace_label) + chunk_size, + chunk_tag, + trace_label, + ) return choices -DEST_TAZ = 'dest_TAZ' -HOME_TAZ = 'TAZ' -HOME_MAZ = 'home_zone_id' -DEST_MAZ = 'dest_MAZ' +DEST_TAZ = "dest_TAZ" +HOME_TAZ = "TAZ" +HOME_MAZ = "home_zone_id" +DEST_MAZ = "dest_MAZ" def aggregate_size_terms(dest_size_terms, network_los): @@ -217,23 +235,36 @@ def aggregate_size_terms(dest_size_terms, network_los): MAZ_size_terms = dest_size_terms.copy() # add crosswalk DEST_TAZ column to MAZ_size_terms - maz_to_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].set_index('MAZ').sort_values(by='TAZ').TAZ + maz_to_taz = ( + network_los.maz_taz_df[["MAZ", "TAZ"]] + .set_index("MAZ") + .sort_values(by="TAZ") + .TAZ + ) MAZ_size_terms[DEST_TAZ] = MAZ_size_terms.index.map(maz_to_taz) - weighted_average_cols = ['shadow_price_size_term_adjustment', 'shadow_price_utility_adjustment'] + weighted_average_cols = [ + "shadow_price_size_term_adjustment", + "shadow_price_utility_adjustment", + ] for c in weighted_average_cols: - MAZ_size_terms[c] *= MAZ_size_terms['size_term'] # weighted average + MAZ_size_terms[c] *= MAZ_size_terms["size_term"] # weighted average TAZ_size_terms = MAZ_size_terms.groupby(DEST_TAZ).agg( - {'size_term': 'sum', - 'shadow_price_size_term_adjustment': 'sum', - 'shadow_price_utility_adjustment': 'sum'}) + { + "size_term": "sum", + "shadow_price_size_term_adjustment": "sum", + "shadow_price_utility_adjustment": "sum", + } + ) for c in weighted_average_cols: - TAZ_size_terms[c] /= TAZ_size_terms['size_term'] # weighted average + TAZ_size_terms[c] /= TAZ_size_terms["size_term"] # weighted average if TAZ_size_terms.isna().any(axis=None): - logger.warning(f"TAZ_size_terms with NAN values\n{TAZ_size_terms[TAZ_size_terms.isna().any(axis=1)]}") + logger.warning( + f"TAZ_size_terms with NAN values\n{TAZ_size_terms[TAZ_size_terms.isna().any(axis=1)]}" + ) assert not TAZ_size_terms.isna(axis=None).any() # print(f"TAZ_size_terms\n{TAZ_size_terms}") @@ -243,8 +274,10 @@ def aggregate_size_terms(dest_size_terms, network_los): # 3 20.511 1.0 0 # 4 19.737 1.0 0 - MAZ_size_terms = MAZ_size_terms[[DEST_TAZ, 'size_term']].reset_index(drop=False) - MAZ_size_terms = MAZ_size_terms.sort_values([DEST_TAZ, 'zone_id']).reset_index(drop=True) + MAZ_size_terms = MAZ_size_terms[[DEST_TAZ, "size_term"]].reset_index(drop=False) + MAZ_size_terms = MAZ_size_terms.sort_values([DEST_TAZ, "zone_id"]).reset_index( + drop=True + ) # print(f"MAZ_size_terms\n{MAZ_size_terms}") # zone_id dest_TAZ size_term @@ -257,20 +290,22 @@ def aggregate_size_terms(dest_size_terms, network_los): def location_presample( - segment_name, - persons_merged, - network_los, - dest_size_terms, - estimator, - model_settings, - chunk_size, chunk_tag, - trace_label): - - trace_label = tracing.extend_trace_label(trace_label, 'presample') + segment_name, + persons_merged, + network_los, + dest_size_terms, + estimator, + model_settings, + chunk_size, + chunk_tag, + trace_label, +): + + trace_label = tracing.extend_trace_label(trace_label, "presample") logger.info(f"{trace_label} location_presample") - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] assert DEST_TAZ != alt_dest_col_name MAZ_size_terms, TAZ_size_terms = aggregate_size_terms(dest_size_terms, network_los) @@ -278,19 +313,21 @@ def location_presample( # convert MAZ zone_id to 'TAZ' in choosers (persons_merged) # persons_merged[HOME_TAZ] = persons_merged[HOME_MAZ].map(maz_to_taz) assert HOME_MAZ in persons_merged - assert HOME_TAZ in persons_merged # 'TAZ' should already be in persons_merged from land_use + assert ( + HOME_TAZ in persons_merged + ) # 'TAZ' should already be in persons_merged from land_use # FIXME - MEMORY HACK - only include columns actually used in spec # FIXME we don't actually require that land_use provide a TAZ crosswalk # FIXME maybe we should add it for multi-zone (from maz_taz) if missing? - chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] + chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] chooser_columns = [HOME_TAZ if c == HOME_MAZ else c for c in chooser_columns] choosers = persons_merged[chooser_columns] # create wrapper with keys for this lookup - in this case there is a HOME_TAZ in the choosers # and a DEST_TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions - skim_dict = network_los.get_skim_dict('taz') + skim_dict = network_los.get_skim_dict("taz") skims = skim_dict.wrap(HOME_TAZ, DEST_TAZ) taz_sample = _location_sample( @@ -301,8 +338,10 @@ def location_presample( estimator, model_settings, DEST_TAZ, - chunk_size, chunk_tag, - trace_label) + chunk_size, + chunk_tag, + trace_label, + ) # print(f"taz_sample\n{taz_sample}") # dest_TAZ prob pick_count @@ -313,7 +352,9 @@ def location_presample( # 55227 20 0.035548 3 # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total - maz_choices = tour_destination.choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label) + maz_choices = tour_destination.choose_MAZ_for_TAZ( + taz_sample, MAZ_size_terms, trace_label + ) assert DEST_MAZ in maz_choices maz_choices = maz_choices.rename(columns={DEST_MAZ: alt_dest_col_name}) @@ -322,14 +363,16 @@ def location_presample( def run_location_sample( - segment_name, - persons_merged, - network_los, - dest_size_terms, - estimator, - model_settings, - chunk_size, chunk_tag, - trace_label): + segment_name, + persons_merged, + network_los, + dest_size_terms, + estimator, + model_settings, + chunk_size, + chunk_tag, + trace_label, +): """ select a sample of alternative locations. @@ -348,20 +391,27 @@ def run_location_sample( 23751, 14, 0.972732479292, 2 """ - logger.debug(f"dropping {(~(dest_size_terms.size_term > 0)).sum()} " - f"of {len(dest_size_terms)} rows where size_term is zero") + logger.debug( + f"dropping {(~(dest_size_terms.size_term > 0)).sum()} " + f"of {len(dest_size_terms)} rows where size_term is zero" + ) dest_size_terms = dest_size_terms[dest_size_terms.size_term > 0] # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting('want_dest_choice_presampling', True): + if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): pre_sample_taz = False - logger.info(f"Disabled destination zone presampling for {trace_label} " - f"because 'want_dest_choice_presampling' setting is False") + logger.info( + f"Disabled destination zone presampling for {trace_label} " + f"because 'want_dest_choice_presampling' setting is False" + ) if pre_sample_taz: - logger.info("Running %s location_presample with %d persons" % (trace_label, len(persons_merged))) + logger.info( + "Running %s location_presample with %d persons" + % (trace_label, len(persons_merged)) + ) choices = location_presample( segment_name, @@ -371,8 +421,9 @@ def run_location_sample( estimator, model_settings, chunk_size, - chunk_tag=f'{chunk_tag}.presample', - trace_label=trace_label) + chunk_tag=f"{chunk_tag}.presample", + trace_label=trace_label, + ) else: @@ -384,20 +435,23 @@ def run_location_sample( estimator, model_settings, chunk_size, - chunk_tag=f'{chunk_tag}.sample', - trace_label=trace_label) + chunk_tag=f"{chunk_tag}.sample", + trace_label=trace_label, + ) return choices def run_location_logsums( - segment_name, - persons_merged_df, - network_los, - location_sample_df, - model_settings, - chunk_size, chunk_tag, - trace_label): + segment_name, + persons_merged_df, + network_los, + location_sample_df, + model_settings, + chunk_size, + chunk_tag, + trace_label, +): """ add logsum column to existing location_sample table @@ -421,28 +475,33 @@ def run_location_logsums( assert not location_sample_df.empty - logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) + logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) # FIXME - MEMORY HACK - only include columns actually used in spec - persons_merged_df = \ - logsum.filter_chooser_columns(persons_merged_df, logsum_settings, model_settings) + persons_merged_df = logsum.filter_chooser_columns( + persons_merged_df, logsum_settings, model_settings + ) - logger.info("Running %s with %s rows" % (trace_label, len(location_sample_df.index))) + logger.info( + "Running %s with %s rows" % (trace_label, len(location_sample_df.index)) + ) - choosers = location_sample_df.join(persons_merged_df, how='left') + choosers = location_sample_df.join(persons_merged_df, how="left") - tour_purpose = model_settings['LOGSUM_TOUR_PURPOSE'] + tour_purpose = model_settings["LOGSUM_TOUR_PURPOSE"] if isinstance(tour_purpose, dict): tour_purpose = tour_purpose[segment_name] logsums = logsum.compute_logsums( choosers, tour_purpose, - logsum_settings, model_settings, + logsum_settings, + model_settings, network_los, chunk_size, chunk_tag, - trace_label) + trace_label, + ) # "add_column series should have an index matching the table to which it is being added" # when the index has duplicates, however, in the special case that the series index exactly @@ -454,16 +513,18 @@ def run_location_logsums( def run_location_simulate( - segment_name, - persons_merged, - location_sample_df, - network_los, - dest_size_terms, - want_logsums, - estimator, - model_settings, - chunk_size, chunk_tag, - trace_label): + segment_name, + persons_merged, + location_sample_df, + network_los, + dest_size_terms, + want_logsums, + estimator, + model_settings, + chunk_size, + chunk_tag, + trace_label, +): """ run location model on location_sample annotated with mode_choice logsum to select a dest zone from sample alternatives @@ -479,16 +540,20 @@ def run_location_simulate( assert not persons_merged.empty # FIXME - MEMORY HACK - only include columns actually used in spec - chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] + chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] choosers = persons_merged[chooser_columns] - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] # alternatives are pre-sampled and annotated with logsums and pick_count # but we have to merge additional alt columns into alt sample list - alternatives = \ - pd.merge(location_sample_df, dest_size_terms, - left_on=alt_dest_col_name, right_index=True, how="left") + alternatives = pd.merge( + location_sample_df, + dest_size_terms, + left_on=alt_dest_col_name, + right_index=True, + how="left", + ) logger.info("Running %s with %d persons" % (trace_label, len(choosers))) @@ -496,12 +561,9 @@ def run_location_simulate( # and a zone_id in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skim_dict = network_los.get_default_skim_dict() - skims = skim_dict.wrap('home_zone_id', alt_dest_col_name) + skims = skim_dict.wrap("home_zone_id", alt_dest_col_name) - locals_d = { - 'skims': skims, - 'segment_size': segment_name - } + locals_d = {"skims": skims, "segment_size": segment_name} constants = config.get_model_constants(model_settings) if constants is not None: locals_d.update(constants) @@ -512,9 +574,11 @@ def run_location_simulate( estimator.set_alt_id(alt_dest_col_name) estimator.write_interaction_sample_alternatives(alternatives) - spec = simulate.spec_for_segment(model_settings, spec_id='SPEC', segment_name=segment_name, estimator=estimator) + spec = simulate.spec_for_segment( + model_settings, spec_id="SPEC", segment_name=segment_name, estimator=estimator + ) - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_sample_simulate( choosers, @@ -525,15 +589,17 @@ def run_location_simulate( want_logsums=want_logsums, skims=skims, locals_d=locals_d, - chunk_size=chunk_size, chunk_tag=chunk_tag, + chunk_size=chunk_size, + chunk_tag=chunk_tag, trace_label=trace_label, - trace_choice_name=model_settings['DEST_CHOICE_COLUMN_NAME'], - estimator=estimator) + trace_choice_name=model_settings["DEST_CHOICE_COLUMN_NAME"], + estimator=estimator, + ) if not want_logsums: # for consistency, always return a dataframe with canonical column name assert isinstance(choices, pd.Series) - choices = choices.to_frame('choice') + choices = choices.to_frame("choice") assert isinstance(choices, pd.DataFrame) @@ -541,16 +607,18 @@ def run_location_simulate( def run_location_choice( - persons_merged_df, - network_los, - shadow_price_calculator, - want_logsums, - want_sample_table, - estimator, - model_settings, - chunk_size, chunk_tag, - trace_hh_id, trace_label - ): + persons_merged_df, + network_los, + shadow_price_calculator, + want_logsums, + want_sample_table, + estimator, + model_settings, + chunk_size, + chunk_tag, + trace_hh_id, + trace_label, +): """ Run the three-part location choice algorithm to generate a location choice for each chooser @@ -580,74 +648,89 @@ def run_location_choice( logsums optional & only returned if DEST_CHOICE_LOGSUM_COLUMN_NAME specified in model_settings """ - chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] + chooser_segment_column = model_settings["CHOOSER_SEGMENT_COLUMN_NAME"] # maps segment names to compact (integer) ids - segment_ids = model_settings['SEGMENT_IDS'] + segment_ids = model_settings["SEGMENT_IDS"] choices_list = [] sample_list = [] for segment_name, segment_id in segment_ids.items(): - choosers = persons_merged_df[persons_merged_df[chooser_segment_column] == segment_id] + choosers = persons_merged_df[ + persons_merged_df[chooser_segment_column] == segment_id + ] # size_term and shadow price adjustment - one row per zone dest_size_terms = shadow_price_calculator.dest_size_terms(segment_name) - assert dest_size_terms.index.is_monotonic_increasing, \ - f"shadow_price_calculator.dest_size_terms({segment_name}) not monotonic_increasing" + assert ( + dest_size_terms.index.is_monotonic_increasing + ), f"shadow_price_calculator.dest_size_terms({segment_name}) not monotonic_increasing" if choosers.shape[0] == 0: logger.info(f"{trace_label} skipping segment {segment_name}: no choosers") continue # - location_sample - location_sample_df = \ - run_location_sample( - segment_name, - choosers, - network_los, - dest_size_terms, - estimator, - model_settings, - chunk_size, - chunk_tag, # run_location_sample will add appropriate suffix for sample or presample - trace_label=tracing.extend_trace_label(trace_label, 'sample.%s' % segment_name)) + location_sample_df = run_location_sample( + segment_name, + choosers, + network_los, + dest_size_terms, + estimator, + model_settings, + chunk_size, + chunk_tag, # run_location_sample will add appropriate suffix for sample or presample + trace_label=tracing.extend_trace_label( + trace_label, "sample.%s" % segment_name + ), + ) # - location_logsums - location_sample_df = \ - run_location_logsums( - segment_name, - choosers, - network_los, - location_sample_df, - model_settings, - chunk_size, chunk_tag=f'{chunk_tag}.logsums', - trace_label=tracing.extend_trace_label(trace_label, 'logsums.%s' % segment_name)) + location_sample_df = run_location_logsums( + segment_name, + choosers, + network_los, + location_sample_df, + model_settings, + chunk_size, + chunk_tag=f"{chunk_tag}.logsums", + trace_label=tracing.extend_trace_label( + trace_label, "logsums.%s" % segment_name + ), + ) # - location_simulate - choices_df = \ - run_location_simulate( - segment_name, - choosers, - location_sample_df, - network_los, - dest_size_terms, - want_logsums, - estimator, - model_settings, - chunk_size, chunk_tag=f'{chunk_tag}.simulate', - trace_label=tracing.extend_trace_label(trace_label, 'simulate.%s' % segment_name)) + choices_df = run_location_simulate( + segment_name, + choosers, + location_sample_df, + network_los, + dest_size_terms, + want_logsums, + estimator, + model_settings, + chunk_size, + chunk_tag=f"{chunk_tag}.simulate", + trace_label=tracing.extend_trace_label( + trace_label, "simulate.%s" % segment_name + ), + ) if estimator: if trace_hh_id: - estimation_trace_label = \ - tracing.extend_trace_label(trace_label, f'estimation.{segment_name}.modeled_choices') + estimation_trace_label = tracing.extend_trace_label( + trace_label, f"estimation.{segment_name}.modeled_choices" + ) tracing.trace_df(choices_df, label=estimation_trace_label) estimator.write_choices(choices_df.choice) - choices_df.choice = estimator.get_survey_values(choices_df.choice, 'persons', - column_names=model_settings['DEST_CHOICE_COLUMN_NAME']) + choices_df.choice = estimator.get_survey_values( + choices_df.choice, + "persons", + column_names=model_settings["DEST_CHOICE_COLUMN_NAME"], + ) estimator.write_override_choices(choices_df.choice) if want_logsums: @@ -659,31 +742,38 @@ def run_location_choice( # merge mode_choice_logsum for the overridden location # alt_logsums columns: ['person_id', 'choice', 'logsum'] - alt_dest_col = model_settings['ALT_DEST_COL_NAME'] - alt_logsums = \ - location_sample_df[[alt_dest_col, ALT_LOGSUM]]\ - .rename(columns={alt_dest_col: 'choice', ALT_LOGSUM: 'logsum'})\ + alt_dest_col = model_settings["ALT_DEST_COL_NAME"] + alt_logsums = ( + location_sample_df[[alt_dest_col, ALT_LOGSUM]] + .rename(columns={alt_dest_col: "choice", ALT_LOGSUM: "logsum"}) .reset_index() + ) # choices_df columns: ['person_id', 'choice'] - choices_df = choices_df[['choice']].reset_index() + choices_df = choices_df[["choice"]].reset_index() # choices_df columns: ['person_id', 'choice', 'logsum'] - choices_df = pd.merge(choices_df, alt_logsums, how='left').set_index('person_id') + choices_df = pd.merge(choices_df, alt_logsums, how="left").set_index( + "person_id" + ) - logger.debug(f"{trace_label} segment {segment_name} estimation: override logsums") + logger.debug( + f"{trace_label} segment {segment_name} estimation: override logsums" + ) if trace_hh_id: - estimation_trace_label = \ - tracing.extend_trace_label(trace_label, f'estimation.{segment_name}.survey_choices') + estimation_trace_label = tracing.extend_trace_label( + trace_label, f"estimation.{segment_name}.survey_choices" + ) tracing.trace_df(choices_df, estimation_trace_label) choices_list.append(choices_df) if want_sample_table: # FIXME - sample_table - location_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], - append=True, inplace=True) + location_sample_df.set_index( + model_settings["ALT_DEST_COL_NAME"], append=True, inplace=True + ) sample_list.append(location_sample_df) else: # del this so we dont hold active reference to it while run_location_sample is creating its replacement @@ -694,7 +784,7 @@ def run_location_choice( else: # this will only happen with small samples (e.g. singleton) with no (e.g.) school segs logger.warning("%s no choices", trace_label) - choices_df = pd.DataFrame(columns=['choice', 'logsum']) + choices_df = pd.DataFrame(columns=["choice", "logsum"]) if len(sample_list) > 0: save_sample_df = pd.concat(sample_list) @@ -706,12 +796,17 @@ def run_location_choice( def iterate_location_choice( - model_settings, - persons_merged, persons, households, - network_los, - estimator, - chunk_size, trace_hh_id, locutor, - trace_label): + model_settings, + persons_merged, + persons, + households, + network_los, + estimator, + chunk_size, + trace_hh_id, + locutor, + trace_label, +): """ iterate run_location_choice updating shadow pricing until convergence criteria satisfied or max_iterations reached. @@ -740,25 +835,31 @@ def iterate_location_choice( chunk_tag = trace_label # boolean to filter out persons not needing location modeling (e.g. is_worker, is_student) - chooser_filter_column = model_settings['CHOOSER_FILTER_COLUMN_NAME'] + chooser_filter_column = model_settings["CHOOSER_FILTER_COLUMN_NAME"] - dest_choice_column_name = model_settings['DEST_CHOICE_COLUMN_NAME'] - logsum_column_name = model_settings.get('DEST_CHOICE_LOGSUM_COLUMN_NAME') + dest_choice_column_name = model_settings["DEST_CHOICE_COLUMN_NAME"] + logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") - sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') - want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None + sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") + want_sample_table = ( + config.setting("want_dest_choice_sample_tables") + and sample_table_name is not None + ) persons_merged_df = persons_merged.to_frame() persons_merged_df = persons_merged_df[persons_merged[chooser_filter_column]] - persons_merged_df.sort_index(inplace=True) # interaction_sample expects chooser index to be monotonic increasing + persons_merged_df.sort_index( + inplace=True + ) # interaction_sample expects chooser index to be monotonic increasing # chooser segmentation allows different sets coefficients for e.g. different income_segments or tour_types - chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] + chooser_segment_column = model_settings["CHOOSER_SEGMENT_COLUMN_NAME"] - assert chooser_segment_column in persons_merged_df, \ - f"CHOOSER_SEGMENT_COLUMN '{chooser_segment_column}' not in persons_merged table." + assert ( + chooser_segment_column in persons_merged_df + ), f"CHOOSER_SEGMENT_COLUMN '{chooser_segment_column}' not in persons_merged table." spc = shadow_pricing.load_shadow_price_calculator(model_settings) max_iterations = spc.max_iterations @@ -779,17 +880,22 @@ def iterate_location_choice( want_sample_table=want_sample_table, estimator=estimator, model_settings=model_settings, - chunk_size=chunk_size, chunk_tag=chunk_tag, + chunk_size=chunk_size, + chunk_tag=chunk_tag, trace_hh_id=trace_hh_id, - trace_label=tracing.extend_trace_label(trace_label, 'i%s' % iteration)) + trace_label=tracing.extend_trace_label(trace_label, "i%s" % iteration), + ) # choices_df is a pandas DataFrame with columns 'choice' and (optionally) 'logsum' if choices_df is None: break spc.set_choices( - choices=choices_df['choice'], - segment_ids=persons_merged_df[chooser_segment_column].reindex(choices_df.index)) + choices=choices_df["choice"], + segment_ids=persons_merged_df[chooser_segment_column].reindex( + choices_df.index + ), + ) if locutor: spc.write_trace_files(iteration) @@ -800,10 +906,10 @@ def iterate_location_choice( # - shadow price table if locutor: - if spc.use_shadow_pricing and 'SHADOW_PRICE_TABLE' in model_settings: - inject.add_table(model_settings['SHADOW_PRICE_TABLE'], spc.shadow_prices) - if 'MODELED_SIZE_TABLE' in model_settings: - inject.add_table(model_settings['MODELED_SIZE_TABLE'], spc.modeled_size) + if spc.use_shadow_pricing and "SHADOW_PRICE_TABLE" in model_settings: + inject.add_table(model_settings["SHADOW_PRICE_TABLE"], spc.shadow_prices) + if "MODELED_SIZE_TABLE" in model_settings: + inject.add_table(model_settings["MODELED_SIZE_TABLE"], spc.modeled_size) persons_df = persons.to_frame() @@ -812,74 +918,77 @@ def iterate_location_choice( # so we backfill the empty choices with -1 to code as no school location # names for location choice and (optional) logsums columns NO_DEST_ZONE = -1 - persons_df[dest_choice_column_name] = \ - choices_df['choice'].reindex(persons_df.index).fillna(NO_DEST_ZONE).astype(int) + persons_df[dest_choice_column_name] = ( + choices_df["choice"].reindex(persons_df.index).fillna(NO_DEST_ZONE).astype(int) + ) # add the dest_choice_logsum column to persons dataframe if logsum_column_name: - persons_df[logsum_column_name] = \ - choices_df['logsum'].reindex(persons_df.index).astype('float') + persons_df[logsum_column_name] = ( + choices_df["logsum"].reindex(persons_df.index).astype("float") + ) if save_sample_df is not None: # might be None for tiny samples even if sample_table_name was specified assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # lest they try to put school and workplace samples into the same table if pipeline.is_table(sample_table_name): - raise RuntimeError("dest choice sample table %s already exists" % sample_table_name) + raise RuntimeError( + "dest choice sample table %s already exists" % sample_table_name + ) pipeline.extend_table(sample_table_name, save_sample_df) # - annotate persons table - if 'annotate_persons' in model_settings: + if "annotate_persons" in model_settings: expressions.assign_columns( df=persons_df, - model_settings=model_settings.get('annotate_persons'), - trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) + model_settings=model_settings.get("annotate_persons"), + trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), + ) pipeline.replace_table("persons", persons_df) if trace_hh_id: - tracing.trace_df(persons_df, - label=trace_label, - warn_if_empty=True) + tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table - if 'annotate_households' in model_settings: + if "annotate_households" in model_settings: households_df = households.to_frame() expressions.assign_columns( df=households_df, - model_settings=model_settings.get('annotate_households'), - trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) + model_settings=model_settings.get("annotate_households"), + trace_label=tracing.extend_trace_label(trace_label, "annotate_households"), + ) pipeline.replace_table("households", households_df) if trace_hh_id: - tracing.trace_df(households_df, - label=trace_label, - warn_if_empty=True) + tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) if logsum_column_name: - tracing.print_summary(logsum_column_name, choices_df['logsum'], value_counts=True) + tracing.print_summary( + logsum_column_name, choices_df["logsum"], value_counts=True + ) return persons_df @inject.step() def workplace_location( - persons_merged, persons, households, - network_los, - chunk_size, trace_hh_id, locutor): + persons_merged, persons, households, network_los, chunk_size, trace_hh_id, locutor +): """ workplace location choice model iterate_location_choice adds location choice column and annotations to persons table """ - trace_label = 'workplace_location' - model_settings = config.read_model_settings('workplace_location.yaml') + trace_label = "workplace_location" + model_settings = config.read_model_settings("workplace_location.yaml") - estimator = estimation.manager.begin_estimation('workplace_location') + estimator = estimation.manager.begin_estimation("workplace_location") if estimator: - write_estimation_specs(estimator, model_settings, 'workplace_location.yaml') + write_estimation_specs(estimator, model_settings, "workplace_location.yaml") # FIXME - debugging code to test multiprocessing failure handling # process_name = multiprocessing.current_process().name @@ -888,10 +997,15 @@ def workplace_location( iterate_location_choice( model_settings, - persons_merged, persons, households, + persons_merged, + persons, + households, network_los, estimator, - chunk_size, trace_hh_id, locutor, trace_label + chunk_size, + trace_hh_id, + locutor, + trace_label, ) if estimator: @@ -900,29 +1014,32 @@ def workplace_location( @inject.step() def school_location( - persons_merged, persons, households, - network_los, - chunk_size, trace_hh_id, locutor - ): + persons_merged, persons, households, network_los, chunk_size, trace_hh_id, locutor +): """ School location choice model iterate_location_choice adds location choice column and annotations to persons table """ - trace_label = 'school_location' - model_settings = config.read_model_settings('school_location.yaml') + trace_label = "school_location" + model_settings = config.read_model_settings("school_location.yaml") - estimator = estimation.manager.begin_estimation('school_location') + estimator = estimation.manager.begin_estimation("school_location") if estimator: - write_estimation_specs(estimator, model_settings, 'school_location.yaml') + write_estimation_specs(estimator, model_settings, "school_location.yaml") iterate_location_choice( model_settings, - persons_merged, persons, households, + persons_merged, + persons, + households, network_los, estimator, - chunk_size, trace_hh_id, locutor, trace_label + chunk_size, + trace_hh_id, + locutor, + trace_label, ) if estimator: diff --git a/activitysim/abm/models/mandatory_scheduling.py b/activitysim/abm/models/mandatory_scheduling.py index 34f25cf778..6a9618874d 100644 --- a/activitysim/abm/models/mandatory_scheduling.py +++ b/activitysim/abm/models/mandatory_scheduling.py @@ -4,45 +4,33 @@ import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline +from activitysim.core import config, expressions, inject, pipeline, simulate from activitysim.core import timetable as tt -from activitysim.core import expressions - -from activitysim.core.util import reindex +from activitysim.core import tracing +from activitysim.core.util import assign_in_place, reindex from .util import estimation from .util import vectorize_tour_scheduling as vts from .util.tour_scheduling import run_tour_scheduling -from activitysim.core.util import assign_in_place - - logger = logging.getLogger(__name__) DUMP = False @inject.step() -def mandatory_tour_scheduling(tours, - persons_merged, - tdd_alts, - chunk_size, - trace_hh_id): +def mandatory_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): """ This model predicts the departure time and duration of each activity for mandatory tours """ - model_name = 'mandatory_tour_scheduling' + model_name = "mandatory_tour_scheduling" trace_label = model_name persons_merged = persons_merged.to_frame() tours = tours.to_frame() - mandatory_tours = tours[tours.tour_category == 'mandatory'] + mandatory_tours = tours[tours.tour_category == "mandatory"] # - if no mandatory_tours if mandatory_tours.shape[0] == 0: @@ -56,31 +44,44 @@ def mandatory_tour_scheduling(tours, # (i.e. there are different logsum coefficients for work, school, univ primary_purposes # for simplicity managing these different segmentation schemes, # we conflate them by segmenting tour processing to align with primary_purpose - tour_segment_col = 'mandatory_tour_seg' + tour_segment_col = "mandatory_tour_seg" assert tour_segment_col not in mandatory_tours - is_university_tour = \ - (mandatory_tours.tour_type == 'school') & \ - reindex(persons_merged.is_university, mandatory_tours.person_id) - mandatory_tours[tour_segment_col] = \ - mandatory_tours.tour_type.where(~is_university_tour, 'univ') - - choices = run_tour_scheduling(model_name, mandatory_tours, persons_merged, tdd_alts, - tour_segment_col, chunk_size, trace_hh_id) + is_university_tour = (mandatory_tours.tour_type == "school") & reindex( + persons_merged.is_university, mandatory_tours.person_id + ) + mandatory_tours[tour_segment_col] = mandatory_tours.tour_type.where( + ~is_university_tour, "univ" + ) + + choices = run_tour_scheduling( + model_name, + mandatory_tours, + persons_merged, + tdd_alts, + tour_segment_col, + chunk_size, + trace_hh_id, + ) assign_in_place(tours, choices) pipeline.replace_table("tours", tours) # updated df for tracing - mandatory_tours = tours[tours.tour_category == 'mandatory'] + mandatory_tours = tours[tours.tour_category == "mandatory"] - tracing.dump_df(DUMP, - tt.tour_map(persons_merged, mandatory_tours, tdd_alts), - trace_label, 'tour_map') + tracing.dump_df( + DUMP, + tt.tour_map(persons_merged, mandatory_tours, tdd_alts), + trace_label, + "tour_map", + ) if trace_hh_id: - tracing.trace_df(mandatory_tours, - label=trace_label, - slicer='person_id', - index_label='tour', - columns=None, - warn_if_empty=True) + tracing.trace_df( + mandatory_tours, + label=trace_label, + slicer="person_id", + index_label="tour", + columns=None, + warn_if_empty=True, + ) diff --git a/activitysim/abm/models/mandatory_tour_frequency.py b/activitysim/abm/models/mandatory_tour_frequency.py index bbe1df599e..727a591f0c 100644 --- a/activitysim/abm/models/mandatory_tour_frequency.py +++ b/activitysim/abm/models/mandatory_tour_frequency.py @@ -4,15 +4,10 @@ import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import expressions +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing -from .util.tour_frequency import process_mandatory_tours from .util import estimation +from .util.tour_frequency import process_mandatory_tours logger = logging.getLogger(__name__) @@ -20,40 +15,39 @@ def add_null_results(trace_label, mandatory_tour_frequency_settings): logger.info("Skipping %s: add_null_results", trace_label) - persons = inject.get_table('persons').to_frame() - persons['mandatory_tour_frequency'] = '' + persons = inject.get_table("persons").to_frame() + persons["mandatory_tour_frequency"] = "" tours = pd.DataFrame() - tours['tour_category'] = None - tours['tour_type'] = None - tours['person_id'] = None - tours.index.name = 'tour_id' + tours["tour_category"] = None + tours["tour_type"] = None + tours["person_id"] = None + tours.index.name = "tour_id" pipeline.replace_table("tours", tours) expressions.assign_columns( df=persons, - model_settings=mandatory_tour_frequency_settings.get('annotate_persons'), - trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) + model_settings=mandatory_tour_frequency_settings.get("annotate_persons"), + trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), + ) pipeline.replace_table("persons", persons) @inject.step() -def mandatory_tour_frequency(persons_merged, - chunk_size, - trace_hh_id): +def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ - trace_label = 'mandatory_tour_frequency' - model_settings_file_name = 'mandatory_tour_frequency.yaml' + trace_label = "mandatory_tour_frequency" + model_settings_file_name = "mandatory_tour_frequency.yaml" model_settings = config.read_model_settings(model_settings_file_name) choosers = persons_merged.to_frame() # filter based on results of CDAP - choosers = choosers[choosers.cdap_activity == 'M'] + choosers = choosers[choosers.cdap_activity == "M"] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours @@ -62,7 +56,7 @@ def mandatory_tour_frequency(persons_merged, return # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_dict = {} @@ -71,11 +65,12 @@ def mandatory_tour_frequency(persons_merged, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) - estimator = estimation.manager.begin_estimation('mandatory_tour_frequency') + estimator = estimation.manager.begin_estimation("mandatory_tour_frequency") - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -95,15 +90,18 @@ def mandatory_tour_frequency(persons_merged, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='mandatory_tour_frequency', - estimator=estimator) + trace_choice_name="mandatory_tour_frequency", + estimator=estimator, + ) # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'persons', 'mandatory_tour_frequency') + choices = estimator.get_survey_values( + choices, "persons", "mandatory_tour_frequency" + ) estimator.write_override_choices(choices) estimator.end_estimation() @@ -113,39 +111,46 @@ def mandatory_tour_frequency(persons_merged, alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ - alternatives = simulate.read_model_alts('mandatory_tour_frequency_alternatives.csv', set_index='alt') - choosers['mandatory_tour_frequency'] = choices.reindex(choosers.index) + alternatives = simulate.read_model_alts( + "mandatory_tour_frequency_alternatives.csv", set_index="alt" + ) + choosers["mandatory_tour_frequency"] = choices.reindex(choosers.index) mandatory_tours = process_mandatory_tours( - persons=choosers, - mandatory_tour_frequency_alts=alternatives + persons=choosers, mandatory_tour_frequency_alts=alternatives ) tours = pipeline.extend_table("tours", mandatory_tours) - tracing.register_traceable_table('tours', mandatory_tours) - pipeline.get_rn_generator().add_channel('tours', mandatory_tours) + tracing.register_traceable_table("tours", mandatory_tours) + pipeline.get_rn_generator().add_channel("tours", mandatory_tours) # - annotate persons - persons = inject.get_table('persons').to_frame() + persons = inject.get_table("persons").to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' - persons['mandatory_tour_frequency'] = choices.reindex(persons.index).fillna('').astype(str) + persons["mandatory_tour_frequency"] = ( + choices.reindex(persons.index).fillna("").astype(str) + ) expressions.assign_columns( df=persons, - model_settings=model_settings.get('annotate_persons'), - trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) + model_settings=model_settings.get("annotate_persons"), + trace_label=tracing.extend_trace_label(trace_label, "annotate_persons"), + ) pipeline.replace_table("persons", persons) - tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, - value_counts=True) + tracing.print_summary( + "mandatory_tour_frequency", persons.mandatory_tour_frequency, value_counts=True + ) if trace_hh_id: - tracing.trace_df(mandatory_tours, - label="mandatory_tour_frequency.mandatory_tours", - warn_if_empty=True) - - tracing.trace_df(persons, - label="mandatory_tour_frequency.persons", - warn_if_empty=True) + tracing.trace_df( + mandatory_tours, + label="mandatory_tour_frequency.mandatory_tours", + warn_if_empty=True, + ) + + tracing.trace_df( + persons, label="mandatory_tour_frequency.persons", warn_if_empty=True + ) diff --git a/activitysim/abm/models/non_mandatory_destination.py b/activitysim/abm/models/non_mandatory_destination.py index ca3493dd21..322d0ed325 100644 --- a/activitysim/abm/models/non_mandatory_destination.py +++ b/activitysim/abm/models/non_mandatory_destination.py @@ -4,28 +4,18 @@ import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import simulate - +from activitysim.core import config, inject, pipeline, simulate, tracing from activitysim.core.util import assign_in_place -from .util import tour_destination -from .util import estimation - +from .util import estimation, tour_destination logger = logging.getLogger(__name__) @inject.step() def non_mandatory_tour_destination( - tours, - persons_merged, - network_los, - chunk_size, - trace_hh_id): + tours, persons_merged, network_los, chunk_size, trace_hh_id +): """ Given the tour generation from the above, each tour needs to have a @@ -33,36 +23,43 @@ def non_mandatory_tour_destination( person that's making the tour) """ - trace_label = 'non_mandatory_tour_destination' - model_settings_file_name = 'non_mandatory_tour_destination.yaml' + trace_label = "non_mandatory_tour_destination" + model_settings_file_name = "non_mandatory_tour_destination.yaml" model_settings = config.read_model_settings(model_settings_file_name) - logsum_column_name = model_settings.get('DEST_CHOICE_LOGSUM_COLUMN_NAME') + logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None - sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') - want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None + sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") + want_sample_table = ( + config.setting("want_dest_choice_sample_tables") + and sample_table_name is not None + ) tours = tours.to_frame() persons_merged = persons_merged.to_frame() # choosers are tours - in a sense tours are choosing their destination - non_mandatory_tours = tours[tours.tour_category == 'non_mandatory'] + non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] # - if no mandatory_tours if non_mandatory_tours.shape[0] == 0: tracing.no_results(trace_label) return - estimator = estimation.manager.begin_estimation('non_mandatory_tour_destination') + estimator = estimation.manager.begin_estimation("non_mandatory_tour_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') - estimator.write_spec(model_settings, tag='SPEC') + estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) - estimator.write_table(inject.get_injectable('size_terms'), 'size_terms', append=False) - estimator.write_table(inject.get_table('land_use').to_frame(), 'landuse', append=False) + estimator.write_table( + inject.get_injectable("size_terms"), "size_terms", append=False + ) + estimator.write_table( + inject.get_table("land_use").to_frame(), "landuse", append=False + ) estimator.write_model_settings(model_settings, model_settings_file_name) choices_df, save_sample_df = tour_destination.run_tour_destination( @@ -73,20 +70,25 @@ def non_mandatory_tour_destination( model_settings, network_los, estimator, - chunk_size, trace_hh_id, trace_label) + chunk_size, + trace_hh_id, + trace_label, + ) if estimator: estimator.write_choices(choices_df.choice) - choices_df.choice = estimator.get_survey_values(choices_df.choice, 'tours', 'destination') + choices_df.choice = estimator.get_survey_values( + choices_df.choice, "tours", "destination" + ) estimator.write_override_choices(choices_df.choice) estimator.end_estimation() - non_mandatory_tours['destination'] = choices_df.choice + non_mandatory_tours["destination"] = choices_df.choice - assign_in_place(tours, non_mandatory_tours[['destination']]) + assign_in_place(tours, non_mandatory_tours[["destination"]]) if want_logsums: - non_mandatory_tours[logsum_column_name] = choices_df['logsum'] + non_mandatory_tours[logsum_column_name] = choices_df["logsum"] assign_in_place(tours, non_mandatory_tours[[logsum_column_name]]) pipeline.replace_table("tours", tours) @@ -97,9 +99,11 @@ def non_mandatory_tour_destination( pipeline.extend_table(sample_table_name, save_sample_df) if trace_hh_id: - tracing.trace_df(tours[tours.tour_category == 'non_mandatory'], - label="non_mandatory_tour_destination", - slicer='person_id', - index_label='tour', - columns=None, - warn_if_empty=True) + tracing.trace_df( + tours[tours.tour_category == "non_mandatory"], + label="non_mandatory_tour_destination", + slicer="person_id", + index_label="tour", + columns=None, + warn_if_empty=True, + ) diff --git a/activitysim/abm/models/non_mandatory_scheduling.py b/activitysim/abm/models/non_mandatory_scheduling.py index d8f32d61ef..b827f18536 100644 --- a/activitysim/abm/models/non_mandatory_scheduling.py +++ b/activitysim/abm/models/non_mandatory_scheduling.py @@ -4,42 +4,34 @@ import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline +from activitysim.core import config, expressions, inject, pipeline, simulate from activitysim.core import timetable as tt -from activitysim.core import simulate -from activitysim.core import expressions +from activitysim.core import tracing +from activitysim.core.util import assign_in_place from .util import estimation from .util.tour_scheduling import run_tour_scheduling - from .util.vectorize_tour_scheduling import vectorize_tour_scheduling -from activitysim.core.util import assign_in_place - logger = logging.getLogger(__name__) DUMP = False @inject.step() -def non_mandatory_tour_scheduling(tours, - persons_merged, - tdd_alts, - chunk_size, - trace_hh_id): +def non_mandatory_tour_scheduling( + tours, persons_merged, tdd_alts, chunk_size, trace_hh_id +): """ This model predicts the departure time and duration of each activity for non-mandatory tours """ - model_name = 'non_mandatory_tour_scheduling' + model_name = "non_mandatory_tour_scheduling" trace_label = model_name persons_merged = persons_merged.to_frame() tours = tours.to_frame() - non_mandatory_tours = tours[tours.tour_category == 'non_mandatory'] + non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] # - if no mandatory_tours if non_mandatory_tours.shape[0] == 0: @@ -48,23 +40,35 @@ def non_mandatory_tour_scheduling(tours, tour_segment_col = None - choices = run_tour_scheduling(model_name, non_mandatory_tours, persons_merged, tdd_alts, - tour_segment_col, chunk_size, trace_hh_id) + choices = run_tour_scheduling( + model_name, + non_mandatory_tours, + persons_merged, + tdd_alts, + tour_segment_col, + chunk_size, + trace_hh_id, + ) assign_in_place(tours, choices) pipeline.replace_table("tours", tours) # updated df for tracing - non_mandatory_tours = tours[tours.tour_category == 'non_mandatory'] + non_mandatory_tours = tours[tours.tour_category == "non_mandatory"] - tracing.dump_df(DUMP, - tt.tour_map(persons_merged, non_mandatory_tours, tdd_alts), - trace_label, 'tour_map') + tracing.dump_df( + DUMP, + tt.tour_map(persons_merged, non_mandatory_tours, tdd_alts), + trace_label, + "tour_map", + ) if trace_hh_id: - tracing.trace_df(non_mandatory_tours, - label=trace_label, - slicer='person_id', - index_label='tour_id', - columns=None, - warn_if_empty=True) + tracing.trace_df( + non_mandatory_tours, + label=trace_label, + slicer="person_id", + index_label="tour_id", + columns=None, + warn_if_empty=True, + ) diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index 814bec97e6..813bc7de63 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -5,18 +5,18 @@ import numpy as np import pandas as pd +from activitysim.core import ( + config, + expressions, + inject, + logit, + pipeline, + simulate, + tracing, +) from activitysim.core.interaction_simulate import interaction_simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import simulate -from activitysim.core import logit -from activitysim.core import expressions - from .util import estimation - from .util.overlap import person_max_window from .util.tour_frequency import process_non_mandatory_tours @@ -24,12 +24,12 @@ def extension_probs(): - f = config.config_file_path('non_mandatory_tour_frequency_extension_probs.csv') - df = pd.read_csv(f, comment='#') + f = config.config_file_path("non_mandatory_tour_frequency_extension_probs.csv") + df = pd.read_csv(f, comment="#") # convert cum probs to individual probs - df['2_tours'] = df['2_tours'] - df['1_tours'] - df['1_tours'] = df['1_tours'] - df['0_tours'] + df["2_tours"] = df["2_tours"] - df["1_tours"] + df["1_tours"] = df["1_tours"] - df["0_tours"] return df @@ -70,9 +70,9 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la assert tour_counts.index.name == persons.index.name - PROBABILITY_COLUMNS = ['0_tours', '1_tours', '2_tours'] - JOIN_COLUMNS = ['ptype', 'has_mandatory_tour', 'has_joint_tour'] - TOUR_TYPE_COL = 'nonmandatory_tour_type' + PROBABILITY_COLUMNS = ["0_tours", "1_tours", "2_tours"] + JOIN_COLUMNS = ["ptype", "has_mandatory_tour", "has_joint_tour"] + TOUR_TYPE_COL = "nonmandatory_tour_type" probs_spec = extension_probs() persons = persons[JOIN_COLUMNS] @@ -91,8 +91,9 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la tour_type_trace_label = tracing.extend_trace_label(trace_label, tour_type) # - only extend tour if frequency is max possible frequency for this tour type - tour_type_is_maxed = \ - extend_tour_counts & (tour_counts[tour_type] == alternatives[tour_type].max()) + tour_type_is_maxed = extend_tour_counts & ( + tour_counts[tour_type] == alternatives[tour_type].max() + ) maxed_tour_count_idx = tour_counts.index[tour_type_is_maxed] if len(maxed_tour_count_idx) == 0: @@ -103,7 +104,7 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la persons.loc[maxed_tour_count_idx], probs_spec[probs_spec[TOUR_TYPE_COL] == i_tour_type], on=JOIN_COLUMNS, - how='left' + how="left", ).set_index(maxed_tour_count_idx) assert choosers.index.name == tour_counts.index.name @@ -111,27 +112,30 @@ def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_la choices, rands = logit.make_choices( choosers[PROBABILITY_COLUMNS], trace_label=tour_type_trace_label, - trace_choosers=choosers) + trace_choosers=choosers, + ) # - extend tour_count (0-based prob alternative choice equals magnitude of extension) if choices.any(): tour_counts.loc[choices.index, tour_type] += choices if have_trace_targets: - tracing.trace_df(choices, - tracing.extend_trace_label(tour_type_trace_label, 'choices'), - columns=[None, 'choice']) - tracing.trace_df(rands, - tracing.extend_trace_label(tour_type_trace_label, 'rands'), - columns=[None, 'rand']) + tracing.trace_df( + choices, + tracing.extend_trace_label(tour_type_trace_label, "choices"), + columns=[None, "choice"], + ) + tracing.trace_df( + rands, + tracing.extend_trace_label(tour_type_trace_label, "rands"), + columns=[None, "rand"], + ) return tour_counts @inject.step() -def non_mandatory_tour_frequency(persons, persons_merged, - chunk_size, - trace_hh_id): +def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is @@ -139,68 +143,76 @@ def non_mandatory_tour_frequency(persons, persons_merged, othdiscr, eatout, and social trips in various combination. """ - trace_label = 'non_mandatory_tour_frequency' - model_settings_file_name = 'non_mandatory_tour_frequency.yaml' + trace_label = "non_mandatory_tour_frequency" + model_settings_file_name = "non_mandatory_tour_frequency.yaml" model_settings = config.read_model_settings(model_settings_file_name) # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions - alternatives = simulate.read_model_alts('non_mandatory_tour_frequency_alternatives.csv', set_index=None) - alternatives['tot_tours'] = alternatives.sum(axis=1) + alternatives = simulate.read_model_alts( + "non_mandatory_tour_frequency_alternatives.csv", set_index=None + ) + alternatives["tot_tours"] = alternatives.sum(axis=1) # filter based on results of CDAP choosers = persons_merged.to_frame() - choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] + choosers = choosers[choosers.cdap_activity.isin(["M", "N"])] # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_dict = { - 'person_max_window': person_max_window - } + locals_dict = {"person_max_window": person_max_window} expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) - spec_segments = model_settings.get('SPEC_SEGMENTS', {}) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) + spec_segments = model_settings.get("SPEC_SEGMENTS", {}) # segment by person type and pick the right spec for each person type choices_list = [] for segment_settings in spec_segments: - segment_name = segment_settings['NAME'] - ptype = segment_settings['PTYPE'] + segment_name = segment_settings["NAME"] + ptype = segment_settings["PTYPE"] # pick the spec column for the segment segment_spec = model_spec[[segment_name]] chooser_segment = choosers[choosers.ptype == ptype] - logger.info("Running segment '%s' of size %d", segment_name, len(chooser_segment)) + logger.info( + "Running segment '%s' of size %d", segment_name, len(chooser_segment) + ) if len(chooser_segment) == 0: # skip empty segments continue - estimator = \ - estimation.manager.begin_estimation(model_name=segment_name, bundle_name='non_mandatory_tour_frequency') + estimator = estimation.manager.begin_estimation( + model_name=segment_name, bundle_name="non_mandatory_tour_frequency" + ) coefficients_df = simulate.read_model_coefficients(segment_settings) - segment_spec = simulate.eval_coefficients(segment_spec, coefficients_df, estimator) + segment_spec = simulate.eval_coefficients( + segment_spec, coefficients_df, estimator + ) if estimator: estimator.write_spec(model_settings, bundle_directory=True) - estimator.write_model_settings(model_settings, model_settings_file_name, bundle_directory=True) + estimator.write_model_settings( + model_settings, model_settings_file_name, bundle_directory=True + ) # preserving coefficients file name makes bringing back updated coefficients more straightforward estimator.write_coefficients(coefficients_df, segment_settings) estimator.write_choosers(chooser_segment) @@ -211,16 +223,16 @@ def non_mandatory_tour_frequency(persons, persons_merged, # chooser index must be duplicated in column or it will be omitted from interaction_dataset # estimation requires that chooser_id is either in index or a column of interaction_dataset # so it can be reformatted (melted) and indexed by chooser_id and alt_id - assert chooser_segment.index.name == 'person_id' - assert 'person_id' not in chooser_segment.columns - chooser_segment['person_id'] = chooser_segment.index + assert chooser_segment.index.name == "person_id" + assert "person_id" not in chooser_segment.columns + chooser_segment["person_id"] = chooser_segment.index # FIXME set_alt_id - do we need this for interaction_simulate estimation bundle tables? - estimator.set_alt_id('alt_id') + estimator.set_alt_id("alt_id") estimator.set_chooser_id(chooser_segment.index.name) - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_simulate( chooser_segment, @@ -229,19 +241,22 @@ def non_mandatory_tour_frequency(persons, persons_merged, log_alt_losers=log_alt_losers, locals_d=constants, chunk_size=chunk_size, - trace_label='non_mandatory_tour_frequency.%s' % segment_name, - trace_choice_name='non_mandatory_tour_frequency', - estimator=estimator) + trace_label="non_mandatory_tour_frequency.%s" % segment_name, + trace_choice_name="non_mandatory_tour_frequency", + estimator=estimator, + ) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'persons', 'non_mandatory_tour_frequency') + choices = estimator.get_survey_values( + choices, "persons", "non_mandatory_tour_frequency" + ) estimator.write_override_choices(choices) estimator.end_estimation() choices_list.append(choices) - del alternatives['tot_tours'] # del tot_tours column we added above + del alternatives["tot_tours"] # del tot_tours column we added above # The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate # is the index value of the chosen alternative in the alternatives table. @@ -252,8 +267,9 @@ def non_mandatory_tour_frequency(persons, persons_merged, # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] - persons['non_mandatory_tour_frequency'] = \ + persons["non_mandatory_tour_frequency"] = ( choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) + ) """ We have now generated non-mandatory tour frequencies, but they are attributes of the person table @@ -279,27 +295,41 @@ def non_mandatory_tour_frequency(persons, persons_merged, modeled_tour_counts.index = choices.index # assign person ids to the index # - extend_tour_counts - probabalistic - extended_tour_counts = \ - extend_tour_counts(choosers, modeled_tour_counts.copy(), alternatives, - trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) + extended_tour_counts = extend_tour_counts( + choosers, + modeled_tour_counts.copy(), + alternatives, + trace_hh_id, + tracing.extend_trace_label(trace_label, "extend_tour_counts"), + ) num_modeled_tours = modeled_tour_counts.sum().sum() num_extended_tours = extended_tour_counts.sum().sum() - logger.info("extend_tour_counts increased tour count by %s from %s to %s" % - (num_extended_tours - num_modeled_tours, num_modeled_tours, num_extended_tours)) + logger.info( + "extend_tour_counts increased tour count by %s from %s to %s" + % ( + num_extended_tours - num_modeled_tours, + num_modeled_tours, + num_extended_tours, + ) + ) """ create the non_mandatory tours based on extended_tour_counts """ if estimator: - override_tour_counts = \ - estimation.manager.get_survey_values(extended_tour_counts, - table_name='persons', - column_names=['_%s' % c for c in extended_tour_counts.columns]) - override_tour_counts = \ - override_tour_counts.rename(columns={('_%s' % c): c for c in extended_tour_counts.columns}) - logger.info("estimation get_survey_values override_tour_counts %s changed cells" % - (override_tour_counts != extended_tour_counts).sum().sum()) + override_tour_counts = estimation.manager.get_survey_values( + extended_tour_counts, + table_name="persons", + column_names=["_%s" % c for c in extended_tour_counts.columns], + ) + override_tour_counts = override_tour_counts.rename( + columns={("_%s" % c): c for c in extended_tour_counts.columns} + ) + logger.info( + "estimation get_survey_values override_tour_counts %s changed cells" + % (override_tour_counts != extended_tour_counts).sum().sum() + ) extended_tour_counts = override_tour_counts """ @@ -311,52 +341,68 @@ def non_mandatory_tour_frequency(persons, persons_merged, if estimator: # make sure they created the right tours - survey_tours = estimation.manager.get_survey_table('tours').sort_index() - non_mandatory_survey_tours = survey_tours[survey_tours.tour_category == 'non_mandatory'] + survey_tours = estimation.manager.get_survey_table("tours").sort_index() + non_mandatory_survey_tours = survey_tours[ + survey_tours.tour_category == "non_mandatory" + ] assert len(non_mandatory_survey_tours) == len(non_mandatory_tours) - assert non_mandatory_survey_tours.index.equals(non_mandatory_tours.sort_index().index) + assert non_mandatory_survey_tours.index.equals( + non_mandatory_tours.sort_index().index + ) # make sure they created tours with the expected tour_ids - columns = ['person_id', 'household_id', 'tour_type', 'tour_category'] - survey_tours = \ - estimation.manager.get_survey_values(non_mandatory_tours, - table_name='tours', - column_names=columns) + columns = ["person_id", "household_id", "tour_type", "tour_category"] + survey_tours = estimation.manager.get_survey_values( + non_mandatory_tours, table_name="tours", column_names=columns + ) - tours_differ = (non_mandatory_tours[columns] != survey_tours[columns]).any(axis=1) + tours_differ = (non_mandatory_tours[columns] != survey_tours[columns]).any( + axis=1 + ) if tours_differ.any(): print("tours_differ\n%s" % tours_differ) print("%s of %s tours differ" % (tours_differ.sum(), len(tours_differ))) print("differing survey_tours\n%s" % survey_tours[tours_differ]) - print("differing modeled_tours\n%s" % non_mandatory_tours[columns][tours_differ]) + print( + "differing modeled_tours\n%s" + % non_mandatory_tours[columns][tours_differ] + ) - assert(not tours_differ.any()) + assert not tours_differ.any() pipeline.extend_table("tours", non_mandatory_tours) - tracing.register_traceable_table('tours', non_mandatory_tours) - pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) + tracing.register_traceable_table("tours", non_mandatory_tours) + pipeline.get_rn_generator().add_channel("tours", non_mandatory_tours) expressions.assign_columns( df=persons, - model_settings=model_settings.get('annotate_persons'), - trace_label=trace_label) + model_settings=model_settings.get("annotate_persons"), + trace_label=trace_label, + ) pipeline.replace_table("persons", persons) - tracing.print_summary('non_mandatory_tour_frequency', - persons.non_mandatory_tour_frequency, value_counts=True) + tracing.print_summary( + "non_mandatory_tour_frequency", + persons.non_mandatory_tour_frequency, + value_counts=True, + ) if trace_hh_id: - tracing.trace_df(non_mandatory_tours, - label="non_mandatory_tour_frequency.non_mandatory_tours", - warn_if_empty=True) - - tracing.trace_df(choosers, - label="non_mandatory_tour_frequency.choosers", - warn_if_empty=True) - - tracing.trace_df(persons, - label="non_mandatory_tour_frequency.annotated_persons", - warn_if_empty=True) + tracing.trace_df( + non_mandatory_tours, + label="non_mandatory_tour_frequency.non_mandatory_tours", + warn_if_empty=True, + ) + + tracing.trace_df( + choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True + ) + + tracing.trace_df( + persons, + label="non_mandatory_tour_frequency.annotated_persons", + warn_if_empty=True, + ) diff --git a/activitysim/abm/models/parking_location_choice.py b/activitysim/abm/models/parking_location_choice.py index 5c13969eed..bddbc65190 100644 --- a/activitysim/abm/models/parking_location_choice.py +++ b/activitysim/abm/models/parking_location_choice.py @@ -5,21 +5,21 @@ import numpy as np import pandas as pd -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import logit - -from activitysim.core import expressions +from activitysim.core import ( + config, + expressions, + inject, + logit, + pipeline, + simulate, + tracing, +) from activitysim.core.interaction_sample_simulate import interaction_sample_simulate -from activitysim.core.util import assign_in_place from activitysim.core.tracing import print_elapsed_time +from activitysim.core.util import assign_in_place from .util import estimation - logger = logging.getLogger(__name__) NO_DESTINATION = -1 @@ -50,19 +50,27 @@ def wrap_skims(model_settings): dict containing skims, keyed by canonical names relative to tour orientation """ - network_los = inject.get_injectable('network_los') + network_los = inject.get_injectable("network_los") skim_dict = network_los.get_default_skim_dict() - origin = model_settings['TRIP_ORIGIN'] - park_zone = model_settings['ALT_DEST_COL_NAME'] - destination = model_settings['TRIP_DESTINATION'] - time_period = model_settings['TRIP_DEPARTURE_PERIOD'] + origin = model_settings["TRIP_ORIGIN"] + park_zone = model_settings["ALT_DEST_COL_NAME"] + destination = model_settings["TRIP_DESTINATION"] + time_period = model_settings["TRIP_DEPARTURE_PERIOD"] skims = { - "odt_skims": skim_dict.wrap_3d(orig_key=origin, dest_key=destination, dim3_key=time_period), - "dot_skims": skim_dict.wrap_3d(orig_key=destination, dest_key=origin, dim3_key=time_period), - "opt_skims": skim_dict.wrap_3d(orig_key=origin, dest_key=park_zone, dim3_key=time_period), - "pdt_skims": skim_dict.wrap_3d(orig_key=park_zone, dest_key=destination, dim3_key=time_period), + "odt_skims": skim_dict.wrap_3d( + orig_key=origin, dest_key=destination, dim3_key=time_period + ), + "dot_skims": skim_dict.wrap_3d( + orig_key=destination, dest_key=origin, dim3_key=time_period + ), + "opt_skims": skim_dict.wrap_3d( + orig_key=origin, dest_key=park_zone, dim3_key=time_period + ), + "pdt_skims": skim_dict.wrap_3d( + orig_key=park_zone, dest_key=destination, dim3_key=time_period + ), "od_skims": skim_dict.wrap(origin, destination), "do_skims": skim_dict.wrap(destination, origin), "op_skims": skim_dict.wrap(origin, park_zone), @@ -86,13 +94,15 @@ def get_spec_for_segment(model_settings, spec_name, segment): def parking_destination_simulate( - segment_name, - trips, - destination_sample, - model_settings, - skims, - chunk_size, trace_hh_id, - trace_label): + segment_name, + trips, + destination_sample, + model_settings, + skims, + chunk_size, + trace_hh_id, + trace_label, +): """ Chose destination from destination_sample (with od_logsum and dp_logsum columns added) @@ -102,11 +112,11 @@ def parking_destination_simulate( choices - pandas.Series destination alt chosen """ - trace_label = tracing.extend_trace_label(trace_label, 'trip_destination_simulate') + trace_label = tracing.extend_trace_label(trace_label, "trip_destination_simulate") - spec = get_spec_for_segment(model_settings, 'SPECIFICATION', segment_name) + spec = get_spec_for_segment(model_settings, "SPECIFICATION", segment_name) - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] logger.info("Running trip_destination_simulate with %d trips", len(trips)) @@ -119,37 +129,46 @@ def parking_destination_simulate( spec=spec, choice_column=alt_dest_col_name, want_logsums=False, - allow_zero_probs=True, zero_prob_choice_val=NO_DESTINATION, + allow_zero_probs=True, + zero_prob_choice_val=NO_DESTINATION, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='parking_loc') + trace_choice_name="parking_loc", + ) # drop any failed zero_prob destinations if (parking_locations == NO_DESTINATION).any(): - logger.debug("dropping %s failed parking locations", (parking_locations == NO_DESTINATION).sum()) + logger.debug( + "dropping %s failed parking locations", + (parking_locations == NO_DESTINATION).sum(), + ) parking_locations = parking_locations[parking_locations != NO_DESTINATION] return parking_locations def choose_parking_location( - segment_name, - trips, - alternatives, - model_settings, - want_sample_table, - skims, - chunk_size, trace_hh_id, - trace_label): + segment_name, + trips, + alternatives, + model_settings, + want_sample_table, + skims, + chunk_size, + trace_hh_id, + trace_label, +): logger.info("choose_parking_location %s with %d trips", trace_label, trips.shape[0]) t0 = print_elapsed_time() - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] - destination_sample = logit.interaction_dataset(trips, alternatives, alt_index_id=alt_dest_col_name) + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] + destination_sample = logit.interaction_dataset( + trips, alternatives, alt_index_id=alt_dest_col_name + ) destination_sample.index = np.repeat(trips.index.values, len(alternatives)) destination_sample.index.name = trips.index.name destination_sample = destination_sample[[alt_dest_col_name]].copy() @@ -161,12 +180,16 @@ def choose_parking_location( destination_sample=destination_sample, model_settings=model_settings, skims=skims, - chunk_size=chunk_size, trace_hh_id=trace_hh_id, - trace_label=trace_label) + chunk_size=chunk_size, + trace_hh_id=trace_hh_id, + trace_label=trace_label, + ) if want_sample_table: # FIXME - sample_table - destination_sample.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) + destination_sample.set_index( + model_settings["ALT_DEST_COL_NAME"], append=True, inplace=True + ) else: destination_sample = None @@ -176,18 +199,24 @@ def choose_parking_location( def run_parking_destination( - model_settings, - trips, land_use, - chunk_size, trace_hh_id, - trace_label, - fail_some_trips_for_testing=False): - - chooser_filter_column = model_settings.get('CHOOSER_FILTER_COLUMN_NAME') - chooser_segment_column = model_settings.get('CHOOSER_SEGMENT_COLUMN_NAME') - - parking_location_column_name = model_settings['ALT_DEST_COL_NAME'] - sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') - want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None + model_settings, + trips, + land_use, + chunk_size, + trace_hh_id, + trace_label, + fail_some_trips_for_testing=False, +): + + chooser_filter_column = model_settings.get("CHOOSER_FILTER_COLUMN_NAME") + chooser_segment_column = model_settings.get("CHOOSER_SEGMENT_COLUMN_NAME") + + parking_location_column_name = model_settings["ALT_DEST_COL_NAME"] + sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") + want_sample_table = ( + config.setting("want_dest_choice_sample_tables") + and sample_table_name is not None + ) choosers = trips[trips[chooser_filter_column]] choosers = choosers.sort_index() @@ -197,7 +226,7 @@ def run_parking_destination( skims = wrap_skims(model_settings) - alt_column_filter_name = model_settings.get('ALTERNATIVE_FILTER_COLUMN_NAME') + alt_column_filter_name = model_settings.get("ALTERNATIVE_FILTER_COLUMN_NAME") alternatives = land_use[land_use[alt_column_filter_name]] # don't need size terms in alternatives, just TAZ index @@ -208,7 +237,9 @@ def run_parking_destination( sample_list = [] for segment_name, chooser_segment in choosers.groupby(chooser_segment_column): if chooser_segment.shape[0] == 0: - logger.info("%s skipping segment %s: no choosers", trace_label, segment_name) + logger.info( + "%s skipping segment %s: no choosers", trace_label, segment_name + ) continue choices, destination_sample = choose_parking_location( @@ -218,8 +249,10 @@ def run_parking_destination( model_settings, want_sample_table, skims, - chunk_size, trace_hh_id, - trace_label=tracing.extend_trace_label(trace_label, segment_name)) + chunk_size, + trace_hh_id, + trace_label=tracing.extend_trace_label(trace_label, segment_name), + ) choices_list.append(choices) if want_sample_table: @@ -233,7 +266,9 @@ def run_parking_destination( parking_df = parking_df.drop(parking_df.index[0]) assign_in_place(trips, parking_df.to_frame(parking_location_column_name)) - trips[parking_location_column_name] = trips[parking_location_column_name].fillna(-1) + trips[parking_location_column_name] = trips[ + parking_location_column_name + ].fillna(-1) else: trips[parking_location_column_name] = -1 @@ -244,30 +279,24 @@ def run_parking_destination( @inject.step() def parking_location( - trips, - trips_merged, - land_use, - network_los, - chunk_size, - trace_hh_id): + trips, trips_merged, land_use, network_los, chunk_size, trace_hh_id +): """ Given a set of trips, each trip needs to have a parking location if it is eligible for remote parking. """ - trace_label = 'parking_location' - model_settings = config.read_model_settings('parking_location_choice.yaml') - alt_destination_col_name = model_settings['ALT_DEST_COL_NAME'] + trace_label = "parking_location" + model_settings = config.read_model_settings("parking_location_choice.yaml") + alt_destination_col_name = model_settings["ALT_DEST_COL_NAME"] - preprocessor_settings = model_settings.get('PREPROCESSOR', None) + preprocessor_settings = model_settings.get("PREPROCESSOR", None) trips_df = trips.to_frame() trips_merged_df = trips_merged.to_frame() land_use_df = land_use.to_frame() - locals_dict = { - 'network_los': network_los - } + locals_dict = {"network_los": network_los} locals_dict.update(config.get_model_constants(model_settings)) if preprocessor_settings: @@ -275,11 +304,13 @@ def parking_location( df=trips_merged_df, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) parking_locations, save_sample_df = run_parking_destination( model_settings, - trips_merged_df, land_use_df, + trips_merged_df, + land_use_df, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label, @@ -290,20 +321,25 @@ def parking_location( pipeline.replace_table("trips", trips_df) if trace_hh_id: - tracing.trace_df(trips_df, - label=trace_label, - slicer='trip_id', - index_label='trip_id', - warn_if_empty=True) + tracing.trace_df( + trips_df, + label=trace_label, + slicer="trip_id", + index_label="trip_id", + warn_if_empty=True, + ) if save_sample_df is not None: - assert len(save_sample_df.index.get_level_values(0).unique()) == \ - len(trips_df[trips_df.trip_num < trips_df.trip_count]) + assert len(save_sample_df.index.get_level_values(0).unique()) == len( + trips_df[trips_df.trip_num < trips_df.trip_count] + ) - sample_table_name = model_settings.get('PARKING_LOCATION_SAMPLE_TABLE_NAME') + sample_table_name = model_settings.get("PARKING_LOCATION_SAMPLE_TABLE_NAME") assert sample_table_name is not None - logger.info("adding %s samples to %s" % (len(save_sample_df), sample_table_name)) + logger.info( + "adding %s samples to %s" % (len(save_sample_df), sample_table_name) + ) # lest they try to put tour samples into the same table if pipeline.is_table(sample_table_name): diff --git a/activitysim/abm/models/stop_frequency.py b/activitysim/abm/models/stop_frequency.py index 50eda3e950..1372582466 100644 --- a/activitysim/abm/models/stop_frequency.py +++ b/activitysim/abm/models/stop_frequency.py @@ -5,36 +5,27 @@ import numpy as np import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import expressions - from activitysim.abm.models.util.canonical_ids import set_trip_index - -from activitysim.core.util import assign_in_place -from activitysim.core.util import reindex +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing +from activitysim.core.util import assign_in_place, reindex from .util import estimation - logger = logging.getLogger(__name__) @inject.injectable() def stop_frequency_alts(): # alt file for building trips even though simulation is simple_simulate not interaction_simulate - file_path = config.config_file_path('stop_frequency_alternatives.csv') - df = pd.read_csv(file_path, comment='#') - df.set_index('alt', inplace=True) + file_path = config.config_file_path("stop_frequency_alternatives.csv") + df = pd.read_csv(file_path, comment="#") + df.set_index("alt", inplace=True) return df def process_trips(tours, stop_frequency_alts): - OUTBOUND_ALT = 'out' + OUTBOUND_ALT = "out" assert OUTBOUND_ALT in stop_frequency_alts.columns # get the actual alternatives for each person - have to go back to the @@ -60,13 +51,13 @@ def process_trips(tours, stop_frequency_alts): # reformat with the columns given below trips = trips.stack().reset_index() - trips.columns = ['tour_id', 'direction', 'trip_count'] + trips.columns = ["tour_id", "direction", "trip_count"] # tours legs have one more leg than stop trips.trip_count += 1 # prefer direction as boolean - trips['outbound'] = trips.direction == OUTBOUND_ALT + trips["outbound"] = trips.direction == OUTBOUND_ALT """ tour_id direction trip_count outbound @@ -81,17 +72,26 @@ def process_trips(tours, stop_frequency_alts): trips = trips.take(np.repeat(trips.index.values, trips.trip_count.values)) trips = trips.reset_index(drop=True) - grouped = trips.groupby(['tour_id', 'outbound']) - trips['trip_num'] = grouped.cumcount() + 1 + grouped = trips.groupby(["tour_id", "outbound"]) + trips["trip_num"] = grouped.cumcount() + 1 - trips['person_id'] = reindex(tours.person_id, trips.tour_id) - trips['household_id'] = reindex(tours.household_id, trips.tour_id) + trips["person_id"] = reindex(tours.person_id, trips.tour_id) + trips["household_id"] = reindex(tours.household_id, trips.tour_id) - trips['primary_purpose'] = reindex(tours.primary_purpose, trips.tour_id) + trips["primary_purpose"] = reindex(tours.primary_purpose, trips.tour_id) # reorder columns and drop 'direction' - trips = trips[['person_id', 'household_id', 'tour_id', 'primary_purpose', - 'trip_num', 'outbound', 'trip_count']] + trips = trips[ + [ + "person_id", + "household_id", + "tour_id", + "primary_purpose", + "trip_num", + "outbound", + "trip_count", + ] + ] """ person_id household_id tour_id primary_purpose trip_num outbound trip_count @@ -112,11 +112,8 @@ def process_trips(tours, stop_frequency_alts): @inject.step() def stop_frequency( - tours, tours_merged, - stop_frequency_alts, - network_los, - chunk_size, - trace_hh_id): + tours, tours_merged, stop_frequency_alts, network_los, chunk_size, trace_hh_id +): """ stop frequency model @@ -143,8 +140,8 @@ def stop_frequency( """ - trace_label = 'stop_frequency' - model_settings_file_name = 'stop_frequency.yaml' + trace_label = "stop_frequency" + model_settings_file_name = "stop_frequency.yaml" model_settings = config.read_model_settings(model_settings_file_name) @@ -158,19 +155,18 @@ def stop_frequency( constants = config.get_model_constants(model_settings) # - run preprocessor to annotate tours_merged - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already - assert 'origin' in tours_merged - assert 'destination' in tours_merged - od_skim_stack_wrapper = network_los.get_default_skim_dict().wrap('origin', 'destination') + assert "origin" in tours_merged + assert "destination" in tours_merged + od_skim_stack_wrapper = network_los.get_default_skim_dict().wrap( + "origin", "destination" + ) skims = [od_skim_stack_wrapper] - locals_dict = { - "od_skims": od_skim_stack_wrapper, - 'network_los': network_los - } + locals_dict = {"od_skims": od_skim_stack_wrapper, "network_los": network_los} locals_dict.update(constants) simulate.set_skim_wrapper_targets(tours_merged, skims) @@ -180,17 +176,23 @@ def stop_frequency( df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) assign_in_place(tours_merged, annotations) - tracing.print_summary('stop_frequency segments', - tours_merged.primary_purpose, value_counts=True) + tracing.print_summary( + "stop_frequency segments", tours_merged.primary_purpose, value_counts=True + ) - spec_segments = model_settings.get('SPEC_SEGMENTS') - assert spec_segments is not None, f"SPEC_SEGMENTS setting not found in model settings: {model_settings_file_name}" - segment_col = model_settings.get('SEGMENT_COL') - assert segment_col is not None, f"SEGMENT_COL setting not found in model settings: {model_settings_file_name}" + spec_segments = model_settings.get("SPEC_SEGMENTS") + assert ( + spec_segments is not None + ), f"SPEC_SEGMENTS setting not found in model settings: {model_settings_file_name}" + segment_col = model_settings.get("SEGMENT_COL") + assert ( + segment_col is not None + ), f"SEGMENT_COL setting not found in model settings: {model_settings_file_name}" nest_spec = config.get_logit_model_settings(model_settings) @@ -206,20 +208,32 @@ def stop_frequency( logging.info(f"{trace_label} skipping empty segment {segment_name}") continue - logging.info(f"{trace_label} running segment {segment_name} with {chooser_segment.shape[0]} chooser rows") + logging.info( + f"{trace_label} running segment {segment_name} with {chooser_segment.shape[0]} chooser rows" + ) - estimator = estimation.manager.begin_estimation(model_name=segment_name, bundle_name='stop_frequency') + estimator = estimation.manager.begin_estimation( + model_name=segment_name, bundle_name="stop_frequency" + ) - segment_spec = simulate.read_model_spec(file_name=segment_settings['SPEC']) - assert segment_spec is not None, "spec for segment_type %s not found" % segment_name + segment_spec = simulate.read_model_spec(file_name=segment_settings["SPEC"]) + assert segment_spec is not None, ( + "spec for segment_type %s not found" % segment_name + ) - coefficients_file_name = segment_settings['COEFFICIENTS'] - coefficients_df = simulate.read_model_coefficients(file_name=coefficients_file_name) - segment_spec = simulate.eval_coefficients(segment_spec, coefficients_df, estimator) + coefficients_file_name = segment_settings["COEFFICIENTS"] + coefficients_df = simulate.read_model_coefficients( + file_name=coefficients_file_name + ) + segment_spec = simulate.eval_coefficients( + segment_spec, coefficients_df, estimator + ) if estimator: estimator.write_spec(segment_settings, bundle_directory=False) - estimator.write_model_settings(model_settings, model_settings_file_name, bundle_directory=True) + estimator.write_model_settings( + model_settings, model_settings_file_name, bundle_directory=True + ) estimator.write_coefficients(coefficients_df, segment_settings) estimator.write_choosers(chooser_segment) @@ -232,15 +246,18 @@ def stop_frequency( locals_d=constants, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_name), - trace_choice_name='stops', - estimator=estimator) + trace_choice_name="stops", + estimator=estimator, + ) # convert indexes to alternative names choices = pd.Series(segment_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'tours', 'stop_frequency') # override choices + choices = estimator.get_survey_values( + choices, "tours", "stop_frequency" + ) # override choices estimator.write_override_choices(choices) estimator.end_estimation() @@ -248,30 +265,30 @@ def stop_frequency( choices = pd.concat(choices_list) - tracing.print_summary('stop_frequency', choices, value_counts=True) + tracing.print_summary("stop_frequency", choices, value_counts=True) # add stop_frequency choices to tours table - assign_in_place(tours, choices.to_frame('stop_frequency')) + assign_in_place(tours, choices.to_frame("stop_frequency")) # FIXME should have added this when tours created? - assert 'primary_purpose' not in tours - if 'primary_purpose' not in tours.columns: + assert "primary_purpose" not in tours + if "primary_purpose" not in tours.columns: # if not already there, then it will have been added by annotate tours preprocessor - assign_in_place(tours, tours_merged[['primary_purpose']]) + assign_in_place(tours, tours_merged[["primary_purpose"]]) pipeline.replace_table("tours", tours) # create trips table trips = process_trips(tours, stop_frequency_alts) trips = pipeline.extend_table("trips", trips) - tracing.register_traceable_table('trips', trips) - pipeline.get_rn_generator().add_channel('trips', trips) + tracing.register_traceable_table("trips", trips) + pipeline.get_rn_generator().add_channel("trips", trips) if estimator: # make sure they created trips with the expected tour_ids - columns = ['person_id', 'household_id', 'tour_id', 'outbound'] + columns = ["person_id", "household_id", "tour_id", "outbound"] - survey_trips = estimation.manager.get_survey_table(table_name='trips') + survey_trips = estimation.manager.get_survey_table(table_name="trips") different = False survey_trips_not_in_trips = survey_trips[~survey_trips.index.isin(trips.index)] if len(survey_trips_not_in_trips) > 0: @@ -283,10 +300,9 @@ def stop_frequency( different = True assert not different - survey_trips = \ - estimation.manager.get_survey_values(trips, - table_name='trips', - column_names=columns) + survey_trips = estimation.manager.get_survey_values( + trips, table_name="trips", column_names=columns + ) trips_differ = (trips[columns] != survey_trips[columns]).any(axis=1) @@ -296,24 +312,22 @@ def stop_frequency( print("differing survey_trips\n%s" % survey_trips[trips_differ]) print("differing modeled_trips\n%s" % trips[columns][trips_differ]) - assert(not trips_differ.any()) + assert not trips_differ.any() if trace_hh_id: - tracing.trace_df(tours, - label="stop_frequency.tours", - slicer='person_id', - columns=None) - - tracing.trace_df(trips, - label="stop_frequency.trips", - slicer='person_id', - columns=None) - - tracing.trace_df(annotations, - label="stop_frequency.annotations", - columns=None) - - tracing.trace_df(tours_merged, - label="stop_frequency.tours_merged", - slicer='person_id', - columns=None) + tracing.trace_df( + tours, label="stop_frequency.tours", slicer="person_id", columns=None + ) + + tracing.trace_df( + trips, label="stop_frequency.trips", slicer="person_id", columns=None + ) + + tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) + + tracing.trace_df( + tours_merged, + label="stop_frequency.tours_merged", + slicer="person_id", + columns=None, + ) diff --git a/activitysim/abm/models/summarize.py b/activitysim/abm/models/summarize.py index 8d224c8754..87c50314af 100644 --- a/activitysim/abm/models/summarize.py +++ b/activitysim/abm/models/summarize.py @@ -2,12 +2,10 @@ # See full license in LICENSE.txt. import logging import sys -import pandas as pd -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import config +import pandas as pd +from activitysim.core import config, inject, pipeline from activitysim.core.config import setting logger = logging.getLogger(__name__) @@ -16,18 +14,20 @@ @inject.step() def write_summaries(output_dir): - summary_settings_name = 'output_summaries' - summary_file_name = 'summaries.txt' + summary_settings_name = "output_summaries" + summary_file_name = "summaries.txt" summary_settings = setting(summary_settings_name) if summary_settings is None: - logger.info("No {summary_settings_name} specified in settings file. Nothing to write.") + logger.info( + "No {summary_settings_name} specified in settings file. Nothing to write." + ) return summary_dict = summary_settings - mode = 'wb' if sys.version_info < (3,) else 'w' + mode = "wb" if sys.version_info < (3,) else "w" with open(config.output_file_path(summary_file_name), mode) as output_file: for table_name, column_names in summary_dict.items(): @@ -36,8 +36,10 @@ def write_summaries(output_dir): for c in column_names: n = 100 - empty = (df[c] == '') | df[c].isnull() + empty = (df[c] == "") | df[c].isnull() - print(f"\n### {table_name}.{c} type: {df.dtypes[c]} rows: {len(df)} ({empty.sum()} empty)\n\n", - file=output_file) + print( + f"\n### {table_name}.{c} type: {df.dtypes[c]} rows: {len(df)} ({empty.sum()} empty)\n\n", + file=output_file, + ) print(df[c].value_counts().nlargest(n), file=output_file) diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index 48dc7ef333..4d2dbef55e 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -2,20 +2,14 @@ # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline - +from activitysim.core import config, inject, los, pipeline, tracing from activitysim.core.util import assign_in_place -from activitysim.core import los - -from .util.mode import run_tour_mode_choice_simulate from .util import estimation +from .util.mode import run_tour_mode_choice_simulate logger = logging.getLogger(__name__) @@ -26,31 +20,35 @@ @inject.step() -def tour_mode_choice_simulate(tours, persons_merged, - network_los, - chunk_size, - trace_hh_id): +def tour_mode_choice_simulate( + tours, persons_merged, network_los, chunk_size, trace_hh_id +): """ Tour mode choice simulate """ - trace_label = 'tour_mode_choice' - model_settings_file_name = 'tour_mode_choice.yaml' + trace_label = "tour_mode_choice" + model_settings_file_name = "tour_mode_choice.yaml" model_settings = config.read_model_settings(model_settings_file_name) - logsum_column_name = model_settings.get('MODE_CHOICE_LOGSUM_COLUMN_NAME') - mode_column_name = 'tour_mode' + logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") + mode_column_name = "tour_mode" primary_tours = tours.to_frame() - assert not (primary_tours.tour_category == 'atwork').any() + assert not (primary_tours.tour_category == "atwork").any() logger.info("Running %s with %d tours" % (trace_label, primary_tours.shape[0])) - tracing.print_summary('tour_types', - primary_tours.tour_type, value_counts=True) + tracing.print_summary("tour_types", primary_tours.tour_type, value_counts=True) persons_merged = persons_merged.to_frame() - primary_tours_merged = pd.merge(primary_tours, persons_merged, left_on='person_id', - right_index=True, how='left', suffixes=('', '_r')) + primary_tours_merged = pd.merge( + primary_tours, + persons_merged, + left_on="person_id", + right_index=True, + how="left", + suffixes=("", "_r"), + ) constants = {} # model_constants can appear in expressions @@ -59,19 +57,23 @@ def tour_mode_choice_simulate(tours, persons_merged, skim_dict = network_los.get_default_skim_dict() # setup skim keys - orig_col_name = 'home_zone_id' - dest_col_name = 'destination' - - out_time_col_name = 'start' - in_time_col_name = 'end' - odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, - dim3_key='out_period') - dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, - dim3_key='in_period') - odr_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, - dim3_key='in_period') - dor_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, - dim3_key='out_period') + orig_col_name = "home_zone_id" + dest_col_name = "destination" + + out_time_col_name = "start" + in_time_col_name = "end" + odt_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col_name, dest_key=dest_col_name, dim3_key="out_period" + ) + dot_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col_name, dest_key=orig_col_name, dim3_key="in_period" + ) + odr_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col_name, dest_key=dest_col_name, dim3_key="in_period" + ) + dor_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col_name, dest_key=orig_col_name, dim3_key="out_period" + ) od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { @@ -80,10 +82,10 @@ def tour_mode_choice_simulate(tours, persons_merged, "odr_skims": odr_skim_stack_wrapper, # dot return skims for e.g. TNC bridge return fare "dor_skims": dor_skim_stack_wrapper, # odt return skims for e.g. TNC bridge return fare "od_skims": od_skim_stack_wrapper, - 'orig_col_name': orig_col_name, - 'dest_col_name': dest_col_name, - 'out_time_col_name': out_time_col_name, - 'in_time_col_name': in_time_col_name + "orig_col_name": orig_col_name, + "dest_col_name": dest_col_name, + "out_time_col_name": out_time_col_name, + "in_time_col_name": in_time_col_name, } if network_los.zone_system == los.THREE_ZONE: @@ -91,24 +93,35 @@ def tour_mode_choice_simulate(tours, persons_merged, tvpb = network_los.tvpb - tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col_name, dest_key=dest_col_name, - tod_key='out_period', segment_key='demographic_segment', - cache_choices=True, - trace_label=trace_label, tag='tvpb_logsum_odt') - tvpb_logsum_dot = tvpb.wrap_logsum(orig_key=dest_col_name, dest_key=orig_col_name, - tod_key='in_period', segment_key='demographic_segment', - cache_choices=True, - trace_label=trace_label, tag='tvpb_logsum_dot') - - skims.update({ - 'tvpb_logsum_odt': tvpb_logsum_odt, - 'tvpb_logsum_dot': tvpb_logsum_dot - }) + tvpb_logsum_odt = tvpb.wrap_logsum( + orig_key=orig_col_name, + dest_key=dest_col_name, + tod_key="out_period", + segment_key="demographic_segment", + cache_choices=True, + trace_label=trace_label, + tag="tvpb_logsum_odt", + ) + tvpb_logsum_dot = tvpb.wrap_logsum( + orig_key=dest_col_name, + dest_key=orig_col_name, + tod_key="in_period", + segment_key="demographic_segment", + cache_choices=True, + trace_label=trace_label, + tag="tvpb_logsum_dot", + ) + + skims.update( + {"tvpb_logsum_odt": tvpb_logsum_odt, "tvpb_logsum_dot": tvpb_logsum_dot} + ) # TVPB constants can appear in expressions - constants.update(network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) + constants.update( + network_los.setting("TVPB_SETTINGS.tour_mode_choice.CONSTANTS") + ) - estimator = estimation.manager.begin_estimation('tour_mode_choice') + estimator = estimation.manager.begin_estimation("tour_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) @@ -119,26 +132,32 @@ def tour_mode_choice_simulate(tours, persons_merged, # FIXME should normalize handling of tour_type and tour_purpose # mtctm1 school tour_type includes univ, which has different coefficients from elementary and HS # we should either add this column when tours created or add univ to tour_types - not_university = (primary_tours_merged.tour_type != 'school') | ~primary_tours_merged.is_university - primary_tours_merged['tour_purpose'] = \ - primary_tours_merged.tour_type.where(not_university, 'univ') + not_university = ( + primary_tours_merged.tour_type != "school" + ) | ~primary_tours_merged.is_university + primary_tours_merged["tour_purpose"] = primary_tours_merged.tour_type.where( + not_university, "univ" + ) choices_list = [] - for tour_purpose, tours_segment in primary_tours_merged.groupby('tour_purpose'): + for tour_purpose, tours_segment in primary_tours_merged.groupby("tour_purpose"): - logger.info("tour_mode_choice_simulate tour_type '%s' (%s tours)" % - (tour_purpose, len(tours_segment.index), )) + logger.info( + "tour_mode_choice_simulate tour_type '%s' (%s tours)" + % (tour_purpose, len(tours_segment.index),) + ) if network_los.zone_system == los.THREE_ZONE: tvpb_logsum_odt.extend_trace_label(tour_purpose) tvpb_logsum_dot.extend_trace_label(tour_purpose) # name index so tracing knows how to slice - assert tours_segment.index.name == 'tour_id' + assert tours_segment.index.name == "tour_id" choices_df = run_tour_mode_choice_simulate( tours_segment, - tour_purpose, model_settings, + tour_purpose, + model_settings, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, network_los=network_los, @@ -147,10 +166,14 @@ def tour_mode_choice_simulate(tours, persons_merged, estimator=estimator, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, tour_purpose), - trace_choice_name='tour_mode_choice') + trace_choice_name="tour_mode_choice", + ) - tracing.print_summary('tour_mode_choice_simulate %s choices_df' % tour_purpose, - choices_df.tour_mode, value_counts=True) + tracing.print_summary( + "tour_mode_choice_simulate %s choices_df" % tour_purpose, + choices_df.tour_mode, + value_counts=True, + ) choices_list.append(choices_df) @@ -159,10 +182,12 @@ def tour_mode_choice_simulate(tours, persons_merged, # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get('tvpb_mode_path_types') + tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") for mode, path_types in tvpb_mode_path_types.items(): - for direction, skim in zip(['od', 'do'], [tvpb_logsum_odt, tvpb_logsum_dot]): + for direction, skim in zip( + ["od", "do"], [tvpb_logsum_odt, tvpb_logsum_dot] + ): path_type = path_types[direction] skim_cache = skim.cache[path_type] @@ -171,20 +196,31 @@ def tour_mode_choice_simulate(tours, persons_merged, for c in skim_cache: - dest_col = f'{direction}_{c}' + dest_col = f"{direction}_{c}" if dest_col not in choices_df: - choices_df[dest_col] = np.nan if pd.api.types.is_numeric_dtype(skim_cache[c]) else '' - choices_df[dest_col].where(choices_df.tour_mode != mode, skim_cache[c], inplace=True) + choices_df[dest_col] = ( + np.nan + if pd.api.types.is_numeric_dtype(skim_cache[c]) + else "" + ) + choices_df[dest_col].where( + choices_df.tour_mode != mode, skim_cache[c], inplace=True + ) if estimator: estimator.write_choices(choices_df.tour_mode) - choices_df.tour_mode = estimator.get_survey_values(choices_df.tour_mode, 'tours', 'tour_mode') + choices_df.tour_mode = estimator.get_survey_values( + choices_df.tour_mode, "tours", "tour_mode" + ) estimator.write_override_choices(choices_df.tour_mode) estimator.end_estimation() - tracing.print_summary('tour_mode_choice_simulate all tour type choices', - choices_df.tour_mode, value_counts=True) + tracing.print_summary( + "tour_mode_choice_simulate all tour type choices", + choices_df.tour_mode, + value_counts=True, + ) # so we can trace with annotations assign_in_place(primary_tours, choices_df) @@ -196,8 +232,10 @@ def tour_mode_choice_simulate(tours, persons_merged, pipeline.replace_table("tours", all_tours) if trace_hh_id: - tracing.trace_df(primary_tours, - label=tracing.extend_trace_label(trace_label, mode_column_name), - slicer='tour_id', - index_label='tour_id', - warn_if_empty=True) + tracing.trace_df( + primary_tours, + label=tracing.extend_trace_label(trace_label, mode_column_name), + slicer="tour_id", + index_label="tour_id", + warn_if_empty=True, + ) diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py index 80eb9b912a..640a00f8f0 100644 --- a/activitysim/abm/models/trip_departure_choice.py +++ b/activitysim/abm/models/trip_departure_choice.py @@ -3,42 +3,44 @@ import numpy as np import pandas as pd -from activitysim.core import chunk -from activitysim.core import config -from activitysim.core import expressions -from activitysim.core import inject -from activitysim.core import logit -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import tracing - from activitysim.abm.models.util.trip import get_time_windows -from activitysim.core import interaction_simulate +from activitysim.core import ( + chunk, + config, + expressions, + inject, + interaction_simulate, + logit, + pipeline, + simulate, + tracing, +) from activitysim.core.simulate import set_skim_wrapper_targets from activitysim.core.util import reindex - logger = logging.getLogger(__name__) -MAIN_LEG_DURATION = 'main_leg_duration' -IB_DURATION = 'inbound_duration' -OB_DURATION = 'outbound_duration' +MAIN_LEG_DURATION = "main_leg_duration" +IB_DURATION = "inbound_duration" +OB_DURATION = "outbound_duration" -TOUR_ID = 'tour_id' -TRIP_ID = 'trip_id' -TOUR_LEG_ID = 'tour_leg_id' -PATTERN_ID = 'pattern_id' -TRIP_DURATION = 'trip_duration' -STOP_TIME_DURATION = 'stop_time_duration' -TRIP_NUM = 'trip_num' -TRIP_COUNT = 'trip_count' -OUTBOUND = 'outbound' +TOUR_ID = "tour_id" +TRIP_ID = "trip_id" +TOUR_LEG_ID = "tour_leg_id" +PATTERN_ID = "pattern_id" +TRIP_DURATION = "trip_duration" +STOP_TIME_DURATION = "stop_time_duration" +TRIP_NUM = "trip_num" +TRIP_COUNT = "trip_count" +OUTBOUND = "outbound" MAX_TOUR_ID = int(1e9) def generate_tour_leg_id(tour_leg_row): - return tour_leg_row.tour_id + (int(MAX_TOUR_ID) if tour_leg_row.outbound else int(2 * MAX_TOUR_ID)) + return tour_leg_row.tour_id + ( + int(MAX_TOUR_ID) if tour_leg_row.outbound else int(2 * MAX_TOUR_ID) + ) def get_tour_legs(trips): @@ -47,6 +49,7 @@ def get_tour_legs(trips): tour_legs = tour_legs.set_index(TOUR_LEG_ID) return tour_legs + # def trip_departure_rpc(chunk_size, choosers, trace_label): # # # NOTE we chunk chunk_id @@ -77,16 +80,21 @@ def generate_alternatives(trips, alternative_col_name): leg_alts = None durations = np.where(legs[OUTBOUND], legs[OB_DURATION], legs[IB_DURATION]) if len(durations) > 0: - leg_alts = pd.Series(np.concatenate([np.arange(0, duration + 1) for duration in durations]), - np.repeat(legs.index, durations + 1), - name=alternative_col_name).to_frame() + leg_alts = pd.Series( + np.concatenate([np.arange(0, duration + 1) for duration in durations]), + np.repeat(legs.index, durations + 1), + name=alternative_col_name, + ).to_frame() single_trips = trips[trips[TRIP_COUNT] == 1] single_alts = None - durations = np.where(single_trips[OUTBOUND], single_trips[OB_DURATION], single_trips[IB_DURATION]) + durations = np.where( + single_trips[OUTBOUND], single_trips[OB_DURATION], single_trips[IB_DURATION] + ) if len(durations) > 0: - single_alts = pd.Series(durations, single_trips.index, - name=alternative_col_name).to_frame() + single_alts = pd.Series( + durations, single_trips.index, name=alternative_col_name + ).to_frame() if not legs.empty and not single_trips.empty: return pd.concat([leg_alts, single_alts]) @@ -107,15 +115,20 @@ def build_patterns(trips, time_windows): pattern_sizes = [] for duration, trip_count in duration_and_counts: - possible_windows = time_windows[:trip_count-1, np.where(time_windows[:trip_count-1].sum(axis=0) == duration)[0]] + possible_windows = time_windows[ + : trip_count - 1, + np.where(time_windows[: trip_count - 1].sum(axis=0) == duration)[0], + ] possible_windows = np.unique(possible_windows, axis=1).transpose() filler = np.full((possible_windows.shape[0], max_trip_count), np.nan) - filler[:possible_windows.shape[0], :possible_windows.shape[1]] = possible_windows + filler[ + : possible_windows.shape[0], : possible_windows.shape[1] + ] = possible_windows patterns.append(filler) pattern_sizes.append(filler.shape[0]) patterns = np.concatenate(patterns) - pattern_names = ['_'.join('%0.0f' % x for x in y[~np.isnan(y)]) for y in patterns] + pattern_names = ["_".join("%0.0f" % x for x in y[~np.isnan(y)]) for y in patterns] indexes = np.repeat(tours.index, pattern_sizes) # If we've done everything right, the indexes @@ -127,15 +140,22 @@ def build_patterns(trips, time_windows): patterns.index.name = tours.index.name patterns[PATTERN_ID] = pattern_names - patterns = patterns.melt(id_vars=PATTERN_ID, value_name=STOP_TIME_DURATION, - var_name=TRIP_NUM, ignore_index=False).reset_index() + patterns = patterns.melt( + id_vars=PATTERN_ID, + value_name=STOP_TIME_DURATION, + var_name=TRIP_NUM, + ignore_index=False, + ).reset_index() patterns = patterns[~patterns[STOP_TIME_DURATION].isnull()].copy() patterns[TRIP_NUM] = patterns[TRIP_NUM] + 1 patterns[STOP_TIME_DURATION] = patterns[STOP_TIME_DURATION].astype(int) - patterns = pd.merge(patterns, trips.reset_index()[[TOUR_ID, TRIP_ID, TRIP_NUM, OUTBOUND]], - on=[TOUR_ID, TRIP_NUM]) + patterns = pd.merge( + patterns, + trips.reset_index()[[TOUR_ID, TRIP_ID, TRIP_NUM, OUTBOUND]], + on=[TOUR_ID, TRIP_NUM], + ) patterns.index = patterns.apply(generate_tour_leg_id, axis=1) patterns.index.name = TOUR_LEG_ID @@ -154,19 +174,22 @@ def get_spec_for_segment(omnibus_spec, segment): return spec -def choose_tour_leg_pattern(trip_segment, - patterns, spec, - trace_label='trace_label'): +def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label="trace_label"): alternatives = generate_alternatives(trip_segment, STOP_TIME_DURATION).sort_index() have_trace_targets = tracing.has_trace_targets(trip_segment) if have_trace_targets: - tracing.trace_df(trip_segment, tracing.extend_trace_label(trace_label, 'choosers')) - tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'), - transpose=False) + tracing.trace_df( + trip_segment, tracing.extend_trace_label(trace_label, "choosers") + ) + tracing.trace_df( + alternatives, + tracing.extend_trace_label(trace_label, "alternatives"), + transpose=False, + ) if len(spec.columns) > 1: - raise RuntimeError('spec must have only one column') + raise RuntimeError("spec must have only one column") # - join choosers and alts # in vanilla interaction_simulate interaction_df is cross join of choosers and alternatives @@ -176,49 +199,68 @@ def choose_tour_leg_pattern(trip_segment, # so we just need to left join alternatives with choosers assert alternatives.index.name == trip_segment.index.name - interaction_df = alternatives.join(trip_segment, how='left', rsuffix='_chooser') + interaction_df = alternatives.join(trip_segment, how="left", rsuffix="_chooser") - chunk.log_df(trace_label, 'interaction_df', interaction_df) + chunk.log_df(trace_label, "interaction_df", interaction_df) if have_trace_targets: - trace_rows, trace_ids = tracing.interaction_trace_rows(interaction_df, trip_segment) - - tracing.trace_df(interaction_df, - tracing.extend_trace_label(trace_label, 'interaction_df'), - transpose=False) + trace_rows, trace_ids = tracing.interaction_trace_rows( + interaction_df, trip_segment + ) + + tracing.trace_df( + interaction_df, + tracing.extend_trace_label(trace_label, "interaction_df"), + transpose=False, + ) else: trace_rows = trace_ids = None - interaction_utilities, trace_eval_results \ - = interaction_simulate.eval_interaction_utilities(spec, interaction_df, None, trace_label, trace_rows, - estimator=None) - - interaction_utilities = pd.concat([interaction_df[STOP_TIME_DURATION], interaction_utilities], axis=1) - chunk.log_df(trace_label, 'interaction_utilities', interaction_utilities) - - interaction_utilities = pd.merge(interaction_utilities.reset_index(), - patterns[patterns[TRIP_ID].isin(interaction_utilities.index)], - on=[TRIP_ID, STOP_TIME_DURATION], how='left') + ( + interaction_utilities, + trace_eval_results, + ) = interaction_simulate.eval_interaction_utilities( + spec, interaction_df, None, trace_label, trace_rows, estimator=None + ) + + interaction_utilities = pd.concat( + [interaction_df[STOP_TIME_DURATION], interaction_utilities], axis=1 + ) + chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) + + interaction_utilities = pd.merge( + interaction_utilities.reset_index(), + patterns[patterns[TRIP_ID].isin(interaction_utilities.index)], + on=[TRIP_ID, STOP_TIME_DURATION], + how="left", + ) if have_trace_targets: - tracing.trace_interaction_eval_results(trace_eval_results, trace_ids, - tracing.extend_trace_label(trace_label, 'eval')) - - tracing.trace_df(interaction_utilities, - tracing.extend_trace_label(trace_label, 'interaction_utilities'), - transpose=False) + tracing.trace_interaction_eval_results( + trace_eval_results, + trace_ids, + tracing.extend_trace_label(trace_label, "eval"), + ) + + tracing.trace_df( + interaction_utilities, + tracing.extend_trace_label(trace_label, "interaction_utilities"), + transpose=False, + ) del interaction_df - chunk.log_df(trace_label, 'interaction_df', None) + chunk.log_df(trace_label, "interaction_df", None) - interaction_utilities = interaction_utilities.groupby([TOUR_ID, OUTBOUND, PATTERN_ID], - as_index=False)[['utility']].sum() + interaction_utilities = interaction_utilities.groupby( + [TOUR_ID, OUTBOUND, PATTERN_ID], as_index=False + )[["utility"]].sum() - interaction_utilities[TOUR_LEG_ID] = \ - interaction_utilities.apply(generate_tour_leg_id, axis=1) + interaction_utilities[TOUR_LEG_ID] = interaction_utilities.apply( + generate_tour_leg_id, axis=1 + ) tour_choosers = interaction_utilities.set_index(TOUR_LEG_ID) - interaction_utilities = tour_choosers[['utility']].copy() + interaction_utilities = tour_choosers[["utility"]].copy() # reshape utilities (one utility column and one row per row in model_design) # to a dataframe with one row per chooser and one column per alternative @@ -226,8 +268,10 @@ def choose_tour_leg_pattern(trip_segment, # so we need to pad with dummy utilities so low that they are never chosen # number of samples per chooser - sample_counts = interaction_utilities.groupby(interaction_utilities.index).size().values - chunk.log_df(trace_label, 'sample_counts', sample_counts) + sample_counts = ( + interaction_utilities.groupby(interaction_utilities.index).size().values + ) + chunk.log_df(trace_label, "sample_counts", sample_counts) # max number of alternatvies for any chooser max_sample_count = sample_counts.max() @@ -242,57 +286,63 @@ def choose_tour_leg_pattern(trip_segment, inserts = np.repeat(last_row_offsets, max_sample_count - sample_counts) del sample_counts - chunk.log_df(trace_label, 'sample_counts', None) + chunk.log_df(trace_label, "sample_counts", None) # insert the zero-prob utilities to pad each alternative set to same size padded_utilities = np.insert(interaction_utilities.utility.values, inserts, -999) del inserts del interaction_utilities - chunk.log_df(trace_label, 'interaction_utilities', None) + chunk.log_df(trace_label, "interaction_utilities", None) # reshape to array with one row per chooser, one column per alternative padded_utilities = padded_utilities.reshape(-1, max_sample_count) - chunk.log_df(trace_label, 'padded_utilities', padded_utilities) + chunk.log_df(trace_label, "padded_utilities", padded_utilities) # convert to a dataframe with one row per chooser and one column per alternative - utilities_df = pd.DataFrame( - padded_utilities, - index=tour_choosers.index.unique()) - chunk.log_df(trace_label, 'utilities_df', utilities_df) + utilities_df = pd.DataFrame(padded_utilities, index=tour_choosers.index.unique()) + chunk.log_df(trace_label, "utilities_df", utilities_df) del padded_utilities - chunk.log_df(trace_label, 'padded_utilities', None) + chunk.log_df(trace_label, "padded_utilities", None) if have_trace_targets: - tracing.trace_df(utilities_df, tracing.extend_trace_label(trace_label, 'utilities'), - column_labels=['alternative', 'utility']) + tracing.trace_df( + utilities_df, + tracing.extend_trace_label(trace_label, "utilities"), + column_labels=["alternative", "utility"], + ) # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative - probs = logit.utils_to_probs(utilities_df, - trace_label=trace_label, trace_choosers=trip_segment) + probs = logit.utils_to_probs( + utilities_df, trace_label=trace_label, trace_choosers=trip_segment + ) - chunk.log_df(trace_label, 'probs', probs) + chunk.log_df(trace_label, "probs", probs) del utilities_df - chunk.log_df(trace_label, 'utilities_df', None) + chunk.log_df(trace_label, "utilities_df", None) if have_trace_targets: - tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'), - column_labels=['alternative', 'probability']) + tracing.trace_df( + probs, + tracing.extend_trace_label(trace_label, "probs"), + column_labels=["alternative", "probability"], + ) # make choices # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample - positions, rands = \ - logit.make_choices(probs, trace_label=trace_label, trace_choosers=trip_segment) + positions, rands = logit.make_choices( + probs, trace_label=trace_label, trace_choosers=trip_segment + ) - chunk.log_df(trace_label, 'positions', positions) - chunk.log_df(trace_label, 'rands', rands) + chunk.log_df(trace_label, "positions", positions) + chunk.log_df(trace_label, "rands", rands) del probs - chunk.log_df(trace_label, 'probs', None) + chunk.log_df(trace_label, "probs", None) # shouldn't have chosen any of the dummy pad utilities assert positions.max() < max_sample_count @@ -304,13 +354,19 @@ def choose_tour_leg_pattern(trip_segment, # resulting pandas Int64Index has one element per chooser row and is in same order as choosers choices = tour_choosers[PATTERN_ID].take(positions + first_row_offsets) - chunk.log_df(trace_label, 'choices', choices) + chunk.log_df(trace_label, "choices", choices) if have_trace_targets: - tracing.trace_df(choices, tracing.extend_trace_label(trace_label, 'choices'), - columns=[None, PATTERN_ID]) - tracing.trace_df(rands, tracing.extend_trace_label(trace_label, 'rands'), - columns=[None, 'rand']) + tracing.trace_df( + choices, + tracing.extend_trace_label(trace_label, "choices"), + columns=[None, PATTERN_ID], + ) + tracing.trace_df( + rands, + tracing.extend_trace_label(trace_label, "rands"), + columns=[None, "rand"], + ) return choices @@ -321,116 +377,139 @@ def apply_stage_two_model(omnibus_spec, trips, chunk_size, trace_label): trips = trips.sort_index() # Assign the duration of the appropriate leg to the trip - trips[TRIP_DURATION] = np.where(trips[OUTBOUND], trips[OB_DURATION], trips[IB_DURATION]) + trips[TRIP_DURATION] = np.where( + trips[OUTBOUND], trips[OB_DURATION], trips[IB_DURATION] + ) - trips['depart'] = -1 + trips["depart"] = -1 # If this is the first outbound trip, the choice is easy, assign the depart time # to equal the tour start time. - trips.loc[(trips['trip_num'] == 1) & (trips[OUTBOUND]), 'depart'] = trips['start'] + trips.loc[(trips["trip_num"] == 1) & (trips[OUTBOUND]), "depart"] = trips["start"] # If its the first return leg, it is easy too. Just assign the trip start time to the # end time minus the IB duration - trips.loc[(trips['trip_num'] == 1) & (~trips[OUTBOUND]), 'depart'] = trips['end'] - trips[IB_DURATION] + trips.loc[(trips["trip_num"] == 1) & (~trips[OUTBOUND]), "depart"] = ( + trips["end"] - trips[IB_DURATION] + ) # The last leg of the outbound tour needs to begin at the start plus OB duration - trips.loc[(trips['trip_count'] == trips['trip_num']) & (trips[OUTBOUND]), 'depart'] = \ - trips['start'] + trips[OB_DURATION] + trips.loc[ + (trips["trip_count"] == trips["trip_num"]) & (trips[OUTBOUND]), "depart" + ] = (trips["start"] + trips[OB_DURATION]) # The last leg of the inbound tour needs to begin at the end time of the tour - trips.loc[(trips['trip_count'] == trips['trip_num']) & (~trips[OUTBOUND]), 'depart'] = \ - trips['end'] + trips.loc[ + (trips["trip_count"] == trips["trip_num"]) & (~trips[OUTBOUND]), "depart" + ] = trips["end"] # Slice off the remaining trips with an intermediate stops to deal with. # Hopefully, with the tricks above we've sliced off a lot of choices. # This slice should only include trip numbers greater than 2 since the - side_trips = trips[(trips['trip_num'] != 1) & (trips['trip_count'] != trips['trip_num'])] + side_trips = trips[ + (trips["trip_num"] != 1) & (trips["trip_count"] != trips["trip_num"]) + ] # No processing needs to be done because we have simple trips / tours if side_trips.empty: - assert trips['depart'].notnull().all - return trips['depart'].astype(int) + assert trips["depart"].notnull().all + return trips["depart"].astype(int) # Get the potential time windows - time_windows = get_time_windows(side_trips[TRIP_DURATION].max(), side_trips[TRIP_COUNT].max() - 1) + time_windows = get_time_windows( + side_trips[TRIP_DURATION].max(), side_trips[TRIP_COUNT].max() - 1 + ) trip_list = [] - for i, chooser_chunk, chunk_trace_label in \ - chunk.adaptive_chunked_choosers_by_chunk_id(side_trips, chunk_size, trace_label): + for ( + i, + chooser_chunk, + chunk_trace_label, + ) in chunk.adaptive_chunked_choosers_by_chunk_id( + side_trips, chunk_size, trace_label + ): for is_outbound, trip_segment in chooser_chunk.groupby(OUTBOUND): - direction = OUTBOUND if is_outbound else 'inbound' + direction = OUTBOUND if is_outbound else "inbound" spec = get_spec_for_segment(omnibus_spec, direction) - segment_trace_label = '{}_{}'.format(direction, chunk_trace_label) + segment_trace_label = "{}_{}".format(direction, chunk_trace_label) patterns = build_patterns(trip_segment, time_windows) - choices = choose_tour_leg_pattern(trip_segment, - patterns, spec, trace_label=segment_trace_label) + choices = choose_tour_leg_pattern( + trip_segment, patterns, spec, trace_label=segment_trace_label + ) - choices = pd.merge(choices.reset_index(), patterns.reset_index(), - on=[TOUR_LEG_ID, PATTERN_ID], how='left') + choices = pd.merge( + choices.reset_index(), + patterns.reset_index(), + on=[TOUR_LEG_ID, PATTERN_ID], + how="left", + ) - choices = choices[['trip_id', 'stop_time_duration']].copy() + choices = choices[["trip_id", "stop_time_duration"]].copy() trip_list.append(choices) - trip_list = pd.concat(trip_list, sort=True).set_index('trip_id') - trips['stop_time_duration'] = 0 + trip_list = pd.concat(trip_list, sort=True).set_index("trip_id") + trips["stop_time_duration"] = 0 trips.update(trip_list) - trips.loc[trips['trip_num'] == 1, 'stop_time_duration'] = trips['depart'] - trips.sort_values(['tour_id', 'outbound', 'trip_num']) - trips['stop_time_duration'] = trips.groupby(['tour_id', 'outbound'])['stop_time_duration'].cumsum() - trips.loc[trips['trip_num'] != trips['trip_count'], 'depart'] = trips['stop_time_duration'] - return trips['depart'].astype(int) + trips.loc[trips["trip_num"] == 1, "stop_time_duration"] = trips["depart"] + trips.sort_values(["tour_id", "outbound", "trip_num"]) + trips["stop_time_duration"] = trips.groupby(["tour_id", "outbound"])[ + "stop_time_duration" + ].cumsum() + trips.loc[trips["trip_num"] != trips["trip_count"], "depart"] = trips[ + "stop_time_duration" + ] + return trips["depart"].astype(int) @inject.step() -def trip_departure_choice( - trips, - trips_merged, - skim_dict, - chunk_size, - trace_hh_id): +def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_id): - trace_label = 'trip_departure_choice' - model_settings = config.read_model_settings('trip_departure_choice.yaml') + trace_label = "trip_departure_choice" + model_settings = config.read_model_settings("trip_departure_choice.yaml") - spec = simulate.read_model_spec(file_name=model_settings['SPECIFICATION']) + spec = simulate.read_model_spec(file_name=model_settings["SPECIFICATION"]) trips_merged_df = trips_merged.to_frame() # add tour-based chunk_id so we can chunk all trips in tour together tour_ids = trips_merged[TOUR_ID].unique() - trips_merged_df['chunk_id'] = reindex(pd.Series(list(range(len(tour_ids))), tour_ids), trips_merged_df.tour_id) + trips_merged_df["chunk_id"] = reindex( + pd.Series(list(range(len(tour_ids))), tour_ids), trips_merged_df.tour_id + ) max_tour_id = trips_merged[TOUR_ID].max() - trip_departure_choice.MAX_TOUR_ID = int(np.power(10, np.ceil(np.log10(max_tour_id)))) + trip_departure_choice.MAX_TOUR_ID = int( + np.power(10, np.ceil(np.log10(max_tour_id))) + ) locals_d = config.get_model_constants(model_settings).copy() - preprocessor_settings = model_settings.get('PREPROCESSOR', None) + preprocessor_settings = model_settings.get("PREPROCESSOR", None) tour_legs = get_tour_legs(trips_merged_df) - pipeline.get_rn_generator().add_channel('tour_legs', tour_legs) + pipeline.get_rn_generator().add_channel("tour_legs", tour_legs) if preprocessor_settings: - od_skim = skim_dict.wrap('origin', 'destination') - do_skim = skim_dict.wrap('destination', 'origin') + od_skim = skim_dict.wrap("origin", "destination") + do_skim = skim_dict.wrap("destination", "origin") skims = [od_skim, do_skim] simulate.set_skim_wrapper_targets(trips_merged_df, skims) - locals_d.update({ - "od_skims": od_skim, - "do_skims": do_skim, - }) + locals_d.update( + {"od_skims": od_skim, "do_skims": do_skim,} + ) expressions.assign_columns( df=trips_merged_df, model_settings=preprocessor_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) choices = apply_stage_two_model(spec, trips_merged_df, chunk_size, trace_label) @@ -438,6 +517,6 @@ def trip_departure_choice( trip_length = len(trips_df) trips_df = pd.concat([trips_df, choices], axis=1) assert len(trips_df) == trip_length - assert trips_df[trips_df['depart'].isnull()].empty + assert trips_df[trips_df["depart"].isnull()].empty pipeline.replace_table("trips", trips_df) diff --git a/activitysim/abm/models/trip_destination.py b/activitysim/abm/models/trip_destination.py index 30a2521f61..eb6075aa96 100644 --- a/activitysim/abm/models/trip_destination.py +++ b/activitysim/abm/models/trip_destination.py @@ -1,63 +1,58 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range - import logging +from builtins import range import numpy as np import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import los -from activitysim.core import assign -from activitysim.core import expressions - -from activitysim.core.tracing import print_elapsed_time - -from activitysim.core.util import reindex -from activitysim.core.util import assign_in_place - -from activitysim.core.pathbuilder import TransitVirtualPathBuilder - +from activitysim.abm.models.util.trip import ( + cleanup_failed_trips, + flag_failed_trip_leg_mates, +) from activitysim.abm.tables.size_terms import tour_destination_size_terms - -from activitysim.core.skim_dictionary import DataFrameMatrix - -from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.core import ( + assign, + config, + expressions, + inject, + los, + pipeline, + simulate, + tracing, +) from activitysim.core.interaction_sample import interaction_sample +from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.core.pathbuilder import TransitVirtualPathBuilder +from activitysim.core.skim_dictionary import DataFrameMatrix +from activitysim.core.tracing import print_elapsed_time +from activitysim.core.util import assign_in_place, reindex from .util import estimation -from activitysim.abm.models.util.trip import cleanup_failed_trips -from activitysim.abm.models.util.trip import flag_failed_trip_leg_mates - - logger = logging.getLogger(__name__) NO_DESTINATION = -1 -TRIP_ORIG_TAZ = 'TAZ' -ALT_DEST_TAZ = 'ALT_DEST_TAZ' -PRIMARY_DEST_TAZ = 'PRIMARY_DEST_TAZ' -DEST_MAZ = 'dest_maz' +TRIP_ORIG_TAZ = "TAZ" +ALT_DEST_TAZ = "ALT_DEST_TAZ" +PRIMARY_DEST_TAZ = "PRIMARY_DEST_TAZ" +DEST_MAZ = "dest_maz" def _destination_sample( - primary_purpose, - trips, - alternatives, - model_settings, - size_term_matrix, - skims, - alt_dest_col_name, - estimator, - chunk_size, - chunk_tag, - trace_label): + primary_purpose, + trips, + alternatives, + model_settings, + size_term_matrix, + skims, + alt_dest_col_name, + estimator, + chunk_size, + chunk_tag, + trace_label, +): """ Note: trips with no viable destination receive no sample rows @@ -74,13 +69,22 @@ def _destination_sample( 102829169 3193 0.002628 1 """ - spec = simulate.spec_for_segment(model_settings, spec_id='DESTINATION_SAMPLE_SPEC', - segment_name=primary_purpose, estimator=estimator) + spec = simulate.spec_for_segment( + model_settings, + spec_id="DESTINATION_SAMPLE_SPEC", + segment_name=primary_purpose, + estimator=estimator, + ) - sample_size = model_settings['SAMPLE_SIZE'] - if config.setting('disable_destination_sampling', False) or (estimator and estimator.want_unsampled_alternatives): + sample_size = model_settings["SAMPLE_SIZE"] + if config.setting("disable_destination_sampling", False) or ( + estimator and estimator.want_unsampled_alternatives + ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count - logger.info("Estimation mode for %s using unsampled alternatives short_circuit_choices" % (trace_label,)) + logger.info( + "Estimation mode for %s using unsampled alternatives short_circuit_choices" + % (trace_label,) + ) sample_size = 0 locals_dict = config.get_model_constants(model_settings).copy() @@ -90,12 +94,10 @@ def _destination_sample( # cannot be determined until after choosers are joined with alternatives # (unless we iterate over trip.purpose - which we could, though we are already iterating over trip_num) # so, instead, expressions determine row-specific size_term by a call to: size_terms.get(df.alt_dest, df.purpose) - locals_dict.update({ - 'size_terms': size_term_matrix - }) + locals_dict.update({"size_terms": size_term_matrix}) locals_dict.update(skims) - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_sample( choosers=trips, @@ -107,28 +109,30 @@ def _destination_sample( spec=spec, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, chunk_tag=chunk_tag, - trace_label=trace_label - ) + chunk_size=chunk_size, + chunk_tag=chunk_tag, + trace_label=trace_label, + ) return choices def destination_sample( - primary_purpose, - trips, - alternatives, - model_settings, - size_term_matrix, - skim_hotel, - estimator, - chunk_size, - trace_label): - - chunk_tag = 'trip_destination.sample' + primary_purpose, + trips, + alternatives, + model_settings, + size_term_matrix, + skim_hotel, + estimator, + chunk_size, + trace_label, +): + + chunk_tag = "trip_destination.sample" skims = skim_hotel.sample_skims(presample=False) - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _destination_sample( primary_purpose, @@ -141,7 +145,8 @@ def destination_sample( estimator, chunk_size, chunk_tag=chunk_tag, - trace_label=trace_label) + trace_label=trace_label, + ) return choices @@ -159,7 +164,9 @@ def aggregate_size_term_matrix(maz_size_term_matrix, maz_taz): return maz_size_term_matrix -def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_col_name, trace_label): +def choose_MAZ_for_TAZ( + taz_sample, MAZ_size_terms, trips, network_los, alt_dest_col_name, trace_label +): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ @@ -189,13 +196,15 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_ trace_hh_id = inject.get_injectable("trace_hh_id", None) have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) if have_trace_targets: - trace_label = tracing.extend_trace_label(trace_label, 'choose_MAZ_for_TAZ') + trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) - tracing.trace_df(taz_sample[trace_targets], - label=tracing.extend_trace_label(trace_label, 'taz_sample'), - transpose=False) + tracing.trace_df( + taz_sample[trace_targets], + label=tracing.extend_trace_label(trace_label, "taz_sample"), + transpose=False, + ) # print(f"taz_sample\n{taz_sample}") # alt_dest_TAZ prob pick_count @@ -203,9 +212,11 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_ # 4343721 12 0.000054 1 # 4343721 20 0.001864 2 - taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False) - taz_choices = taz_choices.reindex(taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True) - taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'}) + taz_choices = taz_sample[[DEST_TAZ, "prob"]].reset_index(drop=False) + taz_choices = taz_choices.reindex( + taz_choices.index.repeat(taz_sample.pick_count) + ).reset_index(drop=True) + taz_choices = taz_choices.rename(columns={"prob": "TAZ_prob"}) # print(f"taz_choices\n{taz_choices}") # trip_id alt_dest_TAZ prob @@ -221,7 +232,9 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_ # 4 0.0 1.879 0.023 0.000 0.023 0.023 5.796 0.023 # just to make it clear we are siloing choices by chooser_id - chooser_id_col = taz_sample.index.name # should be canonical chooser index name (e.g. 'trip_id') + chooser_id_col = ( + taz_sample.index.name + ) # should be canonical chooser index name (e.g. 'trip_id') # for random_for_df, we need df with de-duplicated chooser canonical index chooser_df = pd.DataFrame(index=taz_sample.index[~taz_sample.index.duplicated()]) @@ -233,20 +246,27 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_ taz_sample_size = taz_choices.groupby(chooser_id_col)[DEST_TAZ].count().max() # taz_choices index values should be contiguous - assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size)).all() + assert ( + taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size) + ).all() # we need to choose a MAZ for each DEST_TAZ choice # probability of choosing MAZ based on MAZ size_term fraction of TAZ total # there will be a different set (and number) of candidate MAZs for each TAZ # (preserve index, which will have duplicates as result of join) - maz_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].rename(columns={'TAZ': DEST_TAZ, 'MAZ': DEST_MAZ}) - maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), - maz_taz, - how='left', on=DEST_TAZ).set_index('index') + maz_taz = network_los.maz_taz_df[["MAZ", "TAZ"]].rename( + columns={"TAZ": DEST_TAZ, "MAZ": DEST_MAZ} + ) + maz_sizes = pd.merge( + taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), + maz_taz, + how="left", + on=DEST_TAZ, + ).set_index("index") - purpose = maz_sizes['trip_id'].map(trips.purpose) # size term varies by purpose - maz_sizes['size_term'] = MAZ_size_terms.get(maz_sizes[DEST_MAZ], purpose) + purpose = maz_sizes["trip_id"].map(trips.purpose) # size term varies by purpose + maz_sizes["size_term"] = MAZ_size_terms.get(maz_sizes[DEST_MAZ], purpose) # print(f"maz_sizes\n{maz_sizes}") # trip_id alt_dest_TAZ alt_dest size_term @@ -257,11 +277,13 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_ if have_trace_targets: # write maz_sizes: maz_sizes[index,trip_id,dest_TAZ,zone_id,size_term] - maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer='trip_id') + maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer="trip_id") trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] - tracing.trace_df(trace_maz_sizes, - label=tracing.extend_trace_label(trace_label, 'maz_sizes'), - transpose=False) + tracing.trace_df( + trace_maz_sizes, + label=tracing.extend_trace_label(trace_label, "maz_sizes"), + transpose=False, + ) # number of DEST_TAZ candidates per chooser maz_counts = maz_sizes.groupby(maz_sizes.index).size().values @@ -289,7 +311,11 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_ maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) - rands = pipeline.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size).reshape(-1, 1) + rands = ( + pipeline.get_rn_generator() + .random_for_df(chooser_df, n=taz_sample_size) + .reshape(-1, 1) + ) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] @@ -302,49 +328,79 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_ assert (positions < maz_counts).all() taz_choices[DEST_MAZ] = maz_sizes[DEST_MAZ].take(positions + first_row_offsets) - taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]), positions] - taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob'] + taz_choices["MAZ_prob"] = maz_probs[np.arange(maz_probs.shape[0]), positions] + taz_choices["prob"] = taz_choices["TAZ_prob"] * taz_choices["MAZ_prob"] if have_trace_targets: - taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer='trip_id') + taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer="trip_id") trace_taz_choices_df = taz_choices[taz_choices_trace_targets] - tracing.trace_df(trace_taz_choices_df, - label=tracing.extend_trace_label(trace_label, 'taz_choices'), - transpose=False) + tracing.trace_df( + trace_taz_choices_df, + label=tracing.extend_trace_label(trace_label, "taz_choices"), + transpose=False, + ) - lhs_df = trace_taz_choices_df[['trip_id', DEST_TAZ]] - alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)] + lhs_df = trace_taz_choices_df[["trip_id", DEST_TAZ]] + alt_dest_columns = [f"dest_maz_{c}" for c in range(max_maz_count)] # following the same logic as the full code, but for trace cutout trace_maz_counts = maz_counts[taz_choices_trace_targets] trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum() - trace_inserts = np.repeat(trace_last_row_offsets, max_maz_count - trace_maz_counts) + trace_inserts = np.repeat( + trace_last_row_offsets, max_maz_count - trace_maz_counts + ) # trace dest_maz_alts - padded_maz_sizes = np.insert(trace_maz_sizes[DEST_MAZ].values, trace_inserts, 0.0).reshape(-1, max_maz_count) - df = pd.DataFrame(data=padded_maz_sizes, - columns=alt_dest_columns, index=trace_taz_choices_df.index) + padded_maz_sizes = np.insert( + trace_maz_sizes[DEST_MAZ].values, trace_inserts, 0.0 + ).reshape(-1, max_maz_count) + df = pd.DataFrame( + data=padded_maz_sizes, + columns=alt_dest_columns, + index=trace_taz_choices_df.index, + ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df(df, label=tracing.extend_trace_label(trace_label, 'dest_maz_alts'), transpose=False) + tracing.trace_df( + df, + label=tracing.extend_trace_label(trace_label, "dest_maz_alts"), + transpose=False, + ) # trace dest_maz_size_terms - padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values, trace_inserts, 0.0).reshape(-1, max_maz_count) - df = pd.DataFrame(data=padded_maz_sizes, - columns=alt_dest_columns, index=trace_taz_choices_df.index) + padded_maz_sizes = np.insert( + trace_maz_sizes["size_term"].values, trace_inserts, 0.0 + ).reshape(-1, max_maz_count) + df = pd.DataFrame( + data=padded_maz_sizes, + columns=alt_dest_columns, + index=trace_taz_choices_df.index, + ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df(df, label=tracing.extend_trace_label(trace_label, 'dest_maz_size_terms'), transpose=False) + tracing.trace_df( + df, + label=tracing.extend_trace_label(trace_label, "dest_maz_size_terms"), + transpose=False, + ) # trace dest_maz_probs - df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets], - columns=alt_dest_columns, index=trace_taz_choices_df.index) + df = pd.DataFrame( + data=maz_probs[taz_choices_trace_targets], + columns=alt_dest_columns, + index=trace_taz_choices_df.index, + ) df = pd.concat([lhs_df, df], axis=1) - df['rand'] = rands[taz_choices_trace_targets] - tracing.trace_df(df, label=tracing.extend_trace_label(trace_label, 'dest_maz_probs'), transpose=False) + df["rand"] = rands[taz_choices_trace_targets] + tracing.trace_df( + df, + label=tracing.extend_trace_label(trace_label, "dest_maz_probs"), + transpose=False, + ) - taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob']) - taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ]).agg(prob=('prob', 'max'), - pick_count=('prob', 'count')) + taz_choices = taz_choices.drop(columns=["TAZ_prob", "MAZ_prob"]) + taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ]).agg( + prob=("prob", "max"), pick_count=("prob", "count") + ) taz_choices.reset_index(level=DEST_MAZ, inplace=True) @@ -352,27 +408,29 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_ def destination_presample( - primary_purpose, - trips, - alternatives, - model_settings, - size_term_matrix, - skim_hotel, - network_los, - estimator, - chunk_size, trace_hh_id, - trace_label): - - trace_label = tracing.extend_trace_label(trace_label, 'presample') - chunk_tag = 'trip_destination.presample' # distinguish from trip_destination.sample - - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] - maz_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].set_index('MAZ').TAZ + primary_purpose, + trips, + alternatives, + model_settings, + size_term_matrix, + skim_hotel, + network_los, + estimator, + chunk_size, + trace_hh_id, + trace_label, +): + + trace_label = tracing.extend_trace_label(trace_label, "presample") + chunk_tag = "trip_destination.presample" # distinguish from trip_destination.sample + + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] + maz_taz = network_los.maz_taz_df[["MAZ", "TAZ"]].set_index("MAZ").TAZ TAZ_size_term_matrix = aggregate_size_term_matrix(size_term_matrix, maz_taz) - TRIP_ORIGIN = model_settings['TRIP_ORIGIN'] - PRIMARY_DEST = model_settings['PRIMARY_DEST'] + TRIP_ORIGIN = model_settings["TRIP_ORIGIN"] + PRIMARY_DEST = model_settings["PRIMARY_DEST"] trips = trips.copy() trips[TRIP_ORIGIN] = trips[TRIP_ORIGIN].map(maz_taz) @@ -395,10 +453,13 @@ def destination_presample( estimator, chunk_size, chunk_tag=chunk_tag, - trace_label=trace_label) + trace_label=trace_label, + ) # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total - maz_sample = choose_MAZ_for_TAZ(taz_sample, size_term_matrix, trips, network_los, alt_dest_col_name, trace_label) + maz_sample = choose_MAZ_for_TAZ( + taz_sample, size_term_matrix, trips, network_los, alt_dest_col_name, trace_label + ) assert alt_dest_col_name in maz_sample @@ -406,15 +467,17 @@ def destination_presample( def trip_destination_sample( - primary_purpose, - trips, - alternatives, - model_settings, - size_term_matrix, - skim_hotel, - estimator, - chunk_size, trace_hh_id, - trace_label): + primary_purpose, + trips, + alternatives, + model_settings, + size_term_matrix, + skim_hotel, + estimator, + chunk_size, + trace_hh_id, + trace_label, +): """ Returns @@ -431,22 +494,27 @@ def trip_destination_sample( pick_count : int number of duplicate picks for chooser, alt """ - trace_label = tracing.extend_trace_label(trace_label, 'sample') + trace_label = tracing.extend_trace_label(trace_label, "sample") assert len(trips) > 0 assert len(alternatives) > 0 # by default, enable presampling for multizone systems, unless they disable it in settings file - network_los = inject.get_injectable('network_los') + network_los = inject.get_injectable("network_los") pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting('want_dest_choice_presampling', True): + if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): pre_sample_taz = False - logger.info(f"Disabled destination zone presampling for {trace_label} " - f"because 'want_dest_choice_presampling' setting is False") + logger.info( + f"Disabled destination zone presampling for {trace_label} " + f"because 'want_dest_choice_presampling' setting is False" + ) if pre_sample_taz: - logger.info("Running %s trip_destination_presample with %d trips" % (trace_label, len(trips))) + logger.info( + "Running %s trip_destination_presample with %d trips" + % (trace_label, len(trips)) + ) choices = destination_presample( primary_purpose, @@ -457,8 +525,10 @@ def trip_destination_sample( skim_hotel, network_los, estimator, - chunk_size, trace_hh_id, - trace_label) + chunk_size, + trace_hh_id, + trace_label, + ) else: choices = destination_sample( @@ -470,20 +540,23 @@ def trip_destination_sample( skim_hotel, estimator, chunk_size, - trace_label) + trace_label, + ) return choices def compute_ood_logsums( - choosers, - logsum_settings, - nest_spec, logsum_spec, - od_skims, - locals_dict, - chunk_size, - trace_label, - chunk_tag): + choosers, + logsum_settings, + nest_spec, + logsum_spec, + od_skims, + locals_dict, + chunk_size, + trace_label, + chunk_tag, +): """ Compute one (of two) out-of-direction logsums for destination alternatives @@ -493,9 +566,8 @@ def compute_ood_logsums( locals_dict.update(od_skims) expressions.annotate_preprocessors( - choosers, locals_dict, od_skims, - logsum_settings, - trace_label) + choosers, locals_dict, od_skims, logsum_settings, trace_label + ) logsums = simulate.simple_simulate_logsums( choosers, @@ -505,7 +577,8 @@ def compute_ood_logsums( locals_d=locals_dict, chunk_size=chunk_size, trace_label=trace_label, - chunk_tag=chunk_tag) + chunk_tag=chunk_tag, + ) assert logsums.index.equals(choosers.index) @@ -516,14 +589,15 @@ def compute_ood_logsums( def compute_logsums( - primary_purpose, - trips, - destination_sample, - tours_merged, - model_settings, - skim_hotel, - chunk_size, - trace_label): + primary_purpose, + trips, + destination_sample, + tours_merged, + model_settings, + skim_hotel, + chunk_size, + trace_label, +): """ Calculate mode choice logsums using the same recipe as for trip_mode_choice, but do it twice for each alternative since we need out-of-direction logsum @@ -533,41 +607,40 @@ def compute_logsums( ------- adds od_logsum and dp_logsum columns to trips (in place) """ - trace_label = tracing.extend_trace_label(trace_label, 'compute_logsums') + trace_label = tracing.extend_trace_label(trace_label, "compute_logsums") logger.info("Running %s with %d samples", trace_label, destination_sample.shape[0]) # chunk usage is uniform so better to combine - chunk_tag = 'trip_destination.compute_logsums' + chunk_tag = "trip_destination.compute_logsums" # FIXME should pass this in? - network_los = inject.get_injectable('network_los') + network_los = inject.get_injectable("network_los") # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( - trips, - tours_merged, - left_on='tour_id', - right_index=True, - how="left") + trips, tours_merged, left_on="tour_id", right_index=True, how="left" + ) assert trips_merged.index.equals(trips.index) # - choosers - merge destination_sample and trips_merged # re/set index because pandas merge does not preserve left index if it has duplicate values! - choosers = pd.merge(destination_sample, - trips_merged.reset_index(), - left_index=True, - right_on='trip_id', - how="left", - suffixes=('', '_r')).set_index('trip_id') + choosers = pd.merge( + destination_sample, + trips_merged.reset_index(), + left_index=True, + right_on="trip_id", + how="left", + suffixes=("", "_r"), + ).set_index("trip_id") assert choosers.index.equals(destination_sample.index) - logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) + logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) coefficients = simulate.get_segment_coefficients(logsum_settings, primary_purpose) nest_spec = config.get_logit_model_settings(logsum_settings) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) - logsum_spec = simulate.read_model_spec(file_name=logsum_settings['SPEC']) + logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) logsum_spec = simulate.eval_coefficients(logsum_spec, coefficients, estimator=None) locals_dict = {} @@ -579,69 +652,81 @@ def compute_logsums( skims = skim_hotel.logsum_skims() if network_los.zone_system == los.THREE_ZONE: # TVPB constants can appear in expressions - locals_dict.update(network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) + locals_dict.update( + network_los.setting("TVPB_SETTINGS.tour_mode_choice.CONSTANTS") + ) # - od_logsums od_skims = { - 'ORIGIN': model_settings['TRIP_ORIGIN'], - 'DESTINATION': model_settings['ALT_DEST_COL_NAME'], - "odt_skims": skims['odt_skims'], - "dot_skims": skims['dot_skims'], - "od_skims": skims['od_skims'], + "ORIGIN": model_settings["TRIP_ORIGIN"], + "DESTINATION": model_settings["ALT_DEST_COL_NAME"], + "odt_skims": skims["odt_skims"], + "dot_skims": skims["dot_skims"], + "od_skims": skims["od_skims"], } if network_los.zone_system == los.THREE_ZONE: - od_skims.update({ - 'tvpb_logsum_odt': skims['tvpb_logsum_odt'], - 'tvpb_logsum_dot': skims['tvpb_logsum_dot'] - }) - destination_sample['od_logsum'] = compute_ood_logsums( + od_skims.update( + { + "tvpb_logsum_odt": skims["tvpb_logsum_odt"], + "tvpb_logsum_dot": skims["tvpb_logsum_dot"], + } + ) + destination_sample["od_logsum"] = compute_ood_logsums( choosers, logsum_settings, - nest_spec, logsum_spec, + nest_spec, + logsum_spec, od_skims, locals_dict, chunk_size, - trace_label=tracing.extend_trace_label(trace_label, 'od'), - chunk_tag=chunk_tag) + trace_label=tracing.extend_trace_label(trace_label, "od"), + chunk_tag=chunk_tag, + ) # - dp_logsums dp_skims = { - 'ORIGIN': model_settings['ALT_DEST_COL_NAME'], - 'DESTINATION': model_settings['PRIMARY_DEST'], - "odt_skims": skims['dpt_skims'], - "dot_skims": skims['pdt_skims'], - "od_skims": skims['dp_skims'], + "ORIGIN": model_settings["ALT_DEST_COL_NAME"], + "DESTINATION": model_settings["PRIMARY_DEST"], + "odt_skims": skims["dpt_skims"], + "dot_skims": skims["pdt_skims"], + "od_skims": skims["dp_skims"], } if network_los.zone_system == los.THREE_ZONE: - dp_skims.update({ - 'tvpb_logsum_odt': skims['tvpb_logsum_dpt'], - 'tvpb_logsum_dot': skims['tvpb_logsum_pdt'] - }) + dp_skims.update( + { + "tvpb_logsum_odt": skims["tvpb_logsum_dpt"], + "tvpb_logsum_dot": skims["tvpb_logsum_pdt"], + } + ) - destination_sample['dp_logsum'] = compute_ood_logsums( + destination_sample["dp_logsum"] = compute_ood_logsums( choosers, logsum_settings, - nest_spec, logsum_spec, + nest_spec, + logsum_spec, dp_skims, locals_dict, chunk_size, - trace_label=tracing.extend_trace_label(trace_label, 'dp'), - chunk_tag=chunk_tag) + trace_label=tracing.extend_trace_label(trace_label, "dp"), + chunk_tag=chunk_tag, + ) return destination_sample def trip_destination_simulate( - primary_purpose, - trips, - destination_sample, - model_settings, - want_logsums, - size_term_matrix, - skim_hotel, - estimator, - chunk_size, trace_hh_id, - trace_label): + primary_purpose, + trips, + destination_sample, + model_settings, + want_logsums, + size_term_matrix, + skim_hotel, + estimator, + chunk_size, + trace_hh_id, + trace_label, +): """ Chose destination from destination_sample (with od_logsum and dp_logsum columns added) @@ -651,28 +736,30 @@ def trip_destination_simulate( choices - pandas.Series destination alt chosen """ - trace_label = tracing.extend_trace_label(trace_label, 'trip_dest_simulate') - chunk_tag = 'trip_destination.simulate' + trace_label = tracing.extend_trace_label(trace_label, "trip_dest_simulate") + chunk_tag = "trip_destination.simulate" - spec = simulate.spec_for_segment(model_settings, spec_id='DESTINATION_SPEC', - segment_name=primary_purpose, estimator=estimator) + spec = simulate.spec_for_segment( + model_settings, + spec_id="DESTINATION_SPEC", + segment_name=primary_purpose, + estimator=estimator, + ) if estimator: estimator.write_choosers(trips) - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] logger.info("Running trip_destination_simulate with %d trips", len(trips)) skims = skim_hotel.sample_skims(presample=False) locals_dict = config.get_model_constants(model_settings).copy() - locals_dict.update({ - 'size_terms': size_term_matrix - }) + locals_dict.update({"size_terms": size_term_matrix}) locals_dict.update(skims) - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) destinations = interaction_sample_simulate( choosers=trips, @@ -681,24 +768,29 @@ def trip_destination_simulate( choice_column=alt_dest_col_name, log_alt_losers=log_alt_losers, want_logsums=want_logsums, - allow_zero_probs=True, zero_prob_choice_val=NO_DESTINATION, + allow_zero_probs=True, + zero_prob_choice_val=NO_DESTINATION, skims=skims, locals_d=locals_dict, - chunk_size=chunk_size, chunk_tag=chunk_tag, + chunk_size=chunk_size, + chunk_tag=chunk_tag, trace_label=trace_label, - trace_choice_name='trip_dest', - estimator=estimator) + trace_choice_name="trip_dest", + estimator=estimator, + ) if not want_logsums: # for consistency, always return a dataframe with canonical column name assert isinstance(destinations, pd.Series) - destinations = destinations.to_frame('choice') + destinations = destinations.to_frame("choice") if estimator: # need to overwrite choices here before any failed choices are suppressed estimator.write_choices(destinations.choice) - destinations.choice = estimator.get_survey_values(destinations.choice, 'trips', 'destination') + destinations.choice = estimator.get_survey_values( + destinations.choice, "trips", "destination" + ) estimator.write_override_choices(destinations.choice) # drop any failed zero_prob destinations @@ -710,17 +802,20 @@ def trip_destination_simulate( def choose_trip_destination( - primary_purpose, - trips, - alternatives, - tours_merged, - model_settings, - want_logsums, - want_sample_table, - size_term_matrix, skim_hotel, - estimator, - chunk_size, trace_hh_id, - trace_label): + primary_purpose, + trips, + alternatives, + tours_merged, + model_settings, + want_logsums, + want_sample_table, + size_term_matrix, + skim_hotel, + estimator, + chunk_size, + trace_hh_id, + trace_label, +): logger.info("choose_trip_destination %s with %d trips", trace_label, trips.shape[0]) @@ -735,20 +830,24 @@ def choose_trip_destination( size_term_matrix=size_term_matrix, skim_hotel=skim_hotel, estimator=estimator, - chunk_size=chunk_size, trace_hh_id=trace_hh_id, - trace_label=trace_label) + chunk_size=chunk_size, + trace_hh_id=trace_hh_id, + trace_label=trace_label, + ) dropped_trips = ~trips.index.isin(destination_sample.index.unique()) if dropped_trips.any(): - logger.warning("%s trip_destination_sample %s trips " - "without viable destination alternatives" % - (trace_label, dropped_trips.sum())) + logger.warning( + "%s trip_destination_sample %s trips " + "without viable destination alternatives" + % (trace_label, dropped_trips.sum()) + ) trips = trips[~dropped_trips] t0 = print_elapsed_time("%s.trip_destination_sample" % trace_label, t0) if trips.empty: - return pd.Series(index=trips.index).to_frame('choice'), None + return pd.Series(index=trips.index).to_frame("choice"), None # - compute logsums destination_sample = compute_logsums( @@ -759,7 +858,8 @@ def choose_trip_destination( model_settings=model_settings, skim_hotel=skim_hotel, chunk_size=chunk_size, - trace_label=trace_label) + trace_label=trace_label, + ) t0 = print_elapsed_time("%s.compute_logsums" % trace_label, t0) @@ -773,18 +873,24 @@ def choose_trip_destination( size_term_matrix=size_term_matrix, skim_hotel=skim_hotel, estimator=estimator, - chunk_size=chunk_size, trace_hh_id=trace_hh_id, - trace_label=trace_label) + chunk_size=chunk_size, + trace_hh_id=trace_hh_id, + trace_label=trace_label, + ) dropped_trips = ~trips.index.isin(destinations.index) if dropped_trips.any(): - logger.warning("%s trip_destination_simulate %s trips " - "without viable destination alternatives" % - (trace_label, dropped_trips.sum())) + logger.warning( + "%s trip_destination_simulate %s trips " + "without viable destination alternatives" + % (trace_label, dropped_trips.sum()) + ) if want_sample_table: # FIXME - sample_table - destination_sample.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) + destination_sample.set_index( + model_settings["ALT_DEST_COL_NAME"], append=True, inplace=True + ) else: destination_sample = None @@ -794,51 +900,65 @@ def choose_trip_destination( class SkimHotel(object): - def __init__(self, model_settings, network_los, trace_label): self.model_settings = model_settings - self.trace_label = tracing.extend_trace_label(trace_label, 'skim_hotel') + self.trace_label = tracing.extend_trace_label(trace_label, "skim_hotel") self.network_los = network_los self.zone_system = network_los.zone_system def sample_skims(self, presample): - o = self.model_settings['TRIP_ORIGIN'] - d = self.model_settings['ALT_DEST_COL_NAME'] - p = self.model_settings['PRIMARY_DEST'] + o = self.model_settings["TRIP_ORIGIN"] + d = self.model_settings["ALT_DEST_COL_NAME"] + p = self.model_settings["PRIMARY_DEST"] if presample: assert not (self.zone_system == los.ONE_ZONE) - skim_dict = self.network_los.get_skim_dict('taz') + skim_dict = self.network_los.get_skim_dict("taz") else: skim_dict = self.network_los.get_default_skim_dict() skims = { "od_skims": skim_dict.wrap(o, d), "dp_skims": skim_dict.wrap(d, p), - - "odt_skims": skim_dict.wrap_3d(orig_key=o, dest_key=d, dim3_key='trip_period'), - "dot_skims": skim_dict.wrap_3d(orig_key=d, dest_key=o, dim3_key='trip_period'), - "dpt_skims": skim_dict.wrap_3d(orig_key=d, dest_key=p, dim3_key='trip_period'), - "pdt_skims": skim_dict.wrap_3d(orig_key=p, dest_key=d, dim3_key='trip_period'), + "odt_skims": skim_dict.wrap_3d( + orig_key=o, dest_key=d, dim3_key="trip_period" + ), + "dot_skims": skim_dict.wrap_3d( + orig_key=d, dest_key=o, dim3_key="trip_period" + ), + "dpt_skims": skim_dict.wrap_3d( + orig_key=d, dest_key=p, dim3_key="trip_period" + ), + "pdt_skims": skim_dict.wrap_3d( + orig_key=p, dest_key=d, dim3_key="trip_period" + ), } return skims def logsum_skims(self): - o = self.model_settings['TRIP_ORIGIN'] - d = self.model_settings['ALT_DEST_COL_NAME'] - p = self.model_settings['PRIMARY_DEST'] + o = self.model_settings["TRIP_ORIGIN"] + d = self.model_settings["ALT_DEST_COL_NAME"] + p = self.model_settings["PRIMARY_DEST"] skim_dict = self.network_los.get_default_skim_dict() skims = { - "odt_skims": skim_dict.wrap_3d(orig_key=o, dest_key=d, dim3_key='trip_period'), - "dot_skims": skim_dict.wrap_3d(orig_key=d, dest_key=o, dim3_key='trip_period'), - "dpt_skims": skim_dict.wrap_3d(orig_key=d, dest_key=p, dim3_key='trip_period'), - "pdt_skims": skim_dict.wrap_3d(orig_key=p, dest_key=d, dim3_key='trip_period'), + "odt_skims": skim_dict.wrap_3d( + orig_key=o, dest_key=d, dim3_key="trip_period" + ), + "dot_skims": skim_dict.wrap_3d( + orig_key=d, dest_key=o, dim3_key="trip_period" + ), + "dpt_skims": skim_dict.wrap_3d( + orig_key=d, dest_key=p, dim3_key="trip_period" + ), + "pdt_skims": skim_dict.wrap_3d( + orig_key=p, dest_key=d, dim3_key="trip_period" + ), "od_skims": skim_dict.wrap(o, d), "dp_skims": skim_dict.wrap(d, p), } @@ -847,36 +967,60 @@ def logsum_skims(self): # fixme - is this a lightweight object? tvpb = self.network_los.tvpb - tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=o, dest_key=d, - tod_key='trip_period', segment_key='demographic_segment', - trace_label=self.trace_label, tag='tvpb_logsum_odt') - tvpb_logsum_dot = tvpb.wrap_logsum(orig_key=d, dest_key=o, - tod_key='trip_period', segment_key='demographic_segment', - trace_label=self.trace_label, tag='tvpb_logsum_dot') - tvpb_logsum_dpt = tvpb.wrap_logsum(orig_key=d, dest_key=p, - tod_key='trip_period', segment_key='demographic_segment', - trace_label=self.trace_label, tag='tvpb_logsum_dpt') - tvpb_logsum_pdt = tvpb.wrap_logsum(orig_key=p, dest_key=d, - tod_key='trip_period', segment_key='demographic_segment', - trace_label=self.trace_label, tag='tvpb_logsum_pdt') - - skims.update({ - 'tvpb_logsum_odt': tvpb_logsum_odt, - 'tvpb_logsum_dot': tvpb_logsum_dot, - 'tvpb_logsum_dpt': tvpb_logsum_dpt, - 'tvpb_logsum_pdt': tvpb_logsum_pdt - }) + tvpb_logsum_odt = tvpb.wrap_logsum( + orig_key=o, + dest_key=d, + tod_key="trip_period", + segment_key="demographic_segment", + trace_label=self.trace_label, + tag="tvpb_logsum_odt", + ) + tvpb_logsum_dot = tvpb.wrap_logsum( + orig_key=d, + dest_key=o, + tod_key="trip_period", + segment_key="demographic_segment", + trace_label=self.trace_label, + tag="tvpb_logsum_dot", + ) + tvpb_logsum_dpt = tvpb.wrap_logsum( + orig_key=d, + dest_key=p, + tod_key="trip_period", + segment_key="demographic_segment", + trace_label=self.trace_label, + tag="tvpb_logsum_dpt", + ) + tvpb_logsum_pdt = tvpb.wrap_logsum( + orig_key=p, + dest_key=d, + tod_key="trip_period", + segment_key="demographic_segment", + trace_label=self.trace_label, + tag="tvpb_logsum_pdt", + ) + + skims.update( + { + "tvpb_logsum_odt": tvpb_logsum_odt, + "tvpb_logsum_dot": tvpb_logsum_dot, + "tvpb_logsum_dpt": tvpb_logsum_dpt, + "tvpb_logsum_pdt": tvpb_logsum_pdt, + } + ) return skims def run_trip_destination( - trips, - tours_merged, - estimator, - chunk_size, trace_hh_id, - trace_label, - fail_some_trips_for_testing=False): + trips, + tours_merged, + estimator, + chunk_size, + trace_hh_id, + trace_label, + fail_some_trips_for_testing=False, +): """ trip destination - main functionality separated from model step so it can be called iteratively @@ -900,32 +1044,35 @@ def run_trip_destination( """ - model_settings_file_name = 'trip_destination.yaml' + model_settings_file_name = "trip_destination.yaml" model_settings = config.read_model_settings(model_settings_file_name) - preprocessor_settings = model_settings.get('preprocessor', None) - logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) + preprocessor_settings = model_settings.get("preprocessor", None) + logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) - logsum_column_name = model_settings.get('DEST_CHOICE_LOGSUM_COLUMN_NAME') + logsum_column_name = model_settings.get("DEST_CHOICE_LOGSUM_COLUMN_NAME") want_logsums = logsum_column_name is not None - sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') - want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None + sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") + want_sample_table = ( + config.setting("want_dest_choice_sample_tables") + and sample_table_name is not None + ) - land_use = inject.get_table('land_use') - size_terms = inject.get_injectable('size_terms') - network_los = inject.get_injectable('network_los') + land_use = inject.get_table("land_use") + size_terms = inject.get_injectable("size_terms") + network_los = inject.get_injectable("network_los") trips = trips.sort_index() - trips['next_trip_id'] = np.roll(trips.index, -1) + trips["next_trip_id"] = np.roll(trips.index, -1) trips.next_trip_id = trips.next_trip_id.where(trips.trip_num < trips.trip_count, 0) # - initialize trip origin and destination to those of half-tour # (we will sequentially adjust intermediate trips origin and destination as we choose them) tour_destination = reindex(tours_merged.destination, trips.tour_id).astype(np.int64) tour_origin = reindex(tours_merged.origin, trips.tour_id).astype(np.int64) - trips['destination'] = np.where(trips.outbound, tour_destination, tour_origin) - trips['origin'] = np.where(trips.outbound, tour_origin, tour_destination) - trips['failed'] = False + trips["destination"] = np.where(trips.outbound, tour_destination, tour_origin) + trips["origin"] = np.where(trips.outbound, tour_origin, tour_destination) + trips["failed"] = False if estimator: # need to check or override non-intermediate trip destination @@ -933,30 +1080,38 @@ def run_trip_destination( # FIXME if not consistent, do we fail or override? (seems weird to override them to bad values?) # expect all the same trips - survey_trips = estimator.get_survey_table('trips').sort_index() + survey_trips = estimator.get_survey_table("trips").sort_index() assert survey_trips.index.equals(trips.index) - first = (survey_trips.trip_num == 1) - last = (survey_trips.trip_num == trips.trip_count) + first = survey_trips.trip_num == 1 + last = survey_trips.trip_num == trips.trip_count # expect survey's outbound first trip origin to be same as half tour origin - assert (survey_trips.origin[survey_trips.outbound & first] - == tour_origin[survey_trips.outbound & first]).all() + assert ( + survey_trips.origin[survey_trips.outbound & first] + == tour_origin[survey_trips.outbound & first] + ).all() # expect outbound last trip destination to be same as half tour destination - assert (survey_trips.destination[survey_trips.outbound & last] - == tour_destination[survey_trips.outbound & last]).all() + assert ( + survey_trips.destination[survey_trips.outbound & last] + == tour_destination[survey_trips.outbound & last] + ).all() # expect inbound first trip origin to be same as half tour destination - assert (survey_trips.origin[~survey_trips.outbound & first] - == tour_destination[~survey_trips.outbound & first]).all() + assert ( + survey_trips.origin[~survey_trips.outbound & first] + == tour_destination[~survey_trips.outbound & first] + ).all() # expect inbound last trip destination to be same as half tour origin - assert (survey_trips.destination[~survey_trips.outbound & last] - == tour_origin[~survey_trips.outbound & last]).all() + assert ( + survey_trips.destination[~survey_trips.outbound & last] + == tour_origin[~survey_trips.outbound & last] + ).all() # - filter tours_merged (AFTER copying destination and origin columns to trips) # tours_merged is used for logsums, we filter it here upfront to save space and time - tours_merged_cols = logsum_settings['TOURS_MERGED_CHOOSER_COLUMNS'] - redundant_cols = model_settings.get('REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS', []) + tours_merged_cols = logsum_settings["TOURS_MERGED_CHOOSER_COLUMNS"] + redundant_cols = model_settings.get("REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS", []) if redundant_cols: tours_merged_cols = [c for c in tours_merged_cols if c not in redundant_cols] @@ -966,7 +1121,7 @@ def run_trip_destination( skim_hotel = SkimHotel(model_settings, network_los, trace_label) # - size_terms and alternatives - alternatives = tour_destination_size_terms(land_use, size_terms, 'trip') + alternatives = tour_destination_size_terms(land_use, size_terms, "trip") # DataFrameMatrix alows us to treat dataframe as virtual a 2-D array, indexed by zone_id, purpose # e.g. size_terms.get(df.dest_zone_id, df.purpose) @@ -975,7 +1130,7 @@ def run_trip_destination( # don't need size terms in alternatives, just zone_id index alternatives = alternatives.drop(alternatives.columns, axis=1) - alternatives.index.name = model_settings['ALT_DEST_COL_NAME'] + alternatives.index.name = model_settings["ALT_DEST_COL_NAME"] sample_list = [] @@ -990,11 +1145,11 @@ def run_trip_destination( for trip_num in range(first_trip_num, last_trip_num + 1): nth_trips = trips[intermediate & (trips.trip_num == trip_num)] - nth_trace_label = tracing.extend_trace_label(trace_label, 'trip_num_%s' % trip_num) + nth_trace_label = tracing.extend_trace_label( + trace_label, "trip_num_%s" % trip_num + ) - locals_dict = { - 'network_los': network_los - } + locals_dict = {"network_los": network_los} locals_dict.update(config.get_model_constants(model_settings)) # - annotate nth_trips @@ -1003,13 +1158,14 @@ def run_trip_destination( df=nth_trips, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=nth_trace_label) + trace_label=nth_trace_label, + ) logger.info("Running %s with %d trips", nth_trace_label, nth_trips.shape[0]) # - choose destination for nth_trips, segmented by primary_purpose choices_list = [] - for primary_purpose, trips_segment in nth_trips.groupby('primary_purpose'): + for primary_purpose, trips_segment in nth_trips.groupby("primary_purpose"): choices, destination_sample = choose_trip_destination( primary_purpose, trips_segment, @@ -1018,10 +1174,15 @@ def run_trip_destination( model_settings, want_logsums, want_sample_table, - size_term_matrix, skim_hotel, + size_term_matrix, + skim_hotel, estimator, - chunk_size, trace_hh_id, - trace_label=tracing.extend_trace_label(nth_trace_label, primary_purpose)) + chunk_size, + trace_hh_id, + trace_label=tracing.extend_trace_label( + nth_trace_label, primary_purpose + ), + ) choices_list.append(choices) if want_sample_table: @@ -1036,31 +1197,41 @@ def run_trip_destination( failed_trip_ids = nth_trips.index.difference(destinations_df.index) if failed_trip_ids.any(): - logger.warning("%s sidelining %s trips without viable destination alternatives" % - (nth_trace_label, failed_trip_ids.shape[0])) + logger.warning( + "%s sidelining %s trips without viable destination alternatives" + % (nth_trace_label, failed_trip_ids.shape[0]) + ) next_trip_ids = nth_trips.next_trip_id.reindex(failed_trip_ids) - trips.loc[failed_trip_ids, 'failed'] = True - trips.loc[failed_trip_ids, 'destination'] = -1 - trips.loc[next_trip_ids, 'origin'] = trips.loc[failed_trip_ids].origin.values + trips.loc[failed_trip_ids, "failed"] = True + trips.loc[failed_trip_ids, "destination"] = -1 + trips.loc[next_trip_ids, "origin"] = trips.loc[ + failed_trip_ids + ].origin.values if len(destinations_df) == 0: assert failed_trip_ids.all() - logger.warning(f"all {len(nth_trips)} {primary_purpose} trip_num {trip_num} trips failed") + logger.warning( + f"all {len(nth_trips)} {primary_purpose} trip_num {trip_num} trips failed" + ) if len(destinations_df) > 0: # - assign choices to this trip's destinations # if estimator, then the choices will already have been overridden by trip_destination_simulate # because we need to overwrite choices before any failed choices are suppressed - assign_in_place(trips, destinations_df.choice.to_frame('destination')) + assign_in_place(trips, destinations_df.choice.to_frame("destination")) if want_logsums: - assert 'logsum' in destinations_df.columns - assign_in_place(trips, destinations_df.logsum.to_frame(logsum_column_name)) + assert "logsum" in destinations_df.columns + assign_in_place( + trips, destinations_df.logsum.to_frame(logsum_column_name) + ) # - assign choice to next trip's origin - destinations_df.index = nth_trips.next_trip_id.reindex(destinations_df.index) - assign_in_place(trips, destinations_df.choice.to_frame('origin')) + destinations_df.index = nth_trips.next_trip_id.reindex( + destinations_df.index + ) + assign_in_place(trips, destinations_df.choice.to_frame("origin")) - del trips['next_trip_id'] + del trips["next_trip_id"] if len(sample_list) > 0: save_sample_df = pd.concat(sample_list) @@ -1072,10 +1243,7 @@ def run_trip_destination( @inject.step() -def trip_destination( - trips, - tours_merged, - chunk_size, trace_hh_id): +def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): """ Choose a destination for all 'intermediate' trips based on trip purpose. @@ -1084,26 +1252,32 @@ def trip_destination( """ - trace_label = 'trip_destination' + trace_label = "trip_destination" - model_settings_file_name = 'trip_destination.yaml' + model_settings_file_name = "trip_destination.yaml" model_settings = config.read_model_settings(model_settings_file_name) - CLEANUP = model_settings.get('CLEANUP', True) - fail_some_trips_for_testing = model_settings.get('fail_some_trips_for_testing', False) + CLEANUP = model_settings.get("CLEANUP", True) + fail_some_trips_for_testing = model_settings.get( + "fail_some_trips_for_testing", False + ) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() - estimator = estimation.manager.begin_estimation('trip_destination') + estimator = estimation.manager.begin_estimation("trip_destination") if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') - estimator.write_spec(model_settings, tag='SPEC') + estimator.write_spec(model_settings, tag="SPEC") estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) - estimator.write_table(inject.get_injectable('size_terms'), 'size_terms', append=False) - estimator.write_table(inject.get_table('land_use').to_frame(), 'landuse', append=False) + estimator.write_table( + inject.get_injectable("size_terms"), "size_terms", append=False + ) + estimator.write_table( + inject.get_table("land_use").to_frame(), "landuse", append=False + ) estimator.write_model_settings(model_settings, model_settings_file_name) logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) @@ -1115,25 +1289,34 @@ def trip_destination( chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label, - fail_some_trips_for_testing=fail_some_trips_for_testing) + fail_some_trips_for_testing=fail_some_trips_for_testing, + ) # testing feature t0 make sure at least one trip fails so trip_purpose_and_destination model is run - if config.setting('testing_fail_trip_destination', False) and not trips_df.failed.any(): + if ( + config.setting("testing_fail_trip_destination", False) + and not trips_df.failed.any() + ): if (trips_df.trip_num < trips_df.trip_count).sum() == 0: - raise RuntimeError(f"can't honor 'testing_fail_trip_destination' setting because no intermediate trips") + raise RuntimeError( + f"can't honor 'testing_fail_trip_destination' setting because no intermediate trips" + ) fail_o = trips_df[trips_df.trip_num < trips_df.trip_count].origin.max() - trips_df.failed = (trips_df.origin == fail_o) & \ - (trips_df.trip_num < trips_df.trip_count) + trips_df.failed = (trips_df.origin == fail_o) & ( + trips_df.trip_num < trips_df.trip_count + ) if trips_df.failed.any(): logger.warning("%s %s failed trips", trace_label, trips_df.failed.sum()) - if inject.get_injectable('pipeline_file_prefix', None): + if inject.get_injectable("pipeline_file_prefix", None): file_name = f"{trace_label}_failed_trips_{inject.get_injectable('pipeline_file_prefix')}" else: file_name = f"{trace_label}_failed_trips" logger.info("writing failed trips to %s", file_name) - tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) + tracing.write_csv( + trips_df[trips_df.failed], file_name=file_name, transpose=False + ) if estimator: estimator.end_estimation() @@ -1143,35 +1326,42 @@ def trip_destination( if CLEANUP: if trips_df.failed.any(): - flag_failed_trip_leg_mates(trips_df, 'failed') + flag_failed_trip_leg_mates(trips_df, "failed") if save_sample_df is not None: - save_sample_df.drop(trips_df.index[trips_df.failed], level='trip_id', inplace=True) + save_sample_df.drop( + trips_df.index[trips_df.failed], level="trip_id", inplace=True + ) trips_df = cleanup_failed_trips(trips_df) - trips_df.drop(columns='failed', inplace=True, errors='ignore') + trips_df.drop(columns="failed", inplace=True, errors="ignore") pipeline.replace_table("trips", trips_df) if trace_hh_id: - tracing.trace_df(trips_df, - label=trace_label, - slicer='trip_id', - index_label='trip_id', - warn_if_empty=True) + tracing.trace_df( + trips_df, + label=trace_label, + slicer="trip_id", + index_label="trip_id", + warn_if_empty=True, + ) if save_sample_df is not None: # might be none if want_sample_table but there are no intermediate trips # expect samples only for intermediate trip destinations - assert len(save_sample_df.index.get_level_values(0).unique()) == \ - len(trips_df[trips_df.trip_num < trips_df.trip_count]) + assert len(save_sample_df.index.get_level_values(0).unique()) == len( + trips_df[trips_df.trip_num < trips_df.trip_count] + ) - sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') + sample_table_name = model_settings.get("DEST_CHOICE_SAMPLE_TABLE_NAME") assert sample_table_name is not None - logger.info("adding %s samples to %s" % (len(save_sample_df), sample_table_name)) + logger.info( + "adding %s samples to %s" % (len(save_sample_df), sample_table_name) + ) # lest they try to put tour samples into the same table if pipeline.is_table(sample_table_name): diff --git a/activitysim/abm/models/trip_matrices.py b/activitysim/abm/models/trip_matrices.py index 2df091c012..0f33e69e69 100644 --- a/activitysim/abm/models/trip_matrices.py +++ b/activitysim/abm/models/trip_matrices.py @@ -3,15 +3,11 @@ import logging +import numpy as np import openmatrix as omx import pandas as pd -import numpy as np -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import expressions -from activitysim.core import los +from activitysim.core import config, expressions, inject, los, pipeline logger = logging.getLogger(__name__) @@ -36,117 +32,154 @@ def write_trip_matrices(network_los): """ - trips = inject.get_table('trips', None) + trips = inject.get_table("trips", None) if trips is None: # this step is a NOP if there is no trips table # this might legitimately happen if they comment out some steps to debug but still want write_tables # this saves them the hassle of remembering to comment out this step - logger.warning(f"write_trip_matrices returning empty-handed because there is no trips table") + logger.warning( + f"write_trip_matrices returning empty-handed because there is no trips table" + ) return - model_settings = config.read_model_settings('write_trip_matrices.yaml') + model_settings = config.read_model_settings("write_trip_matrices.yaml") trips_df = annotate_trips(trips, network_los, model_settings) - if bool(model_settings.get('SAVE_TRIPS_TABLE')): - pipeline.replace_table('trips', trips_df) + if bool(model_settings.get("SAVE_TRIPS_TABLE")): + pipeline.replace_table("trips", trips_df) - if 'parking_location' in config.setting('models'): - parking_settings = config.read_model_settings('parking_location_choice.yaml') - parking_taz_col_name = parking_settings['ALT_DEST_COL_NAME'] + if "parking_location" in config.setting("models"): + parking_settings = config.read_model_settings("parking_location_choice.yaml") + parking_taz_col_name = parking_settings["ALT_DEST_COL_NAME"] if parking_taz_col_name in trips_df: - trips_df.loc[trips_df[parking_taz_col_name] > 0, 'destination'] = trips_df[parking_taz_col_name] + trips_df.loc[trips_df[parking_taz_col_name] > 0, "destination"] = trips_df[ + parking_taz_col_name + ] # Also need address the return trip # write matrices by zone system type if network_los.zone_system == los.ONE_ZONE: # taz trips written to taz matrices - logger.info('aggregating trips one zone...') - aggregate_trips = trips_df.groupby(['origin', 'destination'], sort=False).sum() + logger.info("aggregating trips one zone...") + aggregate_trips = trips_df.groupby(["origin", "destination"], sort=False).sum() # use the average household weight for all trips in the origin destination pair - hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL') - aggregate_weight = trips_df[['origin', 'destination', hh_weight_col]].groupby(['origin', 'destination'], - sort=False).mean() + hh_weight_col = model_settings.get("HH_EXPANSION_WEIGHT_COL") + aggregate_weight = ( + trips_df[["origin", "destination", hh_weight_col]] + .groupby(["origin", "destination"], sort=False) + .mean() + ) aggregate_trips[hh_weight_col] = aggregate_weight[hh_weight_col] - orig_vals = aggregate_trips.index.get_level_values('origin') - dest_vals = aggregate_trips.index.get_level_values('destination') + orig_vals = aggregate_trips.index.get_level_values("origin") + dest_vals = aggregate_trips.index.get_level_values("destination") # use the land use table for the set of possible tazs - zone_index = pipeline.get_table('land_use').index + zone_index = pipeline.get_table("land_use").index assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) _, orig_index = zone_index.reindex(orig_vals) _, dest_index = zone_index.reindex(dest_vals) - write_matrices(aggregate_trips, zone_index, orig_index, dest_index, model_settings) + write_matrices( + aggregate_trips, zone_index, orig_index, dest_index, model_settings + ) elif network_los.zone_system == los.TWO_ZONE: # maz trips written to taz matrices - logger.info('aggregating trips two zone...') - trips_df["otaz"] = pipeline.get_table('land_use').reindex(trips_df['origin']).TAZ.tolist() - trips_df["dtaz"] = pipeline.get_table('land_use').reindex(trips_df['destination']).TAZ.tolist() - aggregate_trips = trips_df.groupby(['otaz', 'dtaz'], sort=False).sum() + logger.info("aggregating trips two zone...") + trips_df["otaz"] = ( + pipeline.get_table("land_use").reindex(trips_df["origin"]).TAZ.tolist() + ) + trips_df["dtaz"] = ( + pipeline.get_table("land_use").reindex(trips_df["destination"]).TAZ.tolist() + ) + aggregate_trips = trips_df.groupby(["otaz", "dtaz"], sort=False).sum() # use the average household weight for all trips in the origin destination pair - hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL') - aggregate_weight = trips_df[['otaz', 'dtaz', hh_weight_col]].groupby(['otaz', 'dtaz'], sort=False).mean() + hh_weight_col = model_settings.get("HH_EXPANSION_WEIGHT_COL") + aggregate_weight = ( + trips_df[["otaz", "dtaz", hh_weight_col]] + .groupby(["otaz", "dtaz"], sort=False) + .mean() + ) aggregate_trips[hh_weight_col] = aggregate_weight[hh_weight_col] - orig_vals = aggregate_trips.index.get_level_values('otaz') - dest_vals = aggregate_trips.index.get_level_values('dtaz') + orig_vals = aggregate_trips.index.get_level_values("otaz") + dest_vals = aggregate_trips.index.get_level_values("dtaz") - zone_index = pd.Index(network_los.get_tazs(), name='TAZ') + zone_index = pd.Index(network_los.get_tazs(), name="TAZ") assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) _, orig_index = zone_index.reindex(orig_vals) _, dest_index = zone_index.reindex(dest_vals) - write_matrices(aggregate_trips, zone_index, orig_index, dest_index, model_settings) + write_matrices( + aggregate_trips, zone_index, orig_index, dest_index, model_settings + ) - elif network_los.zone_system == los.THREE_ZONE: # maz trips written to taz and tap matrices + elif ( + network_los.zone_system == los.THREE_ZONE + ): # maz trips written to taz and tap matrices - logger.info('aggregating trips three zone taz...') - trips_df["otaz"] = pipeline.get_table('land_use').reindex(trips_df['origin']).TAZ.tolist() - trips_df["dtaz"] = pipeline.get_table('land_use').reindex(trips_df['destination']).TAZ.tolist() - aggregate_trips = trips_df.groupby(['otaz', 'dtaz'], sort=False).sum() + logger.info("aggregating trips three zone taz...") + trips_df["otaz"] = ( + pipeline.get_table("land_use").reindex(trips_df["origin"]).TAZ.tolist() + ) + trips_df["dtaz"] = ( + pipeline.get_table("land_use").reindex(trips_df["destination"]).TAZ.tolist() + ) + aggregate_trips = trips_df.groupby(["otaz", "dtaz"], sort=False).sum() # use the average household weight for all trips in the origin destination pair - hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL') - aggregate_weight = trips_df[['otaz', 'dtaz', hh_weight_col]].groupby(['otaz', 'dtaz'], sort=False).mean() + hh_weight_col = model_settings.get("HH_EXPANSION_WEIGHT_COL") + aggregate_weight = ( + trips_df[["otaz", "dtaz", hh_weight_col]] + .groupby(["otaz", "dtaz"], sort=False) + .mean() + ) aggregate_trips[hh_weight_col] = aggregate_weight[hh_weight_col] - orig_vals = aggregate_trips.index.get_level_values('otaz') - dest_vals = aggregate_trips.index.get_level_values('dtaz') + orig_vals = aggregate_trips.index.get_level_values("otaz") + dest_vals = aggregate_trips.index.get_level_values("dtaz") - zone_index = pd.Index(network_los.get_tazs(), name='TAZ') + zone_index = pd.Index(network_los.get_tazs(), name="TAZ") assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) _, orig_index = zone_index.reindex(orig_vals) _, dest_index = zone_index.reindex(dest_vals) - write_matrices(aggregate_trips, zone_index, orig_index, dest_index, model_settings) + write_matrices( + aggregate_trips, zone_index, orig_index, dest_index, model_settings + ) - logger.info('aggregating trips three zone tap...') - aggregate_trips = trips_df.groupby(['btap', 'atap'], sort=False).sum() + logger.info("aggregating trips three zone tap...") + aggregate_trips = trips_df.groupby(["btap", "atap"], sort=False).sum() # use the average household weight for all trips in the origin destination pair - hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL') - aggregate_weight = trips_df[['btap', 'atap', hh_weight_col]].groupby(['btap', 'atap'], sort=False).mean() + hh_weight_col = model_settings.get("HH_EXPANSION_WEIGHT_COL") + aggregate_weight = ( + trips_df[["btap", "atap", hh_weight_col]] + .groupby(["btap", "atap"], sort=False) + .mean() + ) aggregate_trips[hh_weight_col] = aggregate_weight[hh_weight_col] - orig_vals = aggregate_trips.index.get_level_values('btap') - dest_vals = aggregate_trips.index.get_level_values('atap') + orig_vals = aggregate_trips.index.get_level_values("btap") + dest_vals = aggregate_trips.index.get_level_values("atap") - zone_index = pd.Index(network_los.get_taps(), name='TAP') + zone_index = pd.Index(network_los.get_taps(), name="TAP") assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) _, orig_index = zone_index.reindex(orig_vals) _, dest_index = zone_index.reindex(dest_vals) - write_matrices(aggregate_trips, zone_index, orig_index, dest_index, model_settings, True) + write_matrices( + aggregate_trips, zone_index, orig_index, dest_index, model_settings, True + ) def annotate_trips(trips, network_los, model_settings): @@ -161,19 +194,18 @@ def annotate_trips(trips, network_los, model_settings): trips_df = trips.to_frame() - trace_label = 'trip_matrices' + trace_label = "trip_matrices" skim_dict = network_los.get_default_skim_dict() # setup skim keys - if 'trip_period' not in trips_df: - trips_df['trip_period'] = network_los.skim_time_period_label(trips_df.depart) - od_skim_wrapper = skim_dict.wrap('origin', 'destination') - odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key='origin', dest_key='destination', dim3_key='trip_period') - skims = { - 'od_skims': od_skim_wrapper, - "odt_skims": odt_skim_stack_wrapper - } + if "trip_period" not in trips_df: + trips_df["trip_period"] = network_los.skim_time_period_label(trips_df.depart) + od_skim_wrapper = skim_dict.wrap("origin", "destination") + odt_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key="origin", dest_key="destination", dim3_key="trip_period" + ) + skims = {"od_skims": od_skim_wrapper, "odt_skims": odt_skim_stack_wrapper} locals_dict = {} constants = config.get_model_constants(model_settings) @@ -181,22 +213,24 @@ def annotate_trips(trips, network_los, model_settings): locals_dict.update(constants) expressions.annotate_preprocessors( - trips_df, locals_dict, skims, - model_settings, trace_label) + trips_df, locals_dict, skims, model_settings, trace_label + ) # Data will be expanded by an expansion weight column from # the households pipeline table, if specified in the model settings. - hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL') + hh_weight_col = model_settings.get("HH_EXPANSION_WEIGHT_COL") if hh_weight_col and hh_weight_col not in trips_df: logger.info("adding '%s' from households to trips table" % hh_weight_col) - household_weights = pipeline.get_table('households')[hh_weight_col] + household_weights = pipeline.get_table("households")[hh_weight_col] trips_df[hh_weight_col] = trips_df.household_id.map(household_weights) return trips_df -def write_matrices(aggregate_trips, zone_index, orig_index, dest_index, model_settings, is_tap=False): +def write_matrices( + aggregate_trips, zone_index, orig_index, dest_index, model_settings, is_tap=False +): """ Write aggregated trips to OMX format. @@ -209,42 +243,48 @@ def write_matrices(aggregate_trips, zone_index, orig_index, dest_index, model_se but the table 'data_field's must be summable types: ints, floats, bools. """ - matrix_settings = model_settings.get('MATRICES') + matrix_settings = model_settings.get("MATRICES") if not matrix_settings: - logger.error('Missing MATRICES setting in write_trip_matrices.yaml') + logger.error("Missing MATRICES setting in write_trip_matrices.yaml") for matrix in matrix_settings: - matrix_is_tap = matrix.get('is_tap', False) + matrix_is_tap = matrix.get("is_tap", False) if matrix_is_tap == is_tap: # only write tap matrices to tap matrix files - filename = matrix.get('file_name') + filename = matrix.get("file_name") filepath = config.output_file_path(filename) - logger.info('opening %s' % filepath) - file = omx.open_file(filepath, 'w') # possibly overwrite existing file - table_settings = matrix.get('tables') + logger.info("opening %s" % filepath) + file = omx.open_file(filepath, "w") # possibly overwrite existing file + table_settings = matrix.get("tables") for table in table_settings: - table_name = table.get('name') - col = table.get('data_field') + table_name = table.get("name") + col = table.get("data_field") if col not in aggregate_trips: - logger.error(f'missing {col} column in aggregate_trips DataFrame') + logger.error(f"missing {col} column in aggregate_trips DataFrame") return - hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL') + hh_weight_col = model_settings.get("HH_EXPANSION_WEIGHT_COL") if hh_weight_col: - aggregate_trips[col] = aggregate_trips[col] / aggregate_trips[hh_weight_col] + aggregate_trips[col] = ( + aggregate_trips[col] / aggregate_trips[hh_weight_col] + ) data = np.zeros((len(zone_index), len(zone_index))) data[orig_index, dest_index] = aggregate_trips[col] - logger.debug('writing %s sum %0.2f' % (table_name, aggregate_trips[col].sum())) + logger.debug( + "writing %s sum %0.2f" % (table_name, aggregate_trips[col].sum()) + ) file[table_name] = data # write to file # include the index-to-zone map in the file - logger.info('adding %s mapping for %s zones to %s' % - (zone_index.name, zone_index.size, filename)) + logger.info( + "adding %s mapping for %s zones to %s" + % (zone_index.name, zone_index.size, filename) + ) file.create_mapping(zone_index.name, zone_index.to_numpy()) - logger.info('closing %s' % filepath) + logger.info("closing %s" % filepath) file.close() diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py index af009b9795..8f830df613 100644 --- a/activitysim/abm/models/trip_mode_choice.py +++ b/activitysim/abm/models/trip_mode_choice.py @@ -3,34 +3,29 @@ import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import expressions - -from activitysim.core import assign -from activitysim.core import los - +from activitysim.core import ( + assign, + config, + expressions, + inject, + los, + pipeline, + simulate, + tracing, +) from activitysim.core.util import assign_in_place -from .util.mode import mode_choice_simulate from .util import estimation - +from .util.mode import mode_choice_simulate logger = logging.getLogger(__name__) @inject.step() -def trip_mode_choice( - trips, - tours_merged, - network_los, - chunk_size, trace_hh_id): +def trip_mode_choice(trips, tours_merged, network_los, chunk_size, trace_hh_id): """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. @@ -39,52 +34,51 @@ def trip_mode_choice( Adds trip_mode column to trip table """ - trace_label = 'trip_mode_choice' - model_settings_file_name = 'trip_mode_choice.yaml' + trace_label = "trip_mode_choice" + model_settings_file_name = "trip_mode_choice.yaml" model_settings = config.read_model_settings(model_settings_file_name) - logsum_column_name = model_settings.get('MODE_CHOICE_LOGSUM_COLUMN_NAME') - mode_column_name = 'trip_mode' + logsum_column_name = model_settings.get("MODE_CHOICE_LOGSUM_COLUMN_NAME") + mode_column_name = "trip_mode" trips_df = trips.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) tours_merged = tours_merged.to_frame() - tours_merged = tours_merged[model_settings['TOURS_MERGED_CHOOSER_COLUMNS']] + tours_merged = tours_merged[model_settings["TOURS_MERGED_CHOOSER_COLUMNS"]] - tracing.print_summary('primary_purpose', - trips_df.primary_purpose, value_counts=True) + tracing.print_summary( + "primary_purpose", trips_df.primary_purpose, value_counts=True + ) # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( - trips_df, - tours_merged, - left_on='tour_id', - right_index=True, - how="left") + trips_df, tours_merged, left_on="tour_id", right_index=True, how="left" + ) assert trips_merged.index.equals(trips.index) # setup skim keys - assert ('trip_period' not in trips_merged) - trips_merged['trip_period'] = network_los.skim_time_period_label(trips_merged.depart) + assert "trip_period" not in trips_merged + trips_merged["trip_period"] = network_los.skim_time_period_label( + trips_merged.depart + ) - orig_col = 'origin' - dest_col = 'destination' + orig_col = "origin" + dest_col = "destination" constants = {} constants.update(config.get_model_constants(model_settings)) - constants.update({ - 'ORIGIN': orig_col, - 'DESTINATION': dest_col - }) + constants.update({"ORIGIN": orig_col, "DESTINATION": dest_col}) skim_dict = network_los.get_default_skim_dict() - odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col, dest_key=dest_col, - dim3_key='trip_period') - dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col, dest_key=orig_col, - dim3_key='trip_period') - od_skim_wrapper = skim_dict.wrap('origin', 'destination') + odt_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col, dest_key=dest_col, dim3_key="trip_period" + ) + dot_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col, dest_key=orig_col, dim3_key="trip_period" + ) + od_skim_wrapper = skim_dict.wrap("origin", "destination") skims = { "odt_skims": odt_skim_stack_wrapper, @@ -96,52 +90,65 @@ def trip_mode_choice( # fixme - is this a lightweight object? tvpb = network_los.tvpb - tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col, dest_key=dest_col, - tod_key='trip_period', segment_key='demographic_segment', - cache_choices=True, - trace_label=trace_label, tag='tvpb_logsum_odt') - skims.update({ - 'tvpb_logsum_odt': tvpb_logsum_odt, - # 'tvpb_logsum_dot': tvpb_logsum_dot - }) + tvpb_logsum_odt = tvpb.wrap_logsum( + orig_key=orig_col, + dest_key=dest_col, + tod_key="trip_period", + segment_key="demographic_segment", + cache_choices=True, + trace_label=trace_label, + tag="tvpb_logsum_odt", + ) + skims.update( + { + "tvpb_logsum_odt": tvpb_logsum_odt, + # 'tvpb_logsum_dot': tvpb_logsum_dot + } + ) # TVPB constants can appear in expressions - constants.update(network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) + constants.update( + network_los.setting("TVPB_SETTINGS.tour_mode_choice.CONSTANTS") + ) - estimator = estimation.manager.begin_estimation('trip_mode_choice') + estimator = estimation.manager.begin_estimation("trip_mode_choice") if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) nest_spec = config.get_logit_model_settings(model_settings) choices_list = [] - for primary_purpose, trips_segment in trips_merged.groupby('primary_purpose'): + for primary_purpose, trips_segment in trips_merged.groupby("primary_purpose"): segment_trace_label = tracing.extend_trace_label(trace_label, primary_purpose) - logger.info("trip_mode_choice tour_type '%s' (%s trips)" % - (primary_purpose, len(trips_segment.index), )) + logger.info( + "trip_mode_choice tour_type '%s' (%s trips)" + % (primary_purpose, len(trips_segment.index),) + ) # name index so tracing knows how to slice - assert trips_segment.index.name == 'trip_id' + assert trips_segment.index.name == "trip_id" if network_los.zone_system == los.THREE_ZONE: tvpb_logsum_odt.extend_trace_label(primary_purpose) # tvpb_logsum_dot.extend_trace_label(primary_purpose) - coefficients = simulate.get_segment_coefficients(model_settings, primary_purpose) + coefficients = simulate.get_segment_coefficients( + model_settings, primary_purpose + ) locals_dict = {} locals_dict.update(constants) locals_dict.update(coefficients) expressions.annotate_preprocessors( - trips_segment, locals_dict, skims, - model_settings, segment_trace_label) + trips_segment, locals_dict, skims, model_settings, segment_trace_label + ) if estimator: # write choosers after annotation @@ -152,31 +159,38 @@ def trip_mode_choice( choices = mode_choice_simulate( choosers=trips_segment, spec=simulate.eval_coefficients(model_spec, coefficients, estimator), - nest_spec=simulate.eval_nest_coefficients(nest_spec, coefficients, segment_trace_label), + nest_spec=simulate.eval_nest_coefficients( + nest_spec, coefficients, segment_trace_label + ), skims=skims, locals_d=locals_dict, chunk_size=chunk_size, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, trace_label=segment_trace_label, - trace_choice_name='trip_mode_choice', - estimator=estimator) + trace_choice_name="trip_mode_choice", + estimator=estimator, + ) if trace_hh_id: # trace the coefficients - tracing.trace_df(pd.Series(locals_dict), - label=tracing.extend_trace_label(segment_trace_label, 'constants'), - transpose=False, - slicer='NONE') + tracing.trace_df( + pd.Series(locals_dict), + label=tracing.extend_trace_label(segment_trace_label, "constants"), + transpose=False, + slicer="NONE", + ) # so we can trace with annotations assign_in_place(trips_segment, choices) - tracing.trace_df(trips_segment, - label=tracing.extend_trace_label(segment_trace_label, 'trip_mode'), - slicer='tour_id', - index_label='tour_id', - warn_if_empty=True) + tracing.trace_df( + trips_segment, + label=tracing.extend_trace_label(segment_trace_label, "trip_mode"), + slicer="tour_id", + index_label="tour_id", + warn_if_empty=True, + ) choices_list.append(choices) @@ -185,7 +199,7 @@ def trip_mode_choice( # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: - tvpb_mode_path_types = model_settings.get('tvpb_mode_path_types') + tvpb_mode_path_types = model_settings.get("tvpb_mode_path_types") for mode, path_type in tvpb_mode_path_types.items(): skim_cache = tvpb_logsum_odt.cache[path_type] @@ -193,12 +207,18 @@ def trip_mode_choice( for c in skim_cache: dest_col = c if dest_col not in choices_df: - choices_df[dest_col] = np.nan if pd.api.types.is_numeric_dtype(skim_cache[c]) else '' - choices_df[dest_col].where(choices_df[mode_column_name] != mode, skim_cache[c], inplace=True) + choices_df[dest_col] = ( + np.nan if pd.api.types.is_numeric_dtype(skim_cache[c]) else "" + ) + choices_df[dest_col].where( + choices_df[mode_column_name] != mode, skim_cache[c], inplace=True + ) if estimator: estimator.write_choices(choices_df.trip_mode) - choices_df.trip_mode = estimator.get_survey_values(choices_df.trip_mode, 'trips', 'trip_mode') + choices_df.trip_mode = estimator.get_survey_values( + choices_df.trip_mode, "trips", "trip_mode" + ) estimator.write_override_choices(choices_df.trip_mode) estimator.end_estimation() @@ -206,19 +226,21 @@ def trip_mode_choice( trips_df = trips.to_frame() assign_in_place(trips_df, choices_df) - tracing.print_summary('trip_modes', - trips_merged.tour_mode, value_counts=True) + tracing.print_summary("trip_modes", trips_merged.tour_mode, value_counts=True) - tracing.print_summary('trip_mode_choice choices', - trips_df[mode_column_name], value_counts=True) + tracing.print_summary( + "trip_mode_choice choices", trips_df[mode_column_name], value_counts=True + ) assert not trips_df[mode_column_name].isnull().any() pipeline.replace_table("trips", trips_df) if trace_hh_id: - tracing.trace_df(trips_df, - label=tracing.extend_trace_label(trace_label, 'trip_mode'), - slicer='trip_id', - index_label='trip_id', - warn_if_empty=True) + tracing.trace_df( + trips_df, + label=tracing.extend_trace_label(trace_label, "trip_mode"), + slicer="trip_id", + index_label="trip_id", + warn_if_empty=True, + ) diff --git a/activitysim/abm/models/trip_purpose.py b/activitysim/abm/models/trip_purpose.py index 78e3a2b873..938da1eb1f 100644 --- a/activitysim/abm/models/trip_purpose.py +++ b/activitysim/abm/models/trip_purpose.py @@ -5,30 +5,33 @@ import numpy as np import pandas as pd -from activitysim.core import logit -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import tracing -from activitysim.core import chunk -from activitysim.core import pipeline -from activitysim.core import expressions -from activitysim.core import simulate +from activitysim.core import ( + chunk, + config, + expressions, + inject, + logit, + pipeline, + simulate, + tracing, +) from .util import estimation logger = logging.getLogger(__name__) -PROBS_JOIN_COLUMNS = ['primary_purpose', 'outbound', 'person_type'] +PROBS_JOIN_COLUMNS = ["primary_purpose", "outbound", "person_type"] def map_coefficients(spec, coefficients): if isinstance(coefficients, pd.DataFrame): - assert ('value' in coefficients.columns) - coefficients = coefficients['value'].to_dict() + assert "value" in coefficients.columns + coefficients = coefficients["value"].to_dict() - assert isinstance(coefficients, dict), \ - "map_coefficients doesn't grok type of coefficients: %s" % (type(coefficients)) + assert isinstance( + coefficients, dict + ), "map_coefficients doesn't grok type of coefficients: %s" % (type(coefficients)) for c in spec.columns: if c == simulate.SPEC_LABEL_NAME: @@ -40,7 +43,9 @@ def map_coefficients(spec, coefficients): return spec -def choose_intermediate_trip_purpose(trips, probs_spec, estimator, trace_hh_id, trace_label): +def choose_intermediate_trip_purpose( + trips, probs_spec, estimator, trace_hh_id, trace_label +): """ chose purpose for intermediate trips based on probs_spec which assigns relative weights (summing to 1) to the possible purpose choices @@ -50,7 +55,7 @@ def choose_intermediate_trip_purpose(trips, probs_spec, estimator, trace_hh_id, purpose: pandas.Series of purpose (str) indexed by trip_id """ - non_purpose_cols = PROBS_JOIN_COLUMNS + ['depart_range_start', 'depart_range_end'] + non_purpose_cols = PROBS_JOIN_COLUMNS + ["depart_range_start", "depart_range_end"] purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) @@ -58,36 +63,64 @@ def choose_intermediate_trip_purpose(trips, probs_spec, estimator, trace_hh_id, # probs should sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) - probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0) + probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div( + sum_probs, axis=0 + ) # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) - choosers = pd.merge(trips.reset_index(), probs_spec, on=PROBS_JOIN_COLUMNS, - how='left').set_index('trip_id') - chunk.log_df(trace_label, 'choosers', choosers) + choosers = pd.merge( + trips.reset_index(), probs_spec, on=PROBS_JOIN_COLUMNS, how="left" + ).set_index("trip_id") + chunk.log_df(trace_label, "choosers", choosers) # select the matching depart range (this should result on in exactly one chooser row per trip) - chooser_probs = \ - (choosers.start >= choosers['depart_range_start']) & (choosers.start <= choosers['depart_range_end']) + chooser_probs = (choosers.start >= choosers["depart_range_start"]) & ( + choosers.start <= choosers["depart_range_end"] + ) # if we failed to match a row in probs_spec if chooser_probs.sum() < num_trips: # this can happen if the spec doesn't have probs for the trips matching a trip's probs_join_cols - missing_trip_ids = trips.index[~trips.index.isin(choosers.index[chooser_probs])].values + missing_trip_ids = trips.index[ + ~trips.index.isin(choosers.index[chooser_probs]) + ].values unmatched_choosers = choosers[choosers.index.isin(missing_trip_ids)] - unmatched_choosers = unmatched_choosers[['person_id', 'start'] + non_purpose_cols] + unmatched_choosers = unmatched_choosers[ + ["person_id", "start"] + non_purpose_cols + ] # join to persons for better diagnostics - persons = inject.get_table('persons').to_frame() - persons_cols = ['age', 'is_worker', 'is_student', 'is_gradeschool', 'is_highschool', 'is_university'] - unmatched_choosers = pd.merge(unmatched_choosers, persons[persons_cols], - left_on='person_id', right_index=True, how='left') - - file_name = '%s.UNMATCHED_PROBS' % trace_label - logger.error("%s %s of %s intermediate trips could not be matched to probs based on join columns %s" % - (trace_label, len(unmatched_choosers), len(choosers), PROBS_JOIN_COLUMNS)) - logger.info("Writing %s unmatched choosers to %s" % (len(unmatched_choosers), file_name,)) + persons = inject.get_table("persons").to_frame() + persons_cols = [ + "age", + "is_worker", + "is_student", + "is_gradeschool", + "is_highschool", + "is_university", + ] + unmatched_choosers = pd.merge( + unmatched_choosers, + persons[persons_cols], + left_on="person_id", + right_index=True, + how="left", + ) + + file_name = "%s.UNMATCHED_PROBS" % trace_label + logger.error( + "%s %s of %s intermediate trips could not be matched to probs based on join columns %s" + % (trace_label, len(unmatched_choosers), len(choosers), PROBS_JOIN_COLUMNS) + ) + logger.info( + "Writing %s unmatched choosers to %s" + % (len(unmatched_choosers), file_name,) + ) tracing.write_csv(unmatched_choosers, file_name=file_name, transpose=False) - raise RuntimeError("Some trips could not be matched to probs based on join columns %s." % PROBS_JOIN_COLUMNS) + raise RuntimeError( + "Some trips could not be matched to probs based on join columns %s." + % PROBS_JOIN_COLUMNS + ) # select the matching depart range (this should result on in exactly one chooser row per trip) choosers = choosers[chooser_probs] @@ -98,26 +131,23 @@ def choose_intermediate_trip_purpose(trips, probs_spec, estimator, trace_hh_id, if estimator: probs_cols = list(probs_spec.columns) print(choosers[probs_cols]) - estimator.write_table(choosers[probs_cols], 'probs', append=True) + estimator.write_table(choosers[probs_cols], "probs", append=True) choices, rands = logit.make_choices( - choosers[purpose_cols], - trace_label=trace_label, trace_choosers=choosers) + choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers + ) if have_trace_targets: - tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose']) - tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) + tracing.trace_df( + choices, "%s.choices" % trace_label, columns=[None, "trip_purpose"] + ) + tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) choices = choices.map(pd.Series(purpose_cols)) return choices -def run_trip_purpose( - trips_df, - estimator, - chunk_size, - trace_hh_id, - trace_label): +def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): """ trip purpose - main functionality separated from model step so it can be called iteratively @@ -133,27 +163,27 @@ def run_trip_purpose( """ # uniform across trip_purpose - chunk_tag = 'trip_purpose' + chunk_tag = "trip_purpose" - model_settings_file_name = 'trip_purpose.yaml' + model_settings_file_name = "trip_purpose.yaml" model_settings = config.read_model_settings(model_settings_file_name) - spec_file_name = model_settings.get('PROBS_SPEC', 'trip_purpose_probs.csv') - probs_spec = pd.read_csv(config.config_file_path(spec_file_name), comment='#') + spec_file_name = model_settings.get("PROBS_SPEC", "trip_purpose_probs.csv") + probs_spec = pd.read_csv(config.config_file_path(spec_file_name), comment="#") # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation # coefficients_df = simulate.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) if estimator: - estimator.write_spec(model_settings, tag='PROBS_SPEC') + estimator.write_spec(model_settings, tag="PROBS_SPEC") estimator.write_model_settings(model_settings, model_settings_file_name) # estimator.write_coefficients(coefficients_df, model_settings) result_list = [] # - last trip of outbound tour gets primary_purpose - last_trip = (trips_df.trip_num == trips_df.trip_count) + last_trip = trips_df.trip_num == trips_df.trip_count purpose = trips_df.primary_purpose[last_trip & trips_df.outbound] result_list.append(purpose) logger.info("assign purpose to %s last outbound trips", purpose.shape[0]) @@ -161,7 +191,9 @@ def run_trip_purpose( # - last trip of inbound tour gets home (or work for atwork subtours) purpose = trips_df.primary_purpose[last_trip & ~trips_df.outbound] # FIXME should be lower case for consistency? - purpose = pd.Series(np.where(purpose == 'atwork', 'Work', 'Home'), index=purpose.index) + purpose = pd.Series( + np.where(purpose == "atwork", "Work", "Home"), index=purpose.index + ) result_list.append(purpose) logger.info("assign purpose to %s last inbound trips", purpose.shape[0]) @@ -169,28 +201,31 @@ def run_trip_purpose( trips_df = trips_df[~last_trip] logger.info("assign purpose to %s intermediate trips", trips_df.shape[0]) - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_dict = config.get_model_constants(model_settings) expressions.assign_columns( df=trips_df, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) - for i, trips_chunk, chunk_trace_label in \ - chunk.adaptive_chunked_choosers(trips_df, chunk_size, chunk_tag, trace_label): + for i, trips_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + trips_df, chunk_size, chunk_tag, trace_label + ): choices = choose_intermediate_trip_purpose( trips_chunk, probs_spec, estimator, trace_hh_id, - trace_label=chunk_trace_label) + trace_label=chunk_trace_label, + ) result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) if len(result_list) > 1: choices = pd.concat(result_list) @@ -199,10 +234,7 @@ def run_trip_purpose( @inject.step() -def trip_purpose( - trips, - chunk_size, - trace_hh_id): +def trip_purpose(trips, chunk_size, trace_hh_id): """ trip purpose model step - calls run_trip_purpose to run the actual model @@ -213,9 +245,14 @@ def trip_purpose( trips_df = trips.to_frame() - estimator = estimation.manager.begin_estimation('trip_purpose') + estimator = estimation.manager.begin_estimation("trip_purpose") if estimator: - chooser_cols_for_estimation = ['person_id', 'household_id', 'tour_id', 'trip_num'] + chooser_cols_for_estimation = [ + "person_id", + "household_id", + "tour_id", + "trip_num", + ] estimator.write_choosers(trips_df[chooser_cols_for_estimation]) choices = run_trip_purpose( @@ -223,16 +260,18 @@ def trip_purpose( estimator, chunk_size=chunk_size, trace_hh_id=trace_hh_id, - trace_label=trace_label + trace_label=trace_label, ) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'trips', 'purpose') # override choices + choices = estimator.get_survey_values( + choices, "trips", "purpose" + ) # override choices estimator.write_override_choices(choices) estimator.end_estimation() - trips_df['purpose'] = choices + trips_df["purpose"] = choices # we should have assigned a purpose to all trips assert not trips_df.purpose.isnull().any() @@ -240,8 +279,10 @@ def trip_purpose( pipeline.replace_table("trips", trips_df) if trace_hh_id: - tracing.trace_df(trips_df, - label=trace_label, - slicer='trip_id', - index_label='trip_id', - warn_if_empty=True) + tracing.trace_df( + trips_df, + label=trace_label, + slicer="trip_id", + index_label="trip_id", + warn_if_empty=True, + ) diff --git a/activitysim/abm/models/trip_purpose_and_destination.py b/activitysim/abm/models/trip_purpose_and_destination.py index 19ac2c4691..31bca977ec 100644 --- a/activitysim/abm/models/trip_purpose_and_destination.py +++ b/activitysim/abm/models/trip_purpose_and_destination.py @@ -4,18 +4,14 @@ import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import inject - -from activitysim.core.util import assign_in_place - -from activitysim.abm.models.trip_purpose import run_trip_purpose from activitysim.abm.models.trip_destination import run_trip_destination - -from activitysim.abm.models.util.trip import flag_failed_trip_leg_mates -from activitysim.abm.models.util.trip import cleanup_failed_trips +from activitysim.abm.models.trip_purpose import run_trip_purpose +from activitysim.abm.models.util.trip import ( + cleanup_failed_trips, + flag_failed_trip_leg_mates, +) +from activitysim.core import config, inject, pipeline, tracing +from activitysim.core.util import assign_in_place from .util import estimation @@ -23,11 +19,8 @@ def run_trip_purpose_and_destination( - trips_df, - tours_merged_df, - chunk_size, - trace_hh_id, - trace_label): + trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label +): assert not trips_df.empty @@ -36,10 +29,10 @@ def run_trip_purpose_and_destination( estimator=None, chunk_size=chunk_size, trace_hh_id=trace_hh_id, - trace_label=tracing.extend_trace_label(trace_label, 'purpose') + trace_label=tracing.extend_trace_label(trace_label, "purpose"), ) - trips_df['purpose'] = choices + trips_df["purpose"] = choices trips_df, save_sample_df = run_trip_destination( trips_df, @@ -47,27 +40,31 @@ def run_trip_purpose_and_destination( estimator=None, chunk_size=chunk_size, trace_hh_id=trace_hh_id, - trace_label=tracing.extend_trace_label(trace_label, 'destination')) + trace_label=tracing.extend_trace_label(trace_label, "destination"), + ) return trips_df, save_sample_df @inject.step() -def trip_purpose_and_destination( - trips, - tours_merged, - chunk_size, - trace_hh_id): +def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): trace_label = "trip_purpose_and_destination" - model_settings = config.read_model_settings('trip_purpose_and_destination.yaml') + model_settings = config.read_model_settings("trip_purpose_and_destination.yaml") # for consistency, read sample_table_name setting from trip_destination settings file - trip_destination_model_settings = config.read_model_settings('trip_destination.yaml') - sample_table_name = trip_destination_model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') - want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None + trip_destination_model_settings = config.read_model_settings( + "trip_destination.yaml" + ) + sample_table_name = trip_destination_model_settings.get( + "DEST_CHOICE_SAMPLE_TABLE_NAME" + ) + want_sample_table = ( + config.setting("want_dest_choice_sample_tables") + and sample_table_name is not None + ) - MAX_ITERATIONS = model_settings.get('MAX_ITERATIONS', 5) + MAX_ITERATIONS = model_settings.get("MAX_ITERATIONS", 5) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() @@ -78,12 +75,12 @@ def trip_purpose_and_destination( # FIXME could allow MAX_ITERATIONS=0 to allow for cleanup-only run # in which case, we would need to drop bad trips, WITHOUT failing bad_trip leg_mates - assert (MAX_ITERATIONS > 0) + assert MAX_ITERATIONS > 0 # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry - if 'destination' in trips_df: + if "destination" in trips_df: - if 'failed' not in trips_df.columns: + if "failed" not in trips_df.columns: # trip_destination model cleaned up any failed trips logger.info("%s - no failed column from prior model run." % trace_label) return @@ -91,35 +88,39 @@ def trip_purpose_and_destination( elif not trips_df.failed.any(): # 'failed' column but no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) - trips_df.drop(columns='failed', inplace=True) + trips_df.drop(columns="failed", inplace=True) pipeline.replace_table("trips", trips_df) return else: logger.info("trip_destination has already been run. Rerunning failed trips") - flag_failed_trip_leg_mates(trips_df, 'failed') + flag_failed_trip_leg_mates(trips_df, "failed") trips_df = trips_df[trips_df.failed] - tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] + tours_merged_df = tours_merged_df[ + tours_merged_df.index.isin(trips_df.tour_id) + ] logger.info("Rerunning %s failed trips and leg-mates" % trips_df.shape[0]) # drop any previously saved samples of failed trips if want_sample_table and pipeline.is_table(sample_table_name): logger.info("Dropping any previously saved samples of failed trips") save_sample_df = pipeline.get_table(sample_table_name) - save_sample_df.drop(trips_df.index, level='trip_id', inplace=True) + save_sample_df.drop(trips_df.index, level="trip_id", inplace=True) pipeline.replace_table(sample_table_name, save_sample_df) del save_sample_df # if we estimated trip_destination, there should have been no failed trips # if we didn't, but it is enabled, it is probably a configuration error # if we just estimated trip_purpose, it isn't clear what they are trying to do , nor how to handle it - assert not (estimation.manager.begin_estimation('trip_purpose') - or estimation.manager.begin_estimation('trip_destination')) + assert not ( + estimation.manager.begin_estimation("trip_purpose") + or estimation.manager.begin_estimation("trip_destination") + ) processed_trips = [] save_samples = [] i = 0 - TRIP_RESULT_COLUMNS = ['purpose', 'destination', 'origin', 'failed'] + TRIP_RESULT_COLUMNS = ["purpose", "destination", "origin", "failed"] while True: i += 1 @@ -133,14 +134,19 @@ def trip_purpose_and_destination( tours_merged_df, chunk_size=chunk_size, trace_hh_id=trace_hh_id, - trace_label=tracing.extend_trace_label(trace_label, "i%s" % i)) + trace_label=tracing.extend_trace_label(trace_label, "i%s" % i), + ) # # if testing, make sure at least one trip fails - if config.setting('testing_fail_trip_destination', False) \ - and (i == 1) and not trips_df.failed.any(): + if ( + config.setting("testing_fail_trip_destination", False) + and (i == 1) + and not trips_df.failed.any() + ): fail_o = trips_df[trips_df.trip_num < trips_df.trip_count].origin.max() - trips_df.failed = (trips_df.origin == fail_o) & \ - (trips_df.trip_num < trips_df.trip_count) + trips_df.failed = (trips_df.origin == fail_o) & ( + trips_df.trip_num < trips_df.trip_count + ) num_failed_trips = trips_df.failed.sum() @@ -151,10 +157,14 @@ def trip_purpose_and_destination( save_samples.append(save_sample_df) break - logger.warning("%s %s failed trips in iteration %s" % (trace_label, num_failed_trips, i)) + logger.warning( + "%s %s failed trips in iteration %s" % (trace_label, num_failed_trips, i) + ) file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) - tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) + tracing.write_csv( + trips_df[trips_df.failed], file_name=file_name, transpose=False + ) # if max iterations reached, add remaining trips to processed_trips and give up # note that we do this BEFORE failing leg_mates so resulting trip legs are complete @@ -162,12 +172,14 @@ def trip_purpose_and_destination( logger.warning("%s too many iterations %s" % (trace_label, i)) processed_trips.append(trips_df[TRIP_RESULT_COLUMNS]) if save_sample_df is not None: - save_sample_df.drop(trips_df[trips_df.failed].index, level='trip_id', inplace=True) + save_sample_df.drop( + trips_df[trips_df.failed].index, level="trip_id", inplace=True + ) save_samples.append(save_sample_df) break # otherwise, if any trips failed, then their leg-mates trips must also fail - flag_failed_trip_leg_mates(trips_df, 'failed') + flag_failed_trip_leg_mates(trips_df, "failed") # add the good trips to processed_trips processed_trips.append(trips_df[~trips_df.failed][TRIP_RESULT_COLUMNS]) @@ -179,7 +191,7 @@ def trip_purpose_and_destination( # add trip samples of processed_trips to processed_samples if save_sample_df is not None: # drop failed trip samples - save_sample_df.drop(trips_df.index, level='trip_id', inplace=True) + save_sample_df.drop(trips_df.index, level="trip_id", inplace=True) save_samples.append(save_sample_df) # - assign result columns to trips @@ -187,11 +199,15 @@ def trip_purpose_and_destination( if len(save_samples) > 0: save_sample_df = pd.concat(save_samples) - logger.info("adding %s samples to %s" % (len(save_sample_df), sample_table_name)) + logger.info( + "adding %s samples to %s" % (len(save_sample_df), sample_table_name) + ) pipeline.extend_table(sample_table_name, save_sample_df) - logger.info("%s %s failed trips after %s iterations" % - (trace_label, processed_trips.failed.sum(), i)) + logger.info( + "%s %s failed trips after %s iterations" + % (trace_label, processed_trips.failed.sum(), i) + ) trips_df = trips.to_frame() assign_in_place(trips_df, processed_trips) @@ -207,14 +223,16 @@ def trip_purpose_and_destination( # once we discard failed trips, we should samples for all trips save_sample_df = pipeline.get_table(sample_table_name) # expect samples only for intermediate trip destinatinos - assert \ - len(save_sample_df.index.get_level_values(0).unique()) == \ - len(trips_df[trips_df.trip_num < trips_df.trip_count]) + assert len(save_sample_df.index.get_level_values(0).unique()) == len( + trips_df[trips_df.trip_num < trips_df.trip_count] + ) del save_sample_df if trace_hh_id: - tracing.trace_df(trips_df, - label=trace_label, - slicer='trip_id', - index_label='trip_id', - warn_if_empty=True) + tracing.trace_df( + trips_df, + label=trace_label, + slicer="trip_id", + index_label="trip_id", + warn_if_empty=True, + ) diff --git a/activitysim/abm/models/trip_scheduling.py b/activitysim/abm/models/trip_scheduling.py index 07ec0fcade..876207f6ff 100644 --- a/activitysim/abm/models/trip_scheduling.py +++ b/activitysim/abm/models/trip_scheduling.py @@ -1,26 +1,15 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range - import logging +from builtins import range import numpy as np import pandas as pd -from activitysim.core import logit -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import tracing -from activitysim.core import chunk -from activitysim.core import pipeline - -from activitysim.core.util import reindex - -from activitysim.abm.models.util.trip import failed_trip_cohorts -from activitysim.abm.models.util.trip import cleanup_failed_trips - from activitysim.abm.models.util import estimation - +from activitysim.abm.models.util.trip import cleanup_failed_trips, failed_trip_cohorts +from activitysim.core import chunk, config, inject, logit, pipeline, tracing +from activitysim.core.util import reindex logger = logging.getLogger(__name__) @@ -35,14 +24,14 @@ NO_TRIP_ID = 0 NO_DEPART = 0 -DEPART_ALT_BASE = 'DEPART_ALT_BASE' +DEPART_ALT_BASE = "DEPART_ALT_BASE" -FAILFIX = 'FAILFIX' -FAILFIX_CHOOSE_MOST_INITIAL = 'choose_most_initial' -FAILFIX_DROP_AND_CLEANUP = 'drop_and_cleanup' +FAILFIX = "FAILFIX" +FAILFIX_CHOOSE_MOST_INITIAL = "choose_most_initial" +FAILFIX_DROP_AND_CLEANUP = "drop_and_cleanup" FAILFIX_DEFAULT = FAILFIX_CHOOSE_MOST_INITIAL -PROBS_JOIN_COLUMNS = ['primary_purpose', 'outbound', 'tour_hour', 'trip_num'] +PROBS_JOIN_COLUMNS = ["primary_purpose", "outbound", "tour_hour", "trip_num"] def set_tour_hour(trips, tours): @@ -60,33 +49,37 @@ def set_tour_hour(trips, tours): """ # all trips must depart between tour start and end - trips['earliest'] = reindex(tours.start, trips.tour_id) - trips['latest'] = reindex(tours.end, trips.tour_id) + trips["earliest"] = reindex(tours.start, trips.tour_id) + trips["latest"] = reindex(tours.end, trips.tour_id) # tour_hour is start for outbound trips, and end for inbound trips - trips['tour_hour'] = np.where( - trips.outbound, - trips['earliest'], - trips['latest']).astype(np.int8) + trips["tour_hour"] = np.where( + trips.outbound, trips["earliest"], trips["latest"] + ).astype(np.int8) # subtours indexed by parent_tour_id - subtours = tours.loc[tours.primary_purpose == 'atwork', - ['tour_num', 'tour_count', 'parent_tour_id', 'start', 'end']] + subtours = tours.loc[ + tours.primary_purpose == "atwork", + ["tour_num", "tour_count", "parent_tour_id", "start", "end"], + ] subtours.parent_tour_id = subtours.parent_tour_id.astype(np.int64) - subtours = subtours.set_index('parent_tour_id') + subtours = subtours.set_index("parent_tour_id") subtours = subtours.astype(np.int16) # remaining columns are all small ints # bool series trip_has_subtours = trips.tour_id.isin(subtours.index) outbound = trip_has_subtours & trips.outbound - trips.loc[outbound, 'latest'] = \ - reindex(subtours[subtours.tour_num == 1]['start'], trips[outbound].tour_id) + trips.loc[outbound, "latest"] = reindex( + subtours[subtours.tour_num == 1]["start"], trips[outbound].tour_id + ) inbound = trip_has_subtours & ~trips.outbound - trips.loc[inbound, 'earliest'] = \ - reindex(subtours[subtours.tour_num == subtours.tour_count]['end'], trips[inbound].tour_id) + trips.loc[inbound, "earliest"] = reindex( + subtours[subtours.tour_num == subtours.tour_count]["end"], + trips[inbound].tour_id, + ) def clip_probs(trips, probs, model_settings): @@ -119,18 +112,23 @@ def clip_probs(trips, probs, model_settings): probs = probs.div(probs.sum(axis=1), axis=0) num_rows, num_cols = probs.shape - ix_map = np.tile(np.arange(0, num_cols), num_rows).reshape(num_rows, num_cols) + depart_alt_base + ix_map = ( + np.tile(np.arange(0, num_cols), num_rows).reshape(num_rows, num_cols) + + depart_alt_base + ) # 5 6 7 8 9 10... # 5 6 7 8 9 10... # 5 6 7 8 9 10... - clip_mask = ((ix_map >= trips.earliest.values.reshape(num_rows, 1)) & - (ix_map <= trips.latest.values.reshape(num_rows, 1))) * 1 + clip_mask = ( + (ix_map >= trips.earliest.values.reshape(num_rows, 1)) + & (ix_map <= trips.latest.values.reshape(num_rows, 1)) + ) * 1 # [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0] # [0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] # [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]... - probs = probs*clip_mask + probs = probs * clip_mask return probs @@ -155,7 +153,7 @@ def report_bad_choices(bad_row_map, df, filename, trace_label, trace_choosers=No hh_ids = tracing.hh_id_for_chooser(df.index, df) else: hh_ids = tracing.hh_id_for_chooser(df.index, trace_choosers) - df['household_id'] = hh_ids + df["household_id"] = hh_ids filename = "%s.%s" % (trace_label, filename) @@ -166,20 +164,25 @@ def report_bad_choices(bad_row_map, df, filename, trace_label, trace_choosers=No MAX_PRINT = 0 for idx in df.index[:MAX_PRINT].values: - row_msg = "%s : failed %s = %s (hh_id = %s)" % \ - (trace_label, df.index.name, idx, df.household_id.loc[idx]) + row_msg = "%s : failed %s = %s (hh_id = %s)" % ( + trace_label, + df.index.name, + idx, + df.household_id.loc[idx], + ) logger.warning(row_msg) def schedule_nth_trips( - trips, - probs_spec, - model_settings, - first_trip_in_leg, - report_failed_trips, - trace_hh_id, - trace_label): + trips, + probs_spec, + model_settings, + first_trip_in_leg, + report_failed_trips, + trace_hh_id, + trace_label, +): """ We join each trip with the appropriate row in probs_spec by joining on probs_join_cols, which should exist in both trips, probs_spec dataframe. @@ -204,17 +207,18 @@ def schedule_nth_trips( time periods depart choices, one per trip (except for trips with zero probs) """ - depart_alt_base = model_settings.get('DEPART_ALT_BASE') + depart_alt_base = model_settings.get("DEPART_ALT_BASE") probs_cols = [c for c in probs_spec.columns if c not in PROBS_JOIN_COLUMNS] # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) - choosers = pd.merge(trips.reset_index(), probs_spec, on=PROBS_JOIN_COLUMNS, - how='left').set_index('trip_id') + choosers = pd.merge( + trips.reset_index(), probs_spec, on=PROBS_JOIN_COLUMNS, how="left" + ).set_index("trip_id") chunk.log_df(trace_label, "choosers", choosers) if trace_hh_id and tracing.has_trace_targets(trips): - tracing.trace_df(choosers, '%s.choosers' % trace_label) + tracing.trace_df(choosers, "%s.choosers" % trace_label) # choosers should now match trips row for row assert choosers.index.is_unique @@ -229,23 +233,25 @@ def schedule_nth_trips( chooser_probs = chooser_probs.div(chooser_probs.sum(axis=1), axis=0).fillna(0) # probs should sum to 1 with residual probs resulting in choice of 'fail' - chooser_probs['fail'] = 1 - chooser_probs.sum(axis=1).clip(0, 1) + chooser_probs["fail"] = 1 - chooser_probs.sum(axis=1).clip(0, 1) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if trace_hh_id and tracing.has_trace_targets(trips): - tracing.trace_df(chooser_probs, '%s.chooser_probs' % trace_label) + tracing.trace_df(chooser_probs, "%s.chooser_probs" % trace_label) - choices, rands = logit.make_choices(chooser_probs, trace_label=trace_label, trace_choosers=choosers) + choices, rands = logit.make_choices( + chooser_probs, trace_label=trace_label, trace_choosers=choosers + ) chunk.log_df(trace_label, "choices", choices) chunk.log_df(trace_label, "rands", rands) if trace_hh_id and tracing.has_trace_targets(trips): - tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'depart']) - tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) + tracing.trace_df(choices, "%s.choices" % trace_label, columns=[None, "depart"]) + tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) # convert alt choice index to depart time (setting failed choices to -1) - failed = (choices == chooser_probs.columns.get_loc('fail')) + failed = choices == chooser_probs.columns.get_loc("fail") choices = (choices + depart_alt_base).where(~failed, -1) chunk.log_df(trace_label, "failed", failed) @@ -255,14 +261,15 @@ def schedule_nth_trips( report_bad_choices( bad_row_map=failed, df=choosers, - filename='failed_choosers', + filename="failed_choosers", trace_label=trace_label, - trace_choosers=None) + trace_choosers=None, + ) # trace before removing failures if trace_hh_id and tracing.has_trace_targets(trips): - tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'depart']) - tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) + tracing.trace_df(choices, "%s.choices" % trace_label, columns=[None, "depart"]) + tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) # remove any failed choices if failed.any(): @@ -275,12 +282,14 @@ def schedule_nth_trips( def schedule_trips_in_leg( - outbound, - trips, - probs_spec, - model_settings, - is_last_iteration, - trace_hh_id, trace_label): + outbound, + trips, + probs_spec, + model_settings, + is_last_iteration, + trace_hh_id, + trace_label, +): """ Parameters @@ -308,8 +317,10 @@ def schedule_trips_in_leg( assert (trips.outbound == outbound).all() # initial trip of leg and all atwork trips get tour_hour - is_initial = (trips.trip_num == 1) if outbound else (trips.trip_num == trips.trip_count) - no_scheduling = is_initial | (trips.primary_purpose == 'atwork') + is_initial = ( + (trips.trip_num == 1) if outbound else (trips.trip_num == trips.trip_count) + ) + no_scheduling = is_initial | (trips.primary_purpose == "atwork") choices = trips.tour_hour[no_scheduling] if no_scheduling.all(): @@ -321,8 +332,10 @@ def schedule_trips_in_leg( # add next_trip_id temp column (temp as trips is now a copy, as result of slicing) trips = trips.sort_index() - trips['next_trip_id'] = np.roll(trips.index, -1 if outbound else 1) - is_final = (trips.trip_num == trips.trip_count) if outbound else (trips.trip_num == 1) + trips["next_trip_id"] = np.roll(trips.index, -1 if outbound else 1) + is_final = ( + (trips.trip_num == trips.trip_count) if outbound else (trips.trip_num == 1) + ) trips.next_trip_id = trips.next_trip_id.where(~is_final, NO_TRIP_ID) # iterate over outbound trips in ascending trip_num order, skipping the initial trip @@ -335,7 +348,7 @@ def schedule_trips_in_leg( else: nth_trips = trips[trips.trip_num == trips.trip_count - i] - nth_trace_label = tracing.extend_trace_label(trace_label, 'num_%s' % i) + nth_trace_label = tracing.extend_trace_label(trace_label, "num_%s" % i) choices = schedule_nth_trips( nth_trips, @@ -344,30 +357,36 @@ def schedule_trips_in_leg( first_trip_in_leg=first_trip_in_leg, report_failed_trips=is_last_iteration, trace_hh_id=trace_hh_id, - trace_label=nth_trace_label) + trace_label=nth_trace_label, + ) # if outbound, this trip's depart constrains next trip's earliest depart option # if inbound, we are handling in reverse order, so it constrains latest depart instead - ADJUST_NEXT_DEPART_COL = 'earliest' if outbound else 'latest' + ADJUST_NEXT_DEPART_COL = "earliest" if outbound else "latest" # most initial departure (when no choice was made because all probs were zero) if is_last_iteration and (failfix == FAILFIX_CHOOSE_MOST_INITIAL): choices = choices.reindex(nth_trips.index) - logger.warning("%s coercing %s depart choices to most initial" % - (nth_trace_label, choices.isna().sum())) + logger.warning( + "%s coercing %s depart choices to most initial" + % (nth_trace_label, choices.isna().sum()) + ) choices = choices.fillna(trips[ADJUST_NEXT_DEPART_COL]) # adjust allowed depart range of next trip - has_next_trip = (nth_trips.next_trip_id != NO_TRIP_ID) + has_next_trip = nth_trips.next_trip_id != NO_TRIP_ID if has_next_trip.any(): next_trip_ids = nth_trips.next_trip_id[has_next_trip] # patch choice any trips with next_trips that weren't scheduled - trips.loc[next_trip_ids, ADJUST_NEXT_DEPART_COL] = \ - choices.reindex(next_trip_ids.index).fillna(trips[ADJUST_NEXT_DEPART_COL]).values + trips.loc[next_trip_ids, ADJUST_NEXT_DEPART_COL] = ( + choices.reindex(next_trip_ids.index) + .fillna(trips[ADJUST_NEXT_DEPART_COL]) + .values + ) result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) first_trip_in_leg = False @@ -378,16 +397,17 @@ def schedule_trips_in_leg( def run_trip_scheduling( - trips_chunk, - tours, - probs_spec, - model_settings, - estimator, - is_last_iteration, - chunk_size, - chunk_tag, - trace_hh_id, - trace_label): + trips_chunk, + tours, + probs_spec, + model_settings, + estimator, + is_last_iteration, + chunk_size, + chunk_tag, + trace_hh_id, + trace_label, +): # only non-initial trips require scheduling, segment handing first such trip in tour will use most space # is_outbound_chooser = (trips.trip_num > 1) & trips.outbound & (trips.primary_purpose != 'atwork') @@ -398,35 +418,35 @@ def run_trip_scheduling( if trips_chunk.outbound.any(): leg_chunk = trips_chunk[trips_chunk.outbound] - leg_trace_label = tracing.extend_trace_label(trace_label, 'outbound') - choices = \ - schedule_trips_in_leg( - outbound=True, - trips=leg_chunk, - probs_spec=probs_spec, - model_settings=model_settings, - is_last_iteration=is_last_iteration, - trace_hh_id=trace_hh_id, - trace_label=leg_trace_label) + leg_trace_label = tracing.extend_trace_label(trace_label, "outbound") + choices = schedule_trips_in_leg( + outbound=True, + trips=leg_chunk, + probs_spec=probs_spec, + model_settings=model_settings, + is_last_iteration=is_last_iteration, + trace_hh_id=trace_hh_id, + trace_label=leg_trace_label, + ) result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) if (~trips_chunk.outbound).any(): leg_chunk = trips_chunk[~trips_chunk.outbound] - leg_trace_label = tracing.extend_trace_label(trace_label, 'inbound') - choices = \ - schedule_trips_in_leg( - outbound=False, - trips=leg_chunk, - probs_spec=probs_spec, - model_settings=model_settings, - is_last_iteration=is_last_iteration, - trace_hh_id=trace_hh_id, - trace_label=leg_trace_label) + leg_trace_label = tracing.extend_trace_label(trace_label, "inbound") + choices = schedule_trips_in_leg( + outbound=False, + trips=leg_chunk, + probs_spec=probs_spec, + model_settings=model_settings, + is_last_iteration=is_last_iteration, + trace_hh_id=trace_hh_id, + trace_label=leg_trace_label, + ) result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) choices = pd.concat(result_list) @@ -434,11 +454,7 @@ def run_trip_scheduling( @inject.step() -def trip_scheduling( - trips, - tours, - chunk_size, - trace_hh_id): +def trip_scheduling(trips, tours, chunk_size, trace_hh_id): """ Trip scheduling assigns depart times for trips within the start, end limits of the tour. @@ -486,7 +502,7 @@ def trip_scheduling( """ trace_label = "trip_scheduling" - model_settings_file_name = 'trip_scheduling.yaml' + model_settings_file_name = "trip_scheduling.yaml" model_settings = config.read_model_settings(model_settings_file_name) trips_df = trips.to_frame() @@ -497,35 +513,52 @@ def trip_scheduling( # trip_scheduling is a probabilistic model ane we don't support estimation, # but we do need to override choices in estimation mode - estimator = estimation.manager.begin_estimation('trip_scheduling') + estimator = estimation.manager.begin_estimation("trip_scheduling") if estimator: - estimator.write_spec(model_settings, tag='PROBS_SPEC') + estimator.write_spec(model_settings, tag="PROBS_SPEC") estimator.write_model_settings(model_settings, model_settings_file_name) - chooser_cols_for_estimation = ['person_id', 'household_id', 'tour_id', 'trip_num', 'trip_count', - 'primary_purpose', 'outbound', 'earliest', 'latest', 'tour_hour', ] + chooser_cols_for_estimation = [ + "person_id", + "household_id", + "tour_id", + "trip_num", + "trip_count", + "primary_purpose", + "outbound", + "earliest", + "latest", + "tour_hour", + ] estimator.write_choosers(trips_df[chooser_cols_for_estimation]) - probs_spec = pd.read_csv(config.config_file_path('trip_scheduling_probs.csv'), comment='#') + probs_spec = pd.read_csv( + config.config_file_path("trip_scheduling_probs.csv"), comment="#" + ) # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation # coefficients_df = simulate.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) # add tour-based chunk_id so we can chunk all trips in tour together - trips_df['chunk_id'] = reindex(pd.Series(list(range(len(tours))), tours.index), trips_df.tour_id) + trips_df["chunk_id"] = reindex( + pd.Series(list(range(len(tours))), tours.index), trips_df.tour_id + ) - assert 'DEPART_ALT_BASE' in model_settings + assert "DEPART_ALT_BASE" in model_settings failfix = model_settings.get(FAILFIX, FAILFIX_DEFAULT) - max_iterations = model_settings.get('MAX_ITERATIONS', 1) + max_iterations = model_settings.get("MAX_ITERATIONS", 1) assert max_iterations > 0 choices_list = [] - for chunk_i, trips_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers_by_chunk_id(trips_df, - chunk_size, - trace_label, - trace_label): + for ( + chunk_i, + trips_chunk, + chunk_trace_label, + ) in chunk.adaptive_chunked_choosers_by_chunk_id( + trips_df, chunk_size, trace_label, trace_label + ): i = 0 while (i < max_iterations) and not trips_chunk.empty: @@ -534,23 +567,28 @@ def trip_scheduling( with chunk.chunk_log(trace_label) if i == 0 else chunk.chunk_log_skip(): i += 1 - is_last_iteration = (i == max_iterations) + is_last_iteration = i == max_iterations trace_label_i = tracing.extend_trace_label(trace_label, "i%s" % i) - logger.info("%s scheduling %s trips within chunk %s", trace_label_i, trips_chunk.shape[0], chunk_i) - - choices = \ - run_trip_scheduling( - trips_chunk, - tours, - probs_spec, - model_settings, - estimator=estimator, - is_last_iteration=is_last_iteration, - trace_hh_id=trace_hh_id, - chunk_size=chunk_size, - chunk_tag=trace_label, - trace_label=trace_label_i) + logger.info( + "%s scheduling %s trips within chunk %s", + trace_label_i, + trips_chunk.shape[0], + chunk_i, + ) + + choices = run_trip_scheduling( + trips_chunk, + tours, + probs_spec, + model_settings, + estimator=estimator, + is_last_iteration=is_last_iteration, + trace_hh_id=trace_hh_id, + chunk_size=chunk_size, + chunk_tag=trace_label, + trace_label=trace_label_i, + ) # boolean series of trips whose individual trip scheduling failed failed = choices.reindex(trips_chunk.index).isnull() @@ -571,24 +609,30 @@ def trip_scheduling( if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'trips', 'depart') # override choices + choices = estimator.get_survey_values( + choices, "trips", "depart" + ) # override choices estimator.write_override_choices(choices) estimator.end_estimation() assert not choices.isnull().any() if choices.isnull().any(): - logger.warning("%s of %s trips could not be scheduled after %s iterations" % - (choices.isnull().sum(), trips_df.shape[0], i)) + logger.warning( + "%s of %s trips could not be scheduled after %s iterations" + % (choices.isnull().sum(), trips_df.shape[0], i) + ) if failfix != FAILFIX_DROP_AND_CLEANUP: - raise RuntimeError("%s setting '%s' not enabled in settings" % - (FAILFIX, FAILFIX_DROP_AND_CLEANUP)) + raise RuntimeError( + "%s setting '%s' not enabled in settings" + % (FAILFIX, FAILFIX_DROP_AND_CLEANUP) + ) - trips_df['failed'] = choices.isnull() + trips_df["failed"] = choices.isnull() trips_df = cleanup_failed_trips(trips_df) choices = choices.reindex(trips_df.index) - trips_df['depart'] = choices + trips_df["depart"] = choices assert not trips_df.depart.isnull().any() diff --git a/activitysim/abm/models/trip_scheduling_choice.py b/activitysim/abm/models/trip_scheduling_choice.py index c950b0726f..463a00ee76 100644 --- a/activitysim/abm/models/trip_scheduling_choice.py +++ b/activitysim/abm/models/trip_scheduling_choice.py @@ -3,39 +3,40 @@ import numpy as np import pandas as pd -from activitysim.core import chunk -from activitysim.core import config -from activitysim.core import expressions -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import tracing - -from activitysim.abm.models.util.trip import generate_alternative_sizes, get_time_windows +from activitysim.abm.models.util.trip import ( + generate_alternative_sizes, + get_time_windows, +) +from activitysim.core import ( + chunk, + config, + expressions, + inject, + pipeline, + simulate, + tracing, +) from activitysim.core.interaction_sample_simulate import _interaction_sample_simulate logger = logging.getLogger(__name__) -TOUR_DURATION_COLUMN = 'duration' -NUM_ALTERNATIVES = 'num_alts' -MAIN_LEG_DURATION = 'main_leg_duration' -IB_DURATION = 'inbound_duration' -OB_DURATION = 'outbound_duration' -NUM_OB_STOPS = 'num_outbound_stops' -NUM_IB_STOPS = 'num_inbound_stops' -HAS_OB_STOPS = 'has_outbound_stops' -HAS_IB_STOPS = 'has_inbound_stops' -LAST_OB_STOP = 'last_outbound_stop' -FIRST_IB_STOP = 'last_inbound_stop' +TOUR_DURATION_COLUMN = "duration" +NUM_ALTERNATIVES = "num_alts" +MAIN_LEG_DURATION = "main_leg_duration" +IB_DURATION = "inbound_duration" +OB_DURATION = "outbound_duration" +NUM_OB_STOPS = "num_outbound_stops" +NUM_IB_STOPS = "num_inbound_stops" +HAS_OB_STOPS = "has_outbound_stops" +HAS_IB_STOPS = "has_inbound_stops" +LAST_OB_STOP = "last_outbound_stop" +FIRST_IB_STOP = "last_inbound_stop" -SCHEDULE_ID = 'schedule_id' +SCHEDULE_ID = "schedule_id" -OUTBOUND_FLAG = 'outbound' +OUTBOUND_FLAG = "outbound" -TEMP_COLS = [NUM_OB_STOPS, LAST_OB_STOP, - NUM_IB_STOPS, FIRST_IB_STOP, - NUM_ALTERNATIVES - ] +TEMP_COLS = [NUM_OB_STOPS, LAST_OB_STOP, NUM_IB_STOPS, FIRST_IB_STOP, NUM_ALTERNATIVES] def generate_schedule_alternatives(tours): @@ -66,7 +67,9 @@ def generate_schedule_alternatives(tours): stops. :return: pd.Dataframe: Potential time duration windows. """ - assert set([NUM_IB_STOPS, NUM_OB_STOPS, TOUR_DURATION_COLUMN]).issubset(tours.columns) + assert set([NUM_IB_STOPS, NUM_OB_STOPS, TOUR_DURATION_COLUMN]).issubset( + tours.columns + ) stop_pattern = tours[HAS_OB_STOPS].astype(int) + tours[HAS_IB_STOPS].astype(int) @@ -87,7 +90,9 @@ def no_stops_patterns(tours): :param tours: pd.Dataframe: Tours with no intermediate stops. :return: pd.Dataframe: Main leg duration, outbound leg duration, and inbound leg duration """ - alternatives = tours[[TOUR_DURATION_COLUMN]].rename(columns={TOUR_DURATION_COLUMN: MAIN_LEG_DURATION}) + alternatives = tours[[TOUR_DURATION_COLUMN]].rename( + columns={TOUR_DURATION_COLUMN: MAIN_LEG_DURATION} + ) alternatives[[IB_DURATION, OB_DURATION]] = 0 return alternatives.astype(int) @@ -106,15 +111,19 @@ def stop_one_way_only_patterns(tours, travel_duration_col=TOUR_DURATION_COLUMN): assert travel_duration_col in tours.columns - indexes, patterns, pattern_sizes = get_pattern_index_and_arrays(tours.index, tours[travel_duration_col], - one_way=True) + indexes, patterns, pattern_sizes = get_pattern_index_and_arrays( + tours.index, tours[travel_duration_col], one_way=True + ) direction = np.repeat(tours[HAS_OB_STOPS], pattern_sizes) inbound = np.where(direction == 0, patterns[:, 1], 0) outbound = np.where(direction == 1, patterns[:, 1], 0) - patterns = pd.DataFrame(index=indexes, data=np.column_stack((patterns[:, 0], outbound, inbound)), - columns=[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]) + patterns = pd.DataFrame( + index=indexes, + data=np.column_stack((patterns[:, 0], outbound, inbound)), + columns=[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION], + ) patterns.index.name = tours.index.name return patterns @@ -134,10 +143,15 @@ def stop_two_way_only_patterns(tours, travel_duration_col=TOUR_DURATION_COLUMN): assert travel_duration_col in tours.columns - indexes, patterns, _ = get_pattern_index_and_arrays(tours.index, tours[travel_duration_col], one_way=False) + indexes, patterns, _ = get_pattern_index_and_arrays( + tours.index, tours[travel_duration_col], one_way=False + ) - patterns = pd.DataFrame(index=indexes, data=patterns, - columns=[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]) + patterns = pd.DataFrame( + index=indexes, + data=patterns, + columns=[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION], + ) patterns.index.name = tours.index.name return patterns @@ -163,7 +177,9 @@ def get_pattern_index_and_arrays(tour_indexes, durations, one_way=True): pattern_sizes = [] for duration in durations: - possible_windows = time_windows[:max_columns, np.where(time_windows.sum(axis=0) == duration)[0]] + possible_windows = time_windows[ + :max_columns, np.where(time_windows.sum(axis=0) == duration)[0] + ] possible_windows = np.unique(possible_windows, axis=1).transpose() patterns.append(possible_windows) pattern_sizes.append(possible_windows.shape[0]) @@ -199,11 +215,12 @@ def get_spec_for_segment(model_settings, spec_name, segment): return spec -def run_trip_scheduling_choice(spec, tours, skims, locals_dict, - chunk_size, trace_hh_id, trace_label): +def run_trip_scheduling_choice( + spec, tours, skims, locals_dict, chunk_size, trace_hh_id, trace_label +): NUM_TOUR_LEGS = 3 - trace_label = tracing.extend_trace_label(trace_label, 'interaction_sample_simulate') + trace_label = tracing.extend_trace_label(trace_label, "interaction_sample_simulate") # FIXME: The duration, start, and end should be ints well before we get here... tours[TOUR_DURATION_COLUMN] = tours[TOUR_DURATION_COLUMN].astype(np.int8) @@ -221,14 +238,21 @@ def run_trip_scheduling_choice(spec, tours, skims, locals_dict, # Assert the number of tour leg schedule alternatives for each tour tours[NUM_ALTERNATIVES] = 1 - tours.loc[tours[HAS_OB_STOPS] != tours[HAS_IB_STOPS], NUM_ALTERNATIVES] = tours[TOUR_DURATION_COLUMN] + 1 - tours.loc[tours[HAS_OB_STOPS] & tours[HAS_IB_STOPS], NUM_ALTERNATIVES] = \ - tours.apply(lambda x: alt_sizes[1, x.duration], axis=1) + tours.loc[tours[HAS_OB_STOPS] != tours[HAS_IB_STOPS], NUM_ALTERNATIVES] = ( + tours[TOUR_DURATION_COLUMN] + 1 + ) + tours.loc[ + tours[HAS_OB_STOPS] & tours[HAS_IB_STOPS], NUM_ALTERNATIVES + ] = tours.apply(lambda x: alt_sizes[1, x.duration], axis=1) # If no intermediate stops on the tour, then then main leg duration # equals the tour duration and the intermediate durations are zero - tours.loc[~tours[HAS_OB_STOPS] & ~tours[HAS_IB_STOPS], MAIN_LEG_DURATION] = tours[TOUR_DURATION_COLUMN] - tours.loc[~tours[HAS_OB_STOPS] & ~tours[HAS_IB_STOPS], [IB_DURATION, OB_DURATION]] = 0 + tours.loc[~tours[HAS_OB_STOPS] & ~tours[HAS_IB_STOPS], MAIN_LEG_DURATION] = tours[ + TOUR_DURATION_COLUMN + ] + tours.loc[ + ~tours[HAS_OB_STOPS] & ~tours[HAS_IB_STOPS], [IB_DURATION, OB_DURATION] + ] = 0 # We only need to determine schedules for tours with intermediate stops indirect_tours = tours.loc[tours[HAS_OB_STOPS] | tours[HAS_IB_STOPS]] @@ -237,8 +261,9 @@ def run_trip_scheduling_choice(spec, tours, skims, locals_dict, # Iterate through the chunks result_list = [] - for i, choosers, chunk_trace_label in \ - chunk.adaptive_chunked_choosers(indirect_tours, chunk_size, trace_label): + for i, choosers, chunk_trace_label in chunk.adaptive_chunked_choosers( + indirect_tours, chunk_size, trace_label + ): # Sort the choosers and get the schedule alternatives choosers = choosers.sort_index() @@ -254,14 +279,15 @@ def run_trip_scheduling_choice(spec, tours, skims, locals_dict, alternatives=schedules, spec=spec, choice_column=SCHEDULE_ID, - allow_zero_probs=True, zero_prob_choice_val=-999, + allow_zero_probs=True, + zero_prob_choice_val=-999, log_alt_losers=False, want_logsums=False, skims=skims, locals_d=locals_dict, trace_label=chunk_trace_label, - trace_choice_name='trip_schedule_stage_1', - estimator=None + trace_choice_name="trip_schedule_stage_1", + estimator=None, ) assert len(choices.index) == len(choosers.index) @@ -270,7 +296,7 @@ def run_trip_scheduling_choice(spec, tours, skims, locals_dict, result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -285,24 +311,20 @@ def run_trip_scheduling_choice(spec, tours, skims, locals_dict, tours.update(choices[[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]]) # Cleanup data types and drop temporary columns - tours[[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]] = \ - tours[[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]].astype(np.int8) + tours[[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]] = tours[ + [MAIN_LEG_DURATION, OB_DURATION, IB_DURATION] + ].astype(np.int8) tours = tours.drop(columns=TEMP_COLS) return tours @inject.step() -def trip_scheduling_choice( - trips, - tours, - skim_dict, - chunk_size, - trace_hh_id): +def trip_scheduling_choice(trips, tours, skim_dict, chunk_size, trace_hh_id): - trace_label = 'trip_scheduling_choice' - model_settings = config.read_model_settings('trip_scheduling_choice.yaml') - spec = get_spec_for_segment(model_settings, 'SPECIFICATION', 'stage_one') + trace_label = "trip_scheduling_choice" + model_settings = config.read_model_settings("trip_scheduling_choice.yaml") + spec = get_spec_for_segment(model_settings, "SPECIFICATION", "stage_one") trips_df = trips.to_frame() tours_df = tours.to_frame() @@ -310,20 +332,36 @@ def trip_scheduling_choice( outbound_trips = trips_df[trips_df[OUTBOUND_FLAG]] inbound_trips = trips_df[~trips_df[OUTBOUND_FLAG]] - last_outbound_trip = trips_df.loc[outbound_trips.groupby('tour_id')['trip_num'].idxmax()] - first_inbound_trip = trips_df.loc[inbound_trips.groupby('tour_id')['trip_num'].idxmin()] - - tours_df[NUM_OB_STOPS] = outbound_trips.groupby('tour_id').size().reindex(tours.index) - 1 - tours_df[NUM_IB_STOPS] = inbound_trips.groupby('tour_id').size().reindex(tours.index) - 1 - tours_df[LAST_OB_STOP] = last_outbound_trip[['tour_id', 'origin']].set_index('tour_id').reindex(tours.index) - tours_df[FIRST_IB_STOP] = first_inbound_trip[['tour_id', 'destination']].set_index('tour_id').reindex(tours.index) - - preprocessor_settings = model_settings.get('PREPROCESSOR', None) + last_outbound_trip = trips_df.loc[ + outbound_trips.groupby("tour_id")["trip_num"].idxmax() + ] + first_inbound_trip = trips_df.loc[ + inbound_trips.groupby("tour_id")["trip_num"].idxmin() + ] + + tours_df[NUM_OB_STOPS] = ( + outbound_trips.groupby("tour_id").size().reindex(tours.index) - 1 + ) + tours_df[NUM_IB_STOPS] = ( + inbound_trips.groupby("tour_id").size().reindex(tours.index) - 1 + ) + tours_df[LAST_OB_STOP] = ( + last_outbound_trip[["tour_id", "origin"]] + .set_index("tour_id") + .reindex(tours.index) + ) + tours_df[FIRST_IB_STOP] = ( + first_inbound_trip[["tour_id", "destination"]] + .set_index("tour_id") + .reindex(tours.index) + ) + + preprocessor_settings = model_settings.get("PREPROCESSOR", None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already - od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination') - do_skim_stack_wrapper = skim_dict.wrap('destination', 'origin') + od_skim_stack_wrapper = skim_dict.wrap("origin", "destination") + do_skim_stack_wrapper = skim_dict.wrap("destination", "origin") obib_skim_stack_wrapper = skim_dict.wrap(LAST_OB_STOP, FIRST_IB_STOP) skims = [od_skim_stack_wrapper, do_skim_stack_wrapper, obib_skim_stack_wrapper] @@ -331,7 +369,7 @@ def trip_scheduling_choice( locals_dict = { "od_skims": od_skim_stack_wrapper, "do_skims": do_skim_stack_wrapper, - "obib_skims": obib_skim_stack_wrapper + "obib_skims": obib_skim_stack_wrapper, } simulate.set_skim_wrapper_targets(tours_df, skims) @@ -340,8 +378,11 @@ def trip_scheduling_choice( df=tours_df, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) - tours_df = run_trip_scheduling_choice(spec, tours_df, skims, locals_dict, chunk_size, trace_hh_id, trace_label) + tours_df = run_trip_scheduling_choice( + spec, tours_df, skims, locals_dict, chunk_size, trace_hh_id, trace_label + ) pipeline.replace_table("tours", tours_df) diff --git a/activitysim/abm/models/util/canonical_ids.py b/activitysim/abm/models/util/canonical_ids.py index 569e8e5f03..d703809c37 100644 --- a/activitysim/abm/models/util/canonical_ids.py +++ b/activitysim/abm/models/util/canonical_ids.py @@ -10,16 +10,22 @@ logger = logging.getLogger(__name__) -RANDOM_CHANNELS = ['households', 'persons', 'tours', 'joint_tour_participants', 'trips'] -TRACEABLE_TABLES = ['households', 'persons', 'tours', 'joint_tour_participants', 'trips'] +RANDOM_CHANNELS = ["households", "persons", "tours", "joint_tour_participants", "trips"] +TRACEABLE_TABLES = [ + "households", + "persons", + "tours", + "joint_tour_participants", + "trips", +] CANONICAL_TABLE_INDEX_NAMES = { - 'households': 'household_id', - 'persons': 'person_id', - 'tours': 'tour_id', - 'joint_tour_participants': 'participant_id', - 'trips': 'trip_id', - 'land_use': 'zone_id' + "households": "household_id", + "persons": "person_id", + "tours": "tour_id", + "joint_tour_participants": "participant_id", + "trips": "trip_id", + "land_use": "zone_id", } # unfortunately the two places this is needed (joint_tour_participation and estimation.infer @@ -31,9 +37,11 @@ def enumerate_tour_types(tour_flavors): # tour_flavors: {'eat': 1, 'business': 2, 'maint': 1} # channels: ['eat1', 'business1', 'business2', 'maint1'] - channels = [tour_type + str(tour_num) - for tour_type, max_count in tour_flavors.items() - for tour_num in range(1, max_count + 1)] + channels = [ + tour_type + str(tour_num) + for tour_type, max_count in tour_flavors.items() + for tour_num in range(1, max_count + 1) + ] return channels @@ -53,35 +61,49 @@ def canonical_tours(): # - non_mandatory_channels MAX_EXTENSION = 2 - non_mandatory_tour_flavors = {'escort': 2 + MAX_EXTENSION, - 'shopping': 1 + MAX_EXTENSION, - 'othmaint': 1 + MAX_EXTENSION, - 'othdiscr': 1 + MAX_EXTENSION, - 'eatout': 1 + MAX_EXTENSION, - 'social': 1 + MAX_EXTENSION} + non_mandatory_tour_flavors = { + "escort": 2 + MAX_EXTENSION, + "shopping": 1 + MAX_EXTENSION, + "othmaint": 1 + MAX_EXTENSION, + "othdiscr": 1 + MAX_EXTENSION, + "eatout": 1 + MAX_EXTENSION, + "social": 1 + MAX_EXTENSION, + } non_mandatory_channels = enumerate_tour_types(non_mandatory_tour_flavors) # - mandatory_channels - mandatory_tour_flavors = {'work': 2, 'school': 2} + mandatory_tour_flavors = {"work": 2, "school": 2} mandatory_channels = enumerate_tour_types(mandatory_tour_flavors) # - atwork_subtour_channels # we need to distinguish between subtours of different work tours # (e.g. eat1_1 is eat subtour for parent work tour 1 and eat1_2 is for work tour 2) - atwork_subtour_flavors = {'eat': 1, 'business': 2, 'maint': 1} + atwork_subtour_flavors = {"eat": 1, "business": 2, "maint": 1} atwork_subtour_channels = enumerate_tour_types(atwork_subtour_flavors) - max_work_tours = mandatory_tour_flavors['work'] - atwork_subtour_channels = ['%s_%s' % (c, i+1) - for c in atwork_subtour_channels - for i in range(max_work_tours)] + max_work_tours = mandatory_tour_flavors["work"] + atwork_subtour_channels = [ + "%s_%s" % (c, i + 1) + for c in atwork_subtour_channels + for i in range(max_work_tours) + ] # - joint_tour_channels - joint_tour_flavors = {'shopping': 2, 'othmaint': 2, 'othdiscr': 2, 'eatout': 2, 'social': 2} + joint_tour_flavors = { + "shopping": 2, + "othmaint": 2, + "othdiscr": 2, + "eatout": 2, + "social": 2, + } joint_tour_channels = enumerate_tour_types(joint_tour_flavors) - joint_tour_channels = ['j_%s' % c for c in joint_tour_channels] + joint_tour_channels = ["j_%s" % c for c in joint_tour_channels] - sub_channels = \ - non_mandatory_channels + mandatory_channels + atwork_subtour_channels + joint_tour_channels + sub_channels = ( + non_mandatory_channels + + mandatory_channels + + atwork_subtour_channels + + joint_tour_channels + ) sub_channels.sort() @@ -106,36 +128,37 @@ def set_tour_index(tours, parent_tour_num_col=None, is_joint=False): Tours dataframe to reindex. """ - tour_num_col = 'tour_type_num' + tour_num_col = "tour_type_num" possible_tours = canonical_tours() possible_tours_count = len(possible_tours) assert tour_num_col in tours.columns # create string tour_id corresonding to keys in possible_tours (e.g. 'work1', 'j_shopping2') - tours['tour_id'] = tours.tour_type + tours[tour_num_col].map(str) + tours["tour_id"] = tours.tour_type + tours[tour_num_col].map(str) if parent_tour_num_col: # we need to distinguish between subtours of different work tours # (e.g. eat1_1 is eat subtour for parent work tour 1 and eat1_2 is for work tour 2) parent_tour_num = tours[parent_tour_num_col] - if parent_tour_num.dtype != 'int64': + if parent_tour_num.dtype != "int64": # might get converted to float if non-subtours rows are None (but we try to avoid this) - logger.error('parent_tour_num.dtype: %s' % parent_tour_num.dtype) + logger.error("parent_tour_num.dtype: %s" % parent_tour_num.dtype) parent_tour_num = parent_tour_num.astype(np.int64) - tours['tour_id'] = tours['tour_id'] + '_' + parent_tour_num.map(str) + tours["tour_id"] = tours["tour_id"] + "_" + parent_tour_num.map(str) if is_joint: - tours['tour_id'] = 'j_' + tours['tour_id'] + tours["tour_id"] = "j_" + tours["tour_id"] # map recognized strings to ints - tours.tour_id = tours.tour_id.replace(to_replace=possible_tours, - value=list(range(possible_tours_count))) + tours.tour_id = tours.tour_id.replace( + to_replace=possible_tours, value=list(range(possible_tours_count)) + ) # convert to numeric - shouldn't be any NaNs - this will raise error if there are - tours.tour_id = pd.to_numeric(tours.tour_id, errors='raise').astype(np.int64) + tours.tour_id = pd.to_numeric(tours.tour_id, errors="raise").astype(np.int64) tours.tour_id = (tours.person_id * possible_tours_count) + tours.tour_id @@ -144,7 +167,7 @@ def set_tour_index(tours, parent_tour_num_col=None, is_joint=False): # print(tours[tours.tour_id.duplicated(keep=False)][['survey_tour_id', 'tour_type', 'tour_category']]) assert not tours.tour_id.duplicated().any() - tours.set_index('tour_id', inplace=True, verify_integrity=True) + tours.set_index("tour_id", inplace=True, verify_integrity=True) # we modify tours in place, but return the dataframe for the convenience of the caller return tours @@ -156,8 +179,8 @@ def set_trip_index(trips): # canonical_trip_num: 1st trip out = 1, 2nd trip out = 2, 1st in = 5, etc. canonical_trip_num = (~trips.outbound * MAX_TRIPS_PER_LEG) + trips.trip_num - trips['trip_id'] = trips.tour_id * (2 * MAX_TRIPS_PER_LEG) + canonical_trip_num - trips.set_index('trip_id', inplace=True, verify_integrity=True) + trips["trip_id"] = trips.tour_id * (2 * MAX_TRIPS_PER_LEG) + canonical_trip_num + trips.set_index("trip_id", inplace=True, verify_integrity=True) # we modify trips in place, but return the dataframe for the convenience of the caller return trips diff --git a/activitysim/abm/models/util/cdap.py b/activitysim/abm/models/util/cdap.py index fdc34961ae..7e0f551860 100644 --- a/activitysim/abm/models/util/cdap.py +++ b/activitysim/abm/models/util/cdap.py @@ -1,33 +1,26 @@ # ActivitySim # See full license in LICENSE.txt. -import logging import itertools +import logging import os import numpy as np import pandas as pd -from activitysim.core import simulate -from activitysim.core import pipeline - -from activitysim.core import chunk -from activitysim.core import logit -from activitysim.core import tracing -from activitysim.core import inject -from activitysim.core import config +from activitysim.core import chunk, config, inject, logit, pipeline, simulate, tracing logger = logging.getLogger(__name__) # FIXME - this allows us to turn some dev debug table dump code on and off - eventually remove? # DUMP = False -_persons_index_ = 'person_id' -_hh_index_ = 'household_id' -_hh_size_ = 'hhsize' +_persons_index_ = "person_id" +_hh_index_ = "household_id" +_hh_size_ = "hhsize" -_hh_id_ = 'household_id' -_ptype_ = 'ptype' -_age_ = 'age' +_hh_id_ = "household_id" +_ptype_ = "ptype" +_age_ = "age" # For clarity, the named constant MAX_HHSIZE refers to the cdap 5 person threshold figure. MAX_HHSIZE = 5 @@ -50,9 +43,9 @@ def add_pn(col, pnum): e.g. M_p1, ptype_p2 but leave _hh_id_ column unchanged """ if type(col) is str: - return col if col == _hh_id_ else '%s_p%s' % (col, pnum) + return col if col == _hh_id_ else "%s_p%s" % (col, pnum) elif isinstance(col, (list, tuple)): - return [c if c == _hh_id_ else '%s_p%s' % (c, pnum) for c in col] + return [c if c == _hh_id_ else "%s_p%s" % (c, pnum) for c in col] else: raise RuntimeError("add_pn col not list or str") @@ -101,24 +94,32 @@ def assign_cdap_rank(persons, person_type_map, trace_hh_id=None, trace_label=Non RANK_CHILD = 2 RANK_BACKFILL = 3 RANK_UNASSIGNED = 9 - persons['cdap_rank'] = RANK_UNASSIGNED + persons["cdap_rank"] = RANK_UNASSIGNED # choose up to 2 workers, preferring full over part, older over younger - workers = \ - persons.loc[persons[_ptype_].isin(person_type_map['WORKER']), [_hh_id_, _ptype_]]\ - .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True])\ - .groupby(_hh_id_).head(2) + workers = ( + persons.loc[ + persons[_ptype_].isin(person_type_map["WORKER"]), [_hh_id_, _ptype_] + ] + .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True]) + .groupby(_hh_id_) + .head(2) + ) # tag the selected workers - persons.loc[workers.index, 'cdap_rank'] = RANK_WORKER + persons.loc[workers.index, "cdap_rank"] = RANK_WORKER del workers # choose up to 3, preferring youngest - children = \ - persons.loc[persons[_ptype_].isin(person_type_map['CHILD']), [_hh_id_, _ptype_, _age_]]\ - .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True])\ - .groupby(_hh_id_).head(3) + children = ( + persons.loc[ + persons[_ptype_].isin(person_type_map["CHILD"]), [_hh_id_, _ptype_, _age_] + ] + .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True]) + .groupby(_hh_id_) + .head(3) + ) # tag the selected children - persons.loc[children.index, 'cdap_rank'] = RANK_CHILD + persons.loc[children.index, "cdap_rank"] = RANK_CHILD del children # choose up to MAX_HHSIZE, preferring anyone already chosen @@ -128,44 +129,45 @@ def assign_cdap_rank(persons, person_type_map, trace_hh_id=None, trace_label=Non # .groupby(_hh_id_).head(MAX_HHSIZE) # choose up to MAX_HHSIZE, choosing randomly - others = persons[[_hh_id_, 'cdap_rank']].copy() - others['random_order'] = pipeline.get_rn_generator().random_for_df(persons) - others = \ - others\ - .sort_values(by=[_hh_id_, 'random_order'], ascending=[True, True])\ - .groupby(_hh_id_).head(MAX_HHSIZE) + others = persons[[_hh_id_, "cdap_rank"]].copy() + others["random_order"] = pipeline.get_rn_generator().random_for_df(persons) + others = ( + others.sort_values(by=[_hh_id_, "random_order"], ascending=[True, True]) + .groupby(_hh_id_) + .head(MAX_HHSIZE) + ) # tag the backfilled persons - persons.loc[others[others.cdap_rank == RANK_UNASSIGNED].index, 'cdap_rank'] \ - = RANK_BACKFILL + persons.loc[ + others[others.cdap_rank == RANK_UNASSIGNED].index, "cdap_rank" + ] = RANK_BACKFILL del others # assign person number in cdapPersonArray preference order # i.e. convert cdap_rank from category to index in order of category rank within household # groupby rank() is slow, so we compute rank artisanally # save time by sorting only the columns we need (persons is big, and sort moves data) - p = persons[[_hh_id_, 'cdap_rank', _age_]]\ - .sort_values(by=[_hh_id_, 'cdap_rank', _age_], ascending=[True, True, True]) + p = persons[[_hh_id_, "cdap_rank", _age_]].sort_values( + by=[_hh_id_, "cdap_rank", _age_], ascending=[True, True, True] + ) rank = p.groupby(_hh_id_).size().map(range) - rank = [item+1 for sublist in rank for item in sublist] - p['cdap_rank'] = rank - persons['cdap_rank'] = p['cdap_rank'] # assignment aligns on index values + rank = [item + 1 for sublist in rank for item in sublist] + p["cdap_rank"] = rank + persons["cdap_rank"] = p["cdap_rank"] # assignment aligns on index values # if DUMP: # tracing.trace_df(persons, '%s.DUMP.cdap_person_array' % trace_label, # transpose=False, slicer='NONE') if trace_hh_id: - tracing.trace_df(persons, '%s.cdap_rank' % trace_label) + tracing.trace_df(persons, "%s.cdap_rank" % trace_label) - return persons['cdap_rank'] + return persons["cdap_rank"] def individual_utilities( - persons, - cdap_indiv_spec, - locals_d, - trace_hh_id=None, trace_label=None): + persons, cdap_indiv_spec, locals_d, trace_hh_id=None, trace_label=None +): """ Calculate CDAP utilities for all individuals. @@ -185,15 +187,20 @@ def individual_utilities( """ # calculate single person utilities - indiv_utils = simulate.eval_utilities(cdap_indiv_spec, persons, locals_d, trace_label=trace_label) + indiv_utils = simulate.eval_utilities( + cdap_indiv_spec, persons, locals_d, trace_label=trace_label + ) # add columns from persons to facilitate building household interactions - useful_columns = [_hh_id_, _ptype_, 'cdap_rank', _hh_size_] + useful_columns = [_hh_id_, _ptype_, "cdap_rank", _hh_size_] indiv_utils[useful_columns] = persons[useful_columns] if trace_hh_id: - tracing.trace_df(indiv_utils, '%s.indiv_utils' % trace_label, - column_labels=['activity', 'person']) + tracing.trace_df( + indiv_utils, + "%s.indiv_utils" % trace_label, + column_labels=["activity", "person"], + ) return indiv_utils @@ -225,25 +232,31 @@ def preprocess_interaction_coefficients(interaction_coefficients): # make a copy coefficients = interaction_coefficients.copy() - if not coefficients['activity'].isin(['M', 'N', 'H']).all(): - msg = "Error in cdap_interaction_coefficients at row %s. Expect only M, N, or H!" \ - % coefficients[~coefficients['activity'].isin(['M', 'N', 'H'])].index.values + if not coefficients["activity"].isin(["M", "N", "H"]).all(): + msg = ( + "Error in cdap_interaction_coefficients at row %s. Expect only M, N, or H!" + % coefficients[~coefficients["activity"].isin(["M", "N", "H"])].index.values + ) raise RuntimeError(msg) - coefficients['cardinality'] = coefficients['interaction_ptypes'].astype(str).str.len() + coefficients["cardinality"] = ( + coefficients["interaction_ptypes"].astype(str).str.len() + ) - wildcards = coefficients.interaction_ptypes == coefficients.cardinality.map(lambda x: x*'*') - coefficients.loc[wildcards, 'interaction_ptypes'] = '' + wildcards = coefficients.interaction_ptypes == coefficients.cardinality.map( + lambda x: x * "*" + ) + coefficients.loc[wildcards, "interaction_ptypes"] = "" - coefficients['slug'] = \ - coefficients['activity'] * coefficients['cardinality'] \ - + coefficients['interaction_ptypes'].astype(str) + coefficients["slug"] = coefficients["activity"] * coefficients[ + "cardinality" + ] + coefficients["interaction_ptypes"].astype(str) return coefficients def cached_spec_name(hhsize): - return 'cdap_spec_%s' % hhsize + return "cdap_spec_%s" % hhsize def get_cached_spec(hhsize): @@ -274,8 +287,9 @@ def cache_spec(hhsize, spec): inject.add_injectable(spec_name, spec) -def build_cdap_spec(interaction_coefficients, hhsize, - trace_spec=False, trace_label=None, cache=True): +def build_cdap_spec( + interaction_coefficients, hhsize, trace_spec=False, trace_label=None, cache=True +): """ Build a spec file for computing utilities of alternative household member interaction patterns for households of specified size. @@ -339,7 +353,7 @@ def build_cdap_spec(interaction_coefficients, hhsize, # generate a list of activity pattern alternatives for this hhsize # e.g. ['HH', 'HM', 'HN', 'MH', 'MM', 'MN', 'NH', 'NM', 'NN'] for hhsize=2 - alternatives = [''.join(tup) for tup in itertools.product('HMN', repeat=hhsize)] + alternatives = ["".join(tup) for tup in itertools.product("HMN", repeat=hhsize)] # spec df has expression column plus a column for each alternative spec = pd.DataFrame(columns=[expression_name] + alternatives) @@ -353,15 +367,17 @@ def build_cdap_spec(interaction_coefficients, hhsize, # Expression MM MN MH NM NN NH HM HN HH # M_p1 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 # N_p1 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0 - for pnum in range(1, hhsize+1): - for activity in ['M', 'N', 'H']: + for pnum in range(1, hhsize + 1): + for activity in ["M", "N", "H"]: new_row_index = len(spec) spec.loc[new_row_index, expression_name] = add_pn(activity, pnum) # list of alternative columns where person pnum has expression activity # e.g. for M_p1 we want the columns where activity M is in position p1 - alternative_columns = [alt for alt in alternatives if alt[pnum - 1] == activity] + alternative_columns = [ + alt for alt in alternatives if alt[pnum - 1] == activity + ] spec.loc[new_row_index, alternative_columns] = 1 # ignore rows whose cardinality exceeds hhsize @@ -383,24 +399,26 @@ def build_cdap_spec(interaction_coefficients, hhsize, # FIXME - should we be doing this for greater than HH_MAXSIZE households? if row.slug in alternatives: - spec.loc[len(spec), [expression_name, row.slug]] = ['1', row.slug] + spec.loc[len(spec), [expression_name, row.slug]] = ["1", row.slug] continue if not (0 <= row.cardinality <= MAX_INTERACTION_CARDINALITY): - raise RuntimeError("Bad row cardinality %d for %s" % (row.cardinality, row.slug)) + raise RuntimeError( + "Bad row cardinality %d for %s" % (row.cardinality, row.slug) + ) # for all other interaction rules, we need to generate a row in the spec for each # possible combination of interacting persons # e.g. for (1, 2), (1,3), (2,3) for a coefficient with cardinality 2 in hhsize 3 - for tup in itertools.combinations(list(range(1, hhsize+1)), row.cardinality): + for tup in itertools.combinations(list(range(1, hhsize + 1)), row.cardinality): # determine the name of the chooser column with the ptypes for this interaction if row.cardinality == 1: interaction_column = "ptype_p%d" % tup[0] else: # column named (e.g.) p1_p3 for an interaction between p1 and p3 - interaction_column = '_'.join(['p%s' % pnum for pnum in tup]) + interaction_column = "_".join(["p%s" % pnum for pnum in tup]) # build expression that evaluates True iff the interaction is between specified ptypes # (e.g.) p1_p3==13 for an interaction between p1 and p3 of ptypes 1 and 3 (or 3 and1 ) @@ -410,13 +428,16 @@ def build_cdap_spec(interaction_coefficients, hhsize, # e.g. ['MMM', 'MMN', 'MMH'] for an interaction between p1 and p3 with activity 'M' # alternative_columns = \ # filter(lambda alt: all([alt[p - 1] == row.activity for p in tup]), alternatives) - alternative_columns = \ - [alt for alt in alternatives if all([alt[p - 1] == row.activity for p in tup])] + alternative_columns = [ + alt + for alt in alternatives + if all([alt[p - 1] == row.activity for p in tup]) + ] # a row for this interaction may already exist, # e.g. if there are rules for both HH13 and MM13, we don't need to add rows for both # since they are triggered by the same expressions (e.g. p1_p2==13, p1_p3=13,...) - existing_row_index = (spec[expression_name] == expression) + existing_row_index = spec[expression_name] == expression if (existing_row_index).any(): # if the rows exist, simply update the appropriate alternative columns in spec spec.loc[existing_row_index, alternative_columns] = row.slug @@ -433,18 +454,25 @@ def build_cdap_spec(interaction_coefficients, hhsize, simulate.uniquify_spec_index(spec) if trace_spec: - tracing.trace_df(spec, '%s.hhsize%d_spec' % (trace_label, hhsize), - transpose=False, slicer='NONE') + tracing.trace_df( + spec, + "%s.hhsize%d_spec" % (trace_label, hhsize), + transpose=False, + slicer="NONE", + ) # replace slug with coefficient - d = interaction_coefficients.set_index('slug')['coefficient'].to_dict() + d = interaction_coefficients.set_index("slug")["coefficient"].to_dict() for c in spec.columns: - spec[c] =\ - spec[c].map(lambda x: d.get(x, x or 0.0)).fillna(0) + spec[c] = spec[c].map(lambda x: d.get(x, x or 0.0)).fillna(0) if trace_spec: - tracing.trace_df(spec, '%s.hhsize%d_spec_patched' % (trace_label, hhsize), - transpose=False, slicer='NONE') + tracing.trace_df( + spec, + "%s.hhsize%d_spec_patched" % (trace_label, hhsize), + transpose=False, + slicer="NONE", + ) if cache: cache_spec(hhsize, spec) @@ -497,15 +525,19 @@ def add_interaction_column(choosers, p_tup): # I couldn't figure out a good way to do this in pandas, but we want to do something like: # choosers['p1_p3'] = choosers['ptype_p1'].astype(str) + choosers['ptype_p3'].astype(str) - dest_col = '_'.join(['p%s' % pnum for pnum in p_tup]) + dest_col = "_".join(["p%s" % pnum for pnum in p_tup]) # build a string concatenating the ptypes of the persons in the order they appear in p_tup - choosers[dest_col] = choosers[add_pn('ptype', p_tup[0])].astype(str) + choosers[dest_col] = choosers[add_pn("ptype", p_tup[0])].astype(str) for pnum in p_tup[1:]: - choosers[dest_col] = choosers[dest_col] + choosers[add_pn('ptype', pnum)].astype(str) + choosers[dest_col] = choosers[dest_col] + choosers[ + add_pn("ptype", pnum) + ].astype(str) # sort the list of ptypes so it is in increasing ptype order, then convert to int - choosers[dest_col] = choosers[dest_col].apply(lambda x: ''.join(sorted(x))).astype(int) + choosers[dest_col] = ( + choosers[dest_col].apply(lambda x: "".join(sorted(x))).astype(int) + ) def hh_choosers(indiv_utils, hhsize): @@ -544,27 +576,31 @@ def hh_choosers(indiv_utils, hhsize): """ # we want to merge the ptype and M, N, and H utilities for each individual in the household - merge_cols = [_hh_id_, _ptype_, 'M', 'N', 'H'] + merge_cols = [_hh_id_, _ptype_, "M", "N", "H"] if hhsize > MAX_HHSIZE: raise RuntimeError("hh_choosers hhsize > MAX_HHSIZE") if hhsize < MAX_HHSIZE: - include_households = (indiv_utils[_hh_size_] == hhsize) + include_households = indiv_utils[_hh_size_] == hhsize else: # we want to include larger households along with MAX_HHSIZE households - include_households = (indiv_utils[_hh_size_] >= MAX_HHSIZE) + include_households = indiv_utils[_hh_size_] >= MAX_HHSIZE # start with all the individuals with cdap_rank of 1 (thus there will be one row per household) - choosers = indiv_utils.loc[include_households & (indiv_utils['cdap_rank'] == 1), merge_cols] + choosers = indiv_utils.loc[ + include_households & (indiv_utils["cdap_rank"] == 1), merge_cols + ] # rename columns, adding pn suffix (e.g. ptype_p1, M_p1) to all columns except hh_id choosers.columns = add_pn(merge_cols, 1) # for each of the higher cdap_ranks - for pnum in range(2, hhsize+1): + for pnum in range(2, hhsize + 1): # df with merge columns for indiv with cdap_rank of pnum - rhs = indiv_utils.loc[include_households & (indiv_utils['cdap_rank'] == pnum), merge_cols] + rhs = indiv_utils.loc[ + include_households & (indiv_utils["cdap_rank"] == pnum), merge_cols + ] # rename columns, adding pn suffix (e.g. ptype_p1, M_p1) to all columns except hh_id rhs.columns = add_pn(merge_cols, pnum) @@ -575,20 +611,21 @@ def hh_choosers(indiv_utils, hhsize): set_hh_index(choosers) # coerce utilities to float (merge apparently makes column type objects) - for pnum in range(1, hhsize+1): - pn_cols = add_pn(['M', 'N', 'H'], pnum) + for pnum in range(1, hhsize + 1): + pn_cols = add_pn(["M", "N", "H"], pnum) choosers[pn_cols] = choosers[pn_cols].astype(float) # add interaction columns for all 2 and 3 person interactions - for i in range(2, min(hhsize, MAX_INTERACTION_CARDINALITY)+1): - for tup in itertools.combinations(list(range(1, hhsize+1)), i): + for i in range(2, min(hhsize, MAX_INTERACTION_CARDINALITY) + 1): + for tup in itertools.combinations(list(range(1, hhsize + 1)), i): add_interaction_column(choosers, tup) return choosers -def household_activity_choices(indiv_utils, interaction_coefficients, hhsize, - trace_hh_id=None, trace_label=None): +def household_activity_choices( + indiv_utils, interaction_coefficients, hhsize, trace_hh_id=None, trace_label=None +): """ Calculate household utilities for each activity pattern alternative for households of hhsize The resulting activity pattern for each household will be coded as a string of activity codes. @@ -620,21 +657,24 @@ def household_activity_choices(indiv_utils, interaction_coefficients, hhsize, # and the household utils are the same as the individual utils choosers = vars = None # extract the individual utilities for individuals from hhsize 1 households - utils = indiv_utils.loc[indiv_utils[_hh_size_] == 1, [_hh_id_, 'M', 'N', 'H']] + utils = indiv_utils.loc[indiv_utils[_hh_size_] == 1, [_hh_id_, "M", "N", "H"]] # index on household_id, not person_id set_hh_index(utils) else: choosers = hh_choosers(indiv_utils, hhsize=hhsize) - spec = build_cdap_spec(interaction_coefficients, hhsize, - trace_spec=(trace_hh_id in choosers.index), - trace_label=trace_label) + spec = build_cdap_spec( + interaction_coefficients, + hhsize, + trace_spec=(trace_hh_id in choosers.index), + trace_label=trace_label, + ) utils = simulate.eval_utilities(spec, choosers, trace_label=trace_label) if len(utils.index) == 0: - return pd.Series(dtype='float64') + return pd.Series(dtype="float64") probs = logit.utils_to_probs(utils, trace_label=trace_label) @@ -648,23 +688,35 @@ def household_activity_choices(indiv_utils, interaction_coefficients, hhsize, if trace_hh_id: if hhsize > 1: - tracing.trace_df(choosers, '%s.hhsize%d_choosers' % (trace_label, hhsize), - column_labels=['expression', 'person']) - - tracing.trace_df(utils, '%s.hhsize%d_utils' % (trace_label, hhsize), - column_labels=['expression', 'household']) - tracing.trace_df(probs, '%s.hhsize%d_probs' % (trace_label, hhsize), - column_labels=['expression', 'household']) - tracing.trace_df(choices, '%s.hhsize%d_activity_choices' % (trace_label, hhsize), - column_labels=['expression', 'household']) - tracing.trace_df(rands, '%s.hhsize%d_rands' % (trace_label, hhsize), - columns=[None, 'rand']) + tracing.trace_df( + choosers, + "%s.hhsize%d_choosers" % (trace_label, hhsize), + column_labels=["expression", "person"], + ) + + tracing.trace_df( + utils, + "%s.hhsize%d_utils" % (trace_label, hhsize), + column_labels=["expression", "household"], + ) + tracing.trace_df( + probs, + "%s.hhsize%d_probs" % (trace_label, hhsize), + column_labels=["expression", "household"], + ) + tracing.trace_df( + choices, + "%s.hhsize%d_activity_choices" % (trace_label, hhsize), + column_labels=["expression", "household"], + ) + tracing.trace_df( + rands, "%s.hhsize%d_rands" % (trace_label, hhsize), columns=[None, "rand"] + ) return choices -def unpack_cdap_indiv_activity_choices(persons, hh_choices, - trace_hh_id, trace_label): +def unpack_cdap_indiv_activity_choices(persons, hh_choices, trace_hh_id, trace_label): """ Unpack the household activity choice list into choices for each (non-extra) household member @@ -683,25 +735,27 @@ def unpack_cdap_indiv_activity_choices(persons, hh_choices, series contains one activity per individual hh member, indexed on _persons_index_ """ - cdap_indivs = persons['cdap_rank'] <= MAX_HHSIZE + cdap_indivs = persons["cdap_rank"] <= MAX_HHSIZE indiv_activity = pd.merge( - left=persons.loc[cdap_indivs, [_hh_id_, 'cdap_rank']], - right=hh_choices.to_frame(name='hh_choices'), + left=persons.loc[cdap_indivs, [_hh_id_, "cdap_rank"]], + right=hh_choices.to_frame(name="hh_choices"), left_on=_hh_id_, - right_index=True + right_index=True, ) # resulting dataframe has columns _hh_id_,'cdap_rank', hh_choices indexed on _persons_index_ - indiv_activity['cdap_activity'] = '' + indiv_activity["cdap_activity"] = "" # for each cdap_rank (1..5) for i in range(MAX_HHSIZE): - pnum_i = (indiv_activity['cdap_rank'] == i+1) - indiv_activity.loc[pnum_i, ['cdap_activity']] = indiv_activity[pnum_i]['hh_choices'].str[i] + pnum_i = indiv_activity["cdap_rank"] == i + 1 + indiv_activity.loc[pnum_i, ["cdap_activity"]] = indiv_activity[pnum_i][ + "hh_choices" + ].str[i] - cdap_indiv_activity_choices = indiv_activity['cdap_activity'] + cdap_indiv_activity_choices = indiv_activity["cdap_activity"] # if DUMP: # tracing.trace_df(cdap_indiv_activity_choices, @@ -711,8 +765,9 @@ def unpack_cdap_indiv_activity_choices(persons, hh_choices, return cdap_indiv_activity_choices -def extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, - trace_hh_id, trace_label): +def extra_hh_member_choices( + persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label +): """ Generate the activity choices for the 'extra' household members who weren't handled by cdap @@ -742,16 +797,18 @@ def extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, list of alternatives chosen for all extra members, indexed by _persons_index_ """ - trace_label = tracing.extend_trace_label(trace_label, 'extra_hh_member_choices') + trace_label = tracing.extend_trace_label(trace_label, "extra_hh_member_choices") # extra household members have cdap_ran > MAX_HHSIZE - choosers = persons[persons['cdap_rank'] > MAX_HHSIZE] + choosers = persons[persons["cdap_rank"] > MAX_HHSIZE] if len(choosers.index) == 0: - return pd.Series(dtype='float64') + return pd.Series(dtype="float64") # eval the expression file - values = simulate.eval_variables(cdap_fixed_relative_proportions.index, choosers, locals_d) + values = simulate.eval_variables( + cdap_fixed_relative_proportions.index, choosers, locals_d + ) # cdap_fixed_relative_proportions computes relative proportions by ptype, not utilities proportions = values.dot(cdap_fixed_relative_proportions) @@ -777,26 +834,40 @@ def extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, # slicer='NONE') if trace_hh_id: - tracing.trace_df(proportions, '%s.extra_hh_member_choices_proportions' % trace_label, - column_labels=['expression', 'person']) - tracing.trace_df(probs, '%s.extra_hh_member_choices_probs' % trace_label, - column_labels=['expression', 'person']) - tracing.trace_df(choices, '%s.extra_hh_member_choices_choices' % trace_label, - column_labels=['expression', 'person']) - tracing.trace_df(rands, '%s.extra_hh_member_choices_rands' % trace_label, - columns=[None, 'rand']) + tracing.trace_df( + proportions, + "%s.extra_hh_member_choices_proportions" % trace_label, + column_labels=["expression", "person"], + ) + tracing.trace_df( + probs, + "%s.extra_hh_member_choices_probs" % trace_label, + column_labels=["expression", "person"], + ) + tracing.trace_df( + choices, + "%s.extra_hh_member_choices_choices" % trace_label, + column_labels=["expression", "person"], + ) + tracing.trace_df( + rands, + "%s.extra_hh_member_choices_rands" % trace_label, + columns=[None, "rand"], + ) return choices def _run_cdap( - persons, - person_type_map, - cdap_indiv_spec, - interaction_coefficients, - cdap_fixed_relative_proportions, - locals_d, - trace_hh_id, trace_label): + persons, + person_type_map, + cdap_indiv_spec, + interaction_coefficients, + cdap_fixed_relative_proportions, + locals_d, + trace_hh_id, + trace_label, +): """ Implements core run_cdap functionality on persons df (or chunked subset thereof) Aside from chunking of persons df, params are passed through from run_cdap unchanged @@ -813,53 +884,61 @@ def _run_cdap( # persons with cdap_rank 1..MAX_HHSIZE will be have their activities chose by CDAP model # extra household members, will have activities assigned by in fixed proportions assign_cdap_rank(persons, person_type_map, trace_hh_id, trace_label) - chunk.log_df(trace_label, 'persons', persons) + chunk.log_df(trace_label, "persons", persons) # Calculate CDAP utilities for each individual, ignoring interactions # ind_utils has index of 'person_id' and a column for each alternative # i.e. three columns 'M' (Mandatory), 'N' (NonMandatory), 'H' (Home) - indiv_utils = individual_utilities(persons[persons.cdap_rank <= MAX_HHSIZE], - cdap_indiv_spec, locals_d, - trace_hh_id, trace_label) - chunk.log_df(trace_label, 'indiv_utils', indiv_utils) + indiv_utils = individual_utilities( + persons[persons.cdap_rank <= MAX_HHSIZE], + cdap_indiv_spec, + locals_d, + trace_hh_id, + trace_label, + ) + chunk.log_df(trace_label, "indiv_utils", indiv_utils) # compute interaction utilities, probabilities, and hh activity pattern choices # for each size household separately in turn up to MAX_HHSIZE hh_choices_list = [] - for hhsize in range(1, MAX_HHSIZE+1): + for hhsize in range(1, MAX_HHSIZE + 1): choices = household_activity_choices( - indiv_utils, interaction_coefficients, hhsize=hhsize, - trace_hh_id=trace_hh_id, trace_label=trace_label) + indiv_utils, + interaction_coefficients, + hhsize=hhsize, + trace_hh_id=trace_hh_id, + trace_label=trace_label, + ) hh_choices_list.append(choices) del indiv_utils - chunk.log_df(trace_label, 'indiv_utils', None) + chunk.log_df(trace_label, "indiv_utils", None) # concat all the household choices into a single series indexed on _hh_index_ hh_activity_choices = pd.concat(hh_choices_list) - chunk.log_df(trace_label, 'hh_activity_choices', hh_activity_choices) + chunk.log_df(trace_label, "hh_activity_choices", hh_activity_choices) # unpack the household activity choice list into choices for each (non-extra) household member # resulting series contains one activity per individual hh member, indexed on _persons_index_ - cdap_person_choices \ - = unpack_cdap_indiv_activity_choices(persons, hh_activity_choices, - trace_hh_id, trace_label) + cdap_person_choices = unpack_cdap_indiv_activity_choices( + persons, hh_activity_choices, trace_hh_id, trace_label + ) # assign activities to extra household members (with cdap_rank > MAX_HHSIZE) # resulting series contains one activity per individual hh member, indexed on _persons_index_ - extra_person_choices \ - = extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, - trace_hh_id, trace_label) + extra_person_choices = extra_hh_member_choices( + persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label + ) # concat cdap and extra persoin choices into a single series # this series will be the same length as the persons dataframe and be indexed on _persons_index_ person_choices = pd.concat([cdap_person_choices, extra_person_choices]) - persons['cdap_activity'] = person_choices - chunk.log_df(trace_label, 'persons', persons) + persons["cdap_activity"] = person_choices + chunk.log_df(trace_label, "persons", persons) # if DUMP: # tracing.trace_df(hh_activity_choices, '%s.DUMP.hh_activity_choices' % trace_label, @@ -867,22 +946,25 @@ def _run_cdap( # tracing.trace_df(cdap_results, '%s.DUMP.cdap_results' % trace_label, # transpose=False, slicer='NONE') - result = persons[['cdap_rank', 'cdap_activity']] + result = persons[["cdap_rank", "cdap_activity"]] del persons - chunk.log_df(trace_label, 'persons', None) + chunk.log_df(trace_label, "persons", None) return result def run_cdap( - persons, - person_type_map, - cdap_indiv_spec, - cdap_interaction_coefficients, - cdap_fixed_relative_proportions, - locals_d, - chunk_size=0, trace_hh_id=None, trace_label=None): + persons, + person_type_map, + cdap_indiv_spec, + cdap_interaction_coefficients, + cdap_fixed_relative_proportions, + locals_d, + chunk_size=0, + trace_hh_id=None, + trace_label=None, +): """ Choose individual activity patterns for persons. @@ -919,25 +1001,30 @@ def run_cdap( activity for that person expressed as 'M', 'N', 'H' """ - trace_label = tracing.extend_trace_label(trace_label, 'cdap') + trace_label = tracing.extend_trace_label(trace_label, "cdap") result_list = [] # segment by person type and pick the right spec for each person type - for i, persons_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers_by_chunk_id(persons, chunk_size, trace_label): - - cdap_results = \ - _run_cdap(persons_chunk, - person_type_map, - cdap_indiv_spec, - cdap_interaction_coefficients, - cdap_fixed_relative_proportions, - locals_d, - trace_hh_id, chunk_trace_label) + for ( + i, + persons_chunk, + chunk_trace_label, + ) in chunk.adaptive_chunked_choosers_by_chunk_id(persons, chunk_size, trace_label): + + cdap_results = _run_cdap( + persons_chunk, + person_type_map, + cdap_indiv_spec, + cdap_interaction_coefficients, + cdap_fixed_relative_proportions, + locals_d, + trace_hh_id, + chunk_trace_label, + ) result_list.append(cdap_results) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -947,10 +1034,12 @@ def run_cdap( if trace_hh_id: - tracing.trace_df(cdap_results, - label="cdap", - columns=['cdap_rank', 'cdap_activity'], - warn_if_empty=True) + tracing.trace_df( + cdap_results, + label="cdap", + columns=["cdap_rank", "cdap_activity"], + warn_if_empty=True, + ) # return choices column as series - return cdap_results['cdap_activity'] + return cdap_results["cdap_activity"] diff --git a/activitysim/abm/models/util/estimation.py b/activitysim/abm/models/util/estimation.py index 7cd98372b6..0bf7419f80 100644 --- a/activitysim/abm/models/util/estimation.py +++ b/activitysim/abm/models/util/estimation.py @@ -1,27 +1,23 @@ # ActivitySim # See full license in LICENSE.txt. +import logging import os import shutil -import logging - -import yaml - import pandas as pd +import yaml -from activitysim.core import config -from activitysim.core import simulate - -from activitysim.core.util import reindex from activitysim.abm.models.util import canonical_ids as cid +from activitysim.core import config, simulate +from activitysim.core.util import reindex -logger = logging.getLogger('estimation') +logger = logging.getLogger("estimation") -ESTIMATION_SETTINGS_FILE_NAME = 'estimation.yaml' +ESTIMATION_SETTINGS_FILE_NAME = "estimation.yaml" -def unlink_files(directory_path, file_types=('csv', 'yaml')): +def unlink_files(directory_path, file_types=("csv", "yaml")): for file_name in os.listdir(directory_path): if file_name.endswith(file_types): file_path = os.path.join(directory_path, file_name) @@ -34,7 +30,6 @@ def unlink_files(directory_path, file_types=('csv', 'yaml')): class Estimator(object): - def __init__(self, bundle_name, model_name, estimation_table_recipes): logger.info("Initialize Estimator for'%s'" % (model_name,)) @@ -51,19 +46,27 @@ def __init__(self, bundle_name, model_name, estimation_table_recipes): os.makedirs(output_dir) # make directory if needed # delete estimation files - unlink_files(self.output_directory(), file_types=('csv', 'yaml')) + unlink_files(self.output_directory(), file_types=("csv", "yaml")) if self.bundle_name != self.model_name: # kind of inelegant to always delete these, but ok as they are redundantly recreated for each sub model - unlink_files(self.output_directory(bundle_directory=True), file_types=('csv', 'yaml')) + unlink_files( + self.output_directory(bundle_directory=True), file_types=("csv", "yaml") + ) # FIXME - not required? # assert 'override_choices' in self.model_settings, \ # "override_choices not found for %s in %s." % (model_name, ESTIMATION_SETTINGS_FILE_NAME) - self.omnibus_tables = self.estimation_table_recipes['omnibus_tables'] - self.omnibus_tables_append_columns = self.estimation_table_recipes['omnibus_tables_append_columns'] + self.omnibus_tables = self.estimation_table_recipes["omnibus_tables"] + self.omnibus_tables_append_columns = self.estimation_table_recipes[ + "omnibus_tables_append_columns" + ] self.tables = {} - self.tables_to_cache = [table_name for tables in self.omnibus_tables.values() for table_name in tables] + self.tables_to_cache = [ + table_name + for tables in self.omnibus_tables.values() + for table_name in tables + ] self.alt_id_column_name = None self.chooser_id_column_name = None @@ -121,7 +124,9 @@ def output_directory(self, bundle_directory=False): assert self.estimating assert self.model_name is not None - dir = os.path.join(config.output_file_path('estimation_data_bundle'), self.bundle_name) + dir = os.path.join( + config.output_file_path("estimation_data_bundle"), self.bundle_name + ) if bundle_directory: # shouldn't be asking - probably confused @@ -152,7 +157,9 @@ def output_file_path(self, table_name, file_type=None, bundle_directory=False): return os.path.join(output_dir, file_name) - def write_table(self, df, table_name, index=True, append=True, bundle_directory=False): + def write_table( + self, df, table_name, index=True, append=True, bundle_directory=False + ): """ Parameters @@ -168,22 +175,29 @@ def write_table(self, df, table_name, index=True, append=True, bundle_directory= def cache_table(df, table_name, append): if table_name in self.tables and not append: - raise RuntimeError("cache_table %s append=False and table exists" % (table_name,)) + raise RuntimeError( + "cache_table %s append=False and table exists" % (table_name,) + ) if table_name in self.tables: self.tables[table_name] = pd.concat([self.tables[table_name], df]) else: self.tables[table_name] = df.copy() def write_table(df, table_name, index, append, bundle_directory): - if table_name.endswith('.csv'): + if table_name.endswith(".csv"): # pass through filename without adding model or bundle name prefix - file_path = os.path.join(self.output_directory(bundle_directory), table_name) + file_path = os.path.join( + self.output_directory(bundle_directory), table_name + ) else: - file_path = self.output_file_path(table_name, 'csv', bundle_directory) + file_path = self.output_file_path(table_name, "csv", bundle_directory) file_exists = os.path.isfile(file_path) if file_exists and not append: - raise RuntimeError("write_table %s append=False and file exists: %s" % (table_name, file_path)) - df.to_csv(file_path, mode='a', index=index, header=(not file_exists)) + raise RuntimeError( + "write_table %s append=False and file exists: %s" + % (table_name, file_path) + ) + df.to_csv(file_path, mode="a", index=index, header=(not file_exists)) assert self.estimating @@ -193,11 +207,11 @@ def write_table(df, table_name, index, append, bundle_directory): if cache: cache_table(df, table_name, append) - self.debug('write_table cache: %s' % table_name) + self.debug("write_table cache: %s" % table_name) if write: write_table(df, table_name, index, append, bundle_directory) - self.debug('write_table write: %s' % table_name) + self.debug("write_table write: %s" % table_name) def write_omnibus_table(self): @@ -206,41 +220,50 @@ def write_omnibus_table(self): for omnibus_table, table_names in self.omnibus_tables.items(): - self.debug("write_omnibus_table: %s table_names: %s" % (omnibus_table, table_names)) + self.debug( + "write_omnibus_table: %s table_names: %s" % (omnibus_table, table_names) + ) for t in table_names: if t not in self.tables: - self.warning("write_omnibus_table: %s table '%s' not found" % (omnibus_table, t)) + self.warning( + "write_omnibus_table: %s table '%s' not found" + % (omnibus_table, t) + ) # ignore any tables not in cache table_names = [t for t in table_names if t in self.tables] - concat_axis = 1 if omnibus_table in self.omnibus_tables_append_columns else 0 + concat_axis = ( + 1 if omnibus_table in self.omnibus_tables_append_columns else 0 + ) df = pd.concat([self.tables[t] for t in table_names], axis=concat_axis) - df.sort_index(ascending=True, inplace=True, kind='mergesort') + df.sort_index(ascending=True, inplace=True, kind="mergesort") - file_path = self.output_file_path(omnibus_table, 'csv') + file_path = self.output_file_path(omnibus_table, "csv") assert not os.path.isfile(file_path) - df.to_csv(file_path, mode='a', index=True, header=True) + df.to_csv(file_path, mode="a", index=True, header=True) - self.debug('write_omnibus_choosers: %s' % file_path) + self.debug("write_omnibus_choosers: %s" % file_path) def write_dict(self, d, dict_name, bundle_directory): assert self.estimating - file_path = self.output_file_path(dict_name, 'yaml', bundle_directory) + file_path = self.output_file_path(dict_name, "yaml", bundle_directory) # we don't know how to concat, and afraid to overwrite assert not os.path.isfile(file_path) - with open(file_path, 'w') as f: + with open(file_path, "w") as f: # write ordered dict as array yaml.dump(d, f) self.debug("estimate.write_dict: %s" % file_path) - def write_coefficients(self, coefficients_df=None, model_settings=None, file_name=None): + def write_coefficients( + self, coefficients_df=None, model_settings=None, file_name=None + ): """ Because the whole point of estimation is to generate new coefficient values we want to make it easy to put the coefficients file back in configs @@ -249,7 +272,7 @@ def write_coefficients(self, coefficients_df=None, model_settings=None, file_nam if model_settings is not None: assert file_name is None - file_name = model_settings['COEFFICIENTS'] + file_name = model_settings["COEFFICIENTS"] assert file_name is not None @@ -266,60 +289,74 @@ def write_coefficients_template(self, model_settings): assert self.estimating coefficients_df = simulate.read_model_coefficient_template(model_settings) - tag = 'coefficients_template' + tag = "coefficients_template" self.write_table(coefficients_df, tag, append=False) def write_choosers(self, choosers_df): - self.write_table(choosers_df, 'choosers', append=True) + self.write_table(choosers_df, "choosers", append=True) def write_choices(self, choices): if isinstance(choices, pd.Series): - choices = choices.to_frame(name='model_choice') - assert(list(choices.columns) == ['model_choice']) - self.write_table(choices, 'choices', append=True) + choices = choices.to_frame(name="model_choice") + assert list(choices.columns) == ["model_choice"] + self.write_table(choices, "choices", append=True) def write_override_choices(self, choices): if isinstance(choices, pd.Series): - choices = choices.to_frame(name='override_choice') - assert(list(choices.columns) == ['override_choice']) - self.write_table(choices, 'override_choices', append=True) + choices = choices.to_frame(name="override_choice") + assert list(choices.columns) == ["override_choice"] + self.write_table(choices, "override_choices", append=True) def write_constants(self, constants): - self.write_dict(self, constants, 'model_constants') + self.write_dict(self, constants, "model_constants") def write_nest_spec(self, nest_spec): - self.write_dict(self, nest_spec, 'nest_spec') + self.write_dict(self, nest_spec, "nest_spec") - def copy_model_settings(self, settings_file_name, tag='model_settings', bundle_directory=False): + def copy_model_settings( + self, settings_file_name, tag="model_settings", bundle_directory=False + ): input_path = config.base_settings_file_path(settings_file_name) - output_path = self.output_file_path(tag, 'yaml', bundle_directory) + output_path = self.output_file_path(tag, "yaml", bundle_directory) shutil.copy(input_path, output_path) - def write_model_settings(self, model_settings, settings_file_name, bundle_directory=False): + def write_model_settings( + self, model_settings, settings_file_name, bundle_directory=False + ): - if 'include_settings' in model_settings: - file_path = self.output_file_path('model_settings', 'yaml', bundle_directory) + if "include_settings" in model_settings: + file_path = self.output_file_path( + "model_settings", "yaml", bundle_directory + ) assert not os.path.isfile(file_path) - with open(file_path, 'w') as f: + with open(file_path, "w") as f: yaml.dump(model_settings, f) else: - self.copy_model_settings(settings_file_name, bundle_directory=bundle_directory) - if 'inherit_settings' in model_settings: - self.write_dict(model_settings, 'inherited_model_settings', bundle_directory) + self.copy_model_settings( + settings_file_name, bundle_directory=bundle_directory + ) + if "inherit_settings" in model_settings: + self.write_dict( + model_settings, "inherited_model_settings", bundle_directory + ) def melt_alternatives(self, df): alt_id_name = self.alt_id_column_name - assert alt_id_name is not None, \ - "alt_id not set. Did you forget to call set_alt_id()? (%s)" % self.model_name - assert alt_id_name in df, \ - "alt_id_column_name '%s' not in alternatives table (%s)" % (alt_id_name, self.model_name) + assert alt_id_name is not None, ( + "alt_id not set. Did you forget to call set_alt_id()? (%s)" + % self.model_name + ) + assert alt_id_name in df, ( + "alt_id_column_name '%s' not in alternatives table (%s)" + % (alt_id_name, self.model_name) + ) - variable_column = 'variable' + variable_column = "variable" # alt_dest util_dist_0_1 util_dist_1_2 ... # person_id ... @@ -337,16 +374,20 @@ def melt_alternatives(self, df): assert chooser_name in df # mergesort is the only stable sort, and we want the expressions to appear in original df column order - melt_df = pd.melt(df, id_vars=[chooser_name, alt_id_name]) \ - .sort_values(by=chooser_name, kind='mergesort') \ - .rename(columns={'variable': variable_column}) + melt_df = ( + pd.melt(df, id_vars=[chooser_name, alt_id_name]) + .sort_values(by=chooser_name, kind="mergesort") + .rename(columns={"variable": variable_column}) + ) # person_id,alt_dest,expression,value # 31153,1,util_dist_0_1,1.0 # 31153,2,util_dist_0_1,1.0 # 31153,3,util_dist_0_1,1.0 - melt_df = melt_df.set_index([chooser_name, variable_column, alt_id_name]).unstack(2) + melt_df = melt_df.set_index( + [chooser_name, variable_column, alt_id_name] + ).unstack(2) melt_df.columns = melt_df.columns.droplevel(0) melt_df = melt_df.reset_index(1) @@ -359,21 +400,30 @@ def melt_alternatives(self, df): def write_interaction_expression_values(self, df): df = self.melt_alternatives(df) - self.write_table(df, 'interaction_expression_values', append=True) + self.write_table(df, "interaction_expression_values", append=True) def write_expression_values(self, df): - self.write_table(df, 'expression_values', append=True) + self.write_table(df, "expression_values", append=True) def write_alternatives(self, alternatives_df, bundle_directory=False): - self.write_table(alternatives_df, 'alternatives', append=True, bundle_directory=bundle_directory) + self.write_table( + alternatives_df, + "alternatives", + append=True, + bundle_directory=bundle_directory, + ) def write_interaction_sample_alternatives(self, alternatives_df): alternatives_df = self.melt_alternatives(alternatives_df) - self.write_table(alternatives_df, 'interaction_sample_alternatives', append=True) + self.write_table( + alternatives_df, "interaction_sample_alternatives", append=True + ) def write_interaction_simulate_alternatives(self, interaction_df): interaction_df = self.melt_alternatives(interaction_df) - self.write_table(interaction_df, 'interaction_simulate_alternatives', append=True) + self.write_table( + interaction_df, "interaction_simulate_alternatives", append=True + ) def get_survey_values(self, model_values, table_name, column_names): # convenience method so deep callers don't need to import estimation @@ -385,7 +435,9 @@ def get_survey_table(self, table_name): assert self.estimating return manager.get_survey_table(table_name) - def write_spec(self, model_settings=None, file_name=None, tag='SPEC', bundle_directory=False): + def write_spec( + self, model_settings=None, file_name=None, tag="SPEC", bundle_directory=False + ): if model_settings is not None: assert file_name is None @@ -394,13 +446,12 @@ def write_spec(self, model_settings=None, file_name=None, tag='SPEC', bundle_dir input_path = config.config_file_path(file_name) table_name = tag # more readable than full spec file_name - output_path = self.output_file_path(table_name, 'csv', bundle_directory) + output_path = self.output_file_path(table_name, "csv", bundle_directory) shutil.copy(input_path, output_path) self.debug("estimate.write_spec: %s" % output_path) class EstimationManager(object): - def __init__(self): self.settings_initialized = False @@ -417,30 +468,38 @@ def initialize_settings(self): assert not self.settings_initialized settings = config.read_model_settings(ESTIMATION_SETTINGS_FILE_NAME) - self.enabled = settings.get('enable', 'True') - self.bundles = settings.get('bundles', []) + self.enabled = settings.get("enable", "True") + self.bundles = settings.get("bundles", []) - self.model_estimation_table_types = settings.get('model_estimation_table_types', {}) - self.estimation_table_recipes = settings.get('estimation_table_recipes', {}) + self.model_estimation_table_types = settings.get( + "model_estimation_table_types", {} + ) + self.estimation_table_recipes = settings.get("estimation_table_recipes", {}) if self.enabled: - self.survey_tables = settings.get('survey_tables', {}) + self.survey_tables = settings.get("survey_tables", {}) for table_name, table_info in self.survey_tables.items(): - assert 'file_name' in table_info, \ - "No file name specified for survey_table '%s' in %s" % (table_name, ESTIMATION_SETTINGS_FILE_NAME) - file_path = config.data_file_path(table_info['file_name'], mandatory=True) - assert os.path.exists(file_path), \ - "File for survey table '%s' not found: %s" % (table_name, file_path) + assert "file_name" in table_info, ( + "No file name specified for survey_table '%s' in %s" + % (table_name, ESTIMATION_SETTINGS_FILE_NAME) + ) + file_path = config.data_file_path( + table_info["file_name"], mandatory=True + ) + assert os.path.exists( + file_path + ), "File for survey table '%s' not found: %s" % (table_name, file_path) df = pd.read_csv(file_path) - index_col = table_info.get('index_col') + index_col = table_info.get("index_col") if index_col is not None: - assert index_col in df.columns, \ - "Index col '%s' not in survey_table '%s' in file: %s % (index_col, table_name, file_path)" + assert ( + index_col in df.columns + ), "Index col '%s' not in survey_table '%s' in file: %s % (index_col, table_name, file_path)" df.set_index(index_col, inplace=True) # add the table df to survey_tables - table_info['df'] = df + table_info["df"] = df self.settings_initialized = True @@ -467,25 +526,35 @@ def begin_estimation(self, model_name, bundle_name=None): bundle_name = bundle_name or model_name if bundle_name not in self.bundles: - logger.warning(f"estimation bundle {bundle_name} not in settings file {ESTIMATION_SETTINGS_FILE_NAME}") + logger.warning( + f"estimation bundle {bundle_name} not in settings file {ESTIMATION_SETTINGS_FILE_NAME}" + ) return None # can't estimate the same model simultaneously - assert model_name not in self.estimating, \ - "Cant begin estimating %s - already estimating that model." % (model_name, ) + assert ( + model_name not in self.estimating + ), "Cant begin estimating %s - already estimating that model." % (model_name,) - assert bundle_name in self.model_estimation_table_types, \ - "No estimation_table_type for %s in %s." % (bundle_name, ESTIMATION_SETTINGS_FILE_NAME) + assert bundle_name in self.model_estimation_table_types, ( + "No estimation_table_type for %s in %s." + % (bundle_name, ESTIMATION_SETTINGS_FILE_NAME) + ) model_estimation_table_type = self.model_estimation_table_types[bundle_name] - assert model_estimation_table_type in self.estimation_table_recipes, \ - "model_estimation_table_type '%s' for model %s no in %s." % \ - (model_estimation_table_type, model_name, ESTIMATION_SETTINGS_FILE_NAME) + assert model_estimation_table_type in self.estimation_table_recipes, ( + "model_estimation_table_type '%s' for model %s no in %s." + % (model_estimation_table_type, model_name, ESTIMATION_SETTINGS_FILE_NAME) + ) - self.estimating[model_name] = \ - Estimator(bundle_name, model_name, - estimation_table_recipes=self.estimation_table_recipes[model_estimation_table_type]) + self.estimating[model_name] = Estimator( + bundle_name, + model_name, + estimation_table_recipes=self.estimation_table_recipes[ + model_estimation_table_type + ], + ) return self.estimating[model_name] @@ -496,22 +565,31 @@ def release(self, estimator): def get_survey_table(self, table_name): assert self.enabled if table_name not in self.survey_tables: - logger.warning("EstimationManager. get_survey_table: survey table '%s' not in survey_tables" % table_name) - df = self.survey_tables[table_name].get('df') + logger.warning( + "EstimationManager. get_survey_table: survey table '%s' not in survey_tables" + % table_name + ) + df = self.survey_tables[table_name].get("df") return df def get_survey_values(self, model_values, table_name, column_names): - assert isinstance(model_values, (pd.Series, pd.DataFrame, pd.Index)), \ - "get_survey_values model_values has unrecognized type %s" % type(model_values) + assert isinstance( + model_values, (pd.Series, pd.DataFrame, pd.Index) + ), "get_survey_values model_values has unrecognized type %s" % type( + model_values + ) - dest_index = model_values if isinstance(model_values, (pd.Index)) else model_values.index + dest_index = ( + model_values if isinstance(model_values, (pd.Index)) else model_values.index + ) # read override_df table survey_df = manager.get_survey_table(table_name) - assert survey_df is not None, \ - "get_survey_values: table '%s' not found" % (table_name,) + assert survey_df is not None, "get_survey_values: table '%s' not found" % ( + table_name, + ) column_name = column_names if isinstance(column_names, str) else None if column_name: @@ -519,47 +597,67 @@ def get_survey_values(self, model_values, table_name, column_names): if not set(column_names).issubset(set(survey_df.columns)): missing_columns = list(set(column_names) - set(survey_df.columns)) - logger.error("missing columns (%s) in survey table %s" % (missing_columns, table_name)) - print("survey table columns: %s" % (survey_df.columns, )) - raise RuntimeError("missing columns (%s) in survey table %s" % (missing_columns, table_name)) - - assert set(column_names).issubset(set(survey_df.columns)), \ - f"missing columns ({list(set(column_names) - set(survey_df.columns))}) " \ + logger.error( + "missing columns (%s) in survey table %s" + % (missing_columns, table_name) + ) + print("survey table columns: %s" % (survey_df.columns,)) + raise RuntimeError( + "missing columns (%s) in survey table %s" + % (missing_columns, table_name) + ) + + assert set(column_names).issubset(set(survey_df.columns)), ( + f"missing columns ({list(set(column_names) - set(survey_df.columns))}) " f"in survey table {table_name} {list(survey_df.columns)}" + ) # for now tour_id is asim_tour_id in survey_df asim_df_index_name = dest_index.name if asim_df_index_name == survey_df.index.name: # survey table has same index as activitysim - survey_df_index_column = 'index' + survey_df_index_column = "index" elif asim_df_index_name in survey_df.columns: # survey table has activitysim index as column survey_df_index_column = asim_df_index_name - elif 'asim_%s' % asim_df_index_name in survey_df.columns: + elif "asim_%s" % asim_df_index_name in survey_df.columns: # survey table has activitysim index as column with asim_ prefix - survey_df_index_column = 'asim_%s' % asim_df_index_name + survey_df_index_column = "asim_%s" % asim_df_index_name else: - logger.error("get_survey_values:index '%s' not in survey table" % dest_index.name) + logger.error( + "get_survey_values:index '%s' not in survey table" % dest_index.name + ) # raise RuntimeError("index '%s' not in survey table %s" % (dest_index.name, table_name) survey_df_index_column = None - logger.debug("get_survey_values: reindexing using %s.%s" % (table_name, survey_df_index_column)) + logger.debug( + "get_survey_values: reindexing using %s.%s" + % (table_name, survey_df_index_column) + ) values = pd.DataFrame(index=dest_index) for c in column_names: - if survey_df_index_column == 'index': + if survey_df_index_column == "index": survey_values = survey_df[c] else: - survey_values = pd.Series(survey_df[c].values, index=survey_df[survey_df_index_column]) + survey_values = pd.Series( + survey_df[c].values, index=survey_df[survey_df_index_column] + ) survey_values = reindex(survey_values, dest_index) # shouldn't be any choices we can't override missing_values = survey_values.isna() if missing_values.any(): - logger.error("missing survey_values for %s\n%s" % (c, dest_index[missing_values])) - logger.error("couldn't get_survey_values for %s in %s\n" % (c, table_name)) - raise RuntimeError("couldn't get_survey_values for %s in %s\n" % (c, table_name)) + logger.error( + "missing survey_values for %s\n%s" % (c, dest_index[missing_values]) + ) + logger.error( + "couldn't get_survey_values for %s in %s\n" % (c, table_name) + ) + raise RuntimeError( + "couldn't get_survey_values for %s in %s\n" % (c, table_name) + ) values[c] = survey_values diff --git a/activitysim/abm/models/util/logsums.py b/activitysim/abm/models/util/logsums.py index 1441d73be3..eded14320b 100644 --- a/activitysim/abm/models/util/logsums.py +++ b/activitysim/abm/models/util/logsums.py @@ -2,12 +2,7 @@ # See full license in LICENSE.txt. import logging -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import los -from activitysim.core import expressions - +from activitysim.core import config, expressions, los, simulate, tracing from activitysim.core.pathbuilder import TransitVirtualPathBuilder logger = logging.getLogger(__name__) @@ -15,14 +10,19 @@ def filter_chooser_columns(choosers, logsum_settings, model_settings): - chooser_columns = logsum_settings.get('LOGSUM_CHOOSER_COLUMNS', []) + chooser_columns = logsum_settings.get("LOGSUM_CHOOSER_COLUMNS", []) - if 'CHOOSER_ORIG_COL_NAME' in model_settings and model_settings['CHOOSER_ORIG_COL_NAME'] not in chooser_columns: - chooser_columns.append(model_settings['CHOOSER_ORIG_COL_NAME']) + if ( + "CHOOSER_ORIG_COL_NAME" in model_settings + and model_settings["CHOOSER_ORIG_COL_NAME"] not in chooser_columns + ): + chooser_columns.append(model_settings["CHOOSER_ORIG_COL_NAME"]) missing_columns = [c for c in chooser_columns if c not in choosers] if missing_columns: - logger.debug("logsum.filter_chooser_columns missing_columns %s" % missing_columns) + logger.debug( + "logsum.filter_chooser_columns missing_columns %s" % missing_columns + ) # ignore any columns not appearing in choosers df chooser_columns = [c for c in chooser_columns if c in choosers] @@ -31,13 +31,16 @@ def filter_chooser_columns(choosers, logsum_settings, model_settings): return choosers -def compute_logsums(choosers, - tour_purpose, - logsum_settings, model_settings, - network_los, - chunk_size, - chunk_tag, - trace_label): +def compute_logsums( + choosers, + tour_purpose, + logsum_settings, + model_settings, + network_los, + chunk_size, + chunk_tag, + trace_label, +): """ Parameters @@ -57,22 +60,26 @@ def compute_logsums(choosers, computed logsums with same index as choosers """ - trace_label = tracing.extend_trace_label(trace_label, 'compute_logsums') + trace_label = tracing.extend_trace_label(trace_label, "compute_logsums") logger.debug("Running compute_logsums with %d choosers" % choosers.shape[0]) # compute_logsums needs to know name of dest column in interaction_sample - orig_col_name = model_settings['CHOOSER_ORIG_COL_NAME'] - dest_col_name = model_settings['ALT_DEST_COL_NAME'] + orig_col_name = model_settings["CHOOSER_ORIG_COL_NAME"] + dest_col_name = model_settings["ALT_DEST_COL_NAME"] # FIXME - are we ok with altering choosers (so caller doesn't have to set these)? - assert ('in_period' not in choosers) and ('out_period' not in choosers) - choosers['in_period'] = network_los.skim_time_period_label(model_settings['IN_PERIOD']) - choosers['out_period'] = network_los.skim_time_period_label(model_settings['OUT_PERIOD']) - - assert ('duration' not in choosers) - choosers['duration'] = model_settings['IN_PERIOD'] - model_settings['OUT_PERIOD'] - - logsum_spec = simulate.read_model_spec(file_name=logsum_settings['SPEC']) + assert ("in_period" not in choosers) and ("out_period" not in choosers) + choosers["in_period"] = network_los.skim_time_period_label( + model_settings["IN_PERIOD"] + ) + choosers["out_period"] = network_los.skim_time_period_label( + model_settings["OUT_PERIOD"] + ) + + assert "duration" not in choosers + choosers["duration"] = model_settings["IN_PERIOD"] - model_settings["OUT_PERIOD"] + + logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) coefficients = simulate.get_segment_coefficients(logsum_settings, tour_purpose) logsum_spec = simulate.eval_coefficients(logsum_spec, coefficients, estimator=None) @@ -89,14 +96,18 @@ def compute_logsums(choosers, # setup skim keys skim_dict = network_los.get_default_skim_dict() - odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, - dim3_key='out_period') - dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, - dim3_key='in_period') - odr_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, - dim3_key='in_period') - dor_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, - dim3_key='out_period') + odt_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col_name, dest_key=dest_col_name, dim3_key="out_period" + ) + dot_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col_name, dest_key=orig_col_name, dim3_key="in_period" + ) + odr_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col_name, dest_key=dest_col_name, dim3_key="in_period" + ) + dor_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col_name, dest_key=orig_col_name, dim3_key="out_period" + ) od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { @@ -105,34 +116,45 @@ def compute_logsums(choosers, "odr_skims": odr_skim_stack_wrapper, "dor_skims": dor_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, - 'orig_col_name': orig_col_name, - 'dest_col_name': dest_col_name + "orig_col_name": orig_col_name, + "dest_col_name": dest_col_name, } if network_los.zone_system == los.THREE_ZONE: # fixme - is this a lightweight object? tvpb = network_los.tvpb - tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col_name, dest_key=dest_col_name, - tod_key='out_period', segment_key='demographic_segment', - trace_label=trace_label, tag='tvpb_logsum_odt') - tvpb_logsum_dot = tvpb.wrap_logsum(orig_key=dest_col_name, dest_key=orig_col_name, - tod_key='in_period', segment_key='demographic_segment', - trace_label=trace_label, tag='tvpb_logsum_dot') - - skims.update({ - 'tvpb_logsum_odt': tvpb_logsum_odt, - 'tvpb_logsum_dot': tvpb_logsum_dot - }) + tvpb_logsum_odt = tvpb.wrap_logsum( + orig_key=orig_col_name, + dest_key=dest_col_name, + tod_key="out_period", + segment_key="demographic_segment", + trace_label=trace_label, + tag="tvpb_logsum_odt", + ) + tvpb_logsum_dot = tvpb.wrap_logsum( + orig_key=dest_col_name, + dest_key=orig_col_name, + tod_key="in_period", + segment_key="demographic_segment", + trace_label=trace_label, + tag="tvpb_logsum_dot", + ) + + skims.update( + {"tvpb_logsum_odt": tvpb_logsum_odt, "tvpb_logsum_dot": tvpb_logsum_dot} + ) # TVPB constants can appear in expressions - locals_dict.update(network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) + locals_dict.update( + network_los.setting("TVPB_SETTINGS.tour_mode_choice.CONSTANTS") + ) locals_dict.update(skims) # - run preprocessor to annotate choosers # allow specification of alternate preprocessor for nontour choosers - preprocessor = model_settings.get('LOGSUM_PREPROCESSOR', 'preprocessor') + preprocessor = model_settings.get("LOGSUM_PREPROCESSOR", "preprocessor") preprocessor_settings = logsum_settings[preprocessor] if preprocessor_settings: @@ -143,7 +165,8 @@ def compute_logsums(choosers, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) logsums = simulate.simple_simulate_logsums( choosers, @@ -153,6 +176,7 @@ def compute_logsums(choosers, locals_d=locals_dict, chunk_size=chunk_size, chunk_tag=chunk_tag, - trace_label=trace_label) + trace_label=trace_label, + ) return logsums diff --git a/activitysim/abm/models/util/mode.py b/activitysim/abm/models/util/mode.py index 542cf528d4..07f54bee4e 100644 --- a/activitysim/abm/models/util/mode.py +++ b/activitysim/abm/models/util/mode.py @@ -2,10 +2,7 @@ # See full license in LICENSE.txt. import pandas as pd -from activitysim.core import simulate -from activitysim.core import config -from activitysim.core import expressions -from activitysim.core import tracing +from activitysim.core import config, expressions, simulate, tracing """ At this time, these utilities are mostly for transforming the mode choice @@ -15,14 +12,19 @@ def mode_choice_simulate( - choosers, spec, nest_spec, skims, locals_d, - chunk_size, - mode_column_name, - logsum_column_name, - trace_label, - trace_choice_name, - trace_column_names=None, - estimator=None): + choosers, + spec, + nest_spec, + skims, + locals_d, + chunk_size, + mode_column_name, + logsum_column_name, + trace_label, + trace_choice_name, + trace_column_names=None, + estimator=None, +): """ common method for both tour_mode_choice and trip_mode_choice @@ -57,34 +59,39 @@ def mode_choice_simulate( trace_label=trace_label, trace_choice_name=trace_choice_name, estimator=estimator, - trace_column_names=trace_column_names) + trace_column_names=trace_column_names, + ) # for consistency, always return dataframe, whether or not logsums were requested if isinstance(choices, pd.Series): - choices = choices.to_frame('choice') + choices = choices.to_frame("choice") - choices.rename(columns={'logsum': logsum_column_name, - 'choice': mode_column_name}, - inplace=True) + choices.rename( + columns={"logsum": logsum_column_name, "choice": mode_column_name}, inplace=True + ) alts = spec.columns - choices[mode_column_name] = \ - choices[mode_column_name].map(dict(list(zip(list(range(len(alts))), alts)))) + choices[mode_column_name] = choices[mode_column_name].map( + dict(list(zip(list(range(len(alts))), alts))) + ) return choices def run_tour_mode_choice_simulate( - choosers, - tour_purpose, model_settings, - mode_column_name, - logsum_column_name, - network_los, - skims, - constants, - estimator, - chunk_size, - trace_label=None, trace_choice_name=None): + choosers, + tour_purpose, + model_settings, + mode_column_name, + logsum_column_name, + network_los, + skims, + constants, + estimator, + chunk_size, + trace_label=None, + trace_choice_name=None, +): """ This is a utility to run a mode choice model for each segment (usually segments are tour/trip purposes). Pass in the tours/trip that need a mode, @@ -92,7 +99,7 @@ def run_tour_mode_choice_simulate( you want to use in the evaluation of variables. """ - spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients = simulate.get_segment_coefficients(model_settings, tour_purpose) spec = simulate.eval_coefficients(spec, coefficients, estimator) @@ -107,18 +114,18 @@ def run_tour_mode_choice_simulate( # coefficients can appear in expressions locals_dict.update(coefficients) - assert ('in_period' not in choosers) and ('out_period' not in choosers) - in_time = skims['in_time_col_name'] - out_time = skims['out_time_col_name'] - choosers['in_period'] = network_los.skim_time_period_label(choosers[in_time]) - choosers['out_period'] = network_los.skim_time_period_label(choosers[out_time]) + assert ("in_period" not in choosers) and ("out_period" not in choosers) + in_time = skims["in_time_col_name"] + out_time = skims["out_time_col_name"] + choosers["in_period"] = network_los.skim_time_period_label(choosers[in_time]) + choosers["out_period"] = network_los.skim_time_period_label(choosers[out_time]) expressions.annotate_preprocessors( - choosers, locals_dict, skims, - model_settings, trace_label) + choosers, locals_dict, skims, model_settings, trace_label + ) trace_column_names = choosers.index.name - assert trace_column_names == 'tour_id' + assert trace_column_names == "tour_id" if trace_column_names not in choosers: choosers[trace_column_names] = choosers.index @@ -138,6 +145,7 @@ def run_tour_mode_choice_simulate( trace_label=trace_label, trace_choice_name=trace_choice_name, trace_column_names=trace_column_names, - estimator=estimator) + estimator=estimator, + ) return choices diff --git a/activitysim/abm/models/util/overlap.py b/activitysim/abm/models/util/overlap.py index 433dc3a0bb..70fadfbd43 100644 --- a/activitysim/abm/models/util/overlap.py +++ b/activitysim/abm/models/util/overlap.py @@ -2,12 +2,10 @@ # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import tracing -from activitysim.core import inject - +from activitysim.core import inject, tracing logger = logging.getLogger(__name__) @@ -122,11 +120,11 @@ def p2p_time_window_overlap(p1_ids, p2_ids): row_ids = row_ids[target_rows] run_length = run_length[target_rows] - df = pd.DataFrame({'row_ids': row_ids, 'run_length': run_length}) + df = pd.DataFrame({"row_ids": row_ids, "run_length": run_length}) # groupby index of row_ids match the numpy row indexes of timetable.pairwise_available ndarray # but there may be missing values of any no-overlap persons pairs - max_overlap = df.groupby('row_ids').run_length.max() + max_overlap = df.groupby("row_ids").run_length.max() # fill in any missing values to align with input arrays input_row_ids = np.arange(len(p1_ids)) max_overlap = max_overlap.reindex(input_row_ids).fillna(0) @@ -139,23 +137,28 @@ def p2p_time_window_overlap(p1_ids, p2_ids): def person_pairs(persons): - p = persons[['household_id', 'adult']].reset_index() - p2p = pd.merge(p, p, left_on='household_id', right_on='household_id', how='outer') + p = persons[["household_id", "adult"]].reset_index() + p2p = pd.merge(p, p, left_on="household_id", right_on="household_id", how="outer") # we desire well known non-contingent column names - p2p.rename(columns={ - '%s_x' % persons.index.name: 'person1', - '%s_y' % persons.index.name: 'person2', - }, inplace=True) + p2p.rename( + columns={ + "%s_x" % persons.index.name: "person1", + "%s_y" % persons.index.name: "person2", + }, + inplace=True, + ) p2p = p2p[p2p.person1 < p2p.person2] # index is meaningless, but might as well be tidy p2p.reset_index(drop=True, inplace=True) - p2p['p2p_type'] = (p2p.adult_x * 1 + p2p.adult_y * 1).map({0: 'cc', 1: 'ac', 2: 'aa'}) + p2p["p2p_type"] = (p2p.adult_x * 1 + p2p.adult_y * 1).map( + {0: "cc", 1: "ac", 2: "aa"} + ) - p2p = p2p[['household_id', 'person1', 'person2', 'p2p_type']] + p2p = p2p[["household_id", "person1", "person2", "p2p_type"]] return p2p @@ -164,16 +167,19 @@ def hh_time_window_overlap(households, persons): p2p = person_pairs(persons) - p2p['max_overlap'] = p2p_time_window_overlap(p2p.person1, p2p.person2) + p2p["max_overlap"] = p2p_time_window_overlap(p2p.person1, p2p.person2) - hh_overlap = \ - p2p.groupby(['household_id', 'p2p_type']).max_overlap.max().unstack(level=-1, fill_value=0) + hh_overlap = ( + p2p.groupby(["household_id", "p2p_type"]) + .max_overlap.max() + .unstack(level=-1, fill_value=0) + ) # fill in missing households (in case there were no overlaps) hh_overlap = hh_overlap.reindex(households.index).fillna(0).astype(np.int8) # make sure we have all p2p_types (if there were none to unstack, then column will be missing) - for c in ['aa', 'cc', 'ac']: + for c in ["aa", "cc", "ac"]: if c not in hh_overlap.columns: hh_overlap[c] = 0 @@ -184,18 +190,28 @@ def person_time_window_overlap(persons): p2p = person_pairs(persons) - p2p['max_overlap'] = p2p_time_window_overlap(p2p.person1, p2p.person2) - - p_overlap = pd.concat([ - p2p[['person1', 'p2p_type', 'max_overlap']].rename(columns={'person1': 'person_id'}), - p2p[['person2', 'p2p_type', 'max_overlap']].rename(columns={'person2': 'person_id'}) - ]).groupby(['person_id', 'p2p_type']).max_overlap.max() + p2p["max_overlap"] = p2p_time_window_overlap(p2p.person1, p2p.person2) + + p_overlap = ( + pd.concat( + [ + p2p[["person1", "p2p_type", "max_overlap"]].rename( + columns={"person1": "person_id"} + ), + p2p[["person2", "p2p_type", "max_overlap"]].rename( + columns={"person2": "person_id"} + ), + ] + ) + .groupby(["person_id", "p2p_type"]) + .max_overlap.max() + ) # unstack to create columns for each p2p_type (aa, cc, and ac) p_overlap = p_overlap.unstack(level=-1, fill_value=0) # make sure we have columns for all p2p_types (in case there were none of a p2ptype to unstack) - for c in ['aa', 'cc', 'ac']: + for c in ["aa", "cc", "ac"]: if c not in p_overlap.columns: p_overlap[c] = 0 @@ -221,11 +237,11 @@ def person_max_window(persons): row_ids = row_ids[target_rows] run_length = run_length[target_rows] - df = pd.DataFrame({'row_ids': row_ids, 'run_length': run_length}) + df = pd.DataFrame({"row_ids": row_ids, "run_length": run_length}) # groupby index of row_ids match the numpy row indexes of timetable.pairwise_available ndarray # but there may be missing values of any no-overlap persons pairs - max_overlap = df.groupby('row_ids').run_length.max() + max_overlap = df.groupby("row_ids").run_length.max() # fill in any missing values to align with input arrays input_row_ids = np.arange(persons.shape[0]) max_window = max_overlap.reindex(input_row_ids).fillna(0) diff --git a/activitysim/abm/models/util/test/test_cdap.py b/activitysim/abm/models/util/test/test_cdap.py index 9056264b14..0e4bd68392 100644 --- a/activitysim/abm/models/util/test/test_cdap.py +++ b/activitysim/abm/models/util/test/test_cdap.py @@ -2,30 +2,25 @@ # See full license in LICENSE.txt. import os.path -import yaml import pandas as pd import pandas.testing as pdt import pytest +import yaml -from .. import cdap +from activitysim.core import chunk, config, inject, simulate -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import config -from activitysim.core import chunk +from .. import cdap -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data_dir(): - return os.path.join(os.path.dirname(__file__), 'data') + return os.path.join(os.path.dirname(__file__), "data") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def people(data_dir): - return pd.read_csv( - os.path.join(data_dir, 'people.csv'), - index_col='id') + return pd.read_csv(os.path.join(data_dir, "people.csv"), index_col="id") def teardown_function(func): @@ -33,32 +28,34 @@ def teardown_function(func): inject.reinject_decorated_tables() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def model_settings(configs_dir): - yml_file = os.path.join(configs_dir, 'cdap.yaml') + yml_file = os.path.join(configs_dir, "cdap.yaml") with open(yml_file) as f: model_settings = yaml.load(f, Loader=yaml.loader.SafeLoader) return model_settings -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def configs_dir(): - return os.path.join(os.path.dirname(__file__), 'configs') + return os.path.join(os.path.dirname(__file__), "configs") def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), 'output') + output_dir = os.path.join(os.path.dirname(__file__), "output") inject.add_injectable("output_dir", output_dir) def test_bad_coefficients(): - coefficients = pd.read_csv(config.config_file_path('cdap_interaction_coefficients.csv'), comment='#') + coefficients = pd.read_csv( + config.config_file_path("cdap_interaction_coefficients.csv"), comment="#" + ) coefficients = cdap.preprocess_interaction_coefficients(coefficients) - coefficients.loc[2, 'activity'] = 'AA' + coefficients.loc[2, "activity"] = "AA" with pytest.raises(RuntimeError) as excinfo: coefficients = cdap.preprocess_interaction_coefficients(coefficients) @@ -67,75 +64,95 @@ def test_bad_coefficients(): def test_assign_cdap_rank(people, model_settings): - person_type_map = model_settings.get('PERSON_TYPE_MAP', {}) + person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log('test_assign_cdap_rank', base=True): + with chunk.chunk_log("test_assign_cdap_rank", base=True): cdap.assign_cdap_rank(people, person_type_map) expected = pd.Series( - [1, 1, 1, 2, 2, 1, 3, 1, 2, 1, 3, 2, 1, 3, 2, 4, 1, 3, 4, 2], - index=people.index + [1, 1, 1, 2, 2, 1, 3, 1, 2, 1, 3, 2, 1, 3, 2, 4, 1, 3, 4, 2], index=people.index ) - pdt.assert_series_equal(people['cdap_rank'], expected, check_dtype=False, check_names=False) + pdt.assert_series_equal( + people["cdap_rank"], expected, check_dtype=False, check_names=False + ) def test_individual_utilities(people, model_settings): - cdap_indiv_and_hhsize1 = simulate.read_model_spec(file_name='cdap_indiv_and_hhsize1.csv') + cdap_indiv_and_hhsize1 = simulate.read_model_spec( + file_name="cdap_indiv_and_hhsize1.csv" + ) - person_type_map = model_settings.get('PERSON_TYPE_MAP', {}) + person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log('test_individual_utilities', base=True): + with chunk.chunk_log("test_individual_utilities", base=True): cdap.assign_cdap_rank(people, person_type_map) - individual_utils = cdap.individual_utilities(people, cdap_indiv_and_hhsize1, locals_d=None) - - individual_utils = individual_utils[['M', 'N', 'H']] - - expected = pd.DataFrame([ - [2, 0, 0], # person 1 - [0, 0, 1], # person 2 - [3, 0, 0], # person 3 - [3, 0, 0], # person 4 - [0, 1, 0], # person 5 - [1, 0, 0], # person 6 - [1, 0, 0], # person 7 - [0, 2, 0], # person 8 - [0, 0, 1], # person 9 - [2, 0, 0], # person 10 - [0, 0, 3], # person 11 - [0, 0, 2], # person 12 - [3, 0, 0], # person 13 - [1, 0, 0], # person 14 - [0, 4, 0], # person 15 - [0, 4, 0], # person 16 - [0, 0, 4], # person 17 - [0, 0, 5], # person 18 - [50, 0, 4], # person 19 - [2, 0, 0] # person 20 - ], index=people.index, columns=cdap_indiv_and_hhsize1.columns) + individual_utils = cdap.individual_utilities( + people, cdap_indiv_and_hhsize1, locals_d=None + ) + + individual_utils = individual_utils[["M", "N", "H"]] + + expected = pd.DataFrame( + [ + [2, 0, 0], # person 1 + [0, 0, 1], # person 2 + [3, 0, 0], # person 3 + [3, 0, 0], # person 4 + [0, 1, 0], # person 5 + [1, 0, 0], # person 6 + [1, 0, 0], # person 7 + [0, 2, 0], # person 8 + [0, 0, 1], # person 9 + [2, 0, 0], # person 10 + [0, 0, 3], # person 11 + [0, 0, 2], # person 12 + [3, 0, 0], # person 13 + [1, 0, 0], # person 14 + [0, 4, 0], # person 15 + [0, 4, 0], # person 16 + [0, 0, 4], # person 17 + [0, 0, 5], # person 18 + [50, 0, 4], # person 19 + [2, 0, 0], # person 20 + ], + index=people.index, + columns=cdap_indiv_and_hhsize1.columns, + ) pdt.assert_frame_equal( - individual_utils, expected, check_dtype=False, check_names=False) + individual_utils, expected, check_dtype=False, check_names=False + ) def test_build_cdap_spec_hhsize2(people, model_settings): hhsize = 2 - cdap_indiv_and_hhsize1 = simulate.read_model_spec(file_name='cdap_indiv_and_hhsize1.csv') + cdap_indiv_and_hhsize1 = simulate.read_model_spec( + file_name="cdap_indiv_and_hhsize1.csv" + ) - interaction_coefficients = pd.read_csv(config.config_file_path('cdap_interaction_coefficients.csv'), comment='#') - interaction_coefficients = cdap.preprocess_interaction_coefficients(interaction_coefficients) + interaction_coefficients = pd.read_csv( + config.config_file_path("cdap_interaction_coefficients.csv"), comment="#" + ) + interaction_coefficients = cdap.preprocess_interaction_coefficients( + interaction_coefficients + ) - person_type_map = model_settings.get('PERSON_TYPE_MAP', {}) + person_type_map = model_settings.get("PERSON_TYPE_MAP", {}) - with chunk.chunk_log('test_build_cdap_spec_hhsize2', base=True): + with chunk.chunk_log("test_build_cdap_spec_hhsize2", base=True): cdap.assign_cdap_rank(people, person_type_map) - indiv_utils = cdap.individual_utilities(people, cdap_indiv_and_hhsize1, locals_d=None) + indiv_utils = cdap.individual_utilities( + people, cdap_indiv_and_hhsize1, locals_d=None + ) choosers = cdap.hh_choosers(indiv_utils, hhsize=hhsize) - spec = cdap.build_cdap_spec(interaction_coefficients, hhsize=hhsize, cache=False) + spec = cdap.build_cdap_spec( + interaction_coefficients, hhsize=hhsize, cache=False + ) # pandas.dot depends on column names of expression_values matching spec index values # expressions should have been uniquified when spec was read @@ -148,11 +165,13 @@ def test_build_cdap_spec_hhsize2(people, model_settings): utils = vars.dot(spec) - expected = pd.DataFrame([ - [0, 3, 0, 3, 7, 3, 0, 3, 0], # household 3 - [0, 0, 1, 1, 1, 2, 0, 0, 2], # household 4 + expected = pd.DataFrame( + [ + [0, 3, 0, 3, 7, 3, 0, 3, 0], # household 3 + [0, 0, 1, 1, 1, 2, 0, 0, 2], # household 4 ], index=[3, 4], - columns=['HH', 'HM', 'HN', 'MH', 'MM', 'MN', 'NH', 'NM', 'NN']).astype('float') + columns=["HH", "HM", "HN", "MH", "MM", "MN", "NH", "NM", "NN"], + ).astype("float") pdt.assert_frame_equal(utils, expected, check_names=False) diff --git a/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py b/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py index 74aba531bc..2739e1a401 100644 --- a/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py +++ b/activitysim/abm/models/util/test/test_mandatory_tour_frequency.py @@ -2,30 +2,40 @@ # See full license in LICENSE.txt. -import pytest import os + import pandas as pd import pandas.testing as pdt +import pytest + from ..tour_frequency import process_mandatory_tours def mandatory_tour_frequency_alternatives(): - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') - f = os.path.join(configs_dir, 'mandatory_tour_frequency_alternatives.csv') - df = pd.read_csv(f, comment='#') - df.set_index('alt', inplace=True) + configs_dir = os.path.join(os.path.dirname(__file__), "configs") + f = os.path.join(configs_dir, "mandatory_tour_frequency_alternatives.csv") + df = pd.read_csv(f, comment="#") + df.set_index("alt", inplace=True) return df def test_mtf(): - persons = pd.DataFrame({ - "is_worker": [True, True, False, False], - "mandatory_tour_frequency": ["work1", "work_and_school", "work_and_school", "school2"], - "school_zone_id": [1, 2, 3, 4], - "workplace_zone_id": [10, 20, 30, 40], - "home_zone_id": [100, 200, 300, 400], - "household_id": [1, 2, 2, 4] - }, index=[10, 20, 30, 40]) + persons = pd.DataFrame( + { + "is_worker": [True, True, False, False], + "mandatory_tour_frequency": [ + "work1", + "work_and_school", + "work_and_school", + "school2", + ], + "school_zone_id": [1, 2, 3, 4], + "workplace_zone_id": [10, 20, 30, 40], + "home_zone_id": [100, 200, 300, 400], + "household_id": [1, 2, 2, 4], + }, + index=[10, 20, 30, 40], + ) tour_frequency_alternatives = mandatory_tour_frequency_alternatives() @@ -35,22 +45,30 @@ def test_mtf(): pdt.assert_series_equal( mandatory_tours.person_id, - pd.Series([10, 20, 20, 30, 30, 40, 40], index=idx, name='person_id')) + pd.Series([10, 20, 20, 30, 30, 40, 40], index=idx, name="person_id"), + ) pdt.assert_series_equal( mandatory_tours.tour_type, - pd.Series(['work', 'work', 'school', 'work', 'school', 'school', 'school'], - index=idx, name='tour_type')) + pd.Series( + ["work", "work", "school", "work", "school", "school", "school"], + index=idx, + name="tour_type", + ), + ) # tour_nums for work_and_school non-worker should be flipped pdt.assert_series_equal( mandatory_tours.tour_num, - pd.Series([1, 1, 2, 2, 1, 1, 2], index=idx, name='tour_num')) + pd.Series([1, 1, 2, 2, 1, 1, 2], index=idx, name="tour_num"), + ) pdt.assert_series_equal( mandatory_tours.destination, - pd.Series([10, 20, 2, 30, 3, 4, 4], index=idx, name='destination')) + pd.Series([10, 20, 2, 30, 3, 4, 4], index=idx, name="destination"), + ) pdt.assert_series_equal( mandatory_tours.origin, - pd.Series([100, 200, 200, 300, 300, 400, 400], index=idx, name='origin')) + pd.Series([100, 200, 200, 300, 300, 400, 400], index=idx, name="origin"), + ) diff --git a/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py b/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py index a3574b6744..5ae895630f 100644 --- a/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/util/test/test_non_mandatory_tour_frequency.py @@ -2,10 +2,12 @@ # See full license in LICENSE.txt. -import pytest import os + import pandas as pd import pandas.testing as pdt +import pytest + from ..tour_frequency import process_non_mandatory_tours @@ -13,23 +15,21 @@ def test_nmtf(): persons = pd.DataFrame( { - 'non_mandatory_tour_frequency': [0, 3, 2, 1], - 'household_id': [1, 1, 2, 4], - 'home_zone_id': [100, 100, 200, 400] + "non_mandatory_tour_frequency": [0, 3, 2, 1], + "household_id": [1, 1, 2, 4], + "home_zone_id": [100, 100, 200, 400], }, - index=[0, 1, 2, 3] + index=[0, 1, 2, 3], ) non_mandatory_tour_frequency_alts = pd.DataFrame( - { - "escort": [0, 0, 2, 0], - "shopping": [1, 0, 0, 0], - "othmaint": [0, 1, 0, 0] - }, - index=[0, 1, 2, 3] + {"escort": [0, 0, 2, 0], "shopping": [1, 0, 0, 0], "othmaint": [0, 1, 0, 0]}, + index=[0, 1, 2, 3], ) - tour_counts = non_mandatory_tour_frequency_alts.loc[persons.non_mandatory_tour_frequency] + tour_counts = non_mandatory_tour_frequency_alts.loc[ + persons.non_mandatory_tour_frequency + ] tour_counts.index = persons.index # assign person ids to the index # - create the non_mandatory tours @@ -38,12 +38,12 @@ def test_nmtf(): idx = nmt.index pdt.assert_series_equal( - nmt.person_id, - pd.Series( - [0, 2, 2, 3], index=idx, name='person_id')) + nmt.person_id, pd.Series([0, 2, 2, 3], index=idx, name="person_id") + ) pdt.assert_series_equal( nmt.tour_type, pd.Series( - ["shopping", "escort", "escort", "othmaint"], - index=idx, name='tour_type')) + ["shopping", "escort", "escort", "othmaint"], index=idx, name="tour_type" + ), + ) diff --git a/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py b/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py index 3e9b61b89c..4e4325b056 100644 --- a/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/test/test_vectorize_tour_scheduling.py @@ -2,16 +2,18 @@ # See full license in LICENSE.txt. import os -import pytest -import pandas as pd -import numpy as np +import numpy as np +import pandas as pd import pandas.testing as pdt +import pytest from activitysim.core import inject -from ..vectorize_tour_scheduling import get_previous_tour_by_tourid, \ - vectorize_tour_scheduling +from ..vectorize_tour_scheduling import ( + get_previous_tour_by_tourid, + vectorize_tour_scheduling, +) def teardown_function(func): @@ -20,7 +22,7 @@ def teardown_function(func): def setup_function(): - output_dir = os.path.join(os.path.dirname(__file__), 'output') + output_dir = os.path.join(os.path.dirname(__file__), "output") inject.add_injectable("output_dir", output_dir) @@ -29,45 +31,41 @@ def test_vts(): inject.add_injectable("settings", {}) # note: need 0 duration tour on one end of day to guarantee at least one available tour - alts = pd.DataFrame({ - "start": [1, 1, 2, 3], - "end": [1, 4, 5, 6] - }) - alts['duration'] = alts.end - alts.start + alts = pd.DataFrame({"start": [1, 1, 2, 3], "end": [1, 4, 5, 6]}) + alts["duration"] = alts.end - alts.start inject.add_injectable("tdd_alts", alts) - current_tour_person_ids = pd.Series(['b', 'c'], - index=['d', 'e']) + current_tour_person_ids = pd.Series(["b", "c"], index=["d", "e"]) - previous_tour_by_personid = pd.Series([2, 2, 1], - index=['a', 'b', 'c']) + previous_tour_by_personid = pd.Series([2, 2, 1], index=["a", "b", "c"]) - prev_tour_attrs = get_previous_tour_by_tourid(current_tour_person_ids, - previous_tour_by_personid, - alts) + prev_tour_attrs = get_previous_tour_by_tourid( + current_tour_person_ids, previous_tour_by_personid, alts + ) pdt.assert_series_equal( prev_tour_attrs.start_previous, - pd.Series([2, 1], index=['d', 'e'], name='start_previous')) + pd.Series([2, 1], index=["d", "e"], name="start_previous"), + ) pdt.assert_series_equal( prev_tour_attrs.end_previous, - pd.Series([5, 4], index=['d', 'e'], name='end_previous')) + pd.Series([5, 4], index=["d", "e"], name="end_previous"), + ) - tours = pd.DataFrame({ - "person_id": [1, 1, 2, 3, 3], - "tour_num": [1, 2, 1, 1, 2], - "tour_type": ['x', 'x', 'x', 'x', 'x'] - }) + tours = pd.DataFrame( + { + "person_id": [1, 1, 2, 3, 3], + "tour_num": [1, 2, 1, 1, 2], + "tour_type": ["x", "x", "x", "x", "x"], + } + ) - persons = pd.DataFrame({ - "income": [20, 30, 25] - }, index=[1, 2, 3]) + persons = pd.DataFrame({"income": [20, 30, 25]}, index=[1, 2, 3]) - inject.add_table('persons', persons) + inject.add_table("persons", persons) - spec = pd.DataFrame({"Coefficient": [1.2]}, - index=["income"]) + spec = pd.DataFrame({"Coefficient": [1.2]}, index=["income"]) spec.index.name = "Expression" inject.add_injectable("check_for_variability", True) @@ -75,11 +73,16 @@ def test_vts(): timetable = inject.get_injectable("timetable") tdd_choices = vectorize_tour_scheduling( - tours, persons, alts, timetable, - tour_segments={'spec': spec}, + tours, + persons, + alts, + timetable, + tour_segments={"spec": spec}, tour_segment_col=None, model_settings={}, - chunk_size=0, trace_label='test_vts') + chunk_size=0, + trace_label="test_vts", + ) # FIXME - dead reckoning regression # there's no real logic here - this is just what came out of the monte carlo diff --git a/activitysim/abm/models/util/tour_destination.py b/activitysim/abm/models/util/tour_destination.py index 7482aeb6db..a39037b479 100644 --- a/activitysim/abm/models/util/tour_destination.py +++ b/activitysim/abm/models/util/tour_destination.py @@ -2,23 +2,16 @@ # See full license in LICENSE.txt. import logging -import pandas as pd import numpy as np +import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import los -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import pipeline - -from activitysim.core.util import reindex - -from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.abm.tables.size_terms import tour_destination_size_terms +from activitysim.core import config, inject, los, pipeline, simulate, tracing from activitysim.core.interaction_sample import interaction_sample +from activitysim.core.interaction_sample_simulate import interaction_sample_simulate +from activitysim.core.util import reindex from . import logsums as logsum -from activitysim.abm.tables.size_terms import tour_destination_size_terms logger = logging.getLogger(__name__) DUMP = False @@ -34,10 +27,11 @@ class SizeTermCalculator(object): def __init__(self, size_term_selector): # do this once so they can request size_terms for various segments (tour_type or purpose) - land_use = inject.get_table('land_use') - size_terms = inject.get_injectable('size_terms') - self.destination_size_terms = \ - tour_destination_size_terms(land_use, size_terms, size_term_selector) + land_use = inject.get_table("land_use") + size_terms = inject.get_injectable("size_terms") + self.destination_size_terms = tour_destination_size_terms( + land_use, size_terms, size_term_selector + ) assert not self.destination_size_terms.isna().any(axis=None) @@ -49,15 +43,19 @@ def dest_size_terms_df(self, segment_name, trace_label): # convenient if creating or merging with alts size_terms = self.destination_size_terms[[segment_name]].copy() - size_terms.columns = ['size_term'] + size_terms.columns = ["size_term"] # FIXME - no point in considering impossible alternatives (where dest size term is zero) - logger.debug(f"SizeTermCalculator dropping {(~(size_terms.size_term > 0)).sum()} " - f"of {len(size_terms)} rows where size_term is zero for {segment_name}") + logger.debug( + f"SizeTermCalculator dropping {(~(size_terms.size_term > 0)).sum()} " + f"of {len(size_terms)} rows where size_term is zero for {segment_name}" + ) size_terms = size_terms[size_terms.size_term > 0] if len(size_terms) == 0: - logger.warning(f"SizeTermCalculator: no zones with non-zero size terms for {segment_name} in {trace_label}") + logger.warning( + f"SizeTermCalculator: no zones with non-zero size terms for {segment_name} in {trace_label}" + ) return size_terms @@ -68,36 +66,44 @@ def dest_size_terms_df(self, segment_name, trace_label): def _destination_sample( - spec_segment_name, - choosers, - destination_size_terms, - skims, - estimator, + spec_segment_name, + choosers, + destination_size_terms, + skims, + estimator, + model_settings, + alt_dest_col_name, + chunk_size, + chunk_tag, + trace_label, +): + + model_spec = simulate.spec_for_segment( model_settings, - alt_dest_col_name, - chunk_size, - chunk_tag, - trace_label): - - model_spec = simulate.spec_for_segment(model_settings, spec_id='SAMPLE_SPEC', - segment_name=spec_segment_name, estimator=estimator) + spec_id="SAMPLE_SPEC", + segment_name=spec_segment_name, + estimator=estimator, + ) logger.info("running %s with %d tours", trace_label, len(choosers)) - sample_size = model_settings['SAMPLE_SIZE'] - if config.setting('disable_destination_sampling', False) or (estimator and estimator.want_unsampled_alternatives): + sample_size = model_settings["SAMPLE_SIZE"] + if config.setting("disable_destination_sampling", False) or ( + estimator and estimator.want_unsampled_alternatives + ): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count - logger.info("Estimation mode for %s using unsampled alternatives short_circuit_choices" % (trace_label,)) + logger.info( + "Estimation mode for %s using unsampled alternatives short_circuit_choices" + % (trace_label,) + ) sample_size = 0 - locals_d = { - 'skims': skims - } + locals_d = {"skims": skims} constants = config.get_model_constants(model_settings) if constants is not None: locals_d.update(constants) - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_sample( choosers, @@ -110,39 +116,42 @@ def _destination_sample( locals_d=locals_d, chunk_size=chunk_size, chunk_tag=chunk_tag, - trace_label=trace_label) + trace_label=trace_label, + ) # remember person_id in chosen alts so we can merge with persons in subsequent steps # (broadcasts person_id onto all alternatives sharing the same tour_id index value) - choices['person_id'] = choosers.person_id + choices["person_id"] = choosers.person_id return choices def destination_sample( - spec_segment_name, - choosers, - model_settings, - network_los, - destination_size_terms, - estimator, - chunk_size, trace_label): - - chunk_tag = 'tour_destination.sample' + spec_segment_name, + choosers, + model_settings, + network_los, + destination_size_terms, + estimator, + chunk_size, + trace_label, +): + + chunk_tag = "tour_destination.sample" # create wrapper with keys for this lookup # the skims will be available under the name "skims" for any @ expressions - skim_origin_col_name = model_settings['CHOOSER_ORIG_COL_NAME'] + skim_origin_col_name = model_settings["CHOOSER_ORIG_COL_NAME"] skim_dest_col_name = destination_size_terms.index.name # (logit.interaction_dataset suffixes duplicate chooser column with '_chooser') - if (skim_origin_col_name == skim_dest_col_name): - skim_origin_col_name = f'{skim_origin_col_name}_chooser' + if skim_origin_col_name == skim_dest_col_name: + skim_origin_col_name = f"{skim_origin_col_name}_chooser" skim_dict = network_los.get_default_skim_dict() skims = skim_dict.wrap(skim_origin_col_name, skim_dest_col_name) # the name of the dest column to be returned in choices - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] choices = _destination_sample( spec_segment_name, @@ -152,20 +161,22 @@ def destination_sample( estimator, model_settings, alt_dest_col_name, - chunk_size, chunk_tag=chunk_tag, - trace_label=trace_label) + chunk_size, + chunk_tag=chunk_tag, + trace_label=trace_label, + ) return choices # temp column names for presampling -DEST_MAZ = 'dest_MAZ' -DEST_TAZ = 'dest_TAZ' -ORIG_TAZ = 'TAZ' # likewise a temp, but if already in choosers, we assume we can use it opportunistically +DEST_MAZ = "dest_MAZ" +DEST_TAZ = "dest_TAZ" +ORIG_TAZ = "TAZ" # likewise a temp, but if already in choosers, we assume we can use it opportunistically def map_maz_to_taz(s, network_los): - maz_to_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].set_index('MAZ').TAZ + maz_to_taz = network_los.maz_taz_df[["MAZ", "TAZ"]].set_index("MAZ").TAZ return s.map(maz_to_taz) @@ -180,9 +191,9 @@ def aggregate_size_terms(dest_size_terms, network_los): MAZ_size_terms[DEST_TAZ] = map_maz_to_taz(MAZ_size_terms.index, network_los) # aggregate to TAZ - TAZ_size_terms = MAZ_size_terms.groupby(DEST_TAZ).agg({'size_term': 'sum'}) + TAZ_size_terms = MAZ_size_terms.groupby(DEST_TAZ).agg({"size_term": "sum"}) TAZ_size_terms[DEST_TAZ] = TAZ_size_terms.index - assert not TAZ_size_terms['size_term'].isna().any() + assert not TAZ_size_terms["size_term"].isna().any() # size_term # dest_TAZ @@ -192,8 +203,10 @@ def aggregate_size_terms(dest_size_terms, network_los): # add crosswalk DEST_TAZ column to MAZ_size_terms # MAZ_size_terms = MAZ_size_terms.sort_values([DEST_TAZ, 'size_term']) # maybe helpful for debugging - MAZ_size_terms = MAZ_size_terms[[DEST_TAZ, 'size_term']].reset_index(drop=False) - MAZ_size_terms = MAZ_size_terms.sort_values([DEST_TAZ, 'zone_id']).reset_index(drop=True) + MAZ_size_terms = MAZ_size_terms[[DEST_TAZ, "size_term"]].reset_index(drop=False) + MAZ_size_terms = MAZ_size_terms.sort_values([DEST_TAZ, "zone_id"]).reset_index( + drop=True + ) # zone_id dest_TAZ size_term # 0 6097 2 10.0 @@ -231,21 +244,27 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): trace_hh_id = inject.get_injectable("trace_hh_id", None) have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) if have_trace_targets: - trace_label = tracing.extend_trace_label(trace_label, 'choose_MAZ_for_TAZ') + trace_label = tracing.extend_trace_label(trace_label, "choose_MAZ_for_TAZ") - CHOOSER_ID = taz_sample.index.name # zone_id for tours, but person_id for location choice + CHOOSER_ID = ( + taz_sample.index.name + ) # zone_id for tours, but person_id for location choice assert CHOOSER_ID is not None # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) - tracing.trace_df(taz_sample[trace_targets], - label=tracing.extend_trace_label(trace_label, 'taz_sample'), - transpose=False) + tracing.trace_df( + taz_sample[trace_targets], + label=tracing.extend_trace_label(trace_label, "taz_sample"), + transpose=False, + ) # redupe taz_sample[[DEST_TAZ, 'prob']] using pick_count to repeat rows - taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False) - taz_choices = taz_choices.reindex(taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True) - taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'}) + taz_choices = taz_sample[[DEST_TAZ, "prob"]].reset_index(drop=False) + taz_choices = taz_choices.reindex( + taz_choices.index.repeat(taz_sample.pick_count) + ).reset_index(drop=True) + taz_choices = taz_choices.rename(columns={"prob": "TAZ_prob"}) # print(f"taz_choices\n{taz_choices}") # tour_id dest_TAZ TAZ_prob @@ -261,7 +280,9 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): # 2 24251 2 10.904 # just to make it clear we are siloing choices by chooser_id - chooser_id_col = taz_sample.index.name # should be canonical chooser index name (e.g. 'person_id') + chooser_id_col = ( + taz_sample.index.name + ) # should be canonical chooser index name (e.g. 'person_id') # for random_for_df, we need df with de-duplicated chooser canonical index chooser_df = pd.DataFrame(index=taz_sample.index[~taz_sample.index.duplicated()]) @@ -273,15 +294,21 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): taz_sample_size = taz_choices.groupby(chooser_id_col)[DEST_TAZ].count().max() # taz_choices index values should be contiguous - assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size)).all() + assert ( + taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size) + ).all() # we need to choose a MAZ for each DEST_TAZ choice # probability of choosing MAZ based on MAZ size_term fraction of TAZ total # there will be a different set (and number) of candidate MAZs for each TAZ # (preserve index, which will have duplicates as result of join) # maz_sizes.index is the integer offset into taz_choices of the taz for which the maz_size row is a candidate) - maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), - MAZ_size_terms, how='left', on=DEST_TAZ).set_index('index') + maz_sizes = pd.merge( + taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), + MAZ_size_terms, + how="left", + on=DEST_TAZ, + ).set_index("index") # tour_id dest_TAZ zone_id size_term # index @@ -296,9 +323,11 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer=CHOOSER_ID) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] - tracing.trace_df(trace_maz_sizes, - label=tracing.extend_trace_label(trace_label, 'maz_sizes'), - transpose=False) + tracing.trace_df( + trace_maz_sizes, + label=tracing.extend_trace_label(trace_label, "maz_sizes"), + transpose=False, + ) # number of DEST_TAZ candidates per chooser maz_counts = maz_sizes.groupby(maz_sizes.index).size().values @@ -316,7 +345,9 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts) # insert zero filler to pad each alternative set to same size - padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0).reshape(-1, max_maz_count) + padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0).reshape( + -1, max_maz_count + ) # prob array with one row TAZ_choice, one column per alternative row_sums = padded_maz_sizes.sum(axis=1) @@ -336,50 +367,82 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): # shouldn't have chosen any of the dummy pad positions assert (positions < maz_counts).all() - taz_choices[DEST_MAZ] = maz_sizes['zone_id'].take(positions + first_row_offsets) - taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]), positions] - taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob'] + taz_choices[DEST_MAZ] = maz_sizes["zone_id"].take(positions + first_row_offsets) + taz_choices["MAZ_prob"] = maz_probs[np.arange(maz_probs.shape[0]), positions] + taz_choices["prob"] = taz_choices["TAZ_prob"] * taz_choices["MAZ_prob"] if have_trace_targets: - taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer=CHOOSER_ID) + taz_choices_trace_targets = tracing.trace_targets( + taz_choices, slicer=CHOOSER_ID + ) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] - tracing.trace_df(trace_taz_choices_df, - label=tracing.extend_trace_label(trace_label, 'taz_choices'), - transpose=False) + tracing.trace_df( + trace_taz_choices_df, + label=tracing.extend_trace_label(trace_label, "taz_choices"), + transpose=False, + ) lhs_df = trace_taz_choices_df[[CHOOSER_ID, DEST_TAZ]] - alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)] + alt_dest_columns = [f"dest_maz_{c}" for c in range(max_maz_count)] # following the same logic as the full code, but for trace cutout trace_maz_counts = maz_counts[taz_choices_trace_targets] trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum() - trace_inserts = np.repeat(trace_last_row_offsets, max_maz_count - trace_maz_counts) + trace_inserts = np.repeat( + trace_last_row_offsets, max_maz_count - trace_maz_counts + ) # trace dest_maz_alts - padded_maz_sizes = np.insert(trace_maz_sizes[CHOOSER_ID].values, trace_inserts, 0.0).reshape(-1, max_maz_count) - df = pd.DataFrame(data=padded_maz_sizes, - columns=alt_dest_columns, index=trace_taz_choices_df.index) + padded_maz_sizes = np.insert( + trace_maz_sizes[CHOOSER_ID].values, trace_inserts, 0.0 + ).reshape(-1, max_maz_count) + df = pd.DataFrame( + data=padded_maz_sizes, + columns=alt_dest_columns, + index=trace_taz_choices_df.index, + ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df(df, label=tracing.extend_trace_label(trace_label, 'dest_maz_alts'), transpose=False) + tracing.trace_df( + df, + label=tracing.extend_trace_label(trace_label, "dest_maz_alts"), + transpose=False, + ) # trace dest_maz_size_terms - padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values, trace_inserts, 0.0).reshape(-1, max_maz_count) - df = pd.DataFrame(data=padded_maz_sizes, - columns=alt_dest_columns, index=trace_taz_choices_df.index) + padded_maz_sizes = np.insert( + trace_maz_sizes["size_term"].values, trace_inserts, 0.0 + ).reshape(-1, max_maz_count) + df = pd.DataFrame( + data=padded_maz_sizes, + columns=alt_dest_columns, + index=trace_taz_choices_df.index, + ) df = pd.concat([lhs_df, df], axis=1) - tracing.trace_df(df, label=tracing.extend_trace_label(trace_label, 'dest_maz_size_terms'), transpose=False) + tracing.trace_df( + df, + label=tracing.extend_trace_label(trace_label, "dest_maz_size_terms"), + transpose=False, + ) # trace dest_maz_probs - df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets], - columns=alt_dest_columns, index=trace_taz_choices_df.index) + df = pd.DataFrame( + data=maz_probs[taz_choices_trace_targets], + columns=alt_dest_columns, + index=trace_taz_choices_df.index, + ) df = pd.concat([lhs_df, df], axis=1) - df['rand'] = rands[taz_choices_trace_targets] - tracing.trace_df(df, label=tracing.extend_trace_label(trace_label, 'dest_maz_probs'), transpose=False) - - taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob']) - taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ]).agg(prob=('prob', 'max'), - pick_count=('prob', 'count')) + df["rand"] = rands[taz_choices_trace_targets] + tracing.trace_df( + df, + label=tracing.extend_trace_label(trace_label, "dest_maz_probs"), + transpose=False, + ) + + taz_choices = taz_choices.drop(columns=["TAZ_prob", "MAZ_prob"]) + taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ]).agg( + prob=("prob", "max"), pick_count=("prob", "count") + ) taz_choices.reset_index(level=DEST_MAZ, inplace=True) @@ -387,25 +450,29 @@ def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): def destination_presample( - spec_segment_name, - choosers, - model_settings, - network_los, - destination_size_terms, - estimator, - chunk_size, trace_label): - - trace_label = tracing.extend_trace_label(trace_label, 'presample') - chunk_tag = 'tour_destination.presample' + spec_segment_name, + choosers, + model_settings, + network_los, + destination_size_terms, + estimator, + chunk_size, + trace_label, +): + + trace_label = tracing.extend_trace_label(trace_label, "presample") + chunk_tag = "tour_destination.presample" logger.info(f"{trace_label} location_presample") - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] assert DEST_TAZ != alt_dest_col_name - MAZ_size_terms, TAZ_size_terms = aggregate_size_terms(destination_size_terms, network_los) + MAZ_size_terms, TAZ_size_terms = aggregate_size_terms( + destination_size_terms, network_los + ) - orig_maz = model_settings['CHOOSER_ORIG_COL_NAME'] + orig_maz = model_settings["CHOOSER_ORIG_COL_NAME"] assert orig_maz in choosers if ORIG_TAZ not in choosers: choosers[ORIG_TAZ] = map_maz_to_taz(choosers[orig_maz], network_los) @@ -413,7 +480,7 @@ def destination_presample( # create wrapper with keys for this lookup - in this case there is a HOME_TAZ in the choosers # and a DEST_TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions - skim_dict = network_los.get_skim_dict('taz') + skim_dict = network_los.get_skim_dict("taz") skims = skim_dict.wrap(ORIG_TAZ, DEST_TAZ) taz_sample = _destination_sample( @@ -424,8 +491,10 @@ def destination_presample( estimator, model_settings, DEST_TAZ, - chunk_size, chunk_tag=chunk_tag, - trace_label=trace_label) + chunk_size, + chunk_tag=chunk_tag, + trace_label=trace_label, + ) # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total maz_choices = choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label) @@ -437,36 +506,50 @@ def destination_presample( def run_destination_sample( - spec_segment_name, - tours, - persons_merged, - model_settings, - network_los, - destination_size_terms, - estimator, - chunk_size, trace_label): + spec_segment_name, + tours, + persons_merged, + model_settings, + network_los, + destination_size_terms, + estimator, + chunk_size, + trace_label, +): # FIXME - MEMORY HACK - only include columns actually used in spec (omit them pre-merge) - chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] - persons_merged = persons_merged[[c for c in persons_merged.columns if c in chooser_columns]] - tours = tours[[c for c in tours.columns if c in chooser_columns or c == 'person_id']] - choosers = pd.merge(tours, persons_merged, left_on='person_id', right_index=True, how='left') + chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] + persons_merged = persons_merged[ + [c for c in persons_merged.columns if c in chooser_columns] + ] + tours = tours[ + [c for c in tours.columns if c in chooser_columns or c == "person_id"] + ] + choosers = pd.merge( + tours, persons_merged, left_on="person_id", right_index=True, how="left" + ) # interaction_sample requires that choosers.index.is_monotonic_increasing if not choosers.index.is_monotonic_increasing: - logger.debug(f"run_destination_sample {trace_label} sorting choosers because not monotonic_increasing") + logger.debug( + f"run_destination_sample {trace_label} sorting choosers because not monotonic_increasing" + ) choosers = choosers.sort_index() # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) - if pre_sample_taz and not config.setting('want_dest_choice_presampling', True): + if pre_sample_taz and not config.setting("want_dest_choice_presampling", True): pre_sample_taz = False - logger.info(f"Disabled destination zone presampling for {trace_label} " - f"because 'want_dest_choice_presampling' setting is False") + logger.info( + f"Disabled destination zone presampling for {trace_label} " + f"because 'want_dest_choice_presampling' setting is False" + ) if pre_sample_taz: - logger.info("Running %s destination_presample with %d tours" % (trace_label, len(tours))) + logger.info( + "Running %s destination_presample with %d tours" % (trace_label, len(tours)) + ) choices = destination_presample( spec_segment_name, @@ -475,7 +558,9 @@ def run_destination_sample( network_los, destination_size_terms, estimator, - chunk_size, trace_label) + chunk_size, + trace_label, + ) else: choices = destination_sample( @@ -485,23 +570,26 @@ def run_destination_sample( network_los, destination_size_terms, estimator, - chunk_size, trace_label) + chunk_size, + trace_label, + ) # remember person_id in chosen alts so we can merge with persons in subsequent steps # (broadcasts person_id onto all alternatives sharing the same tour_id index value) - choices['person_id'] = tours.person_id + choices["person_id"] = tours.person_id return choices def run_destination_logsums( - tour_purpose, - persons_merged, - destination_sample, - model_settings, - network_los, - chunk_size, - trace_label): + tour_purpose, + persons_merged, + destination_sample, + model_settings, + network_los, + chunk_size, + trace_label, +): """ add logsum column to existing tour_destination_sample table @@ -523,82 +611,103 @@ def run_destination_logsums( +-----------+--------------+----------------+------------+----------------+ """ - logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) + logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) - chunk_tag = 'tour_destination.logsums' + chunk_tag = "tour_destination.logsums" # FIXME - MEMORY HACK - only include columns actually used in spec - persons_merged = logsum.filter_chooser_columns(persons_merged, logsum_settings, model_settings) + persons_merged = logsum.filter_chooser_columns( + persons_merged, logsum_settings, model_settings + ) # merge persons into tours - choosers = pd.merge(destination_sample, - persons_merged, - left_on='person_id', - right_index=True, - how="left") + choosers = pd.merge( + destination_sample, + persons_merged, + left_on="person_id", + right_index=True, + how="left", + ) logger.info("Running %s with %s rows", trace_label, len(choosers)) - tracing.dump_df(DUMP, persons_merged, trace_label, 'persons_merged') - tracing.dump_df(DUMP, choosers, trace_label, 'choosers') + tracing.dump_df(DUMP, persons_merged, trace_label, "persons_merged") + tracing.dump_df(DUMP, choosers, trace_label, "choosers") logsums = logsum.compute_logsums( choosers, tour_purpose, - logsum_settings, model_settings, + logsum_settings, + model_settings, network_los, chunk_size, chunk_tag, - trace_label) + trace_label, + ) - destination_sample['mode_choice_logsum'] = logsums + destination_sample["mode_choice_logsum"] = logsums return destination_sample def run_destination_simulate( - spec_segment_name, - tours, - persons_merged, - destination_sample, - want_logsums, - model_settings, - network_los, - destination_size_terms, - estimator, - chunk_size, trace_label): + spec_segment_name, + tours, + persons_merged, + destination_sample, + want_logsums, + model_settings, + network_los, + destination_size_terms, + estimator, + chunk_size, + trace_label, +): """ run destination_simulate on tour_destination_sample annotated with mode_choice logsum to select a destination from sample alternatives """ - chunk_tag = 'tour_destination.simulate' + chunk_tag = "tour_destination.simulate" - model_spec = simulate.spec_for_segment(model_settings, spec_id='SPEC', - segment_name=spec_segment_name, estimator=estimator) + model_spec = simulate.spec_for_segment( + model_settings, + spec_id="SPEC", + segment_name=spec_segment_name, + estimator=estimator, + ) # FIXME - MEMORY HACK - only include columns actually used in spec (omit them pre-merge) - chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] - persons_merged = persons_merged[[c for c in persons_merged.columns if c in chooser_columns]] - tours = tours[[c for c in tours.columns if c in chooser_columns or c == 'person_id']] - choosers = pd.merge(tours, persons_merged, left_on='person_id', right_index=True, how='left') + chooser_columns = model_settings["SIMULATE_CHOOSER_COLUMNS"] + persons_merged = persons_merged[ + [c for c in persons_merged.columns if c in chooser_columns] + ] + tours = tours[ + [c for c in tours.columns if c in chooser_columns or c == "person_id"] + ] + choosers = pd.merge( + tours, persons_merged, left_on="person_id", right_index=True, how="left" + ) # interaction_sample requires that choosers.index.is_monotonic_increasing if not choosers.index.is_monotonic_increasing: - logger.debug(f"run_destination_simulate {trace_label} sorting choosers because not monotonic_increasing") + logger.debug( + f"run_destination_simulate {trace_label} sorting choosers because not monotonic_increasing" + ) choosers = choosers.sort_index() if estimator: estimator.write_choosers(choosers) - alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] - origin_col_name = model_settings['CHOOSER_ORIG_COL_NAME'] + alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] + origin_col_name = model_settings["CHOOSER_ORIG_COL_NAME"] # alternatives are pre-sampled and annotated with logsums and pick_count # but we have to merge size_terms column into alt sample list - destination_sample['size_term'] = \ - reindex(destination_size_terms.size_term, destination_sample[alt_dest_col_name]) + destination_sample["size_term"] = reindex( + destination_size_terms.size_term, destination_sample[alt_dest_col_name] + ) - tracing.dump_df(DUMP, destination_sample, trace_label, 'alternatives') + tracing.dump_df(DUMP, destination_sample, trace_label, "alternatives") constants = config.get_model_constants(model_settings) @@ -611,14 +720,14 @@ def run_destination_simulate( skims = skim_dict.wrap(origin_col_name, alt_dest_col_name) locals_d = { - 'skims': skims, + "skims": skims, } if constants is not None: locals_d.update(constants) - tracing.dump_df(DUMP, choosers, trace_label, 'choosers') + tracing.dump_df(DUMP, choosers, trace_label, "choosers") - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_sample_simulate( choosers, @@ -629,38 +738,44 @@ def run_destination_simulate( want_logsums=want_logsums, skims=skims, locals_d=locals_d, - chunk_size=chunk_size, chunk_tag=chunk_tag, + chunk_size=chunk_size, + chunk_tag=chunk_tag, trace_label=trace_label, - trace_choice_name='destination', - estimator=estimator) + trace_choice_name="destination", + estimator=estimator, + ) if not want_logsums: # for consistency, always return a dataframe with canonical column name assert isinstance(choices, pd.Series) - choices = choices.to_frame('choice') + choices = choices.to_frame("choice") return choices def run_tour_destination( - tours, - persons_merged, - want_logsums, - want_sample_table, - model_settings, - network_los, - estimator, - chunk_size, trace_hh_id, trace_label): - - size_term_calculator = SizeTermCalculator(model_settings['SIZE_TERM_SELECTOR']) + tours, + persons_merged, + want_logsums, + want_sample_table, + model_settings, + network_los, + estimator, + chunk_size, + trace_hh_id, + trace_label, +): + + size_term_calculator = SizeTermCalculator(model_settings["SIZE_TERM_SELECTOR"]) # maps segment names to compact (integer) ids - segments = model_settings['SEGMENTS'] + segments = model_settings["SEGMENTS"] - chooser_segment_column = model_settings.get('CHOOSER_SEGMENT_COLUMN_NAME', None) + chooser_segment_column = model_settings.get("CHOOSER_SEGMENT_COLUMN_NAME", None) if chooser_segment_column is None: - assert len(segments) == 1, \ - f"CHOOSER_SEGMENT_COLUMN_NAME not specified in model_settings to slice SEGMENTS: {segments}" + assert ( + len(segments) == 1 + ), f"CHOOSER_SEGMENT_COLUMN_NAME not specified in model_settings to slice SEGMENTS: {segments}" choices_list = [] sample_list = [] @@ -674,59 +789,65 @@ def run_tour_destination( choosers = tours.copy() # Note: size_term_calculator omits zones with impossible alternatives (where dest size term is zero) - segment_destination_size_terms = size_term_calculator.dest_size_terms_df(segment_name, segment_trace_label) + segment_destination_size_terms = size_term_calculator.dest_size_terms_df( + segment_name, segment_trace_label + ) if choosers.shape[0] == 0: - logger.info("%s skipping segment %s: no choosers", trace_label, segment_name) + logger.info( + "%s skipping segment %s: no choosers", trace_label, segment_name + ) continue # - destination_sample spec_segment_name = segment_name # spec_segment_name is segment_name - location_sample_df = \ - run_destination_sample( - spec_segment_name, - choosers, - persons_merged, - model_settings, - network_los, - segment_destination_size_terms, - estimator, - chunk_size=chunk_size, - trace_label=tracing.extend_trace_label(segment_trace_label, 'sample')) + location_sample_df = run_destination_sample( + spec_segment_name, + choosers, + persons_merged, + model_settings, + network_los, + segment_destination_size_terms, + estimator, + chunk_size=chunk_size, + trace_label=tracing.extend_trace_label(segment_trace_label, "sample"), + ) # - destination_logsums tour_purpose = segment_name # tour_purpose is segment_name - location_sample_df = \ - run_destination_logsums( - tour_purpose, - persons_merged, - location_sample_df, - model_settings, - network_los, - chunk_size=chunk_size, - trace_label=tracing.extend_trace_label(segment_trace_label, 'logsums')) + location_sample_df = run_destination_logsums( + tour_purpose, + persons_merged, + location_sample_df, + model_settings, + network_los, + chunk_size=chunk_size, + trace_label=tracing.extend_trace_label(segment_trace_label, "logsums"), + ) # - destination_simulate spec_segment_name = segment_name # spec_segment_name is segment_name - choices = \ - run_destination_simulate( - spec_segment_name, - choosers, - persons_merged, - destination_sample=location_sample_df, - want_logsums=want_logsums, - model_settings=model_settings, - network_los=network_los, - destination_size_terms=segment_destination_size_terms, - estimator=estimator, - chunk_size=chunk_size, - trace_label=tracing.extend_trace_label(segment_trace_label, 'simulate')) + choices = run_destination_simulate( + spec_segment_name, + choosers, + persons_merged, + destination_sample=location_sample_df, + want_logsums=want_logsums, + model_settings=model_settings, + network_los=network_los, + destination_size_terms=segment_destination_size_terms, + estimator=estimator, + chunk_size=chunk_size, + trace_label=tracing.extend_trace_label(segment_trace_label, "simulate"), + ) choices_list.append(choices) if want_sample_table: # FIXME - sample_table - location_sample_df.set_index(model_settings['ALT_DEST_COL_NAME'], append=True, inplace=True) + location_sample_df.set_index( + model_settings["ALT_DEST_COL_NAME"], append=True, inplace=True + ) sample_list.append(location_sample_df) else: # del this so we dont hold active reference to it while run_location_sample is creating its replacement @@ -737,7 +858,7 @@ def run_tour_destination( else: # this will only happen with small samples (e.g. singleton) with no (e.g.) school segs logger.warning("%s no choices", trace_label) - choices_df = pd.DataFrame(columns=['choice', 'logsum']) + choices_df = pd.DataFrame(columns=["choice", "logsum"]) if len(sample_list) > 0: save_sample_df = pd.concat(sample_list) diff --git a/activitysim/abm/models/util/tour_frequency.py b/activitysim/abm/models/util/tour_frequency.py index fe4ed03647..37108b564c 100644 --- a/activitysim/abm/models/util/tour_frequency.py +++ b/activitysim/abm/models/util/tour_frequency.py @@ -5,13 +5,13 @@ import numpy as np import pandas as pd -from activitysim.core.util import reindex from activitysim.abm.models.util.canonical_ids import set_tour_index +from activitysim.core.util import reindex logger = logging.getLogger(__name__) -def create_tours(tour_counts, tour_category, parent_col='person_id'): +def create_tours(tour_counts, tour_category, parent_col="person_id"): """ This method processes the tour_frequency column that comes out of the model of the same name and turns into a DataFrame that @@ -73,13 +73,15 @@ def create_tours(tour_counts, tour_category, parent_col='person_id'): # now have two rows, and zero trips yields zero rows tours = tours.take(np.repeat(tours.index.values, tours.tour_type_count.values)) - grouped = tours.groupby([parent_col, 'tour_type']) - tours['tour_type_num'] = grouped.cumcount() + 1 - tours['tour_type_count'] = tours['tour_type_num'] + grouped.cumcount(ascending=False) + grouped = tours.groupby([parent_col, "tour_type"]) + tours["tour_type_num"] = grouped.cumcount() + 1 + tours["tour_type_count"] = tours["tour_type_num"] + grouped.cumcount( + ascending=False + ) grouped = tours.groupby(parent_col) - tours['tour_num'] = grouped.cumcount() + 1 - tours['tour_count'] = tours['tour_num'] + grouped.cumcount(ascending=False) + tours["tour_num"] = grouped.cumcount() + 1 + tours["tour_count"] = tours["tour_num"] + grouped.cumcount(ascending=False) """ tour_type tour_type_num tour_type_count tour_num tour_count @@ -90,11 +92,11 @@ def create_tours(tour_counts, tour_category, parent_col='person_id'): """ # set these here to ensure consistency across different tour categories - assert tour_category in ['mandatory', 'non_mandatory', 'atwork', 'joint'] - tours['tour_category'] = tour_category + assert tour_category in ["mandatory", "non_mandatory", "atwork", "joint"] + tours["tour_category"] = tour_category # for joint tours, the correct number will be filled in after participation step - tours['number_of_participants'] = 1 + tours["number_of_participants"] = 1 # index is arbitrary but don't want any duplicates in index tours.reset_index(drop=True, inplace=True) @@ -102,7 +104,9 @@ def create_tours(tour_counts, tour_category, parent_col='person_id'): return tours -def process_tours(tour_frequency, tour_frequency_alts, tour_category, parent_col='person_id'): +def process_tours( + tour_frequency, tour_frequency_alts, tour_category, parent_col="person_id" +): """ This method processes the tour_frequency column that comes out of the model of the same name and turns into a DataFrame that @@ -191,32 +195,47 @@ def process_mandatory_tours(persons, mandatory_tour_frequency_alts): depends on the is_worker column: work tours first for workers, second for non-workers """ - person_columns = ['mandatory_tour_frequency', 'is_worker', - 'school_zone_id', 'workplace_zone_id', 'home_zone_id', 'household_id'] + person_columns = [ + "mandatory_tour_frequency", + "is_worker", + "school_zone_id", + "workplace_zone_id", + "home_zone_id", + "household_id", + ] assert not persons.mandatory_tour_frequency.isnull().any() - tours = process_tours(persons.mandatory_tour_frequency.dropna(), - mandatory_tour_frequency_alts, - tour_category='mandatory') + tours = process_tours( + persons.mandatory_tour_frequency.dropna(), + mandatory_tour_frequency_alts, + tour_category="mandatory", + ) - tours_merged = pd.merge(tours[['person_id', 'tour_type']], - persons[person_columns], - left_on='person_id', right_index=True) + tours_merged = pd.merge( + tours[["person_id", "tour_type"]], + persons[person_columns], + left_on="person_id", + right_index=True, + ) # by default work tours are first for work_and_school tours # swap tour_nums for non-workers so school tour is 1 and work is 2 - work_and_school_and_student = \ - (tours_merged.mandatory_tour_frequency == 'work_and_school') & ~tours_merged.is_worker + work_and_school_and_student = ( + tours_merged.mandatory_tour_frequency == "work_and_school" + ) & ~tours_merged.is_worker - tours.tour_num = tours.tour_num.where(~work_and_school_and_student, 3 - tours.tour_num) + tours.tour_num = tours.tour_num.where( + ~work_and_school_and_student, 3 - tours.tour_num + ) # work tours destination is workplace_zone_id, school tours destination is school_zone_id - tours['destination'] = \ - tours_merged.workplace_zone_id.where((tours_merged.tour_type == 'work'), tours_merged.school_zone_id) + tours["destination"] = tours_merged.workplace_zone_id.where( + (tours_merged.tour_type == "work"), tours_merged.school_zone_id + ) - tours['origin'] = tours_merged.home_zone_id + tours["origin"] = tours_merged.home_zone_id - tours['household_id'] = tours_merged.household_id + tours["household_id"] = tours_merged.household_id # assign stable (predictable) tour_id set_tour_index(tours) @@ -266,10 +285,10 @@ def process_non_mandatory_tours(persons, tour_counts): column names of the alternatives DataFrame supplied above. """ - tours = create_tours(tour_counts, tour_category='non_mandatory') + tours = create_tours(tour_counts, tour_category="non_mandatory") - tours['household_id'] = reindex(persons.household_id, tours.person_id) - tours['origin'] = reindex(persons.home_zone_id, tours.person_id) + tours["household_id"] = reindex(persons.household_id, tours.person_id) + tours["origin"] = reindex(persons.home_zone_id, tours.person_id) # assign stable (predictable) tour_id set_tour_index(tours) @@ -328,11 +347,13 @@ def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): eat_business 1 1 0 """ - parent_col = 'parent_tour_id' - tours = process_tours(work_tours.atwork_subtour_frequency.dropna(), - atwork_subtour_frequency_alts, - tour_category='atwork', - parent_col=parent_col) + parent_col = "parent_tour_id" + tours = process_tours( + work_tours.atwork_subtour_frequency.dropna(), + atwork_subtour_frequency_alts, + tour_category="atwork", + parent_col=parent_col, + ) # print tours """ @@ -349,13 +370,14 @@ def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): """ # merge fields from parent work_tours (note parent tour destination becomes subtour origin) - work_tours = work_tours[['person_id', 'household_id', 'tour_num', 'destination']] - work_tours.rename(columns={'tour_num': 'parent_tour_num', - 'destination': 'origin'}, inplace=True) + work_tours = work_tours[["person_id", "household_id", "tour_num", "destination"]] + work_tours.rename( + columns={"tour_num": "parent_tour_num", "destination": "origin"}, inplace=True + ) tours = pd.merge(tours, work_tours, left_on=parent_col, right_index=True) # assign stable (predictable) tour_id - set_tour_index(tours, parent_tour_num_col='parent_tour_num') + set_tour_index(tours, parent_tour_num_col="parent_tour_num") """ person_id tour_type tour_type_count tour_type_num tour_num tour_count @@ -371,7 +393,7 @@ def process_atwork_subtours(work_tours, atwork_subtour_frequency_alts): """ # don't need this once we have computed index - del tours['parent_tour_num'] + del tours["parent_tour_num"] return tours @@ -405,17 +427,19 @@ def process_joint_tours(joint_tour_frequency, joint_tour_frequency_alts, point_p assert not joint_tour_frequency.isnull().any() - tours = process_tours(joint_tour_frequency.dropna(), - joint_tour_frequency_alts, - tour_category='joint', - parent_col='household_id') + tours = process_tours( + joint_tour_frequency.dropna(), + joint_tour_frequency_alts, + tour_category="joint", + parent_col="household_id", + ) assert not tours.index.duplicated().any() - assert point_persons.index.name == 'household_id' + assert point_persons.index.name == "household_id" # - assign a temp point person to tour so we can create stable index - tours['person_id'] = reindex(point_persons.person_id, tours.household_id) - tours['origin'] = reindex(point_persons.home_zone_id, tours.household_id) + tours["person_id"] = reindex(point_persons.person_id, tours.household_id) + tours["origin"] = reindex(point_persons.home_zone_id, tours.household_id) # assign stable (predictable) tour_id set_tour_index(tours, is_joint=True) diff --git a/activitysim/abm/models/util/tour_scheduling.py b/activitysim/abm/models/util/tour_scheduling.py index d482410280..5936ae81e0 100644 --- a/activitysim/abm/models/util/tour_scheduling.py +++ b/activitysim/abm/models/util/tour_scheduling.py @@ -4,11 +4,7 @@ import pandas as pd -from activitysim.core import simulate -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import expressions +from activitysim.core import config, expressions, inject, simulate, tracing from . import estimation from . import vectorize_tour_scheduling as vts @@ -16,57 +12,70 @@ logger = logging.getLogger(__name__) -def run_tour_scheduling(model_name, chooser_tours, persons_merged, tdd_alts, tour_segment_col, chunk_size, trace_hh_id): +def run_tour_scheduling( + model_name, + chooser_tours, + persons_merged, + tdd_alts, + tour_segment_col, + chunk_size, + trace_hh_id, +): trace_label = model_name - model_settings_file_name = f'{model_name}.yaml' + model_settings_file_name = f"{model_name}.yaml" model_settings = config.read_model_settings(model_settings_file_name) - if 'LOGSUM_SETTINGS' in model_settings: - logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) - logsum_columns = logsum_settings.get('LOGSUM_CHOOSER_COLUMNS', []) + if "LOGSUM_SETTINGS" in model_settings: + logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) + logsum_columns = logsum_settings.get("LOGSUM_CHOOSER_COLUMNS", []) else: logsum_columns = [] # - filter chooser columns for both logsums and simulate - model_columns = model_settings.get('SIMULATE_CHOOSER_COLUMNS', []) - chooser_columns = logsum_columns + [c for c in model_columns if c not in logsum_columns] + model_columns = model_settings.get("SIMULATE_CHOOSER_COLUMNS", []) + chooser_columns = logsum_columns + [ + c for c in model_columns if c not in logsum_columns + ] persons_merged = expressions.filter_chooser_columns(persons_merged, chooser_columns) timetable = inject.get_injectable("timetable") # - run preprocessor to annotate choosers - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: - locals_d = { - 'tt': timetable - } + locals_d = {"tt": timetable} locals_d.update(config.get_model_constants(model_settings)) expressions.assign_columns( df=chooser_tours, model_settings=preprocessor_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) estimators = {} - if 'TOUR_SPEC_SEGMENTS' in model_settings: + if "TOUR_SPEC_SEGMENTS" in model_settings: # load segmented specs - spec_segment_settings = model_settings.get('SPEC_SEGMENTS', {}) + spec_segment_settings = model_settings.get("SPEC_SEGMENTS", {}) specs = {} for spec_segment_name, spec_settings in spec_segment_settings.items(): - bundle_name = f'{model_name}_{spec_segment_name}' + bundle_name = f"{model_name}_{spec_segment_name}" # estimator for this tour_segment - estimator = estimation.manager.begin_estimation(model_name=bundle_name, bundle_name=bundle_name) + estimator = estimation.manager.begin_estimation( + model_name=bundle_name, bundle_name=bundle_name + ) - spec_file_name = spec_settings['SPEC'] + spec_file_name = spec_settings["SPEC"] model_spec = simulate.read_model_spec(file_name=spec_file_name) coefficients_df = simulate.read_model_coefficients(spec_settings) - specs[spec_segment_name] = simulate.eval_coefficients(model_spec, coefficients_df, estimator) + specs[spec_segment_name] = simulate.eval_coefficients( + model_spec, coefficients_df, estimator + ) if estimator: estimators[spec_segment_name] = estimator # add to local list @@ -75,27 +84,29 @@ def run_tour_scheduling(model_name, chooser_tours, persons_merged, tdd_alts, tou estimator.write_coefficients(coefficients_df, spec_settings) # - spec dict segmented by primary_purpose - tour_segment_settings = model_settings.get('TOUR_SPEC_SEGMENTS', {}) + tour_segment_settings = model_settings.get("TOUR_SPEC_SEGMENTS", {}) tour_segments = {} for tour_segment_name, spec_segment_name in tour_segment_settings.items(): tour_segments[tour_segment_name] = {} - tour_segments[tour_segment_name]['spec_segment_name'] = spec_segment_name - tour_segments[tour_segment_name]['spec'] = specs[spec_segment_name] - tour_segments[tour_segment_name]['estimator'] = estimators.get(spec_segment_name) + tour_segments[tour_segment_name]["spec_segment_name"] = spec_segment_name + tour_segments[tour_segment_name]["spec"] = specs[spec_segment_name] + tour_segments[tour_segment_name]["estimator"] = estimators.get( + spec_segment_name + ) # default tour_segment_col to 'tour_type' if segmented spec and tour_segment_col not specified if tour_segment_col is None and tour_segments: - tour_segment_col = 'tour_type' + tour_segment_col = "tour_type" else: # unsegmented spec - assert 'SPEC_SEGMENTS' not in model_settings - assert 'TOUR_SPEC_SEGMENTS' not in model_settings + assert "SPEC_SEGMENTS" not in model_settings + assert "TOUR_SPEC_SEGMENTS" not in model_settings assert tour_segment_col is None estimator = estimation.manager.begin_estimation(model_name) - spec_file_name = model_settings['SPEC'] + spec_file_name = model_settings["SPEC"] model_spec = simulate.read_model_spec(file_name=spec_file_name) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -107,22 +118,23 @@ def run_tour_scheduling(model_name, chooser_tours, persons_merged, tdd_alts, tou estimator.write_coefficients(coefficients_df, model_settings) # - non_mandatory tour scheduling is not segmented by tour type - tour_segments = { - 'spec': model_spec, - 'estimator': estimator - } + tour_segments = {"spec": model_spec, "estimator": estimator} if estimators: timetable.begin_transaction(list(estimators.values())) logger.info(f"Running {model_name} with %d tours", len(chooser_tours)) choices = vts.vectorize_tour_scheduling( - chooser_tours, persons_merged, - tdd_alts, timetable, - tour_segments=tour_segments, tour_segment_col=tour_segment_col, + chooser_tours, + persons_merged, + tdd_alts, + timetable, + tour_segments=tour_segments, + tour_segment_col=tour_segment_col, model_settings=model_settings, chunk_size=chunk_size, - trace_label=trace_label) + trace_label=trace_label, + ) if estimators: # overrride choices for all estimators @@ -134,7 +146,9 @@ def run_tour_scheduling(model_name, chooser_tours, persons_merged, tdd_alts, tou model_choices = choices estimator.write_choices(model_choices) - override_choices = estimator.get_survey_values(model_choices, 'tours', 'tdd') + override_choices = estimator.get_survey_values( + model_choices, "tours", "tdd" + ) estimator.write_override_choices(override_choices) choices_list.append(override_choices) @@ -143,13 +157,18 @@ def run_tour_scheduling(model_name, chooser_tours, persons_merged, tdd_alts, tou # update timetable to reflect the override choices (assign tours in tour_num order) timetable.rollback() - for tour_num, nth_tours in chooser_tours.groupby('tour_num', sort=True): - timetable.assign(window_row_ids=nth_tours['person_id'], tdds=choices.reindex(nth_tours.index)) + for tour_num, nth_tours in chooser_tours.groupby("tour_num", sort=True): + timetable.assign( + window_row_ids=nth_tours["person_id"], + tdds=choices.reindex(nth_tours.index), + ) timetable.replace_table() # choices are tdd alternative ids # we want to add start, end, and duration columns to tours, which we have in tdd_alts table - choices = pd.merge(choices.to_frame('tdd'), tdd_alts, left_on=['tdd'], right_index=True, how='left') + choices = pd.merge( + choices.to_frame("tdd"), tdd_alts, left_on=["tdd"], right_index=True, how="left" + ) return choices diff --git a/activitysim/abm/models/util/trip.py b/activitysim/abm/models/util/trip.py index 5021bd531f..f321e0e3b3 100644 --- a/activitysim/abm/models/util/trip.py +++ b/activitysim/abm/models/util/trip.py @@ -6,19 +6,20 @@ from activitysim.core.util import assign_in_place - logger = logging.getLogger(__name__) def failed_trip_cohorts(trips, failed): # outbound trips in a tour with a failed outbound trip - bad_outbound_trips = \ - trips.outbound & (trips.tour_id.isin(trips.tour_id[failed & trips.outbound])) + bad_outbound_trips = trips.outbound & ( + trips.tour_id.isin(trips.tour_id[failed & trips.outbound]) + ) # inbound trips in a tour with a failed inbound trip - bad_inbound_trips = \ - ~trips.outbound & (trips.tour_id.isin(trips.tour_id[failed & ~trips.outbound])) + bad_inbound_trips = ~trips.outbound & ( + trips.tour_id.isin(trips.tour_id[failed & ~trips.outbound]) + ) bad_trips = bad_outbound_trips | bad_inbound_trips @@ -30,7 +31,9 @@ def flag_failed_trip_leg_mates(trips_df, col_name): set boolean flag column of specified name to identify failed trip leg_mates in place """ - failed_trip_leg_mates = failed_trip_cohorts(trips_df, trips_df.failed) & ~trips_df.failed + failed_trip_leg_mates = ( + failed_trip_cohorts(trips_df, trips_df.failed) & ~trips_df.failed + ) trips_df.loc[failed_trip_leg_mates, col_name] = True # handle outbound and inbound legs independently @@ -55,10 +58,12 @@ def cleanup_failed_trips(trips): """ if trips.failed.any(): - logger.warning("cleanup_failed_trips dropping %s failed trips" % trips.failed.sum()) + logger.warning( + "cleanup_failed_trips dropping %s failed trips" % trips.failed.sum() + ) - trips['patch'] = False - flag_failed_trip_leg_mates(trips, 'patch') + trips["patch"] = False + flag_failed_trip_leg_mates(trips, "patch") # drop the original failures trips = trips[~trips.failed] @@ -67,16 +72,18 @@ def cleanup_failed_trips(trips): patch_trips = trips[trips.patch].sort_index() # recompute fields dependent on trip_num sequence - grouped = patch_trips.groupby(['tour_id', 'outbound']) - patch_trips['trip_num'] = grouped.cumcount() + 1 + grouped = patch_trips.groupby(["tour_id", "outbound"]) + patch_trips["trip_num"] = grouped.cumcount() + 1 # FIXME - 'clever' hack to avoid regroup - implementation dependent optimization that could change - patch_trips['trip_count'] = patch_trips['trip_num'] + grouped.cumcount(ascending=False) + patch_trips["trip_count"] = patch_trips["trip_num"] + grouped.cumcount( + ascending=False + ) - assign_in_place(trips, patch_trips[['trip_num', 'trip_count']]) + assign_in_place(trips, patch_trips[["trip_num", "trip_count"]]) - del trips['patch'] + del trips["patch"] - del trips['failed'] + del trips["failed"] return trips @@ -90,20 +97,25 @@ def generate_alternative_sizes(max_duration, max_trips): :param max_trips: :return: """ + def np_shift(xs, n, fill_zero=True): if n >= 0: shift_array = np.concatenate((np.full(n, np.nan), xs[:-n])) else: shift_array = np.concatenate((xs[-n:], np.full(-n, np.nan))) - return np.nan_to_num(shift_array, np.nan).astype(int) if fill_zero else shift_array + return ( + np.nan_to_num(shift_array, np.nan).astype(int) if fill_zero else shift_array + ) levels = np.empty([max_trips, max_duration + max_trips]) levels[0] = np.arange(1, max_duration + max_trips + 1) for level in np.arange(1, max_trips): - levels[level] = np_shift(np.cumsum(np_shift(levels[level - 1], 1)), -1, fill_zero=False) + levels[level] = np_shift( + np.cumsum(np_shift(levels[level - 1], 1)), -1, fill_zero=False + ) - return levels[:, :max_duration+1].astype(int) + return levels[:, : max_duration + 1].astype(int) def get_time_windows(residual, level): diff --git a/activitysim/abm/models/util/vectorize_tour_scheduling.py b/activitysim/abm/models/util/vectorize_tour_scheduling.py index 358850750c..766cc9a812 100644 --- a/activitysim/abm/models/util/vectorize_tour_scheduling.py +++ b/activitysim/abm/models/util/vectorize_tour_scheduling.py @@ -5,28 +5,25 @@ import numpy as np import pandas as pd -from activitysim.core.interaction_sample_simulate import interaction_sample_simulate -from activitysim.core import config -from activitysim.core import tracing -from activitysim.core import inject -from activitysim.core import mem - -from activitysim.core import chunk -from activitysim.core import simulate -from activitysim.core import logit -from activitysim.core import los - +from activitysim.core import ( + chunk, + config, + expressions, + inject, + logit, + los, + mem, + simulate, +) from activitysim.core import timetable as tt - -from activitysim.core.util import reindex -from activitysim.core import expressions - +from activitysim.core import tracing +from activitysim.core.interaction_sample_simulate import interaction_sample_simulate from activitysim.core.pathbuilder import TransitVirtualPathBuilder - +from activitysim.core.util import reindex logger = logging.getLogger(__name__) -TDD_CHOICE_COLUMN = 'tdd' +TDD_CHOICE_COLUMN = "tdd" USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS = False RUN_ALTS_PREPROCESSOR_BEFORE_MERGE = True # see FIXME below before changing this @@ -34,30 +31,36 @@ def skims_for_logsums(tour_purpose, model_settings, trace_label): - assert 'LOGSUM_SETTINGS' in model_settings + assert "LOGSUM_SETTINGS" in model_settings - network_los = inject.get_injectable('network_los') + network_los = inject.get_injectable("network_los") skim_dict = network_los.get_default_skim_dict() - orig_col_name = 'home_zone_id' + orig_col_name = "home_zone_id" - destination_for_tour_purpose = model_settings.get('DESTINATION_FOR_TOUR_PURPOSE') + destination_for_tour_purpose = model_settings.get("DESTINATION_FOR_TOUR_PURPOSE") if isinstance(destination_for_tour_purpose, str): dest_col_name = destination_for_tour_purpose elif isinstance(destination_for_tour_purpose, dict): dest_col_name = destination_for_tour_purpose.get(tour_purpose) else: - raise RuntimeError(f"expected string or dict DESTINATION_FOR_TOUR_PURPOSE model_setting for {tour_purpose}") - - odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, - dim3_key='out_period') - dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, - dim3_key='in_period') - odr_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, - dim3_key='in_period') - dor_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, - dim3_key='out_period') + raise RuntimeError( + f"expected string or dict DESTINATION_FOR_TOUR_PURPOSE model_setting for {tour_purpose}" + ) + + odt_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col_name, dest_key=dest_col_name, dim3_key="out_period" + ) + dot_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col_name, dest_key=orig_col_name, dim3_key="in_period" + ) + odr_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=orig_col_name, dest_key=dest_col_name, dim3_key="in_period" + ) + dor_skim_stack_wrapper = skim_dict.wrap_3d( + orig_key=dest_col_name, dest_key=orig_col_name, dim3_key="out_period" + ) od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { @@ -66,41 +69,54 @@ def skims_for_logsums(tour_purpose, model_settings, trace_label): "odr_skims": odr_skim_stack_wrapper, "dor_skims": dor_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, - 'orig_col_name': orig_col_name, - 'dest_col_name': dest_col_name, + "orig_col_name": orig_col_name, + "dest_col_name": dest_col_name, } if network_los.zone_system == los.THREE_ZONE: # fixme - is this a lightweight object? tvpb = network_los.tvpb - tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col_name, dest_key=dest_col_name, - tod_key='out_period', segment_key='demographic_segment', - trace_label=trace_label, tag='tvpb_logsum_odt') - tvpb_logsum_dot = tvpb.wrap_logsum(orig_key=dest_col_name, dest_key=orig_col_name, - tod_key='in_period', segment_key='demographic_segment', - trace_label=trace_label, tag='tvpb_logsum_dot') - - skims.update({ - 'tvpb_logsum_odt': tvpb_logsum_odt, - 'tvpb_logsum_dot': tvpb_logsum_dot - }) + tvpb_logsum_odt = tvpb.wrap_logsum( + orig_key=orig_col_name, + dest_key=dest_col_name, + tod_key="out_period", + segment_key="demographic_segment", + trace_label=trace_label, + tag="tvpb_logsum_odt", + ) + tvpb_logsum_dot = tvpb.wrap_logsum( + orig_key=dest_col_name, + dest_key=orig_col_name, + tod_key="in_period", + segment_key="demographic_segment", + trace_label=trace_label, + tag="tvpb_logsum_dot", + ) + + skims.update( + {"tvpb_logsum_odt": tvpb_logsum_odt, "tvpb_logsum_dot": tvpb_logsum_dot} + ) return skims -def _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, network_los, skims, trace_label): +def _compute_logsums( + alt_tdd, tours_merged, tour_purpose, model_settings, network_los, skims, trace_label +): """ compute logsums for tours using skims for alt_tdd out_period and in_period """ - trace_label = tracing.extend_trace_label(trace_label, 'logsums') + trace_label = tracing.extend_trace_label(trace_label, "logsums") with chunk.chunk_log(trace_label): - logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) + logsum_settings = config.read_model_settings(model_settings["LOGSUM_SETTINGS"]) - choosers = alt_tdd.join(tours_merged, how='left', rsuffix='_chooser') - logger.info(f"{trace_label} compute_logsums for {choosers.shape[0]} choosers {alt_tdd.shape[0]} alts") + choosers = alt_tdd.join(tours_merged, how="left", rsuffix="_chooser") + logger.info( + f"{trace_label} compute_logsums for {choosers.shape[0]} choosers {alt_tdd.shape[0]} alts" + ) # - locals_dict constants = config.get_model_constants(logsum_settings) @@ -109,7 +125,9 @@ def _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, networ if network_los.zone_system == los.THREE_ZONE: # TVPB constants can appear in expressions - locals_dict.update(network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) + locals_dict.update( + network_los.setting("TVPB_SETTINGS.tour_mode_choice.CONSTANTS") + ) locals_dict.update(skims) @@ -119,7 +137,7 @@ def _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, networ # - run preprocessor to annotate choosers # allow specification of alternate preprocessor for nontour choosers - preprocessor = model_settings.get('LOGSUM_PREPROCESSOR', 'preprocessor') + preprocessor = model_settings.get("LOGSUM_PREPROCESSOR", "preprocessor") preprocessor_settings = logsum_settings[preprocessor] if preprocessor_settings: @@ -130,14 +148,19 @@ def _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, networ df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) # - compute logsums - logsum_spec = simulate.read_model_spec(file_name=logsum_settings['SPEC']) - logsum_spec = simulate.eval_coefficients(logsum_spec, coefficients, estimator=None) + logsum_spec = simulate.read_model_spec(file_name=logsum_settings["SPEC"]) + logsum_spec = simulate.eval_coefficients( + logsum_spec, coefficients, estimator=None + ) nest_spec = config.get_logit_model_settings(logsum_settings) - nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) + nest_spec = simulate.eval_nest_coefficients( + nest_spec, coefficients, trace_label + ) logsums = simulate.simple_simulate_logsums( choosers, @@ -146,98 +169,137 @@ def _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, networ skims=skims, locals_d=locals_dict, chunk_size=0, - trace_label=trace_label) + trace_label=trace_label, + ) return logsums def dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label): - tdd_segments = inject.get_injectable('tdd_alt_segments', None) + tdd_segments = inject.get_injectable("tdd_alt_segments", None) alt_tdd_periods = None logger.info(f"tdd_alt_segments specified for representative logsums") - with chunk.chunk_log(tracing.extend_trace_label(trace_label, 'dedupe_alt_tdd')): + with chunk.chunk_log(tracing.extend_trace_label(trace_label, "dedupe_alt_tdd")): if tdd_segments is not None: - dedupe_columns = ['out_period', 'in_period'] + dedupe_columns = ["out_period", "in_period"] # tdd_alt_segments is optionally segmented by tour purpose - if 'tour_purpose' in tdd_segments: + if "tour_purpose" in tdd_segments: - is_tdd_for_tour_purpose = (tdd_segments.tour_purpose == tour_purpose) + is_tdd_for_tour_purpose = tdd_segments.tour_purpose == tour_purpose if not is_tdd_for_tour_purpose.any(): is_tdd_for_tour_purpose = tdd_segments.tour_purpose.isnull() - assert is_tdd_for_tour_purpose.any(), \ - f"no segments found for tour purpose {tour_purpose} in tour_departure_and_duration_segments" + assert ( + is_tdd_for_tour_purpose.any() + ), f"no segments found for tour purpose {tour_purpose} in tour_departure_and_duration_segments" - tdd_segments = tdd_segments[is_tdd_for_tour_purpose].drop(columns=['tour_purpose']) - assert len(tdd_segments) > 0, f"tour_purpose '{tour_purpose}' not in tdd_alt_segments" + tdd_segments = tdd_segments[is_tdd_for_tour_purpose].drop( + columns=["tour_purpose"] + ) + assert ( + len(tdd_segments) > 0 + ), f"tour_purpose '{tour_purpose}' not in tdd_alt_segments" # left join representative start on out_period - alt_tdd_periods = \ - pd.merge(alt_tdd[['out_period', 'in_period']].reset_index(), - tdd_segments[['time_period', 'start']].rename(columns={'time_period': 'out_period'}), - how='left', on='out_period') + alt_tdd_periods = pd.merge( + alt_tdd[["out_period", "in_period"]].reset_index(), + tdd_segments[["time_period", "start"]].rename( + columns={"time_period": "out_period"} + ), + how="left", + on="out_period", + ) chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) # left join representative end on in_period - alt_tdd_periods = \ - pd.merge(alt_tdd_periods, - tdd_segments[['time_period', 'end']].rename(columns={'time_period': 'in_period'}), - how='left', on=['in_period']) + alt_tdd_periods = pd.merge( + alt_tdd_periods, + tdd_segments[["time_period", "end"]].rename( + columns={"time_period": "in_period"} + ), + how="left", + on=["in_period"], + ) chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) if tdd_segments.start.isnull().any(): - missing_periods = tdd_segments.out_period[tdd_segments.start.isnull()].unique() - logger.warning(f"missing out_periods in tdd_alt_segments: {missing_periods}") + missing_periods = tdd_segments.out_period[ + tdd_segments.start.isnull() + ].unique() + logger.warning( + f"missing out_periods in tdd_alt_segments: {missing_periods}" + ) if tdd_segments.end.isnull().any(): - missing_periods = tdd_segments.in_period[tdd_segments.end.isnull()].unique() - logger.warning(f"missing in_periods in tdd_alt_segments: {missing_periods}") + missing_periods = tdd_segments.in_period[ + tdd_segments.end.isnull() + ].unique() + logger.warning( + f"missing in_periods in tdd_alt_segments: {missing_periods}" + ) assert not tdd_segments.start.isnull().any() assert not tdd_segments.end.isnull().any() # drop duplicates - alt_tdd_periods = alt_tdd_periods.drop_duplicates().set_index(alt_tdd.index.name) + alt_tdd_periods = alt_tdd_periods.drop_duplicates().set_index( + alt_tdd.index.name + ) chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) # representative duration - alt_tdd_periods['duration'] = alt_tdd_periods['end'] - alt_tdd_periods['start'] + alt_tdd_periods["duration"] = ( + alt_tdd_periods["end"] - alt_tdd_periods["start"] + ) chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) - logger.debug(f"{trace_label} " - f"dedupe_alt_tdd.tdd_alt_segments reduced number of rows by " - f"{round(100 * (len(alt_tdd) - len(alt_tdd_periods)) / len(alt_tdd), 2)}% " - f"from {len(alt_tdd)} to {len(alt_tdd_periods)}") + logger.debug( + f"{trace_label} " + f"dedupe_alt_tdd.tdd_alt_segments reduced number of rows by " + f"{round(100 * (len(alt_tdd) - len(alt_tdd_periods)) / len(alt_tdd), 2)}% " + f"from {len(alt_tdd)} to {len(alt_tdd_periods)}" + ) # if there is no tdd_alt_segments file, we can at least dedupe on 'out_period', 'in_period', 'duration' if alt_tdd_periods is None: # FIXME This won't work if they reference start or end in logsum calculations # for MTC only duration is used (to calculate all_day parking cost) - dedupe_columns = ['out_period', 'in_period', 'duration'] + dedupe_columns = ["out_period", "in_period", "duration"] - logger.warning(f"No tdd_alt_segments for representative logsums so fallback to " - f"deduping tdd_alts by time_period and duration") + logger.warning( + f"No tdd_alt_segments for representative logsums so fallback to " + f"deduping tdd_alts by time_period and duration" + ) # - get list of unique (tour_id, out_period, in_period, duration) in alt_tdd_periods # we can cut the number of alts roughly in half (for mtctm1) by conflating duplicates - alt_tdd_periods = alt_tdd[dedupe_columns].reset_index().drop_duplicates().set_index(alt_tdd.index.name) + alt_tdd_periods = ( + alt_tdd[dedupe_columns] + .reset_index() + .drop_duplicates() + .set_index(alt_tdd.index.name) + ) chunk.log_df(trace_label, "alt_tdd_periods", alt_tdd_periods) - logger.debug(f"{trace_label} " - f"dedupe_alt_tdd.drop_duplicates reduced number of rows by " - f"{round(100 * (len(alt_tdd) - len(alt_tdd_periods)) / len(alt_tdd), 2)}% " - f"from {len(alt_tdd)} to {len(alt_tdd_periods)}") + logger.debug( + f"{trace_label} " + f"dedupe_alt_tdd.drop_duplicates reduced number of rows by " + f"{round(100 * (len(alt_tdd) - len(alt_tdd_periods)) / len(alt_tdd), 2)}% " + f"from {len(alt_tdd)} to {len(alt_tdd_periods)}" + ) return alt_tdd_periods, dedupe_columns -def compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, skims, trace_label): +def compute_logsums( + alt_tdd, tours_merged, tour_purpose, model_settings, skims, trace_label +): """ Compute logsums for the tour alt_tdds, which will differ based on their different start, stop times of day, which translate to different odt_skim out_period and in_periods. @@ -250,15 +312,15 @@ def compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, skims, (out-period, in-period) pairs and then join them back to the alt_tdds. """ - trace_label = tracing.extend_trace_label(trace_label, 'compute_logsums') - network_los = inject.get_injectable('network_los') + trace_label = tracing.extend_trace_label(trace_label, "compute_logsums") + network_los = inject.get_injectable("network_los") # - in_period and out_period - assert 'out_period' not in alt_tdd - assert 'in_period' not in alt_tdd - alt_tdd['out_period'] = network_los.skim_time_period_label(alt_tdd['start']) - alt_tdd['in_period'] = network_los.skim_time_period_label(alt_tdd['end']) - alt_tdd['duration'] = alt_tdd['end'] - alt_tdd['start'] + assert "out_period" not in alt_tdd + assert "in_period" not in alt_tdd + alt_tdd["out_period"] = network_los.skim_time_period_label(alt_tdd["start"]) + alt_tdd["in_period"] = network_los.skim_time_period_label(alt_tdd["end"]) + alt_tdd["duration"] = alt_tdd["end"] - alt_tdd["start"] # outside chunk_log context because we extend log_df call for alt_tdd made by our only caller _schedule_tours chunk.log_df(trace_label, "alt_tdd", alt_tdd) @@ -267,35 +329,56 @@ def compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, skims, if USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS: # compute logsums for all the tour alt_tdds (inefficient) - logsums = _compute_logsums(alt_tdd, tours_merged, tour_purpose, - model_settings, network_los, skims, trace_label) + logsums = _compute_logsums( + alt_tdd, + tours_merged, + tour_purpose, + model_settings, + network_los, + skims, + trace_label, + ) return logsums index_name = alt_tdd.index.name - deduped_alt_tdds, redupe_columns = dedupe_alt_tdd(alt_tdd, tour_purpose, trace_label) + deduped_alt_tdds, redupe_columns = dedupe_alt_tdd( + alt_tdd, tour_purpose, trace_label + ) chunk.log_df(trace_label, "deduped_alt_tdds", deduped_alt_tdds) - logger.info(f"{trace_label} compute_logsums " - f"deduped_alt_tdds reduced number of rows by " - f"{round(100 * (len(alt_tdd) - len(deduped_alt_tdds)) / len(alt_tdd), 2)}% " - f"from {len(alt_tdd)} to {len(deduped_alt_tdds)} compared to USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS") + logger.info( + f"{trace_label} compute_logsums " + f"deduped_alt_tdds reduced number of rows by " + f"{round(100 * (len(alt_tdd) - len(deduped_alt_tdds)) / len(alt_tdd), 2)}% " + f"from {len(alt_tdd)} to {len(deduped_alt_tdds)} compared to USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS" + ) t0 = tracing.print_elapsed_time() # - compute logsums for the alt_tdd_periods - deduped_alt_tdds['logsums'] = \ - _compute_logsums(deduped_alt_tdds, tours_merged, tour_purpose, - model_settings, network_los, skims, trace_label) + deduped_alt_tdds["logsums"] = _compute_logsums( + deduped_alt_tdds, + tours_merged, + tour_purpose, + model_settings, + network_los, + skims, + trace_label, + ) # tracing.log_runtime(model_name=trace_label, start_time=t0) # redupe - join the alt_tdd_period logsums to alt_tdd to get logsums for alt_tdd - logsums = pd.merge( - alt_tdd.reset_index(), - deduped_alt_tdds.reset_index(), - on=[index_name] + redupe_columns, - how='left' - ).set_index(index_name).logsums + logsums = ( + pd.merge( + alt_tdd.reset_index(), + deduped_alt_tdds.reset_index(), + on=[index_name] + redupe_columns, + how="left", + ) + .set_index(index_name) + .logsums + ) chunk.log_df(trace_label, "logsums", logsums) del deduped_alt_tdds @@ -304,20 +387,30 @@ def compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, skims, # this is really expensive TRACE = False if TRACE: - trace_logsums_df = logsums.to_frame('representative_logsum') - trace_logsums_df['brute_force_logsum'] = \ - _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, network_los, skims, trace_label) - tracing.trace_df(trace_logsums_df, - label=tracing.extend_trace_label(trace_label, 'representative_logsums'), - slicer='NONE', transpose=False) + trace_logsums_df = logsums.to_frame("representative_logsum") + trace_logsums_df["brute_force_logsum"] = _compute_logsums( + alt_tdd, + tours_merged, + tour_purpose, + model_settings, + network_los, + skims, + trace_label, + ) + tracing.trace_df( + trace_logsums_df, + label=tracing.extend_trace_label(trace_label, "representative_logsums"), + slicer="NONE", + transpose=False, + ) # leave it to our caller to pick up logsums with call to chunk.log_df return logsums -def get_previous_tour_by_tourid(current_tour_window_ids, - previous_tour_by_window_id, - alts): +def get_previous_tour_by_tourid( + current_tour_window_ids, previous_tour_by_window_id, alts +): """ Matches current tours with attributes of previous tours for the same person. See the return value below for more information. @@ -344,20 +437,21 @@ def get_previous_tour_by_tourid(current_tour_window_ids, interaction. """ - PREV_TOUR_COLUMNS = ['start', 'end'] + PREV_TOUR_COLUMNS = ["start", "end"] - previous_tour_by_tourid = \ - previous_tour_by_window_id.loc[current_tour_window_ids] + previous_tour_by_tourid = previous_tour_by_window_id.loc[current_tour_window_ids] previous_tour_by_tourid = alts.loc[previous_tour_by_tourid, PREV_TOUR_COLUMNS] previous_tour_by_tourid.index = current_tour_window_ids.index - previous_tour_by_tourid.columns = [x+'_previous' for x in PREV_TOUR_COLUMNS] + previous_tour_by_tourid.columns = [x + "_previous" for x in PREV_TOUR_COLUMNS] return previous_tour_by_tourid -def tdd_interaction_dataset(tours, alts, timetable, choice_column, window_id_col, trace_label): +def tdd_interaction_dataset( + tours, alts, timetable, choice_column, window_id_col, trace_label +): """ interaction_sample_simulate expects alts index same as choosers (e.g. tour_id) @@ -383,11 +477,11 @@ def tdd_interaction_dataset(tours, alts, timetable, choice_column, window_id_col """ - trace_label = tracing.extend_trace_label(trace_label, 'tdd_interaction_dataset') + trace_label = tracing.extend_trace_label(trace_label, "tdd_interaction_dataset") with chunk.chunk_log(trace_label): alts_ids = np.tile(alts.index, len(tours.index)) - chunk.log_df(trace_label, 'alts_ids', alts_ids) + chunk.log_df(trace_label, "alts_ids", alts_ids) tour_ids = np.repeat(tours.index, len(alts.index)) window_row_ids = np.repeat(tours[window_id_col], len(alts.index)) @@ -402,15 +496,21 @@ def tdd_interaction_dataset(tours, alts, timetable, choice_column, window_id_col alt_tdd.insert(loc=0, column=choice_column, value=alts_ids) # slice out all non-available tours - available = timetable.tour_available(alt_tdd[window_id_col], alt_tdd[choice_column]) - logger.debug(f"tdd_interaction_dataset keeping {available.sum()} of ({len(available)}) available alt_tdds") + available = timetable.tour_available( + alt_tdd[window_id_col], alt_tdd[choice_column] + ) + logger.debug( + f"tdd_interaction_dataset keeping {available.sum()} of ({len(available)}) available alt_tdds" + ) assert available.any() - chunk.log_df(trace_label, 'alt_tdd', alt_tdd) # catch this before we slice on available + chunk.log_df( + trace_label, "alt_tdd", alt_tdd + ) # catch this before we slice on available alt_tdd = alt_tdd[available] - chunk.log_df(trace_label, 'alt_tdd', alt_tdd) + chunk.log_df(trace_label, "alt_tdd", alt_tdd) # FIXME - don't need this any more after slicing del alt_tdd[window_id_col] @@ -442,41 +542,57 @@ def run_alts_preprocessor(model_settings, alts, segment, locals_dict, trace_labe annotated copy of alts """ - preprocessor_settings = model_settings.get('ALTS_PREPROCESSOR', {}) + preprocessor_settings = model_settings.get("ALTS_PREPROCESSOR", {}) if segment in preprocessor_settings: # segmented by logsum_tour_purpose preprocessor_settings = preprocessor_settings.get(segment) - logger.debug(f"running ALTS_PREPROCESSOR with spec for {segment}: {preprocessor_settings.get('SPEC')}") - elif 'SPEC' in preprocessor_settings: + logger.debug( + f"running ALTS_PREPROCESSOR with spec for {segment}: {preprocessor_settings.get('SPEC')}" + ) + elif "SPEC" in preprocessor_settings: # unsegmented (either because no segmentation, or fallback if settings has generic preprocessor) - logger.debug(f"running ALTS_PREPROCESSOR with unsegmented spec {preprocessor_settings.get('SPEC')}") + logger.debug( + f"running ALTS_PREPROCESSOR with unsegmented spec {preprocessor_settings.get('SPEC')}" + ) else: - logger.debug(f"skipping alts preprocesser because no ALTS_PREPROCESSOR segment for {segment}") + logger.debug( + f"skipping alts preprocesser because no ALTS_PREPROCESSOR segment for {segment}" + ) preprocessor_settings = None if preprocessor_settings: - logger.debug(f"run_alts_preprocessor calling assign_columns for {segment} preprocessor_settings") + logger.debug( + f"run_alts_preprocessor calling assign_columns for {segment} preprocessor_settings" + ) alts = alts.copy() expressions.assign_columns( df=alts, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) return alts def _schedule_tours( - tours, persons_merged, alts, - spec, logsum_tour_purpose, - model_settings, skims, - timetable, window_id_col, - previous_tour, tour_owner_id_col, - estimator, - tour_trace_label): + tours, + persons_merged, + alts, + spec, + logsum_tour_purpose, + model_settings, + skims, + timetable, + window_id_col, + previous_tour, + tour_owner_id_col, + estimator, + tour_trace_label, +): """ previous_tour stores values used to add columns that can be used in the spec which have to do with the previous tours per person. Every column in the @@ -521,12 +637,19 @@ def _schedule_tours( """ - logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) + logger.info( + "%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours)) + ) # merge persons into tours # avoid dual suffix for redundant columns names (e.g. household_id) that appear in both - tours = pd.merge(tours, persons_merged, left_on='person_id', right_index=True, - suffixes=('', '_y')) + tours = pd.merge( + tours, + persons_merged, + left_on="person_id", + right_index=True, + suffixes=("", "_y"), + ) chunk.log_df(tour_trace_label, "tours", tours) # - add explicit window_id_col for timetable owner if it is index @@ -543,29 +666,33 @@ def _schedule_tours( # dataframe columns start, end , duration, person_id, tdd # indexed (not unique) on tour_id choice_column = TDD_CHOICE_COLUMN - alt_tdd = tdd_interaction_dataset(tours, alts, timetable, choice_column, window_id_col, tour_trace_label) + alt_tdd = tdd_interaction_dataset( + tours, alts, timetable, choice_column, window_id_col, tour_trace_label + ) # print(f"tours {tours.shape} alts {alts.shape}") chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) # - add logsums if logsum_tour_purpose: - logsums = compute_logsums(alt_tdd, tours, logsum_tour_purpose, model_settings, skims, tour_trace_label) + logsums = compute_logsums( + alt_tdd, tours, logsum_tour_purpose, model_settings, skims, tour_trace_label + ) else: logsums = 0 - alt_tdd['mode_choice_logsum'] = logsums + alt_tdd["mode_choice_logsum"] = logsums del logsums chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) # - merge in previous tour columns # adds start_previous and end_previous, joins on index - tours = tours.join(get_previous_tour_by_tourid(tours[tour_owner_id_col], previous_tour, alts)) + tours = tours.join( + get_previous_tour_by_tourid(tours[tour_owner_id_col], previous_tour, alts) + ) chunk.log_df(tour_trace_label, "tours", tours) # - make choices - locals_d = { - 'tt': timetable - } + locals_d = {"tt": timetable} constants = config.get_model_constants(model_settings) if constants is not None: locals_d.update(constants) @@ -578,8 +705,12 @@ def _schedule_tours( # In any case, I don't see any benefit to doing this here - at least not for any existing implementations # but if we do, it will require passing spec_segment to schedule_tours and _schedule_tours # or redundently segmenting alts (yuck!) to conform to more granular tour_segmentation (e.g. univ do school) - spec_segment = logsum_tour_purpose # FIXME this is not always right - see note above - alt_tdd = run_alts_preprocessor(model_settings, alt_tdd, spec_segment, locals_d, tour_trace_label) + spec_segment = ( + logsum_tour_purpose # FIXME this is not always right - see note above + ) + alt_tdd = run_alts_preprocessor( + model_settings, alt_tdd, spec_segment, locals_d, tour_trace_label + ) chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) if estimator: @@ -588,7 +719,7 @@ def _schedule_tours( estimator.set_alt_id(choice_column) estimator.write_interaction_sample_alternatives(alt_tdd) - log_alt_losers = config.setting('log_alt_losers', False) + log_alt_losers = config.setting("log_alt_losers", False) choices = interaction_sample_simulate( tours, @@ -599,9 +730,9 @@ def _schedule_tours( locals_d=locals_d, chunk_size=0, trace_label=tour_trace_label, - estimator=estimator + estimator=estimator, ) - chunk.log_df(tour_trace_label, 'choices', choices) + chunk.log_df(tour_trace_label, "choices", choices) # - update previous_tour and timetable parameters @@ -615,13 +746,21 @@ def _schedule_tours( def schedule_tours( - tours, persons_merged, alts, - spec, logsum_tour_purpose, - model_settings, - timetable, timetable_window_id_col, - previous_tour, tour_owner_id_col, - estimator, - chunk_size, tour_trace_label, tour_chunk_tag): + tours, + persons_merged, + alts, + spec, + logsum_tour_purpose, + model_settings, + timetable, + timetable_window_id_col, + previous_tour, + tour_owner_id_col, + estimator, + chunk_size, + tour_trace_label, + tour_chunk_tag, +): """ chunking wrapper for _schedule_tours @@ -636,7 +775,9 @@ def schedule_tours( logger.info("schedule_tours %s tours not monotonic_increasing - sorting df") tours = tours.sort_index() - logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) + logger.info( + "%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours)) + ) # no more than one tour per timetable_window per call if timetable_window_id_col is None: @@ -644,27 +785,36 @@ def schedule_tours( else: assert not tours[timetable_window_id_col].duplicated().any() - if 'LOGSUM_SETTINGS' in model_settings: + if "LOGSUM_SETTINGS" in model_settings: # we need skims to calculate tvpb skim overhead in 3_ZONE systems for use by calc_rows_per_chunk skims = skims_for_logsums(logsum_tour_purpose, model_settings, tour_trace_label) else: skims = None result_list = [] - for i, chooser_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers(tours, chunk_size, tour_trace_label, tour_chunk_tag): - - choices = _schedule_tours(chooser_chunk, persons_merged, - alts, spec, logsum_tour_purpose, - model_settings, skims, - timetable, timetable_window_id_col, - previous_tour, tour_owner_id_col, - estimator, - tour_trace_label=chunk_trace_label) + for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + tours, chunk_size, tour_trace_label, tour_chunk_tag + ): + + choices = _schedule_tours( + chooser_chunk, + persons_merged, + alts, + spec, + logsum_tour_purpose, + model_settings, + skims, + timetable, + timetable_window_id_col, + previous_tour, + tour_owner_id_col, + estimator, + tour_trace_label=chunk_trace_label, + ) result_list.append(choices) - chunk.log_df(tour_trace_label, f'result_list', result_list) + chunk.log_df(tour_trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -677,10 +827,17 @@ def schedule_tours( return choices -def vectorize_tour_scheduling(tours, persons_merged, alts, timetable, - tour_segments, tour_segment_col, - model_settings, - chunk_size=0, trace_label=None): +def vectorize_tour_scheduling( + tours, + persons_merged, + alts, + timetable, + tour_segments, + tour_segment_col, + model_settings, + chunk_size=0, + trace_label=None, +): """ The purpose of this method is fairly straightforward - it takes tours and schedules them into time slots. Alternatives should be specified so @@ -721,11 +878,11 @@ def vectorize_tour_scheduling(tours, persons_merged, alts, timetable, persons timetable updated with tours (caller should replace_table for it to persist) """ - trace_label = tracing.extend_trace_label(trace_label, 'vectorize_tour_scheduling') + trace_label = tracing.extend_trace_label(trace_label, "vectorize_tour_scheduling") assert len(tours.index) > 0 - assert 'tour_num' in tours.columns - assert 'tour_type' in tours.columns + assert "tour_num" in tours.columns + assert "tour_type" in tours.columns # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first @@ -736,9 +893,9 @@ def vectorize_tour_scheduling(tours, persons_merged, alts, timetable, # initialize with first trip from alts previous_tour_by_personid = pd.Series(alts.index[0], index=tours.person_id.unique()) - timetable_window_id_col = 'person_id' - tour_owner_id_col = 'person_id' - compute_logsums = ('LOGSUM_SETTINGS' in model_settings) + timetable_window_id_col = "person_id" + tour_owner_id_col = "person_id" + compute_logsums = "LOGSUM_SETTINGS" in model_settings assert isinstance(tour_segments, dict) @@ -747,44 +904,62 @@ def vectorize_tour_scheduling(tours, persons_merged, alts, timetable, # second trip of type must be in group immediately following first # segregate scheduling by tour_type if multiple specs passed in dict keyed by tour_type - for tour_num, nth_tours in tours.groupby('tour_num', sort=True): + for tour_num, nth_tours in tours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f'tour_{tour_num}') - tour_chunk_tag = tracing.extend_trace_label(trace_label, f"tour_{1 if tour_num == 1 else 'n'}") + tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") + tour_chunk_tag = tracing.extend_trace_label( + trace_label, f"tour_{1 if tour_num == 1 else 'n'}" + ) if tour_segment_col is not None: for tour_segment_name, tour_segment_info in tour_segments.items(): - segment_trace_label = tracing.extend_trace_label(tour_trace_label, tour_segment_name) - segment_chunk_tag = tracing.extend_trace_label(tour_chunk_tag, tour_segment_name) + segment_trace_label = tracing.extend_trace_label( + tour_trace_label, tour_segment_name + ) + segment_chunk_tag = tracing.extend_trace_label( + tour_chunk_tag, tour_segment_name + ) # assume segmentation of spec and coefficients are aligned - spec_segment_name = tour_segment_info.get('spec_segment_name') + spec_segment_name = tour_segment_info.get("spec_segment_name") # assume logsum segmentation is same as tours logsum_tour_purpose = tour_segment_name if compute_logsums else None - nth_tours_in_segment = nth_tours[nth_tours[tour_segment_col] == tour_segment_name] + nth_tours_in_segment = nth_tours[ + nth_tours[tour_segment_col] == tour_segment_name + ] if nth_tours_in_segment.empty: logger.info("skipping empty segment %s" % tour_segment_name) continue if RUN_ALTS_PREPROCESSOR_BEFORE_MERGE: locals_dict = {} - alts = run_alts_preprocessor(model_settings, alts, spec_segment_name, locals_dict, tour_trace_label) - - choices = \ - schedule_tours(nth_tours_in_segment, persons_merged, alts, - spec=tour_segment_info['spec'], - logsum_tour_purpose=logsum_tour_purpose, - model_settings=model_settings, - timetable=timetable, - timetable_window_id_col=timetable_window_id_col, - previous_tour=previous_tour_by_personid, - tour_owner_id_col=tour_owner_id_col, - estimator=tour_segment_info.get('estimator'), - chunk_size=chunk_size, - tour_trace_label=segment_trace_label, tour_chunk_tag=segment_chunk_tag) + alts = run_alts_preprocessor( + model_settings, + alts, + spec_segment_name, + locals_dict, + tour_trace_label, + ) + + choices = schedule_tours( + nth_tours_in_segment, + persons_merged, + alts, + spec=tour_segment_info["spec"], + logsum_tour_purpose=logsum_tour_purpose, + model_settings=model_settings, + timetable=timetable, + timetable_window_id_col=timetable_window_id_col, + previous_tour=previous_tour_by_personid, + tour_owner_id_col=tour_owner_id_col, + estimator=tour_segment_info.get("estimator"), + chunk_size=chunk_size, + tour_trace_label=segment_trace_label, + tour_chunk_tag=segment_chunk_tag, + ) choice_list.append(choices) @@ -793,21 +968,27 @@ def vectorize_tour_scheduling(tours, persons_merged, alts, timetable, # MTC non_mandatory_tours are not segmented by tour_purpose and do not require logsums # FIXME should support logsums? - assert not compute_logsums, "logsums for unsegmented spec not implemented because not currently needed" - assert tour_segments.get('spec_segment_name') is None - - choices = \ - schedule_tours(nth_tours, persons_merged, alts, - spec=tour_segments['spec'], - logsum_tour_purpose=None, - model_settings=model_settings, - timetable=timetable, - timetable_window_id_col=timetable_window_id_col, - previous_tour=previous_tour_by_personid, - tour_owner_id_col=tour_owner_id_col, - estimator=tour_segments.get('estimator'), - chunk_size=chunk_size, - tour_trace_label=tour_trace_label, tour_chunk_tag=tour_chunk_tag) + assert ( + not compute_logsums + ), "logsums for unsegmented spec not implemented because not currently needed" + assert tour_segments.get("spec_segment_name") is None + + choices = schedule_tours( + nth_tours, + persons_merged, + alts, + spec=tour_segments["spec"], + logsum_tour_purpose=None, + model_settings=model_settings, + timetable=timetable, + timetable_window_id_col=timetable_window_id_col, + previous_tour=previous_tour_by_personid, + tour_owner_id_col=tour_owner_id_col, + estimator=tour_segments.get("estimator"), + chunk_size=chunk_size, + tour_trace_label=tour_trace_label, + tour_chunk_tag=tour_chunk_tag, + ) choice_list.append(choices) @@ -815,10 +996,17 @@ def vectorize_tour_scheduling(tours, persons_merged, alts, timetable, return choices -def vectorize_subtour_scheduling(parent_tours, subtours, persons_merged, alts, spec, - model_settings, - estimator, - chunk_size=0, trace_label=None): +def vectorize_subtour_scheduling( + parent_tours, + subtours, + persons_merged, + alts, + spec, + model_settings, + estimator, + chunk_size=0, + trace_label=None, +): """ Like vectorize_tour_scheduling but specifically for atwork subtours @@ -854,14 +1042,14 @@ def vectorize_subtour_scheduling(parent_tours, subtours, persons_merged, alts, s DataFrame and the values are the index of the alts DataFrame. """ if not trace_label: - trace_label = 'vectorize_non_mandatory_tour_scheduling' + trace_label = "vectorize_non_mandatory_tour_scheduling" assert len(subtours.index) > 0 - assert 'tour_num' in subtours.columns - assert 'tour_type' in subtours.columns + assert "tour_num" in subtours.columns + assert "tour_type" in subtours.columns - timetable_window_id_col = 'parent_tour_id' - tour_owner_id_col = 'parent_tour_id' + timetable_window_id_col = "parent_tour_id" + tour_owner_id_col = "parent_tour_id" logsum_tour_purpose = None # FIXME logsums not currently supported # timetable with a window for each parent tour @@ -883,30 +1071,40 @@ def vectorize_subtour_scheduling(parent_tours, subtours, persons_merged, alts, s # keep a series of the the most recent tours for each person # initialize with first trip from alts - previous_tour_by_parent_tour_id = \ - pd.Series(alts.index[0], index=subtours['parent_tour_id'].unique()) + previous_tour_by_parent_tour_id = pd.Series( + alts.index[0], index=subtours["parent_tour_id"].unique() + ) # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # this ought to have been ensured when tours are created (tour_frequency.process_tours) - for tour_num, nth_tours in subtours.groupby('tour_num', sort=True): + for tour_num, nth_tours in subtours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f'tour_{tour_num}') - tour_chunk_tag = tracing.extend_trace_label(trace_label, f"tour_{1 if tour_num == 1 else 'n'}") + tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") + tour_chunk_tag = tracing.extend_trace_label( + trace_label, f"tour_{1 if tour_num == 1 else 'n'}" + ) # no more than one tour per timetable window per call to schedule_tours assert not nth_tours.parent_tour_id.duplicated().any() - choices = \ - schedule_tours(nth_tours, - persons_merged, alts, - spec, logsum_tour_purpose, - model_settings, - timetable, timetable_window_id_col, - previous_tour_by_parent_tour_id, tour_owner_id_col, - estimator, - chunk_size, tour_trace_label, tour_chunk_tag) + choices = schedule_tours( + nth_tours, + persons_merged, + alts, + spec, + logsum_tour_purpose, + model_settings, + timetable, + timetable_window_id_col, + previous_tour_by_parent_tour_id, + tour_owner_id_col, + estimator, + chunk_size, + tour_trace_label, + tour_chunk_tag, + ) choice_list.append(choices) @@ -925,30 +1123,43 @@ def vectorize_subtour_scheduling(parent_tours, subtours, persons_merged, alts, s return choices -def build_joint_tour_timetables(joint_tours, joint_tour_participants, persons_timetable, alts): +def build_joint_tour_timetables( + joint_tours, joint_tour_participants, persons_timetable, alts +): # timetable with a window for each joint tour joint_tour_windows_df = tt.create_timetable_windows(joint_tours, alts) joint_tour_timetable = tt.TimeTable(joint_tour_windows_df, alts) - for participant_num, nth_participants in \ - joint_tour_participants.groupby('participant_num', sort=True): + for participant_num, nth_participants in joint_tour_participants.groupby( + "participant_num", sort=True + ): # nth_participant windows from persons_timetable - participant_windows = persons_timetable.slice_windows_by_row_id(nth_participants.person_id) + participant_windows = persons_timetable.slice_windows_by_row_id( + nth_participants.person_id + ) # assign them joint_tour_timetable - joint_tour_timetable.assign_footprints(nth_participants.tour_id, participant_windows) + joint_tour_timetable.assign_footprints( + nth_participants.tour_id, participant_windows + ) return joint_tour_timetable def vectorize_joint_tour_scheduling( - joint_tours, joint_tour_participants, - persons_merged, alts, persons_timetable, - spec, model_settings, - estimator, - chunk_size=0, trace_label=None): + joint_tours, + joint_tour_participants, + persons_merged, + alts, + persons_timetable, + spec, + model_settings, + estimator, + chunk_size=0, + trace_label=None, +): """ Like vectorize_tour_scheduling but specifically for joint tours @@ -980,21 +1191,25 @@ def vectorize_joint_tour_scheduling( timetable updated with joint tours (caller should replace_table for it to persist) """ - trace_label = tracing.extend_trace_label(trace_label, 'vectorize_joint_tour_scheduling') + trace_label = tracing.extend_trace_label( + trace_label, "vectorize_joint_tour_scheduling" + ) assert len(joint_tours.index) > 0 - assert 'tour_num' in joint_tours.columns - assert 'tour_type' in joint_tours.columns + assert "tour_num" in joint_tours.columns + assert "tour_type" in joint_tours.columns timetable_window_id_col = None - tour_owner_id_col = 'household_id' + tour_owner_id_col = "household_id" logsum_tour_purpose = None # FIXME logsums not currently supported choice_list = [] # keep a series of the the most recent tours for each person # initialize with first trip from alts - previous_tour_by_householdid = pd.Series(alts.index[0], index=joint_tours.household_id.unique()) + previous_tour_by_householdid = pd.Series( + alts.index[0], index=joint_tours.household_id.unique() + ) # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first @@ -1003,36 +1218,45 @@ def vectorize_joint_tour_scheduling( # print "participant windows before scheduling\n%s" % \ # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) - for tour_num, nth_tours in joint_tours.groupby('tour_num', sort=True): + for tour_num, nth_tours in joint_tours.groupby("tour_num", sort=True): - tour_trace_label = tracing.extend_trace_label(trace_label, f'tour_{tour_num}') - tour_chunk_tag = tracing.extend_trace_label(trace_label, f"tour_{1 if tour_num == 1 else 'n'}") + tour_trace_label = tracing.extend_trace_label(trace_label, f"tour_{tour_num}") + tour_chunk_tag = tracing.extend_trace_label( + trace_label, f"tour_{1 if tour_num == 1 else 'n'}" + ) # no more than one tour per household per call to schedule_tours assert not nth_tours.household_id.duplicated().any() - nth_participants = \ - joint_tour_participants[joint_tour_participants.tour_id.isin(nth_tours.index)] + nth_participants = joint_tour_participants[ + joint_tour_participants.tour_id.isin(nth_tours.index) + ] timetable = build_joint_tour_timetables( - nth_tours, nth_participants, - persons_timetable, alts) - - choices = \ - schedule_tours(nth_tours, - persons_merged, alts, - spec, - logsum_tour_purpose, - model_settings, - timetable, timetable_window_id_col, - previous_tour_by_householdid, tour_owner_id_col, - estimator, - chunk_size, tour_trace_label, tour_chunk_tag) + nth_tours, nth_participants, persons_timetable, alts + ) + + choices = schedule_tours( + nth_tours, + persons_merged, + alts, + spec, + logsum_tour_purpose, + model_settings, + timetable, + timetable_window_id_col, + previous_tour_by_householdid, + tour_owner_id_col, + estimator, + chunk_size, + tour_trace_label, + tour_chunk_tag, + ) # - update timetables of all joint tour participants persons_timetable.assign( - nth_participants.person_id, - reindex(choices, nth_participants.tour_id)) + nth_participants.person_id, reindex(choices, nth_participants.tour_id) + ) choice_list.append(choices) diff --git a/activitysim/abm/tables/__init__.py b/activitysim/abm/tables/__init__.py index 24dd5a13d6..97d56b0098 100644 --- a/activitysim/abm/tables/__init__.py +++ b/activitysim/abm/tables/__init__.py @@ -1,14 +1,15 @@ # ActivitySim # See full license in LICENSE.txt. -from . import households -from . import persons -from . import landuse -from . import accessibility -from . import skims -from . import tours -from . import size_terms -from . import trips -from . import time_windows -from . import shadow_pricing - -from . import table_dict +from . import ( + accessibility, + households, + landuse, + persons, + shadow_pricing, + size_terms, + skims, + table_dict, + time_windows, + tours, + trips, +) diff --git a/activitysim/abm/tables/accessibility.py b/activitysim/abm/tables/accessibility.py index 5be410f1f1..3fd0544216 100644 --- a/activitysim/abm/tables/accessibility.py +++ b/activitysim/abm/tables/accessibility.py @@ -27,13 +27,16 @@ def accessibility(land_use): if accessibility_df is None: accessibility_df = pd.DataFrame(index=land_use.index) - logger.debug("created placeholder accessibility table %s" % (accessibility_df.shape,)) + logger.debug( + "created placeholder accessibility table %s" % (accessibility_df.shape,) + ) else: - assert accessibility_df.sort_index().index.equals(land_use.to_frame().sort_index().index), \ - f"loaded accessibility table index does not match index of land_use table" + assert accessibility_df.sort_index().index.equals( + land_use.to_frame().sort_index().index + ), f"loaded accessibility table index does not match index of land_use table" logger.info("loaded land_use %s" % (accessibility_df.shape,)) # replace table function with dataframe - inject.add_table('accessibility', accessibility_df) + inject.add_table("accessibility", accessibility_df) return accessibility_df diff --git a/activitysim/abm/tables/households.py b/activitysim/abm/tables/households.py index 5d52ec004d..9c115f17ce 100644 --- a/activitysim/abm/tables/households.py +++ b/activitysim/abm/tables/households.py @@ -1,16 +1,11 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range - import logging +from builtins import range import pandas as pd -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import mem - +from activitysim.core import inject, mem, pipeline, tracing from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) @@ -30,17 +25,21 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): if override_hh_ids is not None: # trace_hh_id will not used if it is not in list of override_hh_ids - logger.info("override household list containing %s households" % len(override_hh_ids)) + logger.info( + "override household list containing %s households" % len(override_hh_ids) + ) df = df_full[df_full.index.isin(override_hh_ids)] households_sliced = True if df.shape[0] < len(override_hh_ids): - logger.info("found %s of %s households in override household list" % - (df.shape[0], len(override_hh_ids))) + logger.info( + "found %s of %s households in override household list" + % (df.shape[0], len(override_hh_ids)) + ) if df.shape[0] == 0: - raise RuntimeError('No override households found in store') + raise RuntimeError("No override households found in store") # if we are tracing hh exclusively elif trace_hh_id and households_sample_size == 1: @@ -52,7 +51,9 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): # if we need a subset of full store elif tot_households > households_sample_size > 0: - logger.info("sampling %s of %s households" % (households_sample_size, tot_households)) + logger.info( + "sampling %s of %s households" % (households_sample_size, tot_households) + ) """ Because random seed is set differently for each step, sampling of households using @@ -64,15 +65,19 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): if the pipeline rng's base_seed is changed """ - prng = pipeline.get_rn_generator().get_external_rng('sample_households') - df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False)) + prng = pipeline.get_rn_generator().get_external_rng("sample_households") + df = df_full.take( + prng.choice(len(df_full), size=households_sample_size, replace=False) + ) households_sliced = True # if tracing and we missed trace_hh in sample, but it is in full store if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: # replace first hh in sample with trace_hh - logger.debug("replacing household %s with %s in household sample" % - (df.index[0], trace_hh_id)) + logger.debug( + "replacing household %s with %s in household sample" + % (df.index[0], trace_hh_id) + ) df_hh = df_full.loc[[trace_hh_id]] df = pd.concat([df_hh, df[1:]]) @@ -80,24 +85,24 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): df = df_full # persons table - inject.add_injectable('households_sliced', households_sliced) + inject.add_injectable("households_sliced", households_sliced) - if 'sample_rate' not in df.columns: + if "sample_rate" not in df.columns: if households_sample_size == 0: sample_rate = 1 else: sample_rate = round(households_sample_size / tot_households, 3) - df['sample_rate'] = sample_rate + df["sample_rate"] = sample_rate logger.info("loaded households %s" % (df.shape,)) # replace table function with dataframe - inject.add_table('households', df) + inject.add_table("households", df) - pipeline.get_rn_generator().add_channel('households', df) + pipeline.get_rn_generator().add_channel("households", df) - tracing.register_traceable_table('households', df) + tracing.register_traceable_table("households", df) if trace_hh_id: tracing.trace_df(df, "raw.households", warn_if_empty=True) @@ -107,11 +112,13 @@ def households(households_sample_size, override_hh_ids, trace_hh_id): # this is a common merge so might as well define it once here and use it @inject.table() def households_merged(households, land_use, accessibility): - return inject.merge_tables(households.name, tables=[households, land_use, accessibility]) + return inject.merge_tables( + households.name, tables=[households, land_use, accessibility] + ) -inject.broadcast('households', 'persons', cast_index=True, onto_on='household_id') +inject.broadcast("households", "persons", cast_index=True, onto_on="household_id") # this would be accessibility around the household location - be careful with # this one as accessibility at some other location can also matter -inject.broadcast('accessibility', 'households', cast_index=True, onto_on='home_zone_id') +inject.broadcast("accessibility", "households", cast_index=True, onto_on="home_zone_id") diff --git a/activitysim/abm/tables/landuse.py b/activitysim/abm/tables/landuse.py index da0a5f8637..cf4e72420e 100644 --- a/activitysim/abm/tables/landuse.py +++ b/activitysim/abm/tables/landuse.py @@ -24,9 +24,9 @@ def land_use(): logger.info("loaded land_use %s" % (df.shape,)) # replace table function with dataframe - inject.add_table('land_use', df) + inject.add_table("land_use", df) return df -inject.broadcast('land_use', 'households', cast_index=True, onto_on='home_zone_id') +inject.broadcast("land_use", "households", cast_index=True, onto_on="home_zone_id") diff --git a/activitysim/abm/tables/persons.py b/activitysim/abm/tables/persons.py index 6eb1683e76..a193b8f6dd 100644 --- a/activitysim/abm/tables/persons.py +++ b/activitysim/abm/tables/persons.py @@ -4,11 +4,7 @@ import pandas as pd -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import tracing -from activitysim.core import mem - +from activitysim.core import inject, mem, pipeline, tracing from activitysim.core.input import read_input_table logger = logging.getLogger(__name__) @@ -18,7 +14,7 @@ def read_raw_persons(households): df = read_input_table("persons") - if inject.get_injectable('households_sliced', False): + if inject.get_injectable("households_sliced", False): # keep only persons in the sampled households df = df[df.household_id.isin(households.index)] @@ -33,11 +29,11 @@ def persons(households, trace_hh_id): logger.info("loaded persons %s" % (df.shape,)) # replace table function with dataframe - inject.add_table('persons', df) + inject.add_table("persons", df) - pipeline.get_rn_generator().add_channel('persons', df) + pipeline.get_rn_generator().add_channel("persons", df) - tracing.register_traceable_table('persons', df) + tracing.register_traceable_table("persons", df) if trace_hh_id: tracing.trace_df(df, "raw.persons", warn_if_empty=True) @@ -48,15 +44,25 @@ def persons(households, trace_hh_id): persons_without_households = ~df.household_id.isin(households.index) if persons_without_households.any(): - logger.error(f"{persons_without_households.sum()} persons out of {len(persons)} without households\n" - f"{pd.Series({'person_id': persons_without_households.index.values})}") - raise RuntimeError(f"{persons_without_households.sum()} persons with bad household_id") - - households_without_persons = df.groupby('household_id').size().reindex(households.index).isnull() + logger.error( + f"{persons_without_households.sum()} persons out of {len(persons)} without households\n" + f"{pd.Series({'person_id': persons_without_households.index.values})}" + ) + raise RuntimeError( + f"{persons_without_households.sum()} persons with bad household_id" + ) + + households_without_persons = ( + df.groupby("household_id").size().reindex(households.index).isnull() + ) if households_without_persons.any(): - logger.error(f"{households_without_persons.sum()} households out of {len(households.index)} without persons\n" - f"{pd.Series({'household_id': households_without_persons.index.values})}") - raise RuntimeError(f"{households_without_persons.sum()} households with no persons") + logger.error( + f"{households_without_persons.sum()} households out of {len(households.index)} without persons\n" + f"{pd.Series({'household_id': households_without_persons.index.values})}" + ) + raise RuntimeError( + f"{households_without_persons.sum()} households with no persons" + ) return df @@ -65,4 +71,6 @@ def persons(households, trace_hh_id): @inject.table() def persons_merged(persons, households, land_use, accessibility): - return inject.merge_tables(persons.name, tables=[persons, households, land_use, accessibility]) + return inject.merge_tables( + persons.name, tables=[persons, households, land_use, accessibility] + ) diff --git a/activitysim/abm/tables/shadow_pricing.py b/activitysim/abm/tables/shadow_pricing.py index 823fd5515c..053b1dcbc5 100644 --- a/activitysim/abm/tables/shadow_pricing.py +++ b/activitysim/abm/tables/shadow_pricing.py @@ -1,22 +1,16 @@ # ActivitySim # See full license in LICENSE.txt. +import ctypes import logging -import time import multiprocessing -import ctypes - +import time from collections import OrderedDict import numpy as np import pandas as pd -from activitysim.core import inject -from activitysim.core import util -from activitysim.core import config -from activitysim.core import tracing - from activitysim.abm.tables.size_terms import tour_destination_size_terms - +from activitysim.core import config, inject, tracing, util logger = logging.getLogger(__name__) @@ -77,8 +71,9 @@ def size_table_name(model_selector): class ShadowPriceCalculator(object): - - def __init__(self, model_settings, num_processes, shared_data=None, shared_data_lock=None): + def __init__( + self, model_settings, num_processes, shared_data=None, shared_data_lock=None + ): """ Presence of shared_data is used as a flag for multiprocessing @@ -97,42 +92,57 @@ def __init__(self, model_settings, num_processes, shared_data=None, shared_data_ """ self.num_processes = num_processes - self.use_shadow_pricing = bool(config.setting('use_shadow_pricing')) - self.saved_shadow_price_file_path = None # set by read_saved_shadow_prices if loaded + self.use_shadow_pricing = bool(config.setting("use_shadow_pricing")) + self.saved_shadow_price_file_path = ( + None # set by read_saved_shadow_prices if loaded + ) - self.model_selector = model_settings['MODEL_SELECTOR'] + self.model_selector = model_settings["MODEL_SELECTOR"] - full_model_run = config.setting('households_sample_size') == 0 + full_model_run = config.setting("households_sample_size") == 0 if self.use_shadow_pricing and not full_model_run: - logger.warning("deprecated combination of use_shadow_pricing and not full_model_run") + logger.warning( + "deprecated combination of use_shadow_pricing and not full_model_run" + ) - if (self.num_processes > 1) and not config.setting('fail_fast'): + if (self.num_processes > 1) and not config.setting("fail_fast"): # if we are multiprocessing, then fail_fast should be true or we will wait forever for failed processes - logger.warning("deprecated combination of multiprocessing and not fail_fast") - raise RuntimeError("Shadow pricing requires fail_fast setting in multiprocessing mode") + logger.warning( + "deprecated combination of multiprocessing and not fail_fast" + ) + raise RuntimeError( + "Shadow pricing requires fail_fast setting in multiprocessing mode" + ) - self.segment_ids = model_settings['SEGMENT_IDS'] + self.segment_ids = model_settings["SEGMENT_IDS"] # - modeled_size (set by call to set_choices/synchronize_choices) self.modeled_size = None if self.use_shadow_pricing: - self.shadow_settings = config.read_model_settings('shadow_pricing.yaml') + self.shadow_settings = config.read_model_settings("shadow_pricing.yaml") for k in self.shadow_settings: - logger.debug("shadow_settings %s: %s" % (k, self.shadow_settings.get(k))) + logger.debug( + "shadow_settings %s: %s" % (k, self.shadow_settings.get(k)) + ) # - destination_size_table (desired_size) - self.desired_size = inject.get_table(size_table_name(self.model_selector)).to_frame() + self.desired_size = inject.get_table( + size_table_name(self.model_selector) + ).to_frame() self.desired_size = self.desired_size.sort_index() - assert self.desired_size.index.is_monotonic_increasing, \ - f"{size_table_name(self.model_selector)} not is_monotonic_increasing" + assert ( + self.desired_size.index.is_monotonic_increasing + ), f"{size_table_name(self.model_selector)} not is_monotonic_increasing" # - shared_data if shared_data is not None: assert shared_data.shape[0] == self.desired_size.shape[0] - assert shared_data.shape[1] == self.desired_size.shape[1] + 1 # tally column + assert ( + shared_data.shape[1] == self.desired_size.shape[1] + 1 + ) # tally column assert shared_data_lock is not None self.shared_data = shared_data self.shared_data_lock = shared_data_lock @@ -140,26 +150,31 @@ def __init__(self, model_settings, num_processes, shared_data=None, shared_data_ # - load saved shadow_prices (if available) and set max_iterations accordingly if self.use_shadow_pricing: self.shadow_prices = None - self.shadow_price_method = self.shadow_settings['SHADOW_PRICE_METHOD'] - assert self.shadow_price_method in ['daysim', 'ctramp'] + self.shadow_price_method = self.shadow_settings["SHADOW_PRICE_METHOD"] + assert self.shadow_price_method in ["daysim", "ctramp"] - if self.shadow_settings['LOAD_SAVED_SHADOW_PRICES']: + if self.shadow_settings["LOAD_SAVED_SHADOW_PRICES"]: # read_saved_shadow_prices logs error and returns None if file not found self.shadow_prices = self.read_saved_shadow_prices(model_settings) if self.shadow_prices is None: - self.max_iterations = self.shadow_settings.get('MAX_ITERATIONS', 5) + self.max_iterations = self.shadow_settings.get("MAX_ITERATIONS", 5) else: - self.max_iterations = self.shadow_settings.get('MAX_ITERATIONS_SAVED', 1) + self.max_iterations = self.shadow_settings.get( + "MAX_ITERATIONS_SAVED", 1 + ) # initial_shadow_price if we did not load if self.shadow_prices is None: # initial value depends on method - initial_shadow_price = 1.0 if self.shadow_price_method == 'ctramp' else 0.0 - self.shadow_prices = \ - pd.DataFrame(data=initial_shadow_price, - columns=self.desired_size.columns, - index=self.desired_size.index) + initial_shadow_price = ( + 1.0 if self.shadow_price_method == "ctramp" else 0.0 + ) + self.shadow_prices = pd.DataFrame( + data=initial_shadow_price, + columns=self.desired_size.columns, + index=self.desired_size.index, + ) else: self.max_iterations = 1 @@ -184,10 +199,14 @@ def read_saved_shadow_prices(self, model_settings): shadow_prices = None # - load saved shadow_prices - saved_shadow_price_file_name = model_settings.get('SAVED_SHADOW_PRICE_TABLE_NAME') + saved_shadow_price_file_name = model_settings.get( + "SAVED_SHADOW_PRICE_TABLE_NAME" + ) if saved_shadow_price_file_name: # FIXME - where should we look for this file? - file_path = config.data_file_path(saved_shadow_price_file_name, mandatory=False) + file_path = config.data_file_path( + saved_shadow_price_file_name, mandatory=False + ) if file_path: shadow_prices = pd.read_csv(file_path, index_col=0) self.saved_shadow_price_file_path = file_path # informational @@ -276,10 +295,11 @@ def wait(tally, target): logger.info("first_in clearing shared_data") # convert summed numpy array data to conform to original dataframe - global_modeled_size_df = \ - pd.DataFrame(data=global_modeled_size_array, - index=local_modeled_size.index, - columns=local_modeled_size.columns) + global_modeled_size_df = pd.DataFrame( + data=global_modeled_size_array, + index=local_modeled_size.index, + columns=local_modeled_size.columns, + ) return global_modeled_size_df @@ -302,8 +322,7 @@ def set_choices(self, choices, segment_ids): modeled_size = pd.DataFrame(index=self.desired_size.index) for seg_name in self.desired_size: - segment_choices = \ - choices[(segment_ids == self.segment_ids[seg_name])] + segment_choices = choices[(segment_ids == self.segment_ids[seg_name])] modeled_size[seg_name] = segment_choices.value_counts() @@ -343,11 +362,11 @@ def check_fit(self, iteration): # - convergence criteria for check_fit # ignore convergence criteria for zones smaller than size_threshold - size_threshold = self.shadow_settings['SIZE_THRESHOLD'] + size_threshold = self.shadow_settings["SIZE_THRESHOLD"] # zone passes if modeled is within percent_tolerance of desired_size - percent_tolerance = self.shadow_settings['PERCENT_TOLERANCE'] + percent_tolerance = self.shadow_settings["PERCENT_TOLERANCE"] # max percentage of zones allowed to fail - fail_threshold = self.shadow_settings['FAIL_THRESHOLD'] + fail_threshold = self.shadow_settings["FAIL_THRESHOLD"] modeled_size = self.modeled_size desired_size = self.desired_size @@ -362,16 +381,16 @@ def check_fit(self, iteration): # ignore zones where rel_diff < percent_tolerance rel_diff.where(rel_diff > (percent_tolerance / 100.0), 0, inplace=True) - self.num_fail['iter%s' % iteration] = (rel_diff > 0).sum() - self.max_abs_diff['iter%s' % iteration] = abs_diff.max() - self.max_rel_diff['iter%s' % iteration] = rel_diff.max() + self.num_fail["iter%s" % iteration] = (rel_diff > 0).sum() + self.max_abs_diff["iter%s" % iteration] = abs_diff.max() + self.max_rel_diff["iter%s" % iteration] = rel_diff.max() total_fails = (rel_diff > 0).values.sum() # FIXME - should not count zones where desired_size < threshold? (could calc in init) max_fail = (fail_threshold / 100.0) * util.iprod(desired_size.shape) - converged = (total_fails <= max_fail) + converged = total_fails <= max_fail # for c in desired_size: # print("check_fit %s segment %s" % (self.model_selector, c)) @@ -380,8 +399,10 @@ def check_fit(self, iteration): # print(" max abs diff %s" % (abs_diff[c].max())) # print(" max rel diff %s" % (rel_diff[c].max())) - logger.info("check_fit %s iteration: %s converged: %s max_fail: %s total_fails: %s" % - (self.model_selector, iteration, converged, max_fail, total_fails)) + logger.info( + "check_fit %s iteration: %s converged: %s max_fail: %s total_fails: %s" + % (self.model_selector, iteration, converged, max_fail, total_fails) + ) # - convergence stats if converged or iteration == self.max_iterations: @@ -422,7 +443,7 @@ def update_shadow_prices(self): assert self.use_shadow_pricing - shadow_price_method = self.shadow_settings['SHADOW_PRICE_METHOD'] + shadow_price_method = self.shadow_settings["SHADOW_PRICE_METHOD"] # can't update_shadow_prices until after first iteration # modeled_size should have been set by set_choices at end of previous iteration @@ -430,7 +451,7 @@ def update_shadow_prices(self): assert self.desired_size is not None assert self.shadow_prices is not None - if shadow_price_method == 'ctramp': + if shadow_price_method == "ctramp": # - CTRAMP """ if ( modeledDestinationLocationsByDestZone > 0 ) @@ -438,7 +459,7 @@ def update_shadow_prices(self): // else // shadowPrice *= scaledSize; """ - damping_factor = self.shadow_settings['DAMPING_FACTOR'] + damping_factor = self.shadow_settings["DAMPING_FACTOR"] assert 0 < damping_factor <= 1 new_scale_factor = self.desired_size / self.modeled_size @@ -447,9 +468,11 @@ def update_shadow_prices(self): # following CTRAMP (revised version - with 0 dest zone case lines commented out) # avoid zero-divide for 0 modeled_size, by leaving shadow_prices unchanged - new_shadow_prices.where(self.modeled_size > 0, self.shadow_prices, inplace=True) + new_shadow_prices.where( + self.modeled_size > 0, self.shadow_prices, inplace=True + ) - elif shadow_price_method == 'daysim': + elif shadow_price_method == "daysim": # - Daysim """ if modeled > desired: # if modeled is too high, increase shadow price @@ -467,21 +490,32 @@ def update_shadow_prices(self): shadow_price = shadow_price + log(np.maximum(target, 0.01) / np.maximum(modeled, 0.01)) """ # FIXME should these be the same as PERCENT_TOLERANCE and FAIL_THRESHOLD above? - absolute_tolerance = self.shadow_settings['DAYSIM_ABSOLUTE_TOLERANCE'] - percent_tolerance = self.shadow_settings['DAYSIM_PERCENT_TOLERANCE'] / 100.0 + absolute_tolerance = self.shadow_settings["DAYSIM_ABSOLUTE_TOLERANCE"] + percent_tolerance = self.shadow_settings["DAYSIM_PERCENT_TOLERANCE"] / 100.0 assert 0 <= percent_tolerance <= 1 target = np.where( self.modeled_size > self.desired_size, - np.minimum(self.modeled_size, - np.minimum(self.desired_size * (1 + percent_tolerance), - self.desired_size + absolute_tolerance)), - np.maximum(self.modeled_size, - np.maximum(self.desired_size * (1 - percent_tolerance), - self.desired_size - absolute_tolerance))) + np.minimum( + self.modeled_size, + np.minimum( + self.desired_size * (1 + percent_tolerance), + self.desired_size + absolute_tolerance, + ), + ), + np.maximum( + self.modeled_size, + np.maximum( + self.desired_size * (1 - percent_tolerance), + self.desired_size - absolute_tolerance, + ), + ), + ) # adjustment = np.log(np.maximum(target, 0.01) / np.maximum(self.modeled_size, 0.01)) - adjustment = np.log(np.maximum(target, 0.01) / np.maximum(self.modeled_size, 1)) + adjustment = np.log( + np.maximum(target, 0.01) / np.maximum(self.modeled_size, 1) + ) new_shadow_prices = self.shadow_prices + adjustment @@ -504,20 +538,25 @@ def dest_size_terms(self, segment): if self.use_shadow_pricing: - shadow_price_method = self.shadow_settings['SHADOW_PRICE_METHOD'] + shadow_price_method = self.shadow_settings["SHADOW_PRICE_METHOD"] - if shadow_price_method == 'ctramp': + if shadow_price_method == "ctramp": size_term_adjustment = self.shadow_prices[segment] - elif shadow_price_method == 'daysim': + elif shadow_price_method == "daysim": utility_adjustment = self.shadow_prices[segment] else: - raise RuntimeError("unknown SHADOW_PRICE_METHOD %s" % shadow_price_method) - - size_terms = pd.DataFrame({ - 'size_term': self.desired_size[segment], - 'shadow_price_size_term_adjustment': size_term_adjustment, - 'shadow_price_utility_adjustment': utility_adjustment}, - index=self.desired_size.index) + raise RuntimeError( + "unknown SHADOW_PRICE_METHOD %s" % shadow_price_method + ) + + size_terms = pd.DataFrame( + { + "size_term": self.desired_size[segment], + "shadow_price_size_term_adjustment": size_term_adjustment, + "shadow_price_utility_adjustment": utility_adjustment, + }, + index=self.desired_size.index, + ) assert size_terms.index.is_monotonic_increasing @@ -539,18 +578,24 @@ def write_trace_files(self, iteration): logger.info("write_trace_files iteration %s" % iteration) if iteration == 1: # write desired_size only on first iteration, as it doesn't change - tracing.write_csv(self.desired_size, - 'shadow_price_%s_desired_size' % self.model_selector, - transpose=False) - - tracing.write_csv(self.modeled_size, - 'shadow_price_%s_modeled_size_%s' % (self.model_selector, iteration), - transpose=False) + tracing.write_csv( + self.desired_size, + "shadow_price_%s_desired_size" % self.model_selector, + transpose=False, + ) + + tracing.write_csv( + self.modeled_size, + "shadow_price_%s_modeled_size_%s" % (self.model_selector, iteration), + transpose=False, + ) if self.use_shadow_pricing: - tracing.write_csv(self.shadow_prices, - 'shadow_price_%s_shadow_prices_%s' % (self.model_selector, iteration), - transpose=False) + tracing.write_csv( + self.shadow_prices, + "shadow_price_%s_shadow_prices_%s" % (self.model_selector, iteration), + transpose=False, + ) def block_name(model_selector): @@ -597,8 +642,8 @@ def buffers_for_shadow_pricing(shadow_pricing_info): dict of multiprocessing.Array keyed by model_selector """ - dtype = shadow_pricing_info['dtype'] - block_shapes = shadow_pricing_info['block_shapes'] + dtype = shadow_pricing_info["dtype"] + block_shapes = shadow_pricing_info["block_shapes"] data_buffers = {} for block_key, block_shape in block_shapes.items(): @@ -607,13 +652,17 @@ def buffers_for_shadow_pricing(shadow_pricing_info): buffer_size = util.iprod(block_shape) csz = buffer_size * np.dtype(dtype).itemsize - logger.info("allocating shared shadow pricing buffer %s %s buffer_size %s bytes %s (%s)" % - (block_key, buffer_size, block_shape, csz, util.GB(csz))) + logger.info( + "allocating shared shadow pricing buffer %s %s buffer_size %s bytes %s (%s)" + % (block_key, buffer_size, block_shape, csz, util.GB(csz)) + ) if np.issubdtype(dtype, np.int64): typecode = ctypes.c_int64 else: - raise RuntimeError("buffer_for_shadow_pricing unrecognized dtype %s" % dtype) + raise RuntimeError( + "buffer_for_shadow_pricing unrecognized dtype %s" % dtype + ) shared_data_buffer = multiprocessing.Array(typecode, buffer_size) @@ -652,11 +701,13 @@ def shadow_price_data_from_buffers(data_buffers, shadow_pricing_info, model_sele assert type(data_buffers) == dict - dtype = shadow_pricing_info['dtype'] - block_shapes = shadow_pricing_info['block_shapes'] + dtype = shadow_pricing_info["dtype"] + block_shapes = shadow_pricing_info["block_shapes"] if model_selector not in block_shapes: - raise RuntimeError("Model selector %s not in shadow_pricing_info" % model_selector) + raise RuntimeError( + "Model selector %s not in shadow_pricing_info" % model_selector + ) if block_name(model_selector) not in data_buffers: raise RuntimeError("Block %s not in data_buffers" % block_name(model_selector)) @@ -683,31 +734,30 @@ def load_shadow_price_calculator(model_settings): spc : ShadowPriceCalculator """ - num_processes = inject.get_injectable('num_processes', 1) + num_processes = inject.get_injectable("num_processes", 1) - model_selector = model_settings['MODEL_SELECTOR'] + model_selector = model_settings["MODEL_SELECTOR"] # - get shared_data from data_buffers (if multiprocessing) - data_buffers = inject.get_injectable('data_buffers', None) + data_buffers = inject.get_injectable("data_buffers", None) if data_buffers is not None: - logger.info('Using existing data_buffers for shadow_price') + logger.info("Using existing data_buffers for shadow_price") # - shadow_pricing_info - shadow_pricing_info = inject.get_injectable('shadow_pricing_info', None) + shadow_pricing_info = inject.get_injectable("shadow_pricing_info", None) assert shadow_pricing_info is not None # - extract data buffer and reshape as numpy array - data, lock = \ - shadow_price_data_from_buffers(data_buffers, shadow_pricing_info, model_selector) + data, lock = shadow_price_data_from_buffers( + data_buffers, shadow_pricing_info, model_selector + ) else: assert num_processes == 1 data = None # ShadowPriceCalculator will allocate its own data lock = None # - ShadowPriceCalculator - spc = ShadowPriceCalculator( - model_settings, - num_processes, data, lock) + spc = ShadowPriceCalculator(model_settings, num_processes, data, lock) return spc @@ -733,18 +783,20 @@ def add_size_tables(): (size table) counts. """ - use_shadow_pricing = bool(config.setting('use_shadow_pricing')) + use_shadow_pricing = bool(config.setting("use_shadow_pricing")) - shadow_settings = config.read_model_settings('shadow_pricing.yaml') - shadow_pricing_models = shadow_settings.get('shadow_pricing_models') + shadow_settings = config.read_model_settings("shadow_pricing.yaml") + shadow_pricing_models = shadow_settings.get("shadow_pricing_models") if shadow_pricing_models is None: - logger.warning('shadow_pricing_models list not found in shadow_pricing settings') + logger.warning( + "shadow_pricing_models list not found in shadow_pricing settings" + ) return # probably ought not scale if not shadow_pricing (breaks partial sample replicability) # but this allows compatability with existing CTRAMP behavior... - scale_size_table = shadow_settings.get('SCALE_SIZE_TABLE', False) + scale_size_table = shadow_settings.get("SCALE_SIZE_TABLE", False) # shadow_pricing_models is dict of {: } # since these are scaled to model size, they have to be created while single-process @@ -753,21 +805,24 @@ def add_size_tables(): model_settings = config.read_model_settings(model_name) - assert model_selector == model_settings['MODEL_SELECTOR'] + assert model_selector == model_settings["MODEL_SELECTOR"] - assert 'SEGMENT_IDS' in model_settings, f"missing SEGMENT_IDS setting in {model_name} model_settings" - segment_ids = model_settings['SEGMENT_IDS'] - chooser_table_name = model_settings['CHOOSER_TABLE_NAME'] - chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] + assert ( + "SEGMENT_IDS" in model_settings + ), f"missing SEGMENT_IDS setting in {model_name} model_settings" + segment_ids = model_settings["SEGMENT_IDS"] + chooser_table_name = model_settings["CHOOSER_TABLE_NAME"] + chooser_segment_column = model_settings["CHOOSER_SEGMENT_COLUMN_NAME"] choosers_df = inject.get_table(chooser_table_name).to_frame() - if 'CHOOSER_FILTER_COLUMN_NAME' in model_settings: - choosers_df = \ - choosers_df[choosers_df[model_settings['CHOOSER_FILTER_COLUMN_NAME']] != 0] + if "CHOOSER_FILTER_COLUMN_NAME" in model_settings: + choosers_df = choosers_df[ + choosers_df[model_settings["CHOOSER_FILTER_COLUMN_NAME"]] != 0 + ] # - raw_desired_size - land_use = inject.get_table('land_use') - size_terms = inject.get_injectable('size_terms') + land_use = inject.get_table("land_use") + size_terms = inject.get_injectable("size_terms") raw_size = tour_destination_size_terms(land_use, size_terms, model_selector) assert set(raw_size.columns) == set(segment_ids.keys()) @@ -783,28 +838,38 @@ def add_size_tables(): segment_desired_size = raw_size[c].astype(np.float64).sum() # number of synthetic population choosers in segment - segment_chooser_count = \ - (choosers_df[chooser_segment_column] == segment_ids[c]).sum() - - segment_scale_factors[c] = \ - segment_chooser_count / np.maximum(segment_desired_size, 1) - - logger.info("add_desired_size_tables %s segment %s " - "desired %s modeled %s scale_factor %s" % - (chooser_table_name, c, - segment_desired_size, - segment_chooser_count, - segment_scale_factors[c])) + segment_chooser_count = ( + choosers_df[chooser_segment_column] == segment_ids[c] + ).sum() + + segment_scale_factors[c] = segment_chooser_count / np.maximum( + segment_desired_size, 1 + ) + + logger.info( + "add_desired_size_tables %s segment %s " + "desired %s modeled %s scale_factor %s" + % ( + chooser_table_name, + c, + segment_desired_size, + segment_chooser_count, + segment_scale_factors[c], + ) + ) # FIXME - should we be rounding? scaled_size = (raw_size * segment_scale_factors).round() else: scaled_size = raw_size - logger.debug(f"add_size_table {size_table_name(model_selector)} ({scaled_size.shape}) for {model_selector}") + logger.debug( + f"add_size_table {size_table_name(model_selector)} ({scaled_size.shape}) for {model_selector}" + ) - assert scaled_size.index.is_monotonic_increasing, \ - f"size table {size_table_name(model_selector)} not is_monotonic_increasing" + assert ( + scaled_size.index.is_monotonic_increasing + ), f"size table {size_table_name(model_selector)} not is_monotonic_increasing" inject.add_table(size_table_name(model_selector), scaled_size) @@ -823,13 +888,13 @@ def get_shadow_pricing_info(): block_shapes: dict {: } """ - land_use = inject.get_table('land_use') - size_terms = inject.get_injectable('size_terms') + land_use = inject.get_table("land_use") + size_terms = inject.get_injectable("size_terms") - shadow_settings = config.read_model_settings('shadow_pricing.yaml') + shadow_settings = config.read_model_settings("shadow_pricing.yaml") # shadow_pricing_models is dict of {: } - shadow_pricing_models = shadow_settings.get('shadow_pricing_models', {}) + shadow_pricing_models = shadow_settings.get("shadow_pricing_models", {}) blocks = OrderedDict() for model_selector in shadow_pricing_models: @@ -843,8 +908,8 @@ def get_shadow_pricing_info(): sp_dtype = np.int64 shadow_pricing_info = { - 'dtype': sp_dtype, - 'block_shapes': blocks, + "dtype": sp_dtype, + "block_shapes": blocks, } for k in shadow_pricing_info: diff --git a/activitysim/abm/tables/size_terms.py b/activitysim/abm/tables/size_terms.py index cdf9a480b9..8109b3a3bb 100644 --- a/activitysim/abm/tables/size_terms.py +++ b/activitysim/abm/tables/size_terms.py @@ -1,20 +1,19 @@ # ActivitySim # See full license in LICENSE.txt. import logging + import numpy as np import pandas as pd -from activitysim.core import inject -from activitysim.core import config - +from activitysim.core import config, inject logger = logging.getLogger(__name__) @inject.injectable(cache=True) def size_terms(): - f = config.config_file_path('destination_choice_size_terms.csv') - return pd.read_csv(f, comment='#', index_col='segment') + f = config.config_file_path("destination_choice_size_terms.csv") + return pd.read_csv(f, comment="#", index_col="segment") def size_term(land_use, destination_choice_coefficients): @@ -86,19 +85,23 @@ def tour_destination_size_terms(land_use, size_terms, model_selector): land_use = land_use.sort_index() size_terms = size_terms[size_terms.model_selector == model_selector].copy() - del size_terms['model_selector'] + del size_terms["model_selector"] - df = pd.DataFrame({key: size_term(land_use, row) for key, row in size_terms.iterrows()}, - index=land_use.index) + df = pd.DataFrame( + {key: size_term(land_use, row) for key, row in size_terms.iterrows()}, + index=land_use.index, + ) - assert land_use.index.name == 'zone_id' + assert land_use.index.name == "zone_id" df.index.name = land_use.index.name - if not (df.dtypes == 'float64').all(): - logger.warning('Surprised to find that not all size_terms were float64!') + if not (df.dtypes == "float64").all(): + logger.warning("Surprised to find that not all size_terms were float64!") if df.isna().any(axis=None): - logger.warning(f"tour_destination_size_terms with NAN values\n{df[df.isna().any(axis=1)]}") + logger.warning( + f"tour_destination_size_terms with NAN values\n{df[df.isna().any(axis=1)]}" + ) assert not df.isna().any(axis=None) return df diff --git a/activitysim/abm/tables/skims.py b/activitysim/abm/tables/skims.py index bdcaf7f3d1..f7a841d9ad 100644 --- a/activitysim/abm/tables/skims.py +++ b/activitysim/abm/tables/skims.py @@ -3,13 +3,9 @@ import logging -from activitysim.core import los -from activitysim.core import inject -from activitysim.core import config - +from activitysim.core import config, inject, los from activitysim.core.pathbuilder import TransitVirtualPathBuilder - logger = logging.getLogger(__name__) """ @@ -46,12 +42,12 @@ def log_settings(): # abm settings to log on startup return [ - 'households_sample_size', - 'chunk_size', - 'chunk_method', - 'chunk_training_mode', - 'multiprocess', - 'num_processes', - 'resume_after', - 'trace_hh_id', + "households_sample_size", + "chunk_size", + "chunk_method", + "chunk_training_mode", + "multiprocess", + "num_processes", + "resume_after", + "trace_hh_id", ] diff --git a/activitysim/abm/tables/table_dict.py b/activitysim/abm/tables/table_dict.py index cb4367ca2f..2b1cb9086c 100644 --- a/activitysim/abm/tables/table_dict.py +++ b/activitysim/abm/tables/table_dict.py @@ -3,9 +3,8 @@ import logging from collections import OrderedDict -from activitysim.core import inject from activitysim.abm.models.util import canonical_ids as cid - +from activitysim.core import inject logger = logging.getLogger(__name__) diff --git a/activitysim/abm/tables/time_windows.py b/activitysim/abm/tables/time_windows.py index ca6cf0d027..1a6279173e 100644 --- a/activitysim/abm/tables/time_windows.py +++ b/activitysim/abm/tables/time_windows.py @@ -6,8 +6,7 @@ import numpy as np import pandas as pd -from activitysim.core import inject -from activitysim.core import config +from activitysim.core import config, inject from activitysim.core import timetable as tt logger = logging.getLogger(__name__) @@ -16,10 +15,10 @@ @inject.injectable(cache=True) def tdd_alts(): # right now this file just contains the start and end hour - file_path = config.config_file_path('tour_departure_and_duration_alternatives.csv') + file_path = config.config_file_path("tour_departure_and_duration_alternatives.csv") df = pd.read_csv(file_path) - df['duration'] = df.end - df.start + df["duration"] = df.end - df.start # - NARROW df = df.astype(np.int8) @@ -37,15 +36,17 @@ def tdd_alt_segments(): # school,PM,15,17 # school,EV,18,22 - file_path = config.config_file_path('tour_departure_and_duration_segments.csv', mandatory=False) + file_path = config.config_file_path( + "tour_departure_and_duration_segments.csv", mandatory=False + ) if file_path: - df = pd.read_csv(file_path, comment='#') + df = pd.read_csv(file_path, comment="#") # - NARROW - df['start'] = df['start'].astype(np.int8) - df['end'] = df['end'].astype(np.int8) + df["start"] = df["start"].astype(np.int8) + df["end"] = df["end"].astype(np.int8) else: df = None @@ -58,7 +59,7 @@ def person_windows(persons, tdd_alts): df = tt.create_timetable_windows(persons, tdd_alts) - inject.add_table('person_windows', df) + inject.add_table("person_windows", df) return df @@ -66,5 +67,5 @@ def person_windows(persons, tdd_alts): @inject.injectable() def timetable(person_windows, tdd_alts): - logging.debug('@inject timetable') + logging.debug("@inject timetable") return tt.TimeTable(person_windows.to_frame(), tdd_alts, person_windows.name) diff --git a/activitysim/abm/tables/tours.py b/activitysim/abm/tables/tours.py index 3982015885..a3bd8a8112 100644 --- a/activitysim/abm/tables/tours.py +++ b/activitysim/abm/tables/tours.py @@ -12,4 +12,4 @@ def tours_merged(tours, persons_merged): return inject.merge_tables(tours.name, tables=[tours, persons_merged]) -inject.broadcast('persons_merged', 'tours', cast_index=True, onto_on='person_id') +inject.broadcast("persons_merged", "tours", cast_index=True, onto_on="person_id") diff --git a/activitysim/abm/tables/trips.py b/activitysim/abm/tables/trips.py index bd9f0a89f0..890cbd6f57 100644 --- a/activitysim/abm/tables/trips.py +++ b/activitysim/abm/tables/trips.py @@ -4,7 +4,6 @@ from activitysim.core import inject - logger = logging.getLogger(__name__) @@ -13,4 +12,4 @@ def trips_merged(trips, tours): return inject.merge_tables(trips.name, tables=[trips, tours]) -inject.broadcast('tours', 'trips', cast_index=True, onto_on='tour_id') +inject.broadcast("tours", "trips", cast_index=True, onto_on="tour_id") diff --git a/activitysim/abm/test/test_misc/setup_utils.py b/activitysim/abm/test/test_misc/setup_utils.py index bfee82afc0..32c6842b1e 100644 --- a/activitysim/abm/test/test_misc/setup_utils.py +++ b/activitysim/abm/test/test_misc/setup_utils.py @@ -1,23 +1,18 @@ # ActivitySim # See full license in LICENSE.txt. -import os import logging -import pkg_resources +import os -import openmatrix as omx import numpy as np import numpy.testing as npt - +import openmatrix as omx import pandas as pd import pandas.testing as pdt +import pkg_resources import pytest import yaml -from activitysim.core import random -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import config +from activitysim.core import config, inject, pipeline, random, tracing # set the max households for all tests (this is to limit memory use on travis) HOUSEHOLDS_SAMPLE_SIZE = 50 @@ -34,39 +29,39 @@ def example_path(dirname): - resource = os.path.join('examples', 'example_mtc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_mtc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def setup_dirs(ancillary_configs_dir=None, data_dir=None): # ancillary_configs_dir is used by run_mp to test multiprocess - test_pipeline_configs_dir = os.path.join(os.path.dirname(__file__), 'configs') - example_configs_dir = example_path('configs') + test_pipeline_configs_dir = os.path.join(os.path.dirname(__file__), "configs") + example_configs_dir = example_path("configs") configs_dir = [test_pipeline_configs_dir, example_configs_dir] if ancillary_configs_dir is not None: configs_dir = [ancillary_configs_dir] + configs_dir - inject.add_injectable('configs_dir', configs_dir) + inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), 'output') - inject.add_injectable('output_dir', output_dir) + output_dir = os.path.join(os.path.dirname(__file__), "output") + inject.add_injectable("output_dir", output_dir) if not data_dir: - data_dir = example_path('data') + data_dir = example_path("data") - inject.add_injectable('data_dir', data_dir) + inject.add_injectable("data_dir", data_dir) inject.clear_cache() tracing.config_logger() - tracing.delete_output_files('csv') - tracing.delete_output_files('txt') - tracing.delete_output_files('yaml') - tracing.delete_output_files('omx') + tracing.delete_output_files("csv") + tracing.delete_output_files("txt") + tracing.delete_output_files("yaml") + tracing.delete_output_files("omx") def teardown_function(func): @@ -86,7 +81,7 @@ def close_handlers(): def inject_settings(**kwargs): - settings = config.read_settings_file('settings.yaml', mandatory=True) + settings = config.read_settings_file("settings.yaml", mandatory=True) for k in kwargs: settings[k] = kwargs[k] diff --git a/activitysim/abm/test/test_misc/test_load_cached_accessibility.py b/activitysim/abm/test/test_misc/test_load_cached_accessibility.py index 762fdd705c..b73492f18b 100644 --- a/activitysim/abm/test/test_misc/test_load_cached_accessibility.py +++ b/activitysim/abm/test/test_misc/test_load_cached_accessibility.py @@ -1,26 +1,20 @@ # ActivitySim # See full license in LICENSE.txt. -import os import logging -import pkg_resources +import os -import openmatrix as omx import numpy as np import numpy.testing as npt - +import openmatrix as omx import pandas as pd import pandas.testing as pdt +import pkg_resources import pytest import yaml -from activitysim.core import random -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import config +from activitysim.core import config, inject, pipeline, random, tracing -from .setup_utils import setup_dirs -from .setup_utils import inject_settings +from .setup_utils import inject_settings, setup_dirs # set the max households for all tests (this is to limit memory use on travis) HOUSEHOLDS_SAMPLE_SIZE = 50 @@ -37,8 +31,8 @@ def example_path(dirname): - resource = os.path.join('examples', 'example_mtc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_mtc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def teardown_function(func): @@ -60,7 +54,7 @@ def test_load_cached_accessibility(): inject.clear_cache() inject.reinject_decorated_tables() - data_dir = [os.path.join(os.path.dirname(__file__), 'data'), example_path('data')] + data_dir = [os.path.join(os.path.dirname(__file__), "data"), example_path("data")] setup_dirs(data_dir=data_dir) # @@ -68,28 +62,30 @@ def test_load_cached_accessibility(): # activitysim.abm.tables.land_use.accessibility() will load this table if listed here # presumably independently calculated outside activitysim or a cached copy created during a previous run # - settings = config.read_settings_file('settings.yaml', mandatory=True) - input_table_list = settings.get('input_table_list') - input_table_list.append({ - 'tablename': 'accessibility', - 'filename': 'cached_accessibility.csv', - 'index_col': 'zone_id' - }) - inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, - input_table_list=input_table_list - ) + settings = config.read_settings_file("settings.yaml", mandatory=True) + input_table_list = settings.get("input_table_list") + input_table_list.append( + { + "tablename": "accessibility", + "filename": "cached_accessibility.csv", + "index_col": "zone_id", + } + ) + inject_settings( + households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, input_table_list=input_table_list + ) _MODELS = [ - 'initialize_landuse', + "initialize_landuse", # 'compute_accessibility', # we load accessibility table ordinarily created by compute_accessibility - 'initialize_households', + "initialize_households", ] pipeline.run(models=_MODELS, resume_after=None) accessibility_df = pipeline.get_table("accessibility") - assert 'auPkRetail' in accessibility_df + assert "auPkRetail" in accessibility_df pipeline.close_pipeline() inject.clear_cache() diff --git a/activitysim/abm/test/test_misc/test_misc.py b/activitysim/abm/test/test_misc/test_misc.py index af349bc4d0..f4daf7250c 100644 --- a/activitysim/abm/test/test_misc/test_misc.py +++ b/activitysim/abm/test/test_misc/test_misc.py @@ -1,11 +1,11 @@ # ActivitySim # See full license in LICENSE.txt. import os + import pytest from activitysim.core import inject - # The following import statement has the side-effect of registering injectables: from .. import __init__ @@ -26,13 +26,13 @@ def test_misc(): inject.get_injectable("output_dir") assert "directory does not exist" in str(excinfo.value) - configs_dir = os.path.join(os.path.dirname(__file__), 'configs_test_misc') + configs_dir = os.path.join(os.path.dirname(__file__), "configs_test_misc") inject.add_injectable("configs_dir", configs_dir) settings = inject.get_injectable("settings") assert isinstance(settings, dict) - data_dir = os.path.join(os.path.dirname(__file__), 'data') + data_dir = os.path.join(os.path.dirname(__file__), "data") inject.add_injectable("data_dir", data_dir) # default values if not specified in settings diff --git a/activitysim/abm/test/test_misc/test_trip_departure_choice.py b/activitysim/abm/test/test_misc/test_trip_departure_choice.py index 285655ac70..f36cf6df20 100644 --- a/activitysim/abm/test/test_misc/test_trip_departure_choice.py +++ b/activitysim/abm/test/test_misc/test_trip_departure_choice.py @@ -1,4 +1,3 @@ - import numpy as np import pandas as pd import pytest @@ -6,52 +5,93 @@ import activitysim.abm.models.trip_departure_choice as tdc from activitysim.abm.models.util.trip import get_time_windows from activitysim.core import los + from .setup_utils import setup_dirs -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def trips(): outbound_array = [True, True, False, False, False, True, True, False, False, True] - trips = pd.DataFrame(data={'tour_id': [1, 1, 2, 2, 2, 2, 2, 3, 3, 4], - 'trip_duration': [2, 2, 7, 7, 7, 12, 12, 4, 4, 5], - 'inbound_duration': [0, 0, 7, 7, 7, 0, 0, 4, 4, 5], - 'main_leg_duration': [4, 4, 2, 2, 2, 2, 2, 1, 1, 2], - 'outbound_duration': [2, 2, 0, 0, 0, 12, 12, 0, 0, 5], - 'trip_count': [2, 2, 3, 3, 3, 2, 2, 2, 2, 1], - 'trip_num': [1, 2, 1, 2, 3, 1, 2, 1, 2, 1], - 'outbound': outbound_array, - 'chunk_id': [1, 1, 2, 2, 2, 2, 2, 3, 3, 4], - 'is_work': [True, True, False, False, False, False, False, False, False, True], - 'is_school': [False, False, False, False, False, False, False, True, True, False], - 'is_eatout': [False, False, True, True, True, True, True, False, False, False], - 'start': [8, 8, 18, 18, 18, 18, 18, 24, 24, 19], - 'end': [14, 14, 39, 39, 39, 39, 39, 29, 29, 26], - 'origin': [3, 5, 15, 12, 24, 8, 17, 8, 9, 6], - 'destination': [5, 9, 12, 24, 20, 17, 18, 9, 11, 14], - }, index=range(10)) - - trips.index.name = 'trip_id' + trips = pd.DataFrame( + data={ + "tour_id": [1, 1, 2, 2, 2, 2, 2, 3, 3, 4], + "trip_duration": [2, 2, 7, 7, 7, 12, 12, 4, 4, 5], + "inbound_duration": [0, 0, 7, 7, 7, 0, 0, 4, 4, 5], + "main_leg_duration": [4, 4, 2, 2, 2, 2, 2, 1, 1, 2], + "outbound_duration": [2, 2, 0, 0, 0, 12, 12, 0, 0, 5], + "trip_count": [2, 2, 3, 3, 3, 2, 2, 2, 2, 1], + "trip_num": [1, 2, 1, 2, 3, 1, 2, 1, 2, 1], + "outbound": outbound_array, + "chunk_id": [1, 1, 2, 2, 2, 2, 2, 3, 3, 4], + "is_work": [ + True, + True, + False, + False, + False, + False, + False, + False, + False, + True, + ], + "is_school": [ + False, + False, + False, + False, + False, + False, + False, + True, + True, + False, + ], + "is_eatout": [ + False, + False, + True, + True, + True, + True, + True, + False, + False, + False, + ], + "start": [8, 8, 18, 18, 18, 18, 18, 24, 24, 19], + "end": [14, 14, 39, 39, 39, 39, 39, 29, 29, 26], + "origin": [3, 5, 15, 12, 24, 8, 17, 8, 9, 6], + "destination": [5, 9, 12, 24, 20, 17, 18, 9, 11, 14], + }, + index=range(10), + ) + + trips.index.name = "trip_id" return trips -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def settings(): - return {"skims_file": "skims.omx", - "skim_time_periods": { - "labels": ['EA', 'AM', 'MD', 'PM', 'NT']} - } + return { + "skims_file": "skims.omx", + "skim_time_periods": {"labels": ["EA", "AM", "MD", "PM", "NT"]}, + } -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def model_spec(): - index = ["@(df['stop_time_duration'] * df['is_work'].astype(int)).astype(int)", - "@(df['stop_time_duration'] * df['is_school'].astype(int)).astype(int)", - "@(df['stop_time_duration'] * df['is_eatout'].astype(int)).astype(int)"] + index = [ + "@(df['stop_time_duration'] * df['is_work'].astype(int)).astype(int)", + "@(df['stop_time_duration'] * df['is_school'].astype(int)).astype(int)", + "@(df['stop_time_duration'] * df['is_eatout'].astype(int)).astype(int)", + ] - values = {'inbound': [0.933020, 0.370260, 0.994840], - 'outbound': [0.933020, 0.370260, 0.994840] - } + values = { + "inbound": [0.933020, 0.370260, 0.994840], + "outbound": [0.933020, 0.370260, 0.994840], + } return pd.DataFrame(index=index, data=values) @@ -59,14 +99,20 @@ def model_spec(): def test_build_patterns(trips): time_windows = get_time_windows(48, 3) patterns = tdc.build_patterns(trips, time_windows) - patterns = patterns.sort_values(['tour_id', 'outbound', 'trip_num']) + patterns = patterns.sort_values(["tour_id", "outbound", "trip_num"]) assert patterns.shape[0] == 34 assert patterns.shape[1] == 6 assert patterns.index.name == tdc.TOUR_LEG_ID - output_columns = [tdc.TOUR_ID, tdc.PATTERN_ID, tdc.TRIP_NUM, - tdc.STOP_TIME_DURATION, tdc.TOUR_ID, tdc.OUTBOUND] + output_columns = [ + tdc.TOUR_ID, + tdc.PATTERN_ID, + tdc.TRIP_NUM, + tdc.STOP_TIME_DURATION, + tdc.TOUR_ID, + tdc.OUTBOUND, + ] assert set(output_columns).issubset(patterns.columns) @@ -74,7 +120,10 @@ def test_build_patterns(trips): def test_get_tour_legs(trips): tour_legs = tdc.get_tour_legs(trips) assert tour_legs.index.name == tdc.TOUR_LEG_ID - assert np.unique(tour_legs[tdc.TOUR_ID].values).shape[0] == np.unique(trips[tdc.TOUR_ID].values).shape[0] + assert ( + np.unique(tour_legs[tdc.TOUR_ID].values).shape[0] + == np.unique(trips[tdc.TOUR_ID].values).shape[0] + ) def test_generate_alternative(trips): @@ -85,14 +134,16 @@ def test_generate_alternative(trips): assert alts.index.name == tdc.TRIP_ID assert alts.columns[0] == tdc.STOP_TIME_DURATION - pd.testing.assert_series_equal(trips.groupby(trips.index)['trip_duration'].max(), - alts.groupby(alts.index)[tdc.STOP_TIME_DURATION].max(), - check_names=False) + pd.testing.assert_series_equal( + trips.groupby(trips.index)["trip_duration"].max(), + alts.groupby(alts.index)[tdc.STOP_TIME_DURATION].max(), + check_names=False, + ) def test_apply_stage_two_model(model_spec, trips): setup_dirs() - departures = tdc.apply_stage_two_model(model_spec, trips, 0, 'TEST Trip Departure') + departures = tdc.apply_stage_two_model(model_spec, trips, 0, "TEST Trip Departure") assert len(departures) == len(trips) pd.testing.assert_index_equal(departures.index, trips.index) diff --git a/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py b/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py index e03c018967..a3f6ffdd95 100644 --- a/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py +++ b/activitysim/abm/test/test_misc/test_trip_scheduling_choice.py @@ -1,4 +1,3 @@ - import numpy as np import pandas as pd import pytest @@ -6,22 +5,27 @@ from activitysim.abm.models import trip_scheduling_choice as tsc from activitysim.abm.tables.skims import skim_dict from activitysim.core import los + from .setup_utils import setup_dirs -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def tours(): - tours = pd.DataFrame(data={'duration': [2, 44, 32, 12, 11, 16], - 'num_outbound_stops': [2, 4, 0, 0, 1, 3], - 'num_inbound_stops': [1, 0, 0, 2, 1, 2], - 'tour_type': ['othdisc'] * 2 + ['eatout'] * 4, - 'origin': [3, 10, 15, 23, 5, 8], - 'destination': [5, 9, 12, 24, 20, 17], - tsc.LAST_OB_STOP: [1, 3, 0, 0, 12, 14], - tsc.FIRST_IB_STOP: [2, 0, 0, 4, 6, 20], - }, index=range(6)) - - tours.index.name = 'tour_id' + tours = pd.DataFrame( + data={ + "duration": [2, 44, 32, 12, 11, 16], + "num_outbound_stops": [2, 4, 0, 0, 1, 3], + "num_inbound_stops": [1, 0, 0, 2, 1, 2], + "tour_type": ["othdisc"] * 2 + ["eatout"] * 4, + "origin": [3, 10, 15, 23, 5, 8], + "destination": [5, 9, 12, 24, 20, 17], + tsc.LAST_OB_STOP: [1, 3, 0, 0, 12, 14], + tsc.FIRST_IB_STOP: [2, 0, 0, 4, 6, 20], + }, + index=range(6), + ) + + tours.index.name = "tour_id" tours[tsc.HAS_OB_STOPS] = tours[tsc.NUM_OB_STOPS] >= 1 tours[tsc.HAS_IB_STOPS] = tours[tsc.NUM_IB_STOPS] >= 1 @@ -29,41 +33,49 @@ def tours(): return tours -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def settings(): - return {"skims_file": "skims.omx", - "skim_time_periods": { - "labels": ['MD']} - } + return {"skims_file": "skims.omx", "skim_time_periods": {"labels": ["MD"]}} -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def model_spec(): - index = ["@(df['main_leg_duration']>df['duration']).astype(int)", - "@(df['main_leg_duration'] == 0)&(df['tour_type']=='othdiscr')", - "@(df['main_leg_duration'] == 1)&(df['tour_type']=='othdiscr')", - "@(df['main_leg_duration'] == 2)&(df['tour_type']=='othdiscr')", - "@(df['main_leg_duration'] == 3)&(df['tour_type']=='othdiscr')", - "@(df['main_leg_duration'] == 4)&(df['tour_type']=='othdiscr')", - "@df['tour_type']=='othdiscr'", - "@df['tour_type']=='eatout'", - "@df['tour_type']=='eatout'" - ] - - values = [-999, -6.5884, -5.0326, -2.0526, -1.0313, -0.46489, 0.060382, -0.7508, 0.53247] - - return pd.DataFrame(index=index, data=values, columns=['stage_one']) - - -@pytest.fixture(scope='module') + index = [ + "@(df['main_leg_duration']>df['duration']).astype(int)", + "@(df['main_leg_duration'] == 0)&(df['tour_type']=='othdiscr')", + "@(df['main_leg_duration'] == 1)&(df['tour_type']=='othdiscr')", + "@(df['main_leg_duration'] == 2)&(df['tour_type']=='othdiscr')", + "@(df['main_leg_duration'] == 3)&(df['tour_type']=='othdiscr')", + "@(df['main_leg_duration'] == 4)&(df['tour_type']=='othdiscr')", + "@df['tour_type']=='othdiscr'", + "@df['tour_type']=='eatout'", + "@df['tour_type']=='eatout'", + ] + + values = [ + -999, + -6.5884, + -5.0326, + -2.0526, + -1.0313, + -0.46489, + 0.060382, + -0.7508, + 0.53247, + ] + + return pd.DataFrame(index=index, data=values, columns=["stage_one"]) + + +@pytest.fixture(scope="module") def skims(settings): setup_dirs() nw_los = los.Network_LOS() nw_los.load_data() skim_d = skim_dict(nw_los) - od_skim_stack_wrapper = skim_d.wrap('origin', 'destination') - do_skim_stack_wrapper = skim_d.wrap('destination', 'origin') + od_skim_stack_wrapper = skim_d.wrap("origin", "destination") + do_skim_stack_wrapper = skim_d.wrap("destination", "origin") obib_skim_stack_wrapper = skim_d.wrap(tsc.LAST_OB_STOP, tsc.FIRST_IB_STOP) skims = [od_skim_stack_wrapper, do_skim_stack_wrapper, obib_skim_stack_wrapper] @@ -71,13 +83,9 @@ def skims(settings): return skims -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def locals_dict(skims): - return { - "od_skims": skims[0], - "do_skims": skims[1], - "obib_skims": skims[2] - } + return {"od_skims": skims[0], "do_skims": skims[1], "obib_skims": skims[2]} def test_generate_schedule_alternatives(tours): @@ -85,40 +93,53 @@ def test_generate_schedule_alternatives(tours): assert windows.shape[0] == 296 assert windows.shape[1] == 4 - output_columns = [tsc.SCHEDULE_ID, tsc.MAIN_LEG_DURATION, - tsc.OB_DURATION, tsc.IB_DURATION] + output_columns = [ + tsc.SCHEDULE_ID, + tsc.MAIN_LEG_DURATION, + tsc.OB_DURATION, + tsc.IB_DURATION, + ] assert set(output_columns).issubset(windows.columns) def test_no_stops_patterns(tours): - no_stops = tours[(tours['num_outbound_stops'] == 0) & (tours['num_inbound_stops'] == 0)].copy() + no_stops = tours[ + (tours["num_outbound_stops"] == 0) & (tours["num_inbound_stops"] == 0) + ].copy() windows = tsc.no_stops_patterns(no_stops) assert windows.shape[0] == 1 assert windows.shape[1] == 3 - output_columns = [tsc.MAIN_LEG_DURATION, - tsc.OB_DURATION, tsc.IB_DURATION] + output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] assert set(output_columns).issubset(windows.columns) - pd.testing.assert_series_equal(windows[tsc.MAIN_LEG_DURATION], no_stops['duration'], - check_names=False, check_dtype=False) + pd.testing.assert_series_equal( + windows[tsc.MAIN_LEG_DURATION], + no_stops["duration"], + check_names=False, + check_dtype=False, + ) assert windows[windows[tsc.IB_DURATION] > 0].empty assert windows[windows[tsc.OB_DURATION] > 0].empty def test_one_way_stop_patterns(tours): - one_way_stops = tours[((tours['num_outbound_stops'] > 0).astype(int) + - (tours['num_inbound_stops'] > 0).astype(int)) == 1].copy() + one_way_stops = tours[ + ( + (tours["num_outbound_stops"] > 0).astype(int) + + (tours["num_inbound_stops"] > 0).astype(int) + ) + == 1 + ].copy() windows = tsc.stop_one_way_only_patterns(one_way_stops) assert windows.shape[0] == 58 assert windows.shape[1] == 3 - output_columns = [tsc.MAIN_LEG_DURATION, - tsc.OB_DURATION, tsc.IB_DURATION] + output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] assert set(output_columns).issubset(windows.columns) @@ -129,15 +150,19 @@ def test_one_way_stop_patterns(tours): def test_two_way_stop_patterns(tours): - two_way_stops = tours[((tours['num_outbound_stops'] > 0).astype(int) + - (tours['num_inbound_stops'] > 0).astype(int)) == 2].copy() + two_way_stops = tours[ + ( + (tours["num_outbound_stops"] > 0).astype(int) + + (tours["num_inbound_stops"] > 0).astype(int) + ) + == 2 + ].copy() windows = tsc.stop_two_way_only_patterns(two_way_stops) assert windows.shape[0] == 237 assert windows.shape[1] == 3 - output_columns = [tsc.MAIN_LEG_DURATION, - tsc.OB_DURATION, tsc.IB_DURATION] + output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] assert set(output_columns).issubset(windows.columns) @@ -147,15 +172,21 @@ def test_run_trip_scheduling_choice(model_spec, tours, skims, locals_dict): Test run the model. """ - out_tours = tsc.run_trip_scheduling_choice(model_spec, tours, skims, locals_dict, - 2, None, "PyTest Trip Scheduling") + out_tours = tsc.run_trip_scheduling_choice( + model_spec, tours, skims, locals_dict, 2, None, "PyTest Trip Scheduling" + ) assert len(tours) == len(out_tours) - pd.testing.assert_index_equal(tours.sort_index().index, out_tours.sort_index().index) + pd.testing.assert_index_equal( + tours.sort_index().index, out_tours.sort_index().index + ) - output_columns = [tsc.MAIN_LEG_DURATION, - tsc.OB_DURATION, tsc.IB_DURATION] + output_columns = [tsc.MAIN_LEG_DURATION, tsc.OB_DURATION, tsc.IB_DURATION] assert set(output_columns).issubset(out_tours.columns) - assert len(out_tours[out_tours[output_columns].sum(axis=1) == out_tours[tsc.TOUR_DURATION_COLUMN]]) == len(tours) + assert len( + out_tours[ + out_tours[output_columns].sum(axis=1) == out_tours[tsc.TOUR_DURATION_COLUMN] + ] + ) == len(tours) diff --git a/activitysim/abm/test/test_misc/test_trip_utils.py b/activitysim/abm/test/test_misc/test_trip_utils.py index bf908986ea..ef03b90360 100644 --- a/activitysim/abm/test/test_misc/test_trip_utils.py +++ b/activitysim/abm/test/test_misc/test_trip_utils.py @@ -5,9 +5,17 @@ from activitysim.abm.models.util.trip import get_time_windows -@pytest.mark.parametrize("duration, levels, expected", - [(24, 3, 2925), (24, 2, 325), (24, 1, 25), - (48, 3, 20825), (48, 2, 1225), (48, 1, 49)]) +@pytest.mark.parametrize( + "duration, levels, expected", + [ + (24, 3, 2925), + (24, 2, 325), + (24, 1, 25), + (48, 3, 20825), + (48, 2, 1225), + (48, 1, 49), + ], +) def test_get_time_windows(duration, levels, expected): time_windows = get_time_windows(duration, levels) diff --git a/activitysim/abm/test/test_pipeline/test_pipeline.py b/activitysim/abm/test/test_pipeline/test_pipeline.py index 7f8986f8c5..c9635085e2 100644 --- a/activitysim/abm/test/test_pipeline/test_pipeline.py +++ b/activitysim/abm/test/test_pipeline/test_pipeline.py @@ -1,23 +1,18 @@ # ActivitySim # See full license in LICENSE.txt. -import os import logging -import pkg_resources +import os -import openmatrix as omx import numpy as np import numpy.testing as npt - +import openmatrix as omx import pandas as pd import pandas.testing as pdt +import pkg_resources import pytest import yaml -from activitysim.core import random -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import config +from activitysim.core import config, inject, pipeline, random, tracing # set the max households for all tests (this is to limit memory use on travis) HOUSEHOLDS_SAMPLE_SIZE = 50 @@ -34,39 +29,39 @@ def example_path(dirname): - resource = os.path.join('examples', 'example_mtc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_mtc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def setup_dirs(ancillary_configs_dir=None, data_dir=None): # ancillary_configs_dir is used by run_mp to test multiprocess - test_pipeline_configs_dir = os.path.join(os.path.dirname(__file__), 'configs') - example_configs_dir = example_path('configs') + test_pipeline_configs_dir = os.path.join(os.path.dirname(__file__), "configs") + example_configs_dir = example_path("configs") configs_dir = [test_pipeline_configs_dir, example_configs_dir] if ancillary_configs_dir is not None: configs_dir = [ancillary_configs_dir] + configs_dir - inject.add_injectable('configs_dir', configs_dir) + inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), 'output') - inject.add_injectable('output_dir', output_dir) + output_dir = os.path.join(os.path.dirname(__file__), "output") + inject.add_injectable("output_dir", output_dir) if not data_dir: - data_dir = example_path('data') + data_dir = example_path("data") - inject.add_injectable('data_dir', data_dir) + inject.add_injectable("data_dir", data_dir) inject.clear_cache() tracing.config_logger() - tracing.delete_output_files('csv') - tracing.delete_output_files('txt') - tracing.delete_output_files('yaml') - tracing.delete_output_files('omx') + tracing.delete_output_files("csv") + tracing.delete_output_files("txt") + tracing.delete_output_files("yaml") + tracing.delete_output_files("omx") def teardown_function(func): @@ -86,7 +81,7 @@ def close_handlers(): def inject_settings(**kwargs): - settings = config.read_settings_file('settings.yaml', mandatory=True) + settings = config.read_settings_file("settings.yaml", mandatory=True) for k in kwargs: settings[k] = kwargs[k] @@ -100,7 +95,7 @@ def test_rng_access(): setup_dirs() - inject.add_injectable('rng_base_seed', 0) + inject.add_injectable("rng_base_seed", 0) pipeline.open_pipeline() @@ -118,12 +113,15 @@ def regress_mini_auto(): # should be the same results as in run_mp (multiprocessing) test case hh_ids = [1099626, 1173905, 1196298, 1286259] choices = [1, 1, 0, 0] - expected_choice = pd.Series(choices, index=pd.Index(hh_ids, name="household_id"), - name='auto_ownership') + expected_choice = pd.Series( + choices, index=pd.Index(hh_ids, name="household_id"), name="auto_ownership" + ) auto_choice = pipeline.get_table("households").sort_index().auto_ownership - offset = HOUSEHOLDS_SAMPLE_SIZE // 2 # choose something midway as hh_id ordered by hh size + offset = ( + HOUSEHOLDS_SAMPLE_SIZE // 2 + ) # choose something midway as hh_id ordered by hh size print("auto_choice\n%s" % auto_choice.head(offset).tail(4)) auto_choice = auto_choice.reindex(hh_ids) @@ -146,11 +144,14 @@ def regress_mini_mtf(): # these choices are for pure regression - their appropriateness has not been checked per_ids = [2566701, 2566702, 3061895] - choices = ['school1', 'school1', 'work1'] - expected_choice = pd.Series(choices, index=pd.Index(per_ids, name='person_id'), - name='mandatory_tour_frequency') + choices = ["school1", "school1", "work1"] + expected_choice = pd.Series( + choices, + index=pd.Index(per_ids, name="person_id"), + name="mandatory_tour_frequency", + ) - mtf_choice = mtf_choice[mtf_choice != ''] # drop null (empty string) choices + mtf_choice = mtf_choice[mtf_choice != ""] # drop null (empty string) choices offset = len(mtf_choice) // 2 # choose something midway as hh_id ordered by hh size print("mtf_choice\n%s" % mtf_choice.head(offset).tail(3)) @@ -163,7 +164,9 @@ def regress_mini_mtf(): 3061895 work1 Name: mandatory_tour_frequency, dtype: object """ - pdt.assert_series_equal(mtf_choice.reindex(per_ids), expected_choice, check_dtype=False) + pdt.assert_series_equal( + mtf_choice.reindex(per_ids), expected_choice, check_dtype=False + ) def regress_mini_location_choice_logsums(): @@ -171,36 +174,36 @@ def regress_mini_location_choice_logsums(): persons = pipeline.get_table("persons") # DEST_CHOICE_LOGSUM_COLUMN_NAME is specified in school_location.yaml and should be assigned - assert 'school_location_logsum' in persons + assert "school_location_logsum" in persons assert not persons.school_location_logsum.isnull().all() # DEST_CHOICE_LOGSUM_COLUMN_NAME is NOT specified in workplace_location.yaml - assert 'workplace_location_logsum' not in persons + assert "workplace_location_logsum" not in persons def test_mini_pipeline_run(): setup_dirs() - inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, - write_skim_cache=True - ) + inject_settings( + households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, write_skim_cache=True + ) _MODELS = [ - 'initialize_landuse', - 'compute_accessibility', - 'initialize_households', - 'school_location', - 'workplace_location', - 'auto_ownership_simulate' + "initialize_landuse", + "compute_accessibility", + "initialize_households", + "school_location", + "workplace_location", + "auto_ownership_simulate", ] pipeline.run(models=_MODELS, resume_after=None) regress_mini_auto() - pipeline.run_model('cdap_simulate') - pipeline.run_model('mandatory_tour_frequency') + pipeline.run_model("cdap_simulate") + pipeline.run_model("mandatory_tour_frequency") regress_mini_mtf() regress_mini_location_choice_logsums() @@ -217,7 +220,7 @@ def test_mini_pipeline_run(): # should create optional workplace_location_sample table workplace_location_sample_df = pipeline.get_table("workplace_location_sample") - assert 'mode_choice_logsum' in workplace_location_sample_df + assert "mode_choice_logsum" in workplace_location_sample_df pipeline.close_pipeline() inject.clear_cache() @@ -232,8 +235,7 @@ def test_mini_pipeline_run2(): setup_dirs() - inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, - read_skim_cache=True) + inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, read_skim_cache=True) # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() @@ -242,18 +244,18 @@ def test_mini_pipeline_run2(): # print "checkpoints_df\n%s" % checkpoints_df[['checkpoint_name']] assert prev_checkpoint_count == 9 - pipeline.open_pipeline('auto_ownership_simulate') + pipeline.open_pipeline("auto_ownership_simulate") regress_mini_auto() # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: - pipeline.run_model('auto_ownership_simulate') + pipeline.run_model("auto_ownership_simulate") assert "run model 'auto_ownership_simulate' more than once" in str(excinfo.value) # and these new ones - pipeline.run_model('cdap_simulate') - pipeline.run_model('mandatory_tour_frequency') + pipeline.run_model("cdap_simulate") + pipeline.run_model("mandatory_tour_frequency") regress_mini_mtf() @@ -264,9 +266,9 @@ def test_mini_pipeline_run2(): # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test num_hh_ids = 10 hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values - hh_ids = pd.DataFrame({'household_id': hh_ids}) + hh_ids = pd.DataFrame({"household_id": hh_ids}) - hh_ids_path = config.data_file_path('override_hh_ids.csv') + hh_ids_path = config.data_file_path("override_hh_ids.csv") hh_ids.to_csv(hh_ids_path, index=False, header=True) pipeline.close_pipeline() @@ -279,11 +281,11 @@ def test_mini_pipeline_run3(): # test that hh_ids setting overrides household sampling setup_dirs() - inject_settings(hh_ids='override_hh_ids.csv') + inject_settings(hh_ids="override_hh_ids.csv") - households = inject.get_table('households').to_frame() + households = inject.get_table("households").to_frame() - override_hh_ids = pd.read_csv(config.data_file_path('override_hh_ids.csv')) + override_hh_ids = pd.read_csv(config.data_file_path("override_hh_ids.csv")) print("\noverride_hh_ids\n%s" % override_hh_ids) @@ -296,9 +298,14 @@ def test_mini_pipeline_run3(): close_handlers() -def full_run(resume_after=None, chunk_size=0, - households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, - trace_hh_id=None, trace_od=None, check_for_variability=None): +def full_run( + resume_after=None, + chunk_size=0, + households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, + trace_hh_id=None, + trace_od=None, + check_for_variability=None, +): setup_dirs() @@ -310,15 +317,16 @@ def full_run(resume_after=None, chunk_size=0, testing_fail_trip_destination=False, check_for_variability=check_for_variability, want_dest_choice_sample_tables=False, - use_shadow_pricing=False) # shadow pricing breaks replicability when sample_size varies + use_shadow_pricing=False, + ) # shadow pricing breaks replicability when sample_size varies # FIXME should enable testing_fail_trip_destination? - MODELS = settings['models'] + MODELS = settings["models"] pipeline.run(models=MODELS, resume_after=resume_after) - tours = pipeline.get_table('tours') + tours = pipeline.get_table("tours") tour_count = len(tours.index) return tour_count @@ -349,11 +357,10 @@ def get_trace_csv(file_name): def regress_tour_modes(tours_df): - mode_cols = ['tour_mode', 'person_id', 'tour_type', - 'tour_num', 'tour_category'] + mode_cols = ["tour_mode", "person_id", "tour_type", "tour_num", "tour_category"] tours_df = tours_df[tours_df.household_id == HH_ID] - tours_df = tours_df.sort_values(by=['person_id', 'tour_category', 'tour_num']) + tours_df = tours_df.sort_values(by=["person_id", "tour_category", "tour_num"]) print("mode_df\n%s" % tours_df[mode_cols]) @@ -375,25 +382,18 @@ def regress_tour_modes(tours_df): 325052, 325052, 325052, - ] - - EXPECT_TOUR_TYPES = [ - 'othdiscr', - 'work', - 'work', - 'business', - 'work', - 'othmaint' ] + EXPECT_TOUR_TYPES = ["othdiscr", "work", "work", "business", "work", "othmaint"] + EXPECT_MODES = [ - 'WALK', - 'WALK', - 'SHARED3FREE', - 'WALK', - 'WALK_LOC', - 'WALK', - ] + "WALK", + "WALK", + "SHARED3FREE", + "WALK", + "WALK_LOC", + "WALK", + ] assert len(tours_df) == len(EXPECT_PERSON_IDS) assert (tours_df.person_id.values == EXPECT_PERSON_IDS).all() @@ -403,9 +403,9 @@ def regress_tour_modes(tours_df): def regress(): - persons_df = pipeline.get_table('persons') + persons_df = pipeline.get_table("persons") persons_df = persons_df[persons_df.household_id == HH_ID] - print("persons_df\n%s" % persons_df[['value_of_time', 'distance_to_work']]) + print("persons_df\n%s" % persons_df[["value_of_time", "distance_to_work"]]) """ persons_df @@ -415,7 +415,7 @@ def regress(): 3249923 23.349532 0.62 """ - tours_df = pipeline.get_table('tours') + tours_df = pipeline.get_table("tours") regress_tour_modes(tours_df) @@ -423,16 +423,27 @@ def regress(): assert not tours_df.tour_mode.isnull().any() # optional logsum column was added to all tours except mandatory - assert 'destination_logsum' in tours_df - if (tours_df.destination_logsum.isnull() != (tours_df.tour_category == 'mandatory')).any(): - print(tours_df[(tours_df.destination_logsum.isnull() != (tours_df.tour_category == 'mandatory'))]) - assert (tours_df.destination_logsum.isnull() == (tours_df.tour_category == 'mandatory')).all() + assert "destination_logsum" in tours_df + if ( + tours_df.destination_logsum.isnull() != (tours_df.tour_category == "mandatory") + ).any(): + print( + tours_df[ + ( + tours_df.destination_logsum.isnull() + != (tours_df.tour_category == "mandatory") + ) + ] + ) + assert ( + tours_df.destination_logsum.isnull() == (tours_df.tour_category == "mandatory") + ).all() # mode choice logsum calculated for all tours - assert 'mode_choice_logsum' in tours_df + assert "mode_choice_logsum" in tours_df assert not tours_df.mode_choice_logsum.isnull().any() - trips_df = pipeline.get_table('trips') + trips_df = pipeline.get_table("trips") assert trips_df.shape[0] > 0 assert not trips_df.purpose.isnull().any() assert not trips_df.depart.isnull().any() @@ -442,17 +453,17 @@ def regress(): assert not trips_df.mode_choice_logsum.isnull().any() # should be at least two tours per trip - assert trips_df.shape[0] >= 2*tours_df.shape[0] + assert trips_df.shape[0] >= 2 * tours_df.shape[0] # write_trip_matrices - trip_matrices_file = config.output_file_path('trips_md.omx') + trip_matrices_file = config.output_file_path("trips_md.omx") assert os.path.exists(trip_matrices_file) trip_matrices = omx.open_file(trip_matrices_file) assert trip_matrices.shape() == (25, 25) - assert 'WALK_MD' in trip_matrices.list_matrices() - walk_trips = np.array(trip_matrices['WALK_MD']) - assert walk_trips.dtype == np.dtype('float64') + assert "WALK_MD" in trip_matrices.list_matrices() + walk_trips = np.array(trip_matrices["WALK_MD"]) + assert walk_trips.dtype == np.dtype("float64") trip_matrices.close() @@ -462,13 +473,17 @@ def test_full_run1(): if SKIP_FULL_RUN: return - tour_count = full_run(trace_hh_id=HH_ID, check_for_variability=True, - households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) + tour_count = full_run( + trace_hh_id=HH_ID, + check_for_variability=True, + households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, + ) print("tour_count", tour_count) - assert(tour_count == EXPECT_TOUR_COUNT), \ - "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) + assert ( + tour_count == EXPECT_TOUR_COUNT + ), "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) regress() @@ -482,10 +497,13 @@ def test_full_run2(): if SKIP_FULL_RUN: return - tour_count = full_run(resume_after='non_mandatory_tour_scheduling', trace_hh_id=HH_ID) + tour_count = full_run( + resume_after="non_mandatory_tour_scheduling", trace_hh_id=HH_ID + ) - assert(tour_count == EXPECT_TOUR_COUNT), \ - "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) + assert ( + tour_count == EXPECT_TOUR_COUNT + ), "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) regress() @@ -499,12 +517,15 @@ def test_full_run3_with_chunks(): if SKIP_FULL_RUN: return - tour_count = full_run(trace_hh_id=HH_ID, - households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, - chunk_size=500000) + tour_count = full_run( + trace_hh_id=HH_ID, + households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, + chunk_size=500000, + ) - assert(tour_count == EXPECT_TOUR_COUNT), \ - "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) + assert ( + tour_count == EXPECT_TOUR_COUNT + ), "EXPECT_TOUR_COUNT %s but got tour_count %s" % (EXPECT_TOUR_COUNT, tour_count) regress() @@ -518,8 +539,9 @@ def test_full_run4_stability(): if SKIP_FULL_RUN: return - tour_count = full_run(trace_hh_id=HH_ID, - households_sample_size=HOUSEHOLDS_SAMPLE_SIZE-10) + tour_count = full_run( + trace_hh_id=HH_ID, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE - 10 + ) regress() @@ -535,9 +557,7 @@ def test_full_run5_singleton(): if SKIP_FULL_RUN: return - tour_count = full_run(trace_hh_id=HH_ID, - households_sample_size=1, - chunk_size=1) + tour_count = full_run(trace_hh_id=HH_ID, households_sample_size=1, chunk_size=1) regress() @@ -547,6 +567,7 @@ def test_full_run5_singleton(): if __name__ == "__main__": from activitysim import abm # register injectables + print("running test_full_run1") test_full_run1() # teardown_function(None) diff --git a/activitysim/cli/__init__.py b/activitysim/cli/__init__.py index 21c677dcc2..70203a8397 100644 --- a/activitysim/cli/__init__.py +++ b/activitysim/cli/__init__.py @@ -1,3 +1,2 @@ +from . import create, run from .cli import CLI -from . import create -from . import run diff --git a/activitysim/cli/cli.py b/activitysim/cli/cli.py index 1eff88406f..fa3a2afea8 100644 --- a/activitysim/cli/cli.py +++ b/activitysim/cli/cli.py @@ -7,15 +7,16 @@ def __init__(self, version, description): self.description = description self.parser = argparse.ArgumentParser(description=self.description) - self.parser.add_argument('--version', '-V', - action='version', - version=self.version) + self.parser.add_argument( + "--version", "-V", action="version", version=self.version + ) # print help if no subcommand is provided self.parser.set_defaults(func=lambda x: self.parser.print_help()) - self.subparsers = self.parser.add_subparsers(title='subcommands', - help='available subcommand options') + self.subparsers = self.parser.add_subparsers( + title="subcommands", help="available subcommand options" + ) def add_subcommand(self, name, args_func, exec_func, description): subparser = self.subparsers.add_parser(name, description=description) diff --git a/activitysim/cli/create.py b/activitysim/cli/create.py index a0e9f638d1..f83da6eec0 100644 --- a/activitysim/cli/create.py +++ b/activitysim/cli/create.py @@ -1,14 +1,15 @@ +import glob import os -import sys -import requests import shutil -import glob +import sys + import pkg_resources +import requests import yaml -PACKAGE = 'activitysim' -EXAMPLES_DIR = 'examples' -MANIFEST = 'example_manifest.yaml' +PACKAGE = "activitysim" +EXAMPLES_DIR = "examples" +MANIFEST = "example_manifest.yaml" def _example_path(resource): @@ -19,11 +20,11 @@ def _example_path(resource): def _load_manifest(): - with open(_example_path(MANIFEST), 'r') as f: + with open(_example_path(MANIFEST), "r") as f: manifest = yaml.safe_load(f.read()) - assert manifest, f'error: could not load {MANIFEST}' - return {example['name']: example for example in manifest} + assert manifest, f"error: could not load {MANIFEST}" + return {example["name"]: example for example in manifest} EXAMPLES = _load_manifest() @@ -33,19 +34,24 @@ def add_create_args(parser): """Create command args """ create_group = parser.add_mutually_exclusive_group(required=True) - create_group.add_argument('-l', '--list', - action='store_true', - help='list available example directories and exit') - create_group.add_argument('-e', '--example', - type=str, - metavar='PATH', - help='example directory to copy') - - parser.add_argument('-d', '--destination', - type=str, - metavar='PATH', - default=os.getcwd(), - help="path to new project directory (default: %(default)s)") + create_group.add_argument( + "-l", + "--list", + action="store_true", + help="list available example directories and exit", + ) + create_group.add_argument( + "-e", "--example", type=str, metavar="PATH", help="example directory to copy" + ) + + parser.add_argument( + "-d", + "--destination", + type=str, + metavar="PATH", + default=os.getcwd(), + help="path to new project directory (default: %(default)s)", + ) def create(args): @@ -69,11 +75,11 @@ def create(args): def list_examples(): - print('*** Available examples ***\n') + print("*** Available examples ***\n") ret = [] for example in list(EXAMPLES.values()): - del example['include'] + del example["include"] ret.append(example) print(yaml.dump(example)) @@ -111,7 +117,7 @@ def get_example(example_name, destination): example = EXAMPLES[example_name] - for item in example.get('include', []): + for item in example.get("include", []): # split include string into source/destination paths items = item.split() @@ -121,23 +127,23 @@ def get_example(example_name, destination): else: target_path = dest_path - if assets.startswith('http'): + if assets.startswith("http"): download_asset(assets, target_path) else: for asset_path in glob.glob(_example_path(assets)): copy_asset(asset_path, target_path) - print(f'copied! new project files are in {os.path.abspath(dest_path)}') + print(f"copied! new project files are in {os.path.abspath(dest_path)}") - instructions = example.get('instructions') + instructions = example.get("instructions") if instructions: print(instructions) def copy_asset(asset_path, target_path): - print(f'copying {os.path.basename(asset_path)} ...') + print(f"copying {os.path.basename(asset_path)} ...") if os.path.isdir(asset_path): target_path = os.path.join(target_path, os.path.basename(asset_path)) shutil.copytree(asset_path, target_path) @@ -148,9 +154,9 @@ def copy_asset(asset_path, target_path): def download_asset(url, target_path): - print(f'downloading {os.path.basename(target_path)} ...') + print(f"downloading {os.path.basename(target_path)} ...") with requests.get(url, stream=True) as r: r.raise_for_status() - with open(target_path, 'wb') as f: + with open(target_path, "wb") as f: for chunk in r.iter_content(chunk_size=None): f.write(chunk) diff --git a/activitysim/cli/main.py b/activitysim/cli/main.py index 113e8b9d68..0bae61a6b8 100644 --- a/activitysim/cli/main.py +++ b/activitysim/cli/main.py @@ -1,21 +1,21 @@ import sys -from activitysim.cli import CLI -from activitysim.cli import run -from activitysim.cli import create - -from activitysim import __version__, __doc__ +from activitysim import __doc__, __version__ +from activitysim.cli import CLI, create, run def main(): - asim = CLI(version=__version__, - description=__doc__) - asim.add_subcommand(name='run', - args_func=run.add_run_args, - exec_func=run.run, - description=run.run.__doc__) - asim.add_subcommand(name='create', - args_func=create.add_create_args, - exec_func=create.create, - description=create.create.__doc__) + asim = CLI(version=__version__, description=__doc__) + asim.add_subcommand( + name="run", + args_func=run.add_run_args, + exec_func=run.run, + description=run.run.__doc__, + ) + asim.add_subcommand( + name="create", + args_func=create.add_create_args, + exec_func=create.create, + description=create.create.__doc__, + ) sys.exit(asim.execute()) diff --git a/activitysim/cli/run.py b/activitysim/cli/run.py index a7fdd8fc4a..f667608f38 100644 --- a/activitysim/cli/run.py +++ b/activitysim/cli/run.py @@ -1,74 +1,76 @@ # ActivitySim # See full license in LICENSE.txt. -import sys -import os -import logging import argparse +import logging +import os +import sys import warnings import numpy as np -from activitysim.core import inject -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import mem -from activitysim.core import chunk +from activitysim.core import chunk, config, inject, mem, pipeline, tracing logger = logging.getLogger(__name__) -INJECTABLES = ['data_dir', 'configs_dir', 'output_dir', 'settings_file_name'] +INJECTABLES = ["data_dir", "configs_dir", "output_dir", "settings_file_name"] def add_run_args(parser, multiprocess=True): """Run command args """ - parser.add_argument('-w', '--working_dir', - type=str, - metavar='PATH', - help='path to example/project directory (default: %s)' % os.getcwd()) - parser.add_argument('-c', '--config', - type=str, - action='append', - metavar='PATH', - help='path to config dir') - parser.add_argument('-o', '--output', - type=str, - metavar='PATH', - help='path to output dir') - parser.add_argument('-d', '--data', - type=str, - action='append', - metavar='PATH', - help='path to data dir') - parser.add_argument('-r', '--resume', - type=str, - metavar='STEPNAME', - help='resume after step') - parser.add_argument('-p', '--pipeline', - type=str, - metavar='FILE', - help='pipeline file name') - parser.add_argument('-s', '--settings_file', - type=str, - metavar='FILE', - help='settings file name') - parser.add_argument('-g', '--chunk_size', - type=int, - metavar='BYTES', - help='chunk size') + parser.add_argument( + "-w", + "--working_dir", + type=str, + metavar="PATH", + help="path to example/project directory (default: %s)" % os.getcwd(), + ) + parser.add_argument( + "-c", + "--config", + type=str, + action="append", + metavar="PATH", + help="path to config dir", + ) + parser.add_argument( + "-o", "--output", type=str, metavar="PATH", help="path to output dir" + ) + parser.add_argument( + "-d", + "--data", + type=str, + action="append", + metavar="PATH", + help="path to data dir", + ) + parser.add_argument( + "-r", "--resume", type=str, metavar="STEPNAME", help="resume after step" + ) + parser.add_argument( + "-p", "--pipeline", type=str, metavar="FILE", help="pipeline file name" + ) + parser.add_argument( + "-s", "--settings_file", type=str, metavar="FILE", help="settings file name" + ) + parser.add_argument( + "-g", "--chunk_size", type=int, metavar="BYTES", help="chunk size" + ) if multiprocess: - parser.add_argument('-m', '--multiprocess', - default=False, - const=-1, - metavar='(N)', - nargs='?', - type=int, - help='run multiprocess. Adds configs_mp settings' - ' by default. Optionally give a number of processes,' - ' which will override the settings file.') + parser.add_argument( + "-m", + "--multiprocess", + default=False, + const=-1, + metavar="(N)", + nargs="?", + type=int, + help="run multiprocess. Adds configs_mp settings" + " by default. Optionally give a number of processes," + " which will override the settings file.", + ) def validate_injectable(name): @@ -77,9 +79,11 @@ def validate_injectable(name): except RuntimeError: # injectable is missing, meaning is hasn't been explicitly set # and defaults cannot be found. - sys.exit('Error: please specify either a --working_dir ' - "containing 'configs', 'data', and 'output' folders " - 'or all three of --config, --data, and --output') + sys.exit( + "Error: please specify either a --working_dir " + "containing 'configs', 'data', and 'output' folders " + "or all three of --config, --data, and --output" + ) dir_paths = [dir_paths] if isinstance(dir_paths, str) else dir_paths @@ -91,7 +95,6 @@ def validate_injectable(name): def handle_standard_args(args, multiprocess=True): - def inject_arg(name, value): assert name in INJECTABLES inject.add_injectable(name, value) @@ -102,54 +105,54 @@ def inject_arg(name, value): os.chdir(args.working_dir) if args.settings_file: - inject_arg('settings_file_name', args.settings_file) + inject_arg("settings_file_name", args.settings_file) if args.config: - inject_arg('configs_dir', args.config) + inject_arg("configs_dir", args.config) if args.data: - inject_arg('data_dir', args.data) + inject_arg("data_dir", args.data) if args.output: - inject_arg('output_dir', args.output) + inject_arg("output_dir", args.output) if multiprocess and args.multiprocess: - config_paths = validate_injectable('configs_dir') + config_paths = validate_injectable("configs_dir") - if not os.path.exists('configs_mp'): + if not os.path.exists("configs_mp"): logger.warning("could not find 'configs_mp'. skipping...") else: logger.info("adding 'configs_mp' to config_dir list...") - config_paths.insert(0, 'configs_mp') - inject_arg('configs_dir', config_paths) + config_paths.insert(0, "configs_mp") + inject_arg("configs_dir", config_paths) - config.override_setting('multiprocess', True) + config.override_setting("multiprocess", True) if args.multiprocess > 0: - config.override_setting('num_processes', args.multiprocess) + config.override_setting("num_processes", args.multiprocess) if args.chunk_size: - config.override_setting('chunk_size', int(args.chunk_size)) + config.override_setting("chunk_size", int(args.chunk_size)) - for injectable in ['configs_dir', 'data_dir', 'output_dir']: + for injectable in ["configs_dir", "data_dir", "output_dir"]: validate_injectable(injectable) if args.pipeline: - inject.add_injectable('pipeline_file_name', args.pipeline) + inject.add_injectable("pipeline_file_name", args.pipeline) if args.resume: - config.override_setting('resume_after', args.resume) + config.override_setting("resume_after", args.resume) def cleanup_output_files(): tracing.delete_trace_files() - tracing.delete_output_files('h5') - tracing.delete_output_files('csv') - tracing.delete_output_files('txt') - tracing.delete_output_files('yaml') - tracing.delete_output_files('prof') - tracing.delete_output_files('omx') + tracing.delete_output_files("h5") + tracing.delete_output_files("csv") + tracing.delete_output_files("txt") + tracing.delete_output_files("yaml") + tracing.delete_output_files("prof") + tracing.delete_output_files("omx") def run(args): @@ -167,38 +170,45 @@ def run(args): # by default, assume we are running activitysim.abm # other callers (e.g. populationsim) will have to arrange to register their own steps and injectables # (presumably) in a custom run_simulation.py instead of using the 'activitysim run' command - if not inject.is_injectable('preload_injectables'): - from activitysim import abm # register abm steps and other abm-specific injectables + if not inject.is_injectable("preload_injectables"): + from activitysim import ( + abm, # register abm steps and other abm-specific injectables + ) tracing.config_logger(basic=True) handle_standard_args(args) # possibly update injectables # legacy support for run_list setting nested 'models' and 'resume_after' settings - if config.setting('run_list'): - warnings.warn("Support for 'run_list' settings group will be removed.\n" - "The run_list.steps setting is renamed 'models'.\n" - "The run_list.resume_after setting is renamed 'resume_after'.\n" - "Specify both 'models' and 'resume_after' directly in settings config file.", FutureWarning) - run_list = config.setting('run_list') - if 'steps' in run_list: - assert not config.setting('models'), \ - f"Don't expect 'steps' in run_list and 'models' as stand-alone setting!" - config.override_setting('models', run_list['steps']) - - if 'resume_after' in run_list: - assert not config.setting('resume_after'), \ - f"Don't expect 'resume_after' both in run_list and as stand-alone setting!" - config.override_setting('resume_after', run_list['resume_after']) + if config.setting("run_list"): + warnings.warn( + "Support for 'run_list' settings group will be removed.\n" + "The run_list.steps setting is renamed 'models'.\n" + "The run_list.resume_after setting is renamed 'resume_after'.\n" + "Specify both 'models' and 'resume_after' directly in settings config file.", + FutureWarning, + ) + run_list = config.setting("run_list") + if "steps" in run_list: + assert not config.setting( + "models" + ), f"Don't expect 'steps' in run_list and 'models' as stand-alone setting!" + config.override_setting("models", run_list["steps"]) + + if "resume_after" in run_list: + assert not config.setting( + "resume_after" + ), f"Don't expect 'resume_after' both in run_list and as stand-alone setting!" + config.override_setting("resume_after", run_list["resume_after"]) # If you provide a resume_after argument to pipeline.run # the pipeline manager will attempt to load checkpointed tables from the checkpoint store # and resume pipeline processing on the next submodel step after the specified checkpoint - resume_after = config.setting('resume_after', None) + resume_after = config.setting("resume_after", None) # cleanup if not resuming if not resume_after: cleanup_output_files() - elif config.setting('cleanup_trace_files_on_resume', False): + elif config.setting("cleanup_trace_files_on_resume", False): tracing.delete_trace_files() tracing.config_logger(basic=False) # update using possibly new logging configs @@ -206,60 +216,62 @@ def run(args): logging.captureWarnings(capture=True) # directories - for k in ['configs_dir', 'settings_file_name', 'data_dir', 'output_dir']: - logger.info('SETTING %s: %s' % (k, inject.get_injectable(k, None))) + for k in ["configs_dir", "settings_file_name", "data_dir", "output_dir"]: + logger.info("SETTING %s: %s" % (k, inject.get_injectable(k, None))) - log_settings = inject.get_injectable('log_settings', {}) + log_settings = inject.get_injectable("log_settings", {}) for k in log_settings: - logger.info('SETTING %s: %s' % (k, config.setting(k))) + logger.info("SETTING %s: %s" % (k, config.setting(k))) # OMP_NUM_THREADS: openmp # OPENBLAS_NUM_THREADS: openblas # MKL_NUM_THREADS: mkl - for env in ['MKL_NUM_THREADS', 'OMP_NUM_THREADS', 'OPENBLAS_NUM_THREADS']: + for env in ["MKL_NUM_THREADS", "OMP_NUM_THREADS", "OPENBLAS_NUM_THREADS"]: logger.info(f"ENV {env}: {os.getenv(env)}") np_info_keys = [ - 'atlas_blas_info', - 'atlas_blas_threads_info', - 'atlas_info', - 'atlas_threads_info', - 'blas_info', - 'blas_mkl_info', - 'blas_opt_info', - 'lapack_info', - 'lapack_mkl_info', - 'lapack_opt_info', - 'mkl_info'] + "atlas_blas_info", + "atlas_blas_threads_info", + "atlas_info", + "atlas_threads_info", + "blas_info", + "blas_mkl_info", + "blas_opt_info", + "lapack_info", + "lapack_mkl_info", + "lapack_opt_info", + "mkl_info", + ] for cfg_key in np_info_keys: info = np.__config__.get_info(cfg_key) if info: - for info_key in ['libraries']: + for info_key in ["libraries"]: if info_key in info: logger.info(f"NUMPY {cfg_key} {info_key}: {info[info_key]}") t0 = tracing.print_elapsed_time() try: - if config.setting('multiprocess', False): - logger.info('run multiprocess simulation') + if config.setting("multiprocess", False): + logger.info("run multiprocess simulation") from activitysim.core import mp_tasks + injectables = {k: inject.get_injectable(k) for k in INJECTABLES} mp_tasks.run_multiprocess(injectables) assert not pipeline.is_open() - if config.setting('cleanup_pipeline_after_run', False): + if config.setting("cleanup_pipeline_after_run", False): pipeline.cleanup_pipeline() else: - logger.info('run single process simulation') + logger.info("run single process simulation") - pipeline.run(models=config.setting('models'), resume_after=resume_after) + pipeline.run(models=config.setting("models"), resume_after=resume_after) - if config.setting('cleanup_pipeline_after_run', False): + if config.setting("cleanup_pipeline_after_run", False): pipeline.cleanup_pipeline() # has side effect of closing open pipeline else: pipeline.close_pipeline() @@ -267,19 +279,19 @@ def run(args): mem.log_global_hwm() # main process except Exception: # log time until error and the error traceback - tracing.print_elapsed_time('all models until this error', t0) - logger.exception('activitysim run encountered an unrecoverable error') + tracing.print_elapsed_time("all models until this error", t0) + logger.exception("activitysim run encountered an unrecoverable error") raise chunk.consolidate_logs() mem.consolidate_logs() - tracing.print_elapsed_time('all models', t0) + tracing.print_elapsed_time("all models", t0) return 0 -if __name__ == '__main__': +if __name__ == "__main__": from activitysim import abm # register injectables @@ -287,5 +299,5 @@ def run(args): add_run_args(parser) args = parser.parse_args() - parser.parse_args(['--sum', '7', '-1', '42']) + parser.parse_args(["--sum", "7", "-1", "42"]) sys.exit(run(args)) diff --git a/activitysim/cli/test/test_cli.py b/activitysim/cli/test/test_cli.py index b96044a3a0..3cc176b646 100644 --- a/activitysim/cli/test/test_cli.py +++ b/activitysim/cli/test/test_cli.py @@ -1,56 +1,58 @@ # ActivitySim # See full license in LICENSE.txt. import os -import subprocess import shutil +import subprocess import sys + import pytest if sys.version_info < (3, 7): - pytest.skip('capture_output introduced in Python 3.7', allow_module_level=True) + pytest.skip("capture_output introduced in Python 3.7", allow_module_level=True) def test_help(): # cp = completed process - cp = subprocess.run(['activitysim', '-h'], capture_output=True) + cp = subprocess.run(["activitysim", "-h"], capture_output=True) - assert 'usage: activitysim [-h] [--version]' in str(cp.stdout) + assert "usage: activitysim [-h] [--version]" in str(cp.stdout) def test_create_help(): - cp = subprocess.run(['activitysim', 'create', '-h'], capture_output=True) + cp = subprocess.run(["activitysim", "create", "-h"], capture_output=True) - assert 'usage: activitysim create [-h] (-l | -e PATH) [-d PATH]' in str(cp.stdout) + assert "usage: activitysim create [-h] (-l | -e PATH) [-d PATH]" in str(cp.stdout) def test_create_list(): - cp = subprocess.run(['activitysim', 'create', '--list'], capture_output=True) + cp = subprocess.run(["activitysim", "create", "--list"], capture_output=True) - assert 'Available examples' in str(cp.stdout) + assert "Available examples" in str(cp.stdout) assert "name: example_mtc" in str(cp.stdout) assert "name: example_test" in str(cp.stdout) def test_create_copy(): - target = os.path.join(os.path.dirname(__file__), 'test_example') - cp = subprocess.run(['activitysim', 'create', - '--example', 'example_test', - '--destination', target], capture_output=True) + target = os.path.join(os.path.dirname(__file__), "test_example") + cp = subprocess.run( + ["activitysim", "create", "--example", "example_test", "--destination", target], + capture_output=True, + ) - assert 'copying data ...' in str(cp.stdout) - assert 'copying configs ...' in str(cp.stdout) - assert 'copying configs_mp ...' in str(cp.stdout) - assert 'copying output ...' in str(cp.stdout) + assert "copying data ..." in str(cp.stdout) + assert "copying configs ..." in str(cp.stdout) + assert "copying configs_mp ..." in str(cp.stdout) + assert "copying output ..." in str(cp.stdout) # replace slashes on windows assert str(target).replace("\\\\", "\\") in str(cp.stdout).replace("\\\\", "\\") assert os.path.exists(target) - for folder in ['configs', 'configs_mp', 'data', 'output']: + for folder in ["configs", "configs_mp", "data", "output"]: assert os.path.isdir(os.path.join(target, folder)) # clean up @@ -60,19 +62,19 @@ def test_create_copy(): def test_run(): - cp = subprocess.run(['activitysim', 'run'], capture_output=True) + cp = subprocess.run(["activitysim", "run"], capture_output=True) msg = ( - 'Error: please specify either a --working_dir ' + "Error: please specify either a --working_dir " "containing 'configs', 'data', and 'output' " - 'folders or all three of --config, --data, and --output' + "folders or all three of --config, --data, and --output" ) # expect error assert msg in str(cp.stderr) -if __name__ == '__main__': +if __name__ == "__main__": test_help() test_create_help() diff --git a/activitysim/core/assign.py b/activitysim/core/assign.py index afa8cc46dd..7b76d326a5 100644 --- a/activitysim/core/assign.py +++ b/activitysim/core/assign.py @@ -1,18 +1,13 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import zip -from builtins import object - import logging +from builtins import object, zip from collections import OrderedDict import numpy as np import pandas as pd -from activitysim.core import util -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import chunk +from activitysim.core import chunk, config, pipeline, util logger = logging.getLogger(__name__) @@ -65,10 +60,12 @@ def evaluate_constants(expressions, constants): return d -def read_assignment_spec(file_name, - description_name="Description", - target_name="Target", - expression_name="Expression"): +def read_assignment_spec( + file_name, + description_name="Description", + target_name="Target", + expression_name="Expression", +): """ Read a CSV model specification into a Pandas DataFrame or Series. @@ -97,7 +94,7 @@ def read_assignment_spec(file_name, """ try: - cfg = pd.read_csv(file_name, comment='#') + cfg = pd.read_csv(file_name, comment="#") except Exception as e: logger.error(f"Error reading spec file: {file_name}") logger.error(str(e)) @@ -106,14 +103,18 @@ def read_assignment_spec(file_name, # drop null expressions # cfg = cfg.dropna(subset=[expression_name]) - cfg.rename(columns={target_name: 'target', - expression_name: 'expression', - description_name: 'description'}, - inplace=True) + cfg.rename( + columns={ + target_name: "target", + expression_name: "expression", + description_name: "description", + }, + inplace=True, + ) # backfill description - if 'description' not in cfg.columns: - cfg.description = '' + if "description" not in cfg.columns: + cfg.description = "" cfg.target = cfg.target.str.strip() cfg.expression = cfg.expression.str.strip() @@ -124,12 +125,14 @@ def read_assignment_spec(file_name, class NumpyLogger(object): def __init__(self, logger): self.logger = logger - self.target = '' - self.expression = '' + self.target = "" + self.expression = "" def write(self, msg): - self.logger.warning("numpy: %s expression: %s = %s" % - (msg.rstrip(), str(self.target), str(self.expression))) + self.logger.warning( + "numpy: %s expression: %s = %s" + % (msg.rstrip(), str(self.target), str(self.expression)) + ) def local_utilities(): @@ -143,13 +146,13 @@ def local_utilities(): """ utility_dict = { - 'pd': pd, - 'np': np, - 'reindex': util.reindex, - 'reindex_i': util.reindex_i, - 'setting': config.setting, - 'other_than': util.other_than, - 'rng': pipeline.get_rn_generator(), + "pd": pd, + "np": np, + "reindex": util.reindex, + "reindex_i": util.reindex_i, + "setting": config.setting, + "other_than": util.other_than, + "rng": pipeline.get_rn_generator(), } utility_dict.update(config.get_global_constants()) @@ -158,19 +161,26 @@ def local_utilities(): def is_throwaway(target): - return target == '_' + return target == "_" def is_temp_scalar(target): - return target.startswith('_') and target.isupper() + return target.startswith("_") and target.isupper() def is_temp(target): - return target.startswith('_') - - -def assign_variables(assignment_expressions, df, locals_dict, df_alias=None, - trace_rows=None, trace_label=None, chunk_log=None): + return target.startswith("_") + + +def assign_variables( + assignment_expressions, + df, + locals_dict, + df_alias=None, + trace_rows=None, + trace_label=None, + chunk_log=None, +): """ Evaluate a set of variable expressions from a spec in the context of a given data table. @@ -235,7 +245,7 @@ def to_series(x): if df_alias: _locals_dict[df_alias] = df else: - _locals_dict['df'] = df + _locals_dict["df"] = df local_keys = list(_locals_dict.keys()) # build a dataframe of eval results for non-temp targets @@ -248,12 +258,15 @@ def to_series(x): for e in zip(assignment_expressions.target, assignment_expressions.expression): target, expression = e - assert isinstance(target, str), \ - "expected target '%s' for expression '%s' to be string not %s" % \ - (target, expression, type(target)) + assert isinstance(target, str), ( + "expected target '%s' for expression '%s' to be string not %s" + % (target, expression, type(target)) + ) if target in local_keys: - logger.warning("assign_variables target obscures local_d name '%s'", str(target)) + logger.warning( + "assign_variables target obscures local_d name '%s'", str(target) + ) if trace_label: logger.debug(f"{trace_label}.assign_variables {target} = {expression}") @@ -262,14 +275,20 @@ def to_series(x): try: x = eval(expression, globals(), _locals_dict) except Exception as err: - logger.error("assign_variables error: %s: %s", type(err).__name__, str(err)) - logger.error("assign_variables expression: %s = %s", str(target), str(expression)) + logger.error( + "assign_variables error: %s: %s", type(err).__name__, str(err) + ) + logger.error( + "assign_variables expression: %s = %s", str(target), str(expression) + ) raise err if not is_throwaway(target): _locals_dict[target] = x if trace_assigned_locals is not None: - trace_assigned_locals[uniquify_key(trace_assigned_locals, target)] = x + trace_assigned_locals[ + uniquify_key(trace_assigned_locals, target) + ] = x continue @@ -279,7 +298,7 @@ def to_series(x): np_logger.target = str(target) np_logger.expression = str(expression) saved_handler = np.seterrcall(np_logger) - save_err = np.seterr(all='log') + save_err = np.seterr(all="log") # FIXME should whitelist globals for security? globals_dict = {} @@ -294,7 +313,9 @@ def to_series(x): # raise err except Exception as err: - logger.exception(f"assign_variables - {type(err).__name__} ({str(err)}) evaluating: {str(expression)}") + logger.exception( + f"assign_variables - {type(err).__name__} ({str(err)}) evaluating: {str(expression)}" + ) raise err if trace_results is not None: @@ -321,11 +342,11 @@ def to_series(x): assert variables, "No non-temp variables were assigned." if chunk_log: - chunk.log_df(trace_label, 'temps', temps) - chunk.log_df(trace_label, 'variables', variables) + chunk.log_df(trace_label, "temps", temps) + chunk.log_df(trace_label, "variables", variables) # these are going away - let caller log result df - chunk.log_df(trace_label, 'temps', None) - chunk.log_df(trace_label, 'variables', None) + chunk.log_df(trace_label, "temps", None) + chunk.log_df(trace_label, "variables", None) # we stored result in dict - convert to df variables = util.df_from_dict(variables, index=df.index) diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index 259a9d026b..effbbba1e7 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -13,10 +13,7 @@ import numpy as np import pandas as pd -from . import config -from . import mem -from . import tracing -from . import util +from . import config, mem, tracing, util from .util import GB logger = logging.getLogger(__name__) @@ -25,11 +22,11 @@ # CHUNK_METHODS and METRICS # -RSS = 'rss' -USS = 'uss' -BYTES = 'bytes' -HYBRID_RSS = 'hybrid_rss' -HYBRID_USS = 'hybrid_uss' +RSS = "rss" +USS = "uss" +BYTES = "bytes" +HYBRID_RSS = "hybrid_rss" +HYBRID_USS = "hybrid_uss" METRICS = [RSS, USS, BYTES] CHUNK_METHODS = [RSS, USS, BYTES, HYBRID_RSS, HYBRID_USS] @@ -75,10 +72,10 @@ assuming there is abundant RAM. """ -MODE_RETRAIN = 'training' -MODE_ADAPTIVE = 'adaptive' -MODE_PRODUCTION = 'production' -MODE_CHUNKLESS = 'disabled' +MODE_RETRAIN = "training" +MODE_ADAPTIVE = "adaptive" +MODE_PRODUCTION = "production" +MODE_CHUNKLESS = "disabled" TRAINING_MODES = [MODE_RETRAIN, MODE_ADAPTIVE, MODE_PRODUCTION, MODE_CHUNKLESS] # @@ -90,26 +87,31 @@ LOG_SUBCHUNK_HISTORY = False # only useful for debugging WRITE_SUBCHUNK_HISTORY = False # only useful for debugging -DEFAULT_INITIAL_ROWS_PER_CHUNK = 100 # fallback for default_initial_rows_per_chunk setting +DEFAULT_INITIAL_ROWS_PER_CHUNK = ( + 100 # fallback for default_initial_rows_per_chunk setting +) # # cache and history files # -CACHE_FILE_NAME = 'chunk_cache.csv' -LOG_FILE_NAME = 'chunk_history.csv' +CACHE_FILE_NAME = "chunk_cache.csv" +LOG_FILE_NAME = "chunk_history.csv" OMNIBUS_LOG_FILE_NAME = f"omnibus_{LOG_FILE_NAME}" -C_CHUNK_TAG = 'tag' -C_DEPTH = 'depth' -C_NUM_ROWS = 'num_rows' -C_TIME = 'time' +C_CHUNK_TAG = "tag" +C_DEPTH = "depth" +C_NUM_ROWS = "num_rows" +C_TIME = "time" # columns to write to LOG_FILE -CUM_OVERHEAD_COLUMNS = [f'cum_overhead_{m}' for m in METRICS] -CHUNK_HISTORY_COLUMNS = [C_TIME, C_CHUNK_TAG] + CUM_OVERHEAD_COLUMNS + \ - [C_NUM_ROWS, 'row_size', 'chunk_size', C_DEPTH, 'process', 'chunk'] +CUM_OVERHEAD_COLUMNS = [f"cum_overhead_{m}" for m in METRICS] +CHUNK_HISTORY_COLUMNS = ( + [C_TIME, C_CHUNK_TAG] + + CUM_OVERHEAD_COLUMNS + + [C_NUM_ROWS, "row_size", "chunk_size", C_DEPTH, "process", "chunk"] +) CHUNK_CACHE_COLUMNS = [C_CHUNK_TAG, C_NUM_ROWS] + METRICS @@ -125,24 +127,32 @@ def chunk_method(): - method = SETTINGS.get('chunk_method') + method = SETTINGS.get("chunk_method") if method is None: - method = SETTINGS.setdefault('chunk_method', config.setting('chunk_method', DEFAULT_CHUNK_METHOD)) - assert method in CHUNK_METHODS, \ - f"chunk_method setting '{method}' not recognized. Should be one of: {CHUNK_METHODS}" + method = SETTINGS.setdefault( + "chunk_method", config.setting("chunk_method", DEFAULT_CHUNK_METHOD) + ) + assert ( + method in CHUNK_METHODS + ), f"chunk_method setting '{method}' not recognized. Should be one of: {CHUNK_METHODS}" return method def chunk_metric(): - return SETTINGS.setdefault('chunk_metric', USS if chunk_method() in USS_CHUNK_METHODS else 'rss') + return SETTINGS.setdefault( + "chunk_metric", USS if chunk_method() in USS_CHUNK_METHODS else "rss" + ) def chunk_training_mode(): - training_mode = \ - SETTINGS.setdefault('chunk_training_mode', config.setting('chunk_training_mode', MODE_ADAPTIVE)) + training_mode = SETTINGS.setdefault( + "chunk_training_mode", config.setting("chunk_training_mode", MODE_ADAPTIVE) + ) if not training_mode: training_mode = MODE_CHUNKLESS - assert training_mode in TRAINING_MODES, f"chunk_training_mode '{training_mode}' not one of: {TRAINING_MODES}" + assert ( + training_mode in TRAINING_MODES + ), f"chunk_training_mode '{training_mode}' not one of: {TRAINING_MODES}" return training_mode @@ -151,20 +161,27 @@ def chunk_logging(): def default_initial_rows_per_chunk(): - return SETTINGS.setdefault('default_initial_rows_per_chunk', - config.setting('default_initial_rows_per_chunk', DEFAULT_INITIAL_ROWS_PER_CHUNK)) + return SETTINGS.setdefault( + "default_initial_rows_per_chunk", + config.setting( + "default_initial_rows_per_chunk", DEFAULT_INITIAL_ROWS_PER_CHUNK + ), + ) def min_available_chunk_ratio(): - return SETTINGS.setdefault('min_available_chunk_ratio', - config.setting('min_available_chunk_ratio', 0)) + return SETTINGS.setdefault( + "min_available_chunk_ratio", config.setting("min_available_chunk_ratio", 0) + ) def keep_chunk_logs(): # if we are overwriting MEM_LOG_FILE then presumably we want to delete any subprocess files - default = (LOG_FILE_NAME == OMNIBUS_LOG_FILE_NAME) + default = LOG_FILE_NAME == OMNIBUS_LOG_FILE_NAME - return SETTINGS.setdefault('keep_chunk_logs', config.setting('keep_chunk_logs', default)) + return SETTINGS.setdefault( + "keep_chunk_logs", config.setting("keep_chunk_logs", default) + ) def trace_label_for_chunk(trace_label, chunk_size, i): @@ -230,32 +247,38 @@ def consolidate_logs(): if not glob_files: return - assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS), \ - f"shouldn't be any chunk log files when chunk_training_mode" \ + assert chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS), ( + f"shouldn't be any chunk log files when chunk_training_mode" f" is {MODE_PRODUCTION} or {MODE_CHUNKLESS}" + ) # # OMNIBUS_LOG_FILE # logger.debug(f"chunk.consolidate_logs reading glob {glob_file_name}") - omnibus_df = pd.concat((pd.read_csv(f, comment='#') for f in glob_files)) + omnibus_df = pd.concat((pd.read_csv(f, comment="#") for f in glob_files)) omnibus_df = omnibus_df.sort_values(by=C_TIME) # shouldn't have different depths for the same chunk_tag multi_depth_chunk_tag = omnibus_df[[C_CHUNK_TAG, C_DEPTH]] - multi_depth_chunk_tag = multi_depth_chunk_tag[~multi_depth_chunk_tag.duplicated()][[C_CHUNK_TAG]] - multi_depth_chunk_tag = multi_depth_chunk_tag[multi_depth_chunk_tag[C_CHUNK_TAG].duplicated()] - assert len(multi_depth_chunk_tag) == 0,\ - f"consolidate_logs multi_depth_chunk_tags \n{multi_depth_chunk_tag.values}" + multi_depth_chunk_tag = multi_depth_chunk_tag[~multi_depth_chunk_tag.duplicated()][ + [C_CHUNK_TAG] + ] + multi_depth_chunk_tag = multi_depth_chunk_tag[ + multi_depth_chunk_tag[C_CHUNK_TAG].duplicated() + ] + assert ( + len(multi_depth_chunk_tag) == 0 + ), f"consolidate_logs multi_depth_chunk_tags \n{multi_depth_chunk_tag.values}" if not keep_chunk_logs(): - util.delete_files(glob_files, 'chunk.consolidate_logs') + util.delete_files(glob_files, "chunk.consolidate_logs") log_output_path = config.log_file_path(OMNIBUS_LOG_FILE_NAME, prefix=False) logger.debug(f"chunk.consolidate_logs writing omnibus log to {log_output_path}") - omnibus_df.to_csv(log_output_path, mode='w', index=False) + omnibus_df.to_csv(log_output_path, mode="w", index=False) # # CACHE_FILE @@ -269,7 +292,9 @@ def consolidate_logs(): if zero_rows.any(): # this should only happen when chunk_log() instantiates the base ChunkSizer. # Since chunk_log is not chunked (chunk_size is always 0) there is no need for its history record in the cache - logger.debug(f"consolidate_logs dropping {zero_rows.sum()} rows where {C_NUM_ROWS} == 0") + logger.debug( + f"consolidate_logs dropping {zero_rows.sum()} rows where {C_NUM_ROWS} == 0" + ) omnibus_df = omnibus_df[omnibus_df[C_NUM_ROWS] > 0] omnibus_df = omnibus_df[[C_CHUNK_TAG, C_NUM_ROWS] + CUM_OVERHEAD_COLUMNS] @@ -283,29 +308,40 @@ def consolidate_logs(): # compute row_size num_rows = omnibus_df[C_NUM_ROWS] for m in USS_CHUNK_METHODS: - omnibus_df[f'{m}_row_size'] = np.ceil(overhead_for_chunk_method(omnibus_df, m) / num_rows).astype(int) + omnibus_df[f"{m}_row_size"] = np.ceil( + overhead_for_chunk_method(omnibus_df, m) / num_rows + ).astype(int) omnibus_df = omnibus_df.sort_values(by=C_CHUNK_TAG) log_dir_output_path = config.log_file_path(CACHE_FILE_NAME, prefix=False) - logger.debug(f"chunk.consolidate_logs writing omnibus chunk cache to {log_dir_output_path}") - omnibus_df.to_csv(log_dir_output_path, mode='w', index=False) + logger.debug( + f"chunk.consolidate_logs writing omnibus chunk cache to {log_dir_output_path}" + ) + omnibus_df.to_csv(log_dir_output_path, mode="w", index=False) if (chunk_training_mode() == MODE_RETRAIN) or not _HISTORIAN.have_cached_history: - if config.setting('resume_after'): + if config.setting("resume_after"): # FIXME - logger.warning(f"Not updating chunk_log cache directory because resume_after") + logger.warning( + f"Not updating chunk_log cache directory because resume_after" + ) else: - cache_dir_output_path = os.path.join(config.get_cache_dir(), CACHE_FILE_NAME) - logger.debug(f"chunk.consolidate_logs writing chunk cache to {cache_dir_output_path}") - omnibus_df.to_csv(cache_dir_output_path, mode='w', index=False) + cache_dir_output_path = os.path.join( + config.get_cache_dir(), CACHE_FILE_NAME + ) + logger.debug( + f"chunk.consolidate_logs writing chunk cache to {cache_dir_output_path}" + ) + omnibus_df.to_csv(cache_dir_output_path, mode="w", index=False) class ChunkHistorian(object): """ Utility for estimating row_size """ + def __init__(self): self.chunk_log_path = None @@ -324,16 +360,22 @@ def load_cached_history(self): chunk_cache_path = os.path.join(config.get_cache_dir(), CACHE_FILE_NAME) - logger.debug(f"ChunkHistorian load_cached_history chunk_cache_path {chunk_cache_path}") + logger.debug( + f"ChunkHistorian load_cached_history chunk_cache_path {chunk_cache_path}" + ) if os.path.exists(chunk_cache_path): - logger.debug(f"ChunkHistorian load_cached_history reading cached chunk history from {CACHE_FILE_NAME}") - df = pd.read_csv(chunk_cache_path, comment='#') + logger.debug( + f"ChunkHistorian load_cached_history reading cached chunk history from {CACHE_FILE_NAME}" + ) + df = pd.read_csv(chunk_cache_path, comment="#") self.cached_history_df = df for c in CHUNK_CACHE_COLUMNS: - assert c in df, f"Expected column '{c}' not in chunk_cache: {chunk_cache_path}" + assert ( + c in df + ), f"Expected column '{c}' not in chunk_cache: {chunk_cache_path}" self.have_cached_history = True else: @@ -345,9 +387,13 @@ def load_cached_history(self): if chunk_training_mode() == MODE_PRODUCTION: # raise RuntimeError(f"chunk_training_mode is {MODE_PRODUCTION} but no chunk_cache: {chunk_cache_path}") - SETTINGS['chunk_training_mode'] = MODE_RETRAIN - logger.warning(f"chunk_training_mode is {MODE_PRODUCTION} but no chunk_cache: {chunk_cache_path}") - logger.warning(f"chunk_training_mode falling back to {chunk_training_mode()}") + SETTINGS["chunk_training_mode"] = MODE_RETRAIN + logger.warning( + f"chunk_training_mode is {MODE_PRODUCTION} but no chunk_cache: {chunk_cache_path}" + ) + logger.warning( + f"chunk_training_mode falling back to {chunk_training_mode()}" + ) def cached_history_for_chunk_tag(self, chunk_tag): @@ -357,19 +403,25 @@ def cached_history_for_chunk_tag(self, chunk_tag): if self.have_cached_history: try: - df = self.cached_history_df[self.cached_history_df[C_CHUNK_TAG] == chunk_tag] + df = self.cached_history_df[ + self.cached_history_df[C_CHUNK_TAG] == chunk_tag + ] if len(df) > 0: if len(df) > 1: # don't expect this, but not fatal - logger.warning(f"ChunkHistorian aggregating {len(df)} multiple rows for {chunk_tag}") + logger.warning( + f"ChunkHistorian aggregating {len(df)} multiple rows for {chunk_tag}" + ) # history for this chunk_tag as dict column sums ('num_rows' and cum_overhead for each metric) # {'num_rows: , 'rss': , 'uss': , 'bytes': } history = df.sum().to_dict() except Exception as e: - logger.warning(f"ChunkHistorian Error loading cached history for {chunk_tag}") + logger.warning( + f"ChunkHistorian Error loading cached history for {chunk_tag}" + ) raise e return history @@ -398,15 +450,21 @@ def write_history(self, history, chunk_tag): history_df = history_df.tail(1) history_df[C_CHUNK_TAG] = chunk_tag - history_df['process'] = multiprocessing.current_process().name + history_df["process"] = multiprocessing.current_process().name history_df = history_df[CHUNK_HISTORY_COLUMNS] if self.chunk_log_path is None: self.chunk_log_path = config.log_file_path(LOG_FILE_NAME) - tracing.write_df_csv(history_df, self.chunk_log_path, index_label=None, - columns=None, column_labels=None, transpose=False) + tracing.write_df_csv( + history_df, + self.chunk_log_path, + index_label=None, + columns=None, + column_labels=None, + transpose=False, + ) _HISTORIAN = ChunkHistorian() @@ -415,6 +473,7 @@ def write_history(self, history, chunk_tag): class ChunkLedger(object): """ """ + def __init__(self, trace_label, chunk_size, baseline_rss, baseline_uss, headroom): self.trace_label = trace_label self.chunk_size = chunk_size @@ -422,9 +481,9 @@ def __init__(self, trace_label, chunk_size, baseline_rss, baseline_uss, headroom self.base_chunk_size = get_base_chunk_size() self.tables = {} - self.hwm_bytes = {'value': 0, 'info': f'{trace_label}.init'} - self.hwm_rss = {'value': baseline_rss, 'info': f'{trace_label}.init'} - self.hwm_uss = {'value': baseline_uss, 'info': f'{trace_label}.init'} + self.hwm_bytes = {"value": 0, "info": f"{trace_label}.init"} + self.hwm_rss = {"value": baseline_rss, "info": f"{trace_label}.init"} + self.hwm_uss = {"value": baseline_uss, "info": f"{trace_label}.init"} self.total_bytes = 0 def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): @@ -440,29 +499,40 @@ def audit(self, msg, bytes=0, rss=0, uss=0, from_rss_monitor=False): bytes_panic_threshold = self.headroom + (self.base_chunk_size * MAX_OVERDRAFT) if bytes > bytes_panic_threshold: - logger.warning(f"out_of_chunk_memory: " - f"bytes: {bytes} headroom: {self.headroom} chunk_size: {self.base_chunk_size} {msg}") + logger.warning( + f"out_of_chunk_memory: " + f"bytes: {bytes} headroom: {self.headroom} chunk_size: {self.base_chunk_size} {msg}" + ) if chunk_metric() == RSS and rss > mem_panic_threshold: rss, _ = mem.get_rss(force_garbage_collect=True, uss=False) if rss > mem_panic_threshold: - logger.warning(f"out_of_chunk_memory: " - f"rss: {rss} chunk_size: {self.base_chunk_size} {msg}") + logger.warning( + f"out_of_chunk_memory: " + f"rss: {rss} chunk_size: {self.base_chunk_size} {msg}" + ) if chunk_metric() == USS and uss > mem_panic_threshold: _, uss = mem.get_rss(force_garbage_collect=True, uss=True) if uss > mem_panic_threshold: - logger.warning(f"out_of_chunk_memory: " - f"uss: {uss} chunk_size: {self.base_chunk_size} {msg}") + logger.warning( + f"out_of_chunk_memory: " + f"uss: {uss} chunk_size: {self.base_chunk_size} {msg}" + ) def close(self): logger.debug(f"ChunkLedger.close trace_label: {self.trace_label}") - logger.debug(f"ChunkLedger.close hwm_bytes: {self.hwm_bytes.get('value', 0)} {self.hwm_bytes['info']}") - logger.debug(f"ChunkLedger.close hwm_rss {self.hwm_rss['value']} {self.hwm_rss['info']}") - logger.debug(f"ChunkLedger.close hwm_uss {self.hwm_uss['value']} {self.hwm_uss['info']}") + logger.debug( + f"ChunkLedger.close hwm_bytes: {self.hwm_bytes.get('value', 0)} {self.hwm_bytes['info']}" + ) + logger.debug( + f"ChunkLedger.close hwm_rss {self.hwm_rss['value']} {self.hwm_rss['info']}" + ) + logger.debug( + f"ChunkLedger.close hwm_uss {self.hwm_uss['value']} {self.hwm_uss['info']}" + ) def log_df(self, table_name, df): - def size_it(df): if isinstance(df, pd.Series): elements = util.iprod(df.shape) @@ -515,7 +585,9 @@ def size_it(df): else: shape = df.shape - logger.debug(f"log_df delta_bytes: {util.INT(delta_bytes).rjust(12)} {table_name} {shape} {self.trace_label}") + logger.debug( + f"log_df delta_bytes: {util.INT(delta_bytes).rjust(12)} {table_name} {shape} {self.trace_label}" + ) # update current total_bytes count self.total_bytes = sum(self.tables.values()) @@ -526,30 +598,32 @@ def check_local_hwm(self, hwm_trace_label, rss, uss, total_bytes): from_rss_monitor = total_bytes is None - info = f"rss: {GB(rss)} " \ - f"uss: {GB(uss)} " \ - f"base_chunk_size: {GB(self.base_chunk_size)} " \ - f"op: {hwm_trace_label}" + info = ( + f"rss: {GB(rss)} " + f"uss: {GB(uss)} " + f"base_chunk_size: {GB(self.base_chunk_size)} " + f"op: {hwm_trace_label}" + ) if total_bytes: info = f"bytes: {GB(total_bytes)} " + info - if total_bytes > self.hwm_bytes['value']: + if total_bytes > self.hwm_bytes["value"]: # total_bytes high water mark - self.hwm_bytes['value'] = total_bytes - self.hwm_bytes['info'] = info + self.hwm_bytes["value"] = total_bytes + self.hwm_bytes["info"] = info self.audit(hwm_trace_label, bytes=total_bytes) - if rss > self.hwm_rss['value']: + if rss > self.hwm_rss["value"]: # rss high water mark - self.hwm_rss['value'] = rss - self.hwm_rss['info'] = info + self.hwm_rss["value"] = rss + self.hwm_rss["info"] = info self.audit(hwm_trace_label, rss=rss, from_rss_monitor=from_rss_monitor) - if uss > self.hwm_uss['value']: + if uss > self.hwm_uss["value"]: # uss high water mark - self.hwm_uss['value'] = uss - self.hwm_uss['info'] = info + self.hwm_uss["value"] = uss + self.hwm_uss["info"] = info self.audit(hwm_trace_label, uss=uss, from_rss_monitor=from_rss_monitor) # silently registers global high water mark @@ -560,16 +634,16 @@ def check_local_hwm(self, hwm_trace_label, rss, uss, total_bytes): def get_hwm_rss(self): with ledger_lock: - net_rss = self.hwm_rss['value'] + net_rss = self.hwm_rss["value"] return net_rss def get_hwm_uss(self): with ledger_lock: - net_uss = self.hwm_uss['value'] + net_uss = self.hwm_uss["value"] return net_uss def get_hwm_bytes(self): - return self.hwm_bytes['value'] + return self.hwm_bytes["value"] def log_rss(trace_label, force=False): @@ -603,7 +677,7 @@ def log_df(trace_label, table_name, df): assert len(CHUNK_LEDGERS) > 0, f"log_df called without current chunker." - op = 'del' if df is None else 'add' + op = "del" if df is None else "add" hwm_trace_label = f"{trace_label}.{op}.{table_name}" rss, uss = mem.trace_memory_info(hwm_trace_label) @@ -622,7 +696,6 @@ def log_df(trace_label, table_name, df): class MemMonitor(threading.Thread): - def __init__(self, trace_label, stop_snooping): self.trace_label = trace_label self.stop_snooping = stop_snooping @@ -637,6 +710,7 @@ def run(self): class ChunkSizer(object): """ """ + def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): self.depth = len(CHUNK_SIZERS) + 1 @@ -658,7 +732,9 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): parent = CHUNK_SIZERS[-1] assert parent.chunk_ledger is not None - log_rss(trace_label) # give parent a complementary log_rss reading entering sub context + log_rss( + trace_label + ) # give parent a complementary log_rss reading entering sub context else: self.rss, self.uss = 0, 0 @@ -670,8 +746,9 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): self.rows_processed = 0 min_chunk_ratio = min_available_chunk_ratio() - assert 0 <= min_chunk_ratio <= 1, \ - f"min_chunk_ratio setting {min_chunk_ratio} is not in range [0..1]" + assert ( + 0 <= min_chunk_ratio <= 1 + ), f"min_chunk_ratio setting {min_chunk_ratio} is not in range [0..1]" self.min_chunk_size = chunk_size * min_chunk_ratio self.initial_row_size = 0 @@ -689,9 +766,11 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): self.cum_overhead = {m: cached_history[m] for m in METRICS} self.cum_rows = cached_history[C_NUM_ROWS] - logger.debug(f"{self.trace_label}.ChunkSizer - cached history " - f"cum_rows: {self.cum_rows} " - f"cum_overhead: {self.cum_overhead} ") + logger.debug( + f"{self.trace_label}.ChunkSizer - cached history " + f"cum_rows: {self.cum_rows} " + f"cum_overhead: {self.cum_overhead} " + ) # add self to CHUNK_SIZERS list before setting base_chunk_size (since we might be base chunker) CHUNK_SIZERS.append(self) @@ -699,12 +778,15 @@ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): self.base_chunk_size = CHUNK_SIZERS[0].chunk_size # need base_chunk_size to calc headroom - self.headroom = self.available_headroom(self.uss if chunk_metric() == USS else self.rss) + self.headroom = self.available_headroom( + self.uss if chunk_metric() == USS else self.rss + ) def close(self): - if ((self.depth == 1) or WRITE_SUBCHUNK_HISTORY) and \ - (chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS)): + if ((self.depth == 1) or WRITE_SUBCHUNK_HISTORY) and ( + chunk_training_mode() not in (MODE_PRODUCTION, MODE_CHUNKLESS) + ): _HISTORIAN.write_history(self.history, self.chunk_tag) _chunk_sizer = CHUNK_SIZERS.pop() @@ -718,10 +800,12 @@ def available_headroom(self, xss): if headroom < self.min_chunk_size: if self.base_chunk_size > 0: - logger.warning(f"Not enough memory for minimum chunk_size without exceeding specified chunk_size. " - f"available_headroom: {util.INT(headroom)} " - f"min_chunk_size: {util.INT(self.min_chunk_size)} " - f"base_chunk_size: {util.INT(self.base_chunk_size)}") + logger.warning( + f"Not enough memory for minimum chunk_size without exceeding specified chunk_size. " + f"available_headroom: {util.INT(headroom)} " + f"min_chunk_size: {util.INT(self.min_chunk_size)} " + f"base_chunk_size: {util.INT(self.base_chunk_size)}" + ) headroom = self.min_chunk_size @@ -743,15 +827,23 @@ def initial_rows_per_chunk(self): assert len(CHUNK_LEDGERS) == 0, f"len(CHUNK_LEDGERS): {len(CHUNK_LEDGERS)}" if self.initial_row_size > 0: - max_rows_per_chunk = np.maximum(int(self.headroom / self.initial_row_size), 1) + max_rows_per_chunk = np.maximum( + int(self.headroom / self.initial_row_size), 1 + ) rows_per_chunk = np.clip(max_rows_per_chunk, 1, self.num_choosers) - estimated_number_of_chunks = math.ceil(self.num_choosers / rows_per_chunk) + estimated_number_of_chunks = math.ceil( + self.num_choosers / rows_per_chunk + ) - logger.debug(f"{self.trace_label}.initial_rows_per_chunk - initial_row_size: {self.initial_row_size}") + logger.debug( + f"{self.trace_label}.initial_rows_per_chunk - initial_row_size: {self.initial_row_size}" + ) else: # if no initial_row_size from cache, fall back to default_initial_rows_per_chunk self.initial_row_size = 0 - rows_per_chunk = min(self.num_choosers, default_initial_rows_per_chunk()) + rows_per_chunk = min( + self.num_choosers, default_initial_rows_per_chunk() + ) estimated_number_of_chunks = None assert chunk_training_mode() != MODE_PRODUCTION @@ -762,10 +854,12 @@ def initial_rows_per_chunk(self): self.rows_processed += rows_per_chunk self.cum_rows += rows_per_chunk - logger.debug(f"{self.trace_label}.initial_rows_per_chunk - " - f"rows_per_chunk: {self.rows_per_chunk} " - f"headroom: {self.headroom} " - f"initial_row_size: {self.initial_row_size} ") + logger.debug( + f"{self.trace_label}.initial_rows_per_chunk - " + f"rows_per_chunk: {self.rows_per_chunk} " + f"headroom: {self.headroom} " + f"initial_row_size: {self.initial_row_size} " + ) return rows_per_chunk, estimated_number_of_chunks @@ -789,7 +883,9 @@ def adaptive_rows_per_chunk(self, i): self.rss, _ = mem.get_rss(force_garbage_collect=True, uss=False) self.uss = 0 - self.headroom = self.available_headroom(self.uss if chunk_metric() == USS else self.rss) + self.headroom = self.available_headroom( + self.uss if chunk_metric() == USS else self.rss + ) rows_remaining = self.num_choosers - prev_rows_processed @@ -810,8 +906,9 @@ def adaptive_rows_per_chunk(self, i): for m in METRICS: self.cum_overhead[m] += overhead[m] - observed_row_size = \ - prev_cum_rows and math.ceil(overhead_for_chunk_method(self.cum_overhead) / prev_cum_rows) + observed_row_size = prev_cum_rows and math.ceil( + overhead_for_chunk_method(self.cum_overhead) / prev_cum_rows + ) # rows_per_chunk is closest number of chooser rows to achieve chunk_size without exceeding it if observed_row_size > 0: @@ -822,39 +919,49 @@ def adaptive_rows_per_chunk(self, i): self.rows_per_chunk = np.clip(self.rows_per_chunk, 1, rows_remaining) self.rows_processed += self.rows_per_chunk - estimated_number_of_chunks = i + math.ceil(rows_remaining / self.rows_per_chunk) if rows_remaining else i + estimated_number_of_chunks = ( + i + math.ceil(rows_remaining / self.rows_per_chunk) if rows_remaining else i + ) - self.history.setdefault(C_TIME, []).append(datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S.%f")) + self.history.setdefault(C_TIME, []).append( + datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S.%f") + ) self.history.setdefault(C_DEPTH, []).append(self.depth) for m in METRICS: - self.history.setdefault(f'cum_overhead_{m}', []).append(self.cum_overhead[m]) + self.history.setdefault(f"cum_overhead_{m}", []).append( + self.cum_overhead[m] + ) self.history.setdefault(C_NUM_ROWS, []).append(prev_cum_rows) - self.history.setdefault('chunk', []).append(i) - self.history.setdefault('chunk_size', []).append(self.chunk_size) - self.history.setdefault('row_size', []).append(observed_row_size) + self.history.setdefault("chunk", []).append(i) + self.history.setdefault("chunk_size", []).append(self.chunk_size) + self.history.setdefault("row_size", []).append(observed_row_size) # diagnostics not reported by ChunkHistorian if chunk_metric() == USS: - self.history.setdefault('prev_uss', []).append(prev_uss) - self.history.setdefault('cur_uss', []).append(self.uss) + self.history.setdefault("prev_uss", []).append(prev_uss) + self.history.setdefault("cur_uss", []).append(self.uss) else: - self.history.setdefault('prev_rss', []).append(prev_rss) - self.history.setdefault('cur_rss', []).append(self.rss) + self.history.setdefault("prev_rss", []).append(prev_rss) + self.history.setdefault("cur_rss", []).append(self.rss) - self.history.setdefault('prev_headroom', []).append(prev_headroom) - self.history.setdefault('cur_headroom', []).append(self.headroom) + self.history.setdefault("prev_headroom", []).append(prev_headroom) + self.history.setdefault("cur_headroom", []).append(self.headroom) for m in METRICS: - self.history.setdefault(f'overhead_{m}', []).append(overhead[m]) + self.history.setdefault(f"overhead_{m}", []).append(overhead[m]) - self.history.setdefault('new_rows_processed', []).append(self.rows_processed) - self.history.setdefault('new_rows_per_chunk', []).append(self.rows_per_chunk) - self.history.setdefault('estimated_num_chunks', []).append(estimated_number_of_chunks) + self.history.setdefault("new_rows_processed", []).append(self.rows_processed) + self.history.setdefault("new_rows_per_chunk", []).append(self.rows_per_chunk) + self.history.setdefault("estimated_num_chunks", []).append( + estimated_number_of_chunks + ) history_df = pd.DataFrame.from_dict(self.history) if LOG_SUBCHUNK_HISTORY: - logger.debug(f"ChunkSizer.adaptive_rows_per_chunk {self.chunk_tag}\n{history_df.transpose()}") + logger.debug( + f"ChunkSizer.adaptive_rows_per_chunk {self.chunk_tag}\n{history_df.transpose()}" + ) # input() @@ -878,7 +985,9 @@ def ledger(self): assert self.chunk_size == 0 with ledger_lock: - self.chunk_ledger = ChunkLedger(self.trace_label, self.chunk_size, self.rss, self.uss, self.headroom) + self.chunk_ledger = ChunkLedger( + self.trace_label, self.chunk_size, self.rss, self.uss, self.headroom + ) CHUNK_LEDGERS.append(self.chunk_ledger) # reality check - there should be one ledger per sizer @@ -894,9 +1003,13 @@ def ledger(self): mem_monitor = MemMonitor(self.trace_label, stop_snooping) mem_monitor.start() - log_rss(self.trace_label, force=True) # make sure we get at least one reading + log_rss( + self.trace_label, force=True + ) # make sure we get at least one reading yield - log_rss(self.trace_label, force=True) # make sure we get at least one reading + log_rss( + self.trace_label, force=True + ) # make sure we get at least one reading finally: @@ -908,7 +1021,9 @@ def ledger(self): stop_snooping.set() while mem_monitor.is_alive(): - logger.debug(f"{self.trace_label} waiting for mem_monitor thread to terminate") + logger.debug( + f"{self.trace_label} waiting for mem_monitor thread to terminate" + ) mem_monitor.join(timeout=MEM_MONITOR_TICK) with ledger_lock: @@ -960,7 +1075,9 @@ def adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag=None) assert num_choosers > 0 assert chunk_size >= 0 - logger.info(f"{trace_label} Running adaptive_chunked_choosers with {num_choosers} choosers") + logger.info( + f"{trace_label} Running adaptive_chunked_choosers with {num_choosers} choosers" + ) chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) @@ -977,22 +1094,29 @@ def adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag=None) with chunk_sizer.ledger(): # grab the next chunk based on current rows_per_chunk - chooser_chunk = choosers[offset: offset + rows_per_chunk] + chooser_chunk = choosers[offset : offset + rows_per_chunk] - logger.info(f"Running chunk {i} of {estimated_number_of_chunks or '?'} " - f"with {len(chooser_chunk)} of {num_choosers} choosers") + logger.info( + f"Running chunk {i} of {estimated_number_of_chunks or '?'} " + f"with {len(chooser_chunk)} of {num_choosers} choosers" + ) yield i, chooser_chunk, chunk_trace_label offset += rows_per_chunk if chunk_training_mode() != MODE_CHUNKLESS: - rows_per_chunk, estimated_number_of_chunks = chunk_sizer.adaptive_rows_per_chunk(i) + ( + rows_per_chunk, + estimated_number_of_chunks, + ) = chunk_sizer.adaptive_rows_per_chunk(i) chunk_sizer.close() -def adaptive_chunked_choosers_and_alts(choosers, alternatives, chunk_size, trace_label, chunk_tag=None): +def adaptive_chunked_choosers_and_alts( + choosers, alternatives, chunk_size, trace_label, chunk_tag=None +): """ generator to iterate over choosers and alternatives in chunk_size chunks @@ -1033,16 +1157,23 @@ def adaptive_chunked_choosers_and_alts(choosers, alternatives, chunk_size, trace assert num_choosers > 0 # alternatives index should match choosers (except with duplicate repeating alt rows) - assert choosers.index.equals(alternatives.index[~alternatives.index.duplicated(keep='first')]) + assert choosers.index.equals( + alternatives.index[~alternatives.index.duplicated(keep="first")] + ) last_repeat = alternatives.index != np.roll(alternatives.index, -1) assert (num_choosers == 1) or choosers.index.equals(alternatives.index[last_repeat]) - assert 'pick_count' in alternatives.columns or choosers.index.name == alternatives.index.name + assert ( + "pick_count" in alternatives.columns + or choosers.index.name == alternatives.index.name + ) assert choosers.index.name == alternatives.index.name - logger.info(f"{trace_label} Running adaptive_chunked_choosers_and_alts " - f"with {num_choosers} choosers and {num_alternatives} alternatives") + logger.info( + f"{trace_label} Running adaptive_chunked_choosers_and_alts " + f"with {num_choosers} choosers and {num_alternatives} alternatives" + ) chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) rows_per_chunk, estimated_number_of_chunks = chunk_sizer.initial_rows_per_chunk() @@ -1051,30 +1182,41 @@ def adaptive_chunked_choosers_and_alts(choosers, alternatives, chunk_size, trace # alt chunks boundaries are where index changes alt_ids = alternatives.index.values alt_chunk_ends = np.where(alt_ids[:-1] != alt_ids[1:])[0] + 1 - alt_chunk_ends = np.append([0], alt_chunk_ends) # including the first to simplify indexing - alt_chunk_ends = np.append(alt_chunk_ends, [len(alternatives.index)]) # end of final chunk + alt_chunk_ends = np.append( + [0], alt_chunk_ends + ) # including the first to simplify indexing + alt_chunk_ends = np.append( + alt_chunk_ends, [len(alternatives.index)] + ) # end of final chunk i = offset = alt_offset = 0 while offset < num_choosers: i += 1 - assert offset + rows_per_chunk <= num_choosers, \ - f"i {i} offset {offset} rows_per_chunk {rows_per_chunk} num_choosers {num_choosers}" + assert ( + offset + rows_per_chunk <= num_choosers + ), f"i {i} offset {offset} rows_per_chunk {rows_per_chunk} num_choosers {num_choosers}" chunk_trace_label = trace_label_for_chunk(trace_label, chunk_size, i) with chunk_sizer.ledger(): - chooser_chunk = choosers[offset: offset + rows_per_chunk] + chooser_chunk = choosers[offset : offset + rows_per_chunk] alt_end = alt_chunk_ends[offset + rows_per_chunk] - alternative_chunk = alternatives[alt_offset: alt_end] + alternative_chunk = alternatives[alt_offset:alt_end] - assert len(chooser_chunk.index) == len(np.unique(alternative_chunk.index.values)) - assert (chooser_chunk.index == np.unique(alternative_chunk.index.values)).all() + assert len(chooser_chunk.index) == len( + np.unique(alternative_chunk.index.values) + ) + assert ( + chooser_chunk.index == np.unique(alternative_chunk.index.values) + ).all() - logger.info(f"Running chunk {i} of {estimated_number_of_chunks or '?'} " - f"with {len(chooser_chunk)} of {num_choosers} choosers") + logger.info( + f"Running chunk {i} of {estimated_number_of_chunks or '?'} " + f"with {len(chooser_chunk)} of {num_choosers} choosers" + ) yield i, chooser_chunk, alternative_chunk, chunk_trace_label @@ -1082,12 +1224,17 @@ def adaptive_chunked_choosers_and_alts(choosers, alternatives, chunk_size, trace alt_offset = alt_end if chunk_training_mode() != MODE_CHUNKLESS: - rows_per_chunk, estimated_number_of_chunks = chunk_sizer.adaptive_rows_per_chunk(i) + ( + rows_per_chunk, + estimated_number_of_chunks, + ) = chunk_sizer.adaptive_rows_per_chunk(i) chunk_sizer.close() -def adaptive_chunked_choosers_by_chunk_id(choosers, chunk_size, trace_label, chunk_tag=None): +def adaptive_chunked_choosers_by_chunk_id( + choosers, chunk_size, trace_label, chunk_tag=None +): # generator to iterate over choosers in chunk_size chunks # like chunked_choosers but based on chunk_id field rather than dataframe length # (the presumption is that choosers has multiple rows with the same chunk_id that @@ -1096,7 +1243,7 @@ def adaptive_chunked_choosers_by_chunk_id(choosers, chunk_size, trace_label, chu chunk_tag = chunk_tag or trace_label - num_choosers = choosers['chunk_id'].max() + 1 + num_choosers = choosers["chunk_id"].max() + 1 assert num_choosers > 0 chunk_sizer = ChunkSizer(chunk_tag, trace_label, num_choosers, chunk_size) @@ -1113,16 +1260,23 @@ def adaptive_chunked_choosers_by_chunk_id(choosers, chunk_size, trace_label, chu with chunk_sizer.ledger(): - chooser_chunk = choosers[choosers['chunk_id'].between(offset, offset + rows_per_chunk - 1)] + chooser_chunk = choosers[ + choosers["chunk_id"].between(offset, offset + rows_per_chunk - 1) + ] - logger.info(f"{trace_label} Running chunk {i} of {estimated_number_of_chunks or '?'} " - f"with {rows_per_chunk} of {num_choosers} choosers") + logger.info( + f"{trace_label} Running chunk {i} of {estimated_number_of_chunks or '?'} " + f"with {rows_per_chunk} of {num_choosers} choosers" + ) yield i, chooser_chunk, chunk_trace_label offset += rows_per_chunk if chunk_training_mode() != MODE_CHUNKLESS: - rows_per_chunk, estimated_number_of_chunks = chunk_sizer.adaptive_rows_per_chunk(i) + ( + rows_per_chunk, + estimated_number_of_chunks, + ) = chunk_sizer.adaptive_rows_per_chunk(i) chunk_sizer.close() diff --git a/activitysim/core/config.py b/activitysim/core/config.py index b85a02e81a..617c63d2d6 100644 --- a/activitysim/core/config.py +++ b/activitysim/core/config.py @@ -1,13 +1,14 @@ # ActivitySim # See full license in LICENSE.txt. import argparse -import os import glob -import yaml +import logging +import os import sys import warnings -import logging +import yaml + from activitysim.core import inject logger = logging.getLogger(__name__) @@ -26,35 +27,37 @@ def locutor(): @inject.injectable(cache=True) def configs_dir(): - if not os.path.exists('configs'): + if not os.path.exists("configs"): raise RuntimeError("'configs' directory does not exist") - return 'configs' + return "configs" @inject.injectable(cache=True) def data_dir(): - if not os.path.exists('data'): + if not os.path.exists("data"): raise RuntimeError("'data' directory does not exist") - return 'data' + return "data" @inject.injectable(cache=True) def output_dir(): - if not os.path.exists('output'): - print(f"'output' directory does not exist - current working directory: {os.getcwd()}") + if not os.path.exists("output"): + print( + f"'output' directory does not exist - current working directory: {os.getcwd()}" + ) raise RuntimeError("'output' directory does not exist") - return 'output' + return "output" @inject.injectable() def output_file_prefix(): - return '' + return "" @inject.injectable(cache=True) def pipeline_file_name(settings): - pipeline_file_name = settings.get('pipeline_file_name', 'pipeline.h5') + pipeline_file_name = settings.get("pipeline_file_name", "pipeline.h5") return pipeline_file_name @@ -66,7 +69,7 @@ def rng_base_seed(): @inject.injectable(cache=True) def settings_file_name(): - return 'settings.yaml' + return "settings.yaml" @inject.injectable(cache=True) @@ -94,9 +97,11 @@ def get_cache_dir(): ------- str path """ - cache_dir = setting('cache_dir', default=None) + cache_dir = setting("cache_dir", default=None) if cache_dir is None: - cache_dir = setting('cache_dir', os.path.join(inject.get_injectable('output_dir'), 'cache')) + cache_dir = setting( + "cache_dir", os.path.join(inject.get_injectable("output_dir"), "cache") + ) if not os.path.isdir(cache_dir): os.mkdir(cache_dir) @@ -106,13 +111,13 @@ def get_cache_dir(): def setting(key, default=None): - return inject.get_injectable('settings').get(key, default) + return inject.get_injectable("settings").get(key, default) def override_setting(key, value): - new_settings = inject.get_injectable('settings') + new_settings = inject.get_injectable("settings") new_settings[key] = value - inject.add_injectable('settings', new_settings) + inject.add_injectable("settings", new_settings) def get_global_constants(): @@ -124,7 +129,7 @@ def get_global_constants(): constants : dict dictionary of constants to add to locals for use by expressions in model spec """ - return read_settings_file('constants.yaml', mandatory=False) + return read_settings_file("constants.yaml", mandatory=False) def read_model_settings(file_name, mandatory=False): @@ -168,9 +173,12 @@ def future_model_settings(model_name, model_settings, future_settings): model_settings = model_settings.copy() for key, setting in future_settings.items(): if key not in model_settings.keys(): - warnings.warn(f"Setting '{key}' not found in {model_name} model settings." - f"Replacing with default value: {setting}." - f"This setting will be required in future versions", FutureWarning) + warnings.warn( + f"Setting '{key}' not found in {model_name} model settings." + f"Replacing with default value: {setting}." + f"This setting will be required in future versions", + FutureWarning, + ) model_settings[key] = setting return model_settings @@ -185,7 +193,7 @@ def get_model_constants(model_settings): constants : dict dictionary of constants to add to locals for use by expressions in model spec """ - return model_settings.get('CONSTANTS', {}) + return model_settings.get("CONSTANTS", {}) def get_logit_model_settings(model_settings): @@ -205,14 +213,14 @@ def get_logit_model_settings(model_settings): if model_settings is not None: # default to MNL - logit_type = model_settings.get('LOGIT_TYPE', 'MNL') + logit_type = model_settings.get("LOGIT_TYPE", "MNL") - if logit_type not in ['NL', 'MNL']: + if logit_type not in ["NL", "MNL"]: logger.error("Unrecognized logit type '%s'" % logit_type) raise RuntimeError("Unrecognized logit type '%s'" % logit_type) - if logit_type == 'NL': - nests = model_settings.get('NESTS', None) + if logit_type == "NL": + nests = model_settings.get("NESTS", None) if nests is None: logger.error("No NEST found in model spec for NL model type") raise RuntimeError("No NEST found in model spec for NL model type") @@ -221,7 +229,7 @@ def get_logit_model_settings(model_settings): def build_output_file_path(file_name, use_prefix=None): - output_dir = inject.get_injectable('output_dir') + output_dir = inject.get_injectable("output_dir") if use_prefix: file_name = "%s-%s" % (use_prefix, file_name) @@ -231,7 +239,9 @@ def build_output_file_path(file_name, use_prefix=None): return file_path -def cascading_input_file_path(file_name, dir_list_injectable_name, mandatory=True, allow_glob=False): +def cascading_input_file_path( + file_name, dir_list_injectable_name, mandatory=True, allow_glob=False +): dir_paths = inject.get_injectable(dir_list_injectable_name) dir_paths = [dir_paths] if isinstance(dir_paths, str) else dir_paths @@ -248,15 +258,19 @@ def cascading_input_file_path(file_name, dir_list_injectable_name, mandatory=Tru break if mandatory and not file_path: - raise RuntimeError("file_path %s: file '%s' not in %s" % - (dir_list_injectable_name, file_name, dir_paths)) + raise RuntimeError( + "file_path %s: file '%s' not in %s" + % (dir_list_injectable_name, file_name, dir_paths) + ) return file_path def data_file_path(file_name, mandatory=True, allow_glob=False): - return cascading_input_file_path(file_name, 'data_dir', mandatory=mandatory, allow_glob=allow_glob) + return cascading_input_file_path( + file_name, "data_dir", mandatory=mandatory, allow_glob=allow_glob + ) def expand_input_file_list(input_files): @@ -280,8 +294,11 @@ def expand_input_file_list(input_files): continue if os.path.isdir(file_name): - logger.warning("WARNING: expand_input_file_list skipping directory: " - "(use glob instead): %s", file_name) + logger.warning( + "WARNING: expand_input_file_list skipping directory: " + "(use glob instead): %s", + file_name, + ) ungroked_files += 1 continue @@ -292,12 +309,16 @@ def expand_input_file_list(input_files): if os.path.isfile(globbed_file): expanded_files.append(globbed_file) else: - logger.warning("WARNING: expand_input_file_list skipping: " - "(does not grok) %s", file_name) + logger.warning( + "WARNING: expand_input_file_list skipping: " "(does not grok) %s", + file_name, + ) ungroked_files += 1 if len(globbed_files) == 0: - logger.warning("WARNING: expand_input_file_list file/glob not found: %s", file_name) + logger.warning( + "WARNING: expand_input_file_list file/glob not found: %s", file_name + ) assert ungroked_files == 0, f"{ungroked_files} ungroked file names" @@ -306,22 +327,22 @@ def expand_input_file_list(input_files): def config_file_path(file_name, mandatory=True): - return cascading_input_file_path(file_name, 'configs_dir', mandatory) + return cascading_input_file_path(file_name, "configs_dir", mandatory) def output_file_path(file_name): - prefix = inject.get_injectable('output_file_prefix', None) + prefix = inject.get_injectable("output_file_prefix", None) return build_output_file_path(file_name, use_prefix=prefix) def trace_file_path(file_name): - output_dir = inject.get_injectable('output_dir') + output_dir = inject.get_injectable("output_dir") # - check for optional trace subfolder - if os.path.exists(os.path.join(output_dir, 'trace')): - output_dir = os.path.join(output_dir, 'trace') + if os.path.exists(os.path.join(output_dir, "trace")): + output_dir = os.path.join(output_dir, "trace") else: file_name = "trace.%s" % (file_name,) @@ -331,14 +352,14 @@ def trace_file_path(file_name): def log_file_path(file_name, prefix=True): - output_dir = inject.get_injectable('output_dir') + output_dir = inject.get_injectable("output_dir") # - check for optional log subfolder - if os.path.exists(os.path.join(output_dir, 'log')): - output_dir = os.path.join(output_dir, 'log') + if os.path.exists(os.path.join(output_dir, "log")): + output_dir = os.path.join(output_dir, "log") # - check for optional process name prefix - prefix = prefix and inject.get_injectable('log_file_prefix', None) + prefix = prefix and inject.get_injectable("log_file_prefix", None) if prefix: file_name = "%s-%s" % (prefix, file_name) @@ -356,7 +377,10 @@ def open_log_file(file_name, mode, header=None, prefix=False): f = open(file_path, mode) if want_header: - assert mode in ['a', 'w'], f"open_log_file: header requested but mode was {mode}" + assert mode in [ + "a", + "w", + ], f"open_log_file: header requested but mode was {mode}" print(header, file=f) return f @@ -364,7 +388,7 @@ def open_log_file(file_name, mode, header=None, prefix=False): def pipeline_file_path(file_name): - prefix = inject.get_injectable('pipeline_file_prefix', None) + prefix = inject.get_injectable("pipeline_file_prefix", None) return build_output_file_path(file_name, use_prefix=prefix) @@ -377,7 +401,9 @@ def __str__(self): return repr(f"Settings file '{self.file_name}' not found in {self.configs_dir}") -def read_settings_file(file_name, mandatory=True, include_stack=[], configs_dir_list=None): +def read_settings_file( + file_name, mandatory=True, include_stack=[], configs_dir_list=None +): """ look for first occurence of yaml file named in directories in configs_dir list, @@ -411,14 +437,19 @@ def backfill_settings(settings, backfill): return new_settings if configs_dir_list is None: - configs_dir_list = inject.get_injectable('configs_dir') - configs_dir_list = [configs_dir_list] if isinstance(configs_dir_list, str) else configs_dir_list + configs_dir_list = inject.get_injectable("configs_dir") + configs_dir_list = ( + [configs_dir_list] + if isinstance(configs_dir_list, str) + else configs_dir_list + ) assert isinstance(configs_dir_list, list) - assert len(configs_dir_list) == len(set(configs_dir_list)), \ - f"repeating file names not allowed in config_dir list: {configs_dir_list}" + assert len(configs_dir_list) == len( + set(configs_dir_list) + ), f"repeating file names not allowed in config_dir list: {configs_dir_list}" - if not file_name.lower().endswith('.yaml'): - file_name = '%s.yaml' % (file_name,) + if not file_name.lower().endswith(".yaml"): + file_name = "%s.yaml" % (file_name,) inheriting = False settings = {} @@ -428,11 +459,15 @@ def backfill_settings(settings, backfill): if os.path.exists(file_path): if inheriting: # we must be inheriting - logger.debug("inheriting additional settings for %s from %s" % (file_name, file_path)) + logger.debug( + "inheriting additional settings for %s from %s" + % (file_name, file_path) + ) inheriting = True - assert file_path not in source_file_paths, \ - f"read_settings_file - recursion in reading 'file_path' after loading: {source_file_paths}" + assert ( + file_path not in source_file_paths + ), f"read_settings_file - recursion in reading 'file_path' after loading: {source_file_paths}" with open(file_path) as f: @@ -445,23 +480,34 @@ def backfill_settings(settings, backfill): # maintain a list of files we read from to improve error message when an expected setting is not found source_file_paths += [file_path] - include_file_name = s.get('include_settings', False) + include_file_name = s.get("include_settings", False) if include_file_name: # FIXME - prevent users from creating borgesian garden of branching paths? # There is a lot of opportunity for confusion if this feature were over-used # Maybe we insist that a file with an include directive is the 'end of the road' # essentially the current settings firle is an alias for the included file if len(s) > 1: - logger.error(f"'include_settings' must appear alone in settings file.") - additional_settings = list(set(s.keys()).difference({'include_settings'})) - logger.error(f"Unexpected additional settings: {additional_settings}") - raise RuntimeError(f"'include_settings' must appear alone in settings file.") - - logger.debug("including settings for %s from %s" % (file_name, include_file_name)) + logger.error( + f"'include_settings' must appear alone in settings file." + ) + additional_settings = list( + set(s.keys()).difference({"include_settings"}) + ) + logger.error( + f"Unexpected additional settings: {additional_settings}" + ) + raise RuntimeError( + f"'include_settings' must appear alone in settings file." + ) + + logger.debug( + "including settings for %s from %s" % (file_name, include_file_name) + ) # recursive call to read included file INSTEAD of the file with include_settings sepcified - s, source_file_paths = \ - read_settings_file(include_file_name, mandatory=True, include_stack=source_file_paths) + s, source_file_paths = read_settings_file( + include_file_name, mandatory=True, include_stack=source_file_paths + ) # FIXME backfill with the included file settings = backfill_settings(settings, s) @@ -469,31 +515,37 @@ def backfill_settings(settings, backfill): # we are done as soon as we read one file successfully # unless if inherit_settings is set to true in this file - if not s.get('inherit_settings', False): + if not s.get("inherit_settings", False): break # if inheriting, continue and backfill settings from the next existing settings file configs_dir_list - inherit_settings = s.get('inherit_settings') + inherit_settings = s.get("inherit_settings") if isinstance(inherit_settings, str): inherit_file_name = inherit_settings - assert os.path.join(dir, inherit_file_name) not in source_file_paths, \ - f"circular inheritance of {inherit_file_name}: {source_file_paths}: " + assert ( + os.path.join(dir, inherit_file_name) not in source_file_paths + ), f"circular inheritance of {inherit_file_name}: {source_file_paths}: " # make a recursive call to switch inheritance chain to specified file configs_dir_list = None - logger.debug("inheriting additional settings for %s from %s" % (file_name, inherit_file_name)) - s, source_file_paths = \ - read_settings_file(inherit_file_name, mandatory=True, - include_stack=source_file_paths, - configs_dir_list=configs_dir_list) + logger.debug( + "inheriting additional settings for %s from %s" + % (file_name, inherit_file_name) + ) + s, source_file_paths = read_settings_file( + inherit_file_name, + mandatory=True, + include_stack=source_file_paths, + configs_dir_list=configs_dir_list, + ) # backfill with the inherited file settings = backfill_settings(settings, s) break # break the current inheritance chain (not as bad luck as breaking a chain-letter chain?...) if len(source_file_paths) > 0: - settings['source_file_paths'] = source_file_paths + settings["source_file_paths"] = source_file_paths if mandatory and not settings: raise SettingsFileNotFound(file_name, configs_dir_list) @@ -518,10 +570,10 @@ def base_settings_file_path(file_name): path to base settings file or None if not found """ - if not file_name.lower().endswith('.yaml'): - file_name = '%s.yaml' % (file_name, ) + if not file_name.lower().endswith(".yaml"): + file_name = "%s.yaml" % (file_name,) - configs_dir = inject.get_injectable('configs_dir') + configs_dir = inject.get_injectable("configs_dir") configs_dir = [configs_dir] if isinstance(configs_dir, str) else configs_dir for dir in configs_dir: @@ -537,36 +589,47 @@ def filter_warnings(): set warning filter to 'strict' if specified in settings """ - if setting('strict', False): # noqa: E402 - warnings.filterwarnings('error', category=Warning) - warnings.filterwarnings('default', category=PendingDeprecationWarning, module='future') - warnings.filterwarnings('default', category=FutureWarning, module='pandas') - warnings.filterwarnings('default', category=RuntimeWarning, module='numpy') + if setting("strict", False): # noqa: E402 + warnings.filterwarnings("error", category=Warning) + warnings.filterwarnings( + "default", category=PendingDeprecationWarning, module="future" + ) + warnings.filterwarnings("default", category=FutureWarning, module="pandas") + warnings.filterwarnings("default", category=RuntimeWarning, module="numpy") # pandas pytables.py __getitem__ (e.g. df = store['any_string']) # indirectly raises tables DeprecationWarning: tostring() is deprecated. Use tobytes() instead. - warnings.filterwarnings('ignore', category=DeprecationWarning, module='tables', message='tostring') + warnings.filterwarnings( + "ignore", category=DeprecationWarning, module="tables", message="tostring" + ) # File "tables/hdf5extension.pyx", line 1450, in tables.hdf5extension.Array._open_array # DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. # Deprecated in NumPy 1.20; - warnings.filterwarnings('ignore', category=DeprecationWarning, module='tables', - message='`np.object` is a deprecated alias') + warnings.filterwarnings( + "ignore", + category=DeprecationWarning, + module="tables", + message="`np.object` is a deprecated alias", + ) # beginning pandas version 1.3, various places emit a PerformanceWarning that is # caught in the "strict" filter above, but which are currently unavoidable for complex models. # These warning are left as warnings as an invitation for future enhancement. from pandas.errors import PerformanceWarning - warnings.filterwarnings('default', category=PerformanceWarning) + + warnings.filterwarnings("default", category=PerformanceWarning) def handle_standard_args(parser=None): from activitysim.cli import run - warnings.warn('config.handle_standard_args() has been moved to the command line ' - 'module and will be removed in future versions.', - FutureWarning) + warnings.warn( + "config.handle_standard_args() has been moved to the command line " + "module and will be removed in future versions.", + FutureWarning, + ) if parser is None: parser = argparse.ArgumentParser() diff --git a/activitysim/core/expressions.py b/activitysim/core/expressions.py index 91dc1e7ac4..65cc9e8658 100644 --- a/activitysim/core/expressions.py +++ b/activitysim/core/expressions.py @@ -6,15 +6,9 @@ import numpy as np import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import assign -from activitysim.core import inject -from activitysim.core import simulate - +from activitysim.core import assign, config, inject, simulate, tracing from activitysim.core.util import assign_in_place - logger = logging.getLogger(__name__) @@ -46,31 +40,36 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): if isinstance(model_settings, str): model_settings_name = model_settings - model_settings = config.read_model_settings('%s.yaml' % model_settings) + model_settings = config.read_model_settings("%s.yaml" % model_settings) assert model_settings, "Found no model settings for %s" % model_settings_name else: - model_settings_name = 'dict' + model_settings_name = "dict" assert isinstance(model_settings, dict) - assert 'DF' in model_settings, \ - "Expected to find 'DF' in %s" % model_settings_name + assert "DF" in model_settings, "Expected to find 'DF' in %s" % model_settings_name - df_name = model_settings.get('DF') - helper_table_names = model_settings.get('TABLES', []) - expressions_spec_name = model_settings.get('SPEC', None) + df_name = model_settings.get("DF") + helper_table_names = model_settings.get("TABLES", []) + expressions_spec_name = model_settings.get("SPEC", None) - assert expressions_spec_name is not None, \ + assert expressions_spec_name is not None, ( "Expected to find 'SPEC' in %s" % model_settings_name + ) - trace_label = tracing.extend_trace_label(trace_label or '', expressions_spec_name) + trace_label = tracing.extend_trace_label(trace_label or "", expressions_spec_name) if not expressions_spec_name.endswith(".csv"): - expressions_spec_name = '%s.csv' % expressions_spec_name - logger.debug(f"{trace_label} compute_columns using expression spec file {expressions_spec_name}") - expressions_spec = assign.read_assignment_spec(config.config_file_path(expressions_spec_name)) - - assert expressions_spec.shape[0] > 0, \ + expressions_spec_name = "%s.csv" % expressions_spec_name + logger.debug( + f"{trace_label} compute_columns using expression spec file {expressions_spec_name}" + ) + expressions_spec = assign.read_assignment_spec( + config.config_file_path(expressions_spec_name) + ) + + assert expressions_spec.shape[0] > 0, ( "Expected to find some assignment expressions in %s" % expressions_spec_name + ) tables = {t: inject.get_table(t).to_frame() for t in helper_table_names} @@ -79,28 +78,26 @@ def compute_columns(df, model_settings, locals_dict={}, trace_label=None): tables[df_name] = df # be nice and also give it to them as df? - tables['df'] = df + tables["df"] = df _locals_dict = assign.local_utilities() _locals_dict.update(locals_dict) _locals_dict.update(tables) # FIXME a number of asim model preprocessors want skim_dict - should they request it in model_settings.TABLES? - _locals_dict.update({ - # 'los': inject.get_injectable('network_los', None), - 'skim_dict': inject.get_injectable('skim_dict', None), - }) + _locals_dict.update( + { + # 'los': inject.get_injectable('network_los', None), + "skim_dict": inject.get_injectable("skim_dict", None), + } + ) - results, trace_results, trace_assigned_locals \ - = assign.assign_variables(expressions_spec, - df, - _locals_dict, - trace_rows=tracing.trace_targets(df)) + results, trace_results, trace_assigned_locals = assign.assign_variables( + expressions_spec, df, _locals_dict, trace_rows=tracing.trace_targets(df) + ) if trace_results is not None: - tracing.trace_df(trace_results, - label=trace_label, - slicer='NONE') + tracing.trace_df(trace_results, label=trace_label, slicer="NONE") if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label) @@ -131,15 +128,13 @@ def assign_columns(df, model_settings, locals_dict={}, trace_label=None): # ################################################################################################## -def annotate_preprocessors( - tours_df, locals_dict, skims, - model_settings, trace_label): +def annotate_preprocessors(tours_df, locals_dict, skims, model_settings, trace_label): locals_d = {} locals_d.update(locals_dict) locals_d.update(skims) - preprocessor_settings = model_settings.get('preprocessor', []) + preprocessor_settings = model_settings.get("preprocessor", []) if not isinstance(preprocessor_settings, list): assert isinstance(preprocessor_settings, dict) preprocessor_settings = [preprocessor_settings] @@ -152,7 +147,8 @@ def annotate_preprocessors( df=tours_df, model_settings=model_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) assign_in_place(tours_df, results) diff --git a/activitysim/core/inject.py b/activitysim/core/inject.py index da221b284b..6e2e9d5fa1 100644 --- a/activitysim/core/inject.py +++ b/activitysim/core/inject.py @@ -11,37 +11,38 @@ # we want to allow None (any anyting else) as a default value, so just choose an improbable string -_NO_DEFAULT = 'throw error if missing' +_NO_DEFAULT = "throw error if missing" logger = logging.getLogger(__name__) def step(): - def decorator(func): name = func.__name__ logger.debug("inject step %s" % name) - assert not _DECORATED_STEPS.get(name, False), \ + assert not _DECORATED_STEPS.get(name, False), ( "step '%s' already decorated." % name + ) _DECORATED_STEPS[name] = func orca.add_step(name, func) return func + return decorator def table(): - def decorator(func): name = func.__name__ logger.debug("inject table %s" % name) - assert not _DECORATED_TABLES.get(name, False), \ + assert not _DECORATED_TABLES.get(name, False), ( "table '%s' already decorated." % name + ) _DECORATED_TABLES[name] = func orca.add_table(name, func) @@ -58,14 +59,16 @@ def decorator(func): logger.debug("inject injectable %s" % name) # insist on explicit override to ensure multiple definitions occur in correct order - assert override or not _DECORATED_INJECTABLES.get(name, False), \ + assert override or not _DECORATED_INJECTABLES.get(name, False), ( "injectable '%s' already defined. not overridden" % name + ) - _DECORATED_INJECTABLES[name] = {'func': func, 'cache': cache} + _DECORATED_INJECTABLES[name] = {"func": func, "cache": cache} orca.add_injectable(name, func, cache=cache) return func + return decorator @@ -82,7 +85,11 @@ def add_table(table_name, table, replace=False): Add new table and raise assertion error if the table already exists. Silently replace if replace=True. """ - if not replace and orca.is_table(table_name) and orca.table_type(table_name) == 'dataframe': + if ( + not replace + and orca.is_table(table_name) + and orca.table_type(table_name) == "dataframe" + ): logger.warning("inject add_table replacing existing table %s" % table_name) assert False @@ -99,10 +106,17 @@ def add_injectable(name, injectable, cache=False): return orca.add_injectable(name, injectable, cache=cache) -def broadcast(cast, onto, cast_on=None, onto_on=None, cast_index=False, onto_index=False): - return orca.broadcast(cast, onto, - cast_on=cast_on, onto_on=onto_on, - cast_index=cast_index, onto_index=onto_index) +def broadcast( + cast, onto, cast_on=None, onto_on=None, cast_index=False, onto_index=False +): + return orca.broadcast( + cast, + onto, + cast_on=cast_on, + onto_on=onto_on, + cast_index=cast_index, + onto_index=onto_index, + ) def get_table(name, default=_NO_DEFAULT): @@ -151,11 +165,11 @@ def reinject_decorated_tables(): for column_key, args in _DECORATED_COLUMNS.items(): table_name, column_name = column_key logger.debug("reinject decorated column %s.%s" % (table_name, column_name)) - orca.add_column(table_name, column_name, args['func'], cache=args['cache']) + orca.add_column(table_name, column_name, args["func"], cache=args["cache"]) for name, args in _DECORATED_INJECTABLES.items(): logger.debug("reinject decorated injectable %s" % name) - orca.add_injectable(name, args['func'], cache=args['cache']) + orca.add_injectable(name, args["func"], cache=args["cache"]) def clear_cache(): @@ -165,12 +179,12 @@ def clear_cache(): def set_step_args(args=None): assert isinstance(args, dict) or args is None - orca.add_injectable('step_args', args) + orca.add_injectable("step_args", args) def get_step_arg(arg_name, default=_NO_DEFAULT): - args = orca.get_injectable('step_args') + args = orca.get_injectable("step_args") assert isinstance(args, dict) if arg_name not in args and default == _NO_DEFAULT: diff --git a/activitysim/core/input.py b/activitysim/core/input.py index 3ae3fd806a..f542a00346 100644 --- a/activitysim/core/input.py +++ b/activitysim/core/input.py @@ -2,23 +2,18 @@ # See full license in LICENSE.txt. import logging -import warnings import os +import warnings import pandas as pd -from activitysim.core import ( - inject, - config, - util, -) -from activitysim.core import mem +from activitysim.core import config, inject, mem, util logger = logging.getLogger(__name__) def canonical_table_index_name(table_name): - table_index_names = inject.get_injectable('canonical_table_index_names', None) + table_index_names = inject.get_injectable("canonical_table_index_names", None) return table_index_names and table_index_names.get(table_name, None) @@ -35,19 +30,21 @@ def read_input_table(tablename, required=True): ------- pandas DataFrame """ - table_list = config.setting('input_table_list') - assert table_list is not None, 'no input_table_list found in settings' + table_list = config.setting("input_table_list") + assert table_list is not None, "no input_table_list found in settings" table_info = None for info in table_list: - if info['tablename'] == tablename: + if info["tablename"] == tablename: table_info = info if table_info is not None: df = read_from_table_info(table_info) else: if required: - raise RuntimeError(f"could not find info for for tablename {tablename} in settings file") + raise RuntimeError( + f"could not find info for for tablename {tablename} in settings file" + ) df = None return df @@ -78,17 +75,17 @@ def read_from_table_info(table_info): +--------------+----------------------------------------------------------+ """ - input_store = config.setting('input_store', None) - create_input_store = config.setting('create_input_store', default=False) - - tablename = table_info.get('tablename') - data_filename = table_info.get('filename', input_store) - h5_tablename = table_info.get('h5_tablename') or tablename - drop_columns = table_info.get('drop_columns', None) - column_map = table_info.get('column_map', None) - keep_columns = table_info.get('keep_columns', None) - rename_columns = table_info.get('rename_columns', None) - csv_dtypes = table_info.get('dtypes', {}) + input_store = config.setting("input_store", None) + create_input_store = config.setting("create_input_store", default=False) + + tablename = table_info.get("tablename") + data_filename = table_info.get("filename", input_store) + h5_tablename = table_info.get("h5_tablename") or tablename + drop_columns = table_info.get("drop_columns", None) + column_map = table_info.get("column_map", None) + keep_columns = table_info.get("keep_columns", None) + rename_columns = table_info.get("rename_columns", None) + csv_dtypes = table_info.get("dtypes", {}) # don't require a redundant index_col directive for canonical tables # but allow explicit disabling of assignment of index col for canonical tables, in which case, presumably, @@ -96,19 +93,22 @@ def read_from_table_info(table_info): canonical_index_col = canonical_table_index_name(tablename) # if there is an explicit index_col entry in table_info - if 'index_col' in table_info: + if "index_col" in table_info: # honor explicit index_col unless it conflicts with canonical name - index_col = table_info['index_col'] + index_col = table_info["index_col"] if canonical_index_col: if index_col: # if there is a non-empty index_col directive, it should be for canonical_table_index_name - assert index_col == canonical_index_col, \ - f"{tablename} index_col {table_info.get('index_col')} should be {index_col}" + assert ( + index_col == canonical_index_col + ), f"{tablename} index_col {table_info.get('index_col')} should be {index_col}" else: - logger.info(f"Not assigning canonical index_col {tablename}.{canonical_index_col} " - f"because settings file index_col directive is explicitly None.") + logger.info( + f"Not assigning canonical index_col {tablename}.{canonical_index_col} " + f"because settings file index_col directive is explicitly None." + ) # if there is an index_col directive for a canonical table, it should be for canonical_table_index_name @@ -116,34 +116,38 @@ def read_from_table_info(table_info): # otherwise default is to use canonical index name for known tables, and no index for unknown tables index_col = canonical_index_col - assert tablename is not None, 'no tablename provided' - assert data_filename is not None, 'no input file provided' + assert tablename is not None, "no tablename provided" + assert data_filename is not None, "no input file provided" data_file_path = config.data_file_path(data_filename) - df = _read_input_file(data_file_path, h5_tablename=h5_tablename, csv_dtypes=csv_dtypes) + df = _read_input_file( + data_file_path, h5_tablename=h5_tablename, csv_dtypes=csv_dtypes + ) # logger.debug('raw %s table columns: %s' % (tablename, df.columns.values)) - logger.debug('raw %s table size: %s' % (tablename, util.df_size(df))) + logger.debug("raw %s table size: %s" % (tablename, util.df_size(df))) if create_input_store: - h5_filepath = config.output_file_path('input_data.h5') - logger.info('writing %s to %s' % (h5_tablename, h5_filepath)) - df.to_hdf(h5_filepath, key=h5_tablename, mode='a') + h5_filepath = config.output_file_path("input_data.h5") + logger.info("writing %s to %s" % (h5_tablename, h5_filepath)) + df.to_hdf(h5_filepath, key=h5_tablename, mode="a") - csv_dir = config.output_file_path('input_data') + csv_dir = config.output_file_path("input_data") if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed - df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False) + df.to_csv(os.path.join(csv_dir, "%s.csv" % tablename), index=False) if drop_columns: logger.debug("dropping columns: %s" % drop_columns) - df.drop(columns=drop_columns, inplace=True, errors='ignore') + df.drop(columns=drop_columns, inplace=True, errors="ignore") if column_map: - warnings.warn("table_inf option 'column_map' renamed 'rename_columns'" - "Support for 'column_map' will be removed in future versions.", - FutureWarning) + warnings.warn( + "table_inf option 'column_map' renamed 'rename_columns'" + "Support for 'column_map' will be removed in future versions.", + FutureWarning, + ) logger.debug("renaming columns: %s" % column_map) df.rename(columns=column_map, inplace=True) @@ -158,52 +162,62 @@ def read_from_table_info(table_info): assert not df.duplicated(index_col).any() if canonical_index_col: # we expect canonical indexes to be integer-valued - assert (df[index_col] == df[index_col].astype(int)).all(), \ - f"Index col '{index_col}' has non-integer values" + assert ( + df[index_col] == df[index_col].astype(int) + ).all(), f"Index col '{index_col}' has non-integer values" df[index_col] = df[index_col].astype(int) df.set_index(index_col, inplace=True) else: # FIXME not sure we want to do this. More likely they omitted index col than that they want to name it? # df.index.names = [index_col] - logger.error(f"index_col '{index_col}' specified in configs but not in {tablename} table!") + logger.error( + f"index_col '{index_col}' specified in configs but not in {tablename} table!" + ) logger.error(f"{tablename} columns are: {list(df.columns)}") raise RuntimeError(f"index_col '{index_col}' not in {tablename} table!") if keep_columns: logger.debug("keeping columns: %s" % keep_columns) if not set(keep_columns).issubset(set(df.columns)): - logger.error(f"Required columns missing from {tablename} table: " - f"{list(set(keep_columns).difference(set(df.columns)))}") + logger.error( + f"Required columns missing from {tablename} table: " + f"{list(set(keep_columns).difference(set(df.columns)))}" + ) logger.error(f"{tablename} table has columns: {list(df.columns)}") raise RuntimeError(f"Required columns missing from {tablename} table") df = df[keep_columns] if df.columns.duplicated().any(): - duplicate_column_names = df.columns[df.columns.duplicated(keep=False)].unique().to_list() - assert not df.columns.duplicated().any(), f"duplicate columns names in {tablename}: {duplicate_column_names}" + duplicate_column_names = ( + df.columns[df.columns.duplicated(keep=False)].unique().to_list() + ) + assert ( + not df.columns.duplicated().any() + ), f"duplicate columns names in {tablename}: {duplicate_column_names}" - logger.debug('%s table columns: %s' % (tablename, df.columns.values)) - logger.debug('%s table size: %s' % (tablename, util.df_size(df))) - logger.debug('%s index name: %s' % (tablename, df.index.name)) + logger.debug("%s table columns: %s" % (tablename, df.columns.values)) + logger.debug("%s table size: %s" % (tablename, util.df_size(df))) + logger.debug("%s index name: %s" % (tablename, df.index.name)) return df def _read_input_file(filepath, h5_tablename=None, csv_dtypes=None): - assert os.path.exists(filepath), 'input file not found: %s' % filepath + assert os.path.exists(filepath), "input file not found: %s" % filepath - if filepath.endswith('.csv'): + if filepath.endswith(".csv"): return _read_csv_with_fallback_encoding(filepath, csv_dtypes) - if filepath.endswith('.h5'): - assert h5_tablename is not None, 'must provide a tablename to read HDF5 table' - logger.info('reading %s table from %s' % (h5_tablename, filepath)) + if filepath.endswith(".h5"): + assert h5_tablename is not None, "must provide a tablename to read HDF5 table" + logger.info("reading %s table from %s" % (h5_tablename, filepath)) return pd.read_hdf(filepath, h5_tablename) raise IOError( - 'Unsupported file type: %s. ' - 'ActivitySim supports CSV and HDF5 files only' % filepath) + "Unsupported file type: %s. " + "ActivitySim supports CSV and HDF5 files only" % filepath + ) def _read_csv_with_fallback_encoding(filepath, dtypes=None): @@ -213,12 +227,14 @@ def _read_csv_with_fallback_encoding(filepath, dtypes=None): """ try: - logger.info('Reading CSV file %s' % filepath) - df = pd.read_csv(filepath, comment='#', dtype=dtypes) + logger.info("Reading CSV file %s" % filepath) + df = pd.read_csv(filepath, comment="#", dtype=dtypes) except UnicodeDecodeError: logger.warning( - 'Reading %s with default utf-8 encoding failed, trying cp1252 instead', filepath) - df = pd.read_csv(filepath, comment='#', encoding='cp1252', dtype=dtypes) + "Reading %s with default utf-8 encoding failed, trying cp1252 instead", + filepath, + ) + df = pd.read_csv(filepath, comment="#", encoding="cp1252", dtype=dtypes) if dtypes: # although the dtype argument suppresses the DtypeWarning, it does not coerce recognized types (e.g. int) diff --git a/activitysim/core/interaction_sample.py b/activitysim/core/interaction_sample.py index 5548242b58..4d2edaf6de 100644 --- a/activitysim/core/interaction_sample.py +++ b/activitysim/core/interaction_sample.py @@ -1,33 +1,30 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range - import logging - +from builtins import range from math import ceil + import numpy as np import pandas as pd -from . import logit -from . import tracing -from . import chunk +from . import chunk, interaction_simulate, logit, pipeline, tracing from .simulate import set_skim_wrapper_targets - -from . import interaction_simulate -from . import pipeline - logger = logging.getLogger(__name__) DUMP = False def make_sample_choices( - choosers, probs, - alternatives, - sample_size, alternative_count, alt_col_name, - allow_zero_probs, - trace_label): + choosers, + probs, + alternatives, + sample_size, + alternative_count, + alt_col_name, + allow_zero_probs, + trace_label, +): """ Parameters @@ -55,21 +52,23 @@ def make_sample_choices( assert len(alternatives) == alternative_count if allow_zero_probs: - zero_probs = (probs.sum(axis=1) == 0) + zero_probs = probs.sum(axis=1) == 0 if zero_probs.all(): - return pd.DataFrame(columns=[alt_col_name, 'rand', 'prob', choosers.index.name]) + return pd.DataFrame( + columns=[alt_col_name, "rand", "prob", choosers.index.name] + ) if zero_probs.any(): # remove from sample probs = probs[~zero_probs] choosers = choosers[~zero_probs] cum_probs_array = probs.values.cumsum(axis=1) - chunk.log_df(trace_label, 'cum_probs_array', cum_probs_array) + chunk.log_df(trace_label, "cum_probs_array", cum_probs_array) # alt probs in convenient layout to return prob of chose alternative # (same layout as cum_probs_arr) alt_probs_array = probs.values.flatten() - chunk.log_df(trace_label, 'alt_probs_array', alt_probs_array) + chunk.log_df(trace_label, "alt_probs_array", alt_probs_array) # get sample_size rands for each chooser rands = pipeline.get_rn_generator().random_for_df(probs, n=sample_size) @@ -78,7 +77,7 @@ def make_sample_choices( # reshape so rands[i] is in broadcastable (2-D) shape for cum_probs_arr # i.e rands[i] is a 2-D array of one alt choice rand for each chooser rands = rands.T.reshape(sample_size, -1, 1) - chunk.log_df(trace_label, 'rands', rands) + chunk.log_df(trace_label, "rands", rands) # the alternative value chosen choices_array = np.empty([sample_size, len(choosers)]).astype(int) @@ -89,7 +88,7 @@ def make_sample_choices( # chunk log these later after we populate them... alts = np.tile(alternatives.index.values, len(choosers)) - chunk.log_df(trace_label, 'alts', alts) + chunk.log_df(trace_label, "alts", alts) # FIXME - do this all at once rather than iterate? for i in range(sample_size): @@ -121,47 +120,53 @@ def make_sample_choices( del positions del offsets - chunk.log_df(trace_label, 'choices_array', choices_array) - chunk.log_df(trace_label, 'choice_probs_array', choice_probs_array) + chunk.log_df(trace_label, "choices_array", choices_array) + chunk.log_df(trace_label, "choice_probs_array", choice_probs_array) del alts - chunk.log_df(trace_label, 'alts', None) + chunk.log_df(trace_label, "alts", None) del cum_probs_array - chunk.log_df(trace_label, 'cum_probs_array', None) + chunk.log_df(trace_label, "cum_probs_array", None) del alt_probs_array - chunk.log_df(trace_label, 'alt_probs_array', None) + chunk.log_df(trace_label, "alt_probs_array", None) # explode to one row per chooser.index, alt_zone_id choices_df = pd.DataFrame( - {alt_col_name: choices_array.flatten(order='F'), - 'rand': rands.flatten(order='F'), - 'prob': choice_probs_array.flatten(order='F'), - choosers.index.name: np.repeat(np.asanyarray(choosers.index), sample_size) - }) + { + alt_col_name: choices_array.flatten(order="F"), + "rand": rands.flatten(order="F"), + "prob": choice_probs_array.flatten(order="F"), + choosers.index.name: np.repeat(np.asanyarray(choosers.index), sample_size), + } + ) - chunk.log_df(trace_label, 'choices_df', choices_df) + chunk.log_df(trace_label, "choices_df", choices_df) del choices_array - chunk.log_df(trace_label, 'choices_array', None) + chunk.log_df(trace_label, "choices_array", None) del rands - chunk.log_df(trace_label, 'rands', None) + chunk.log_df(trace_label, "rands", None) del choice_probs_array - chunk.log_df(trace_label, 'choice_probs_array', None) + chunk.log_df(trace_label, "choice_probs_array", None) # handing this off to caller - chunk.log_df(trace_label, 'choices_df', None) + chunk.log_df(trace_label, "choices_df", None) return choices_df def _interaction_sample( - choosers, alternatives, - spec, sample_size, alt_col_name, - allow_zero_probs=False, - log_alt_losers=False, - skims=None, - locals_d=None, - trace_label=None): + choosers, + alternatives, + spec, + sample_size, + alt_col_name, + allow_zero_probs=False, + log_alt_losers=False, + skims=None, + locals_d=None, + trace_label=None, +): """ Run a MNL simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or @@ -223,12 +228,16 @@ def _interaction_sample( assert num_choosers > 0 if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, 'choosers')) - tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'), - slicer='NONE', transpose=False) + tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) + tracing.trace_df( + alternatives, + tracing.extend_trace_label(trace_label, "alternatives"), + slicer="NONE", + transpose=False, + ) if len(spec.columns) > 1: - raise RuntimeError('spec must have only one column') + raise RuntimeError("spec must have only one column") # if using skims, copy index into the dataframe, so it will be # available as the "destination" for set_skim_wrapper_targets @@ -242,11 +251,14 @@ def _interaction_sample( # for every chooser, there will be a row for each alternative # index values (non-unique) are from alternatives df alternative_count = alternatives.shape[0] - interaction_df = \ - logit.interaction_dataset(choosers, alternatives, sample_size=alternative_count, - chooser_index_id=chooser_index_id) + interaction_df = logit.interaction_dataset( + choosers, + alternatives, + sample_size=alternative_count, + chooser_index_id=chooser_index_id, + ) - chunk.log_df(trace_label, 'interaction_df', interaction_df) + chunk.log_df(trace_label, "interaction_df", interaction_df) assert alternative_count == len(interaction_df.index) / len(choosers.index) @@ -259,93 +271,132 @@ def _interaction_sample( # utilities has utility value for element in the cross product of choosers and alternatives # interaction_utilities is a df with one utility column and one row per row in interaction_df if have_trace_targets: - trace_rows, trace_ids \ - = tracing.interaction_trace_rows(interaction_df, choosers, alternative_count) - - tracing.trace_df(interaction_df[trace_rows], - tracing.extend_trace_label(trace_label, 'interaction_df'), - slicer='NONE', transpose=False) + trace_rows, trace_ids = tracing.interaction_trace_rows( + interaction_df, choosers, alternative_count + ) + + tracing.trace_df( + interaction_df[trace_rows], + tracing.extend_trace_label(trace_label, "interaction_df"), + slicer="NONE", + transpose=False, + ) else: trace_rows = trace_ids = None # interaction_utilities is a df with one utility column and one row per interaction_df row - interaction_utilities, trace_eval_results \ - = interaction_simulate.eval_interaction_utilities(spec, interaction_df, locals_d, trace_label, trace_rows, - estimator=None, - log_alt_losers=log_alt_losers) - chunk.log_df(trace_label, 'interaction_utilities', interaction_utilities) + ( + interaction_utilities, + trace_eval_results, + ) = interaction_simulate.eval_interaction_utilities( + spec, + interaction_df, + locals_d, + trace_label, + trace_rows, + estimator=None, + log_alt_losers=log_alt_losers, + ) + chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) # ########### HWM - high water mark (point of max observed memory usage) del interaction_df - chunk.log_df(trace_label, 'interaction_df', None) + chunk.log_df(trace_label, "interaction_df", None) if have_trace_targets: - tracing.trace_interaction_eval_results(trace_eval_results, trace_ids, - tracing.extend_trace_label(trace_label, 'eval')) + tracing.trace_interaction_eval_results( + trace_eval_results, + trace_ids, + tracing.extend_trace_label(trace_label, "eval"), + ) - tracing.trace_df(interaction_utilities[trace_rows], - tracing.extend_trace_label(trace_label, 'interaction_utilities'), - slicer='NONE', transpose=False) + tracing.trace_df( + interaction_utilities[trace_rows], + tracing.extend_trace_label(trace_label, "interaction_utilities"), + slicer="NONE", + transpose=False, + ) - tracing.dump_df(DUMP, interaction_utilities, trace_label, 'interaction_utilities') + tracing.dump_df(DUMP, interaction_utilities, trace_label, "interaction_utilities") # reshape utilities (one utility column and one row per row in interaction_utilities) # to a dataframe with one row per chooser and one column per alternative utilities = pd.DataFrame( interaction_utilities.values.reshape(len(choosers), alternative_count), - index=choosers.index) - chunk.log_df(trace_label, 'utilities', utilities) + index=choosers.index, + ) + chunk.log_df(trace_label, "utilities", utilities) del interaction_utilities - chunk.log_df(trace_label, 'interaction_utilities', None) + chunk.log_df(trace_label, "interaction_utilities", None) if have_trace_targets: - tracing.trace_df(utilities, tracing.extend_trace_label(trace_label, 'utils'), - column_labels=['alternative', 'utility']) + tracing.trace_df( + utilities, + tracing.extend_trace_label(trace_label, "utils"), + column_labels=["alternative", "utility"], + ) - tracing.dump_df(DUMP, utilities, trace_label, 'utilities') + tracing.dump_df(DUMP, utilities, trace_label, "utilities") # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative - probs = logit.utils_to_probs(utilities, allow_zero_probs=allow_zero_probs, - trace_label=trace_label, trace_choosers=choosers) - chunk.log_df(trace_label, 'probs', probs) + probs = logit.utils_to_probs( + utilities, + allow_zero_probs=allow_zero_probs, + trace_label=trace_label, + trace_choosers=choosers, + ) + chunk.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, 'utilities', None) + chunk.log_df(trace_label, "utilities", None) if have_trace_targets: - tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'), - column_labels=['alternative', 'probability']) + tracing.trace_df( + probs, + tracing.extend_trace_label(trace_label, "probs"), + column_labels=["alternative", "probability"], + ) if sample_size == 0: # FIXME return full alternative set rather than sample - logger.info("Estimation mode for %s using unsampled alternatives" % (trace_label, )) + logger.info( + "Estimation mode for %s using unsampled alternatives" % (trace_label,) + ) index_name = probs.index.name - choices_df = \ - pd.melt(probs.reset_index(), id_vars=[index_name])\ - .sort_values(by=index_name, kind='mergesort')\ - .set_index(index_name)\ - .rename(columns={'value': 'prob'})\ - .drop(columns='variable') - - choices_df['pick_count'] = 1 - choices_df.insert(0, alt_col_name, np.tile(alternatives.index.values, len(choosers.index))) + choices_df = ( + pd.melt(probs.reset_index(), id_vars=[index_name]) + .sort_values(by=index_name, kind="mergesort") + .set_index(index_name) + .rename(columns={"value": "prob"}) + .drop(columns="variable") + ) + + choices_df["pick_count"] = 1 + choices_df.insert( + 0, alt_col_name, np.tile(alternatives.index.values, len(choosers.index)) + ) return choices_df else: choices_df = make_sample_choices( - choosers, probs, alternatives, - sample_size, alternative_count, alt_col_name, + choosers, + probs, + alternatives, + sample_size, + alternative_count, + alt_col_name, allow_zero_probs=allow_zero_probs, - trace_label=trace_label) + trace_label=trace_label, + ) - chunk.log_df(trace_label, 'choices_df', choices_df) + chunk.log_df(trace_label, "choices_df", choices_df) del probs - chunk.log_df(trace_label, 'probs', None) + chunk.log_df(trace_label, "probs", None) # pick_count and pick_dup # pick_count is number of duplicate picks @@ -353,47 +404,56 @@ def _interaction_sample( pick_group = choices_df.groupby([choosers.index.name, alt_col_name]) # number each item in each group from 0 to the length of that group - 1. - choices_df['pick_count'] = pick_group.cumcount(ascending=True) + choices_df["pick_count"] = pick_group.cumcount(ascending=True) # flag duplicate rows after first - choices_df['pick_dup'] = choices_df['pick_count'] > 0 + choices_df["pick_dup"] = choices_df["pick_count"] > 0 # add reverse cumcount to get total pick_count (conveniently faster than groupby.count + merge) - choices_df['pick_count'] += pick_group.cumcount(ascending=False) + 1 + choices_df["pick_count"] += pick_group.cumcount(ascending=False) + 1 # drop the duplicates - choices_df = choices_df[~choices_df['pick_dup']] - del choices_df['pick_dup'] - chunk.log_df(trace_label, 'choices_df', choices_df) + choices_df = choices_df[~choices_df["pick_dup"]] + del choices_df["pick_dup"] + chunk.log_df(trace_label, "choices_df", choices_df) # set index after groupby so we can trace on it choices_df.set_index(choosers.index.name, inplace=True) - tracing.dump_df(DUMP, choices_df, trace_label, 'choices_df') + tracing.dump_df(DUMP, choices_df, trace_label, "choices_df") if have_trace_targets: - tracing.trace_df(choices_df, - tracing.extend_trace_label(trace_label, 'sampled_alternatives'), - transpose=False, - column_labels=['sample_alt', 'alternative']) + tracing.trace_df( + choices_df, + tracing.extend_trace_label(trace_label, "sampled_alternatives"), + transpose=False, + column_labels=["sample_alt", "alternative"], + ) # don't need this after tracing - del choices_df['rand'] - chunk.log_df(trace_label, 'choices_df', choices_df) + del choices_df["rand"] + chunk.log_df(trace_label, "choices_df", choices_df) # - NARROW - choices_df['prob'] = choices_df['prob'].astype(np.float32) - assert (choices_df['pick_count'].max() < 4294967295) or (choices_df.empty) - choices_df['pick_count'] = choices_df['pick_count'].astype(np.uint32) + choices_df["prob"] = choices_df["prob"].astype(np.float32) + assert (choices_df["pick_count"].max() < 4294967295) or (choices_df.empty) + choices_df["pick_count"] = choices_df["pick_count"].astype(np.uint32) return choices_df def interaction_sample( - choosers, alternatives, spec, sample_size, - alt_col_name, - allow_zero_probs=False, - log_alt_losers=False, - skims=None, locals_d=None, chunk_size=0, chunk_tag=None, - trace_label=None): + choosers, + alternatives, + spec, + sample_size, + alt_col_name, + allow_zero_probs=False, + log_alt_losers=False, + skims=None, + locals_d=None, + chunk_size=0, + chunk_tag=None, + trace_label=None, +): """ Run a simulation in the situation in which alternatives must @@ -451,7 +511,7 @@ def interaction_sample( number of duplicate picks for chooser, alt """ - trace_label = tracing.extend_trace_label(trace_label, 'interaction_sample') + trace_label = tracing.extend_trace_label(trace_label, "interaction_sample") chunk_tag = chunk_tag or trace_label # we return alternatives ordered in (index, alt_col_name) @@ -463,24 +523,28 @@ def interaction_sample( sample_size = min(sample_size, len(alternatives.index)) result_list = [] - for i, chooser_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag): - - choices = _interaction_sample(chooser_chunk, alternatives, - spec=spec, - sample_size=sample_size, - alt_col_name=alt_col_name, - allow_zero_probs=allow_zero_probs, - log_alt_losers=log_alt_losers, - skims=skims, - locals_d=locals_d, - trace_label=chunk_trace_label) + for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + choosers, chunk_size, trace_label, chunk_tag + ): + + choices = _interaction_sample( + chooser_chunk, + alternatives, + spec=spec, + sample_size=sample_size, + alt_col_name=alt_col_name, + allow_zero_probs=allow_zero_probs, + log_alt_losers=log_alt_losers, + skims=skims, + locals_d=locals_d, + trace_label=chunk_trace_label, + ) if choices.shape[0] > 0: # might not be any if allow_zero_probs result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: @@ -488,9 +552,11 @@ def interaction_sample( if len(result_list) > 1: choices = pd.concat(result_list) - assert allow_zero_probs or (len(choosers.index) == len(np.unique(choices.index.values))) + assert allow_zero_probs or ( + len(choosers.index) == len(np.unique(choices.index.values)) + ) # keep alts in canonical order so choices based on their probs are stable across runs - choices = choices.sort_values(by=alt_col_name).sort_index(kind='mergesort') + choices = choices.sort_values(by=alt_col_name).sort_index(kind="mergesort") return choices diff --git a/activitysim/core/interaction_sample_simulate.py b/activitysim/core/interaction_sample_simulate.py index 9f7a275ba5..776df91d47 100644 --- a/activitysim/core/interaction_sample_simulate.py +++ b/activitysim/core/interaction_sample_simulate.py @@ -5,24 +5,27 @@ import numpy as np import pandas as pd -from . import logit -from . import tracing -from . import chunk +from . import chunk, interaction_simulate, logit, tracing from .simulate import set_skim_wrapper_targets -from . import interaction_simulate - logger = logging.getLogger(__name__) def _interaction_sample_simulate( - choosers, alternatives, spec, - choice_column, - allow_zero_probs, zero_prob_choice_val, log_alt_losers, - want_logsums, - skims, locals_d, - trace_label, trace_choice_name, - estimator): + choosers, + alternatives, + spec, + choice_column, + allow_zero_probs, + zero_prob_choice_val, + log_alt_losers, + want_logsums, + skims, + locals_d, + trace_label, + trace_choice_name, + estimator, +): """ Run a MNL simulation in the situation in which alternatives must @@ -88,17 +91,22 @@ def _interaction_sample_simulate( # this is the more general check (not requiring is_monotonic_increasing) last_repeat = alternatives.index != np.roll(alternatives.index, -1) - assert (choosers.shape[0] == 1) or choosers.index.equals(alternatives.index[last_repeat]) + assert (choosers.shape[0] == 1) or choosers.index.equals( + alternatives.index[last_repeat] + ) have_trace_targets = tracing.has_trace_targets(choosers) if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, 'choosers')) - tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'), - transpose=False) + tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) + tracing.trace_df( + alternatives, + tracing.extend_trace_label(trace_label, "alternatives"), + transpose=False, + ) if len(spec.columns) > 1: - raise RuntimeError('spec must have only one column') + raise RuntimeError("spec must have only one column") # if using skims, copy index into the dataframe, so it will be # available as the "destination" for the skims dereference below @@ -115,21 +123,25 @@ def _interaction_sample_simulate( # so we just need to left join alternatives with choosers assert alternatives.index.name == choosers.index.name - interaction_df = alternatives.join(choosers, how='left', rsuffix='_chooser') + interaction_df = alternatives.join(choosers, how="left", rsuffix="_chooser") if log_alt_losers: # logit.interaction_dataset adds ALT_CHOOSER_ID column if log_alt_losers is True # to enable detection of zero_prob-driving utils (e.g. -999 for all alts in a chooser) - interaction_df[interaction_simulate.ALT_CHOOSER_ID] = interaction_df.index.values + interaction_df[ + interaction_simulate.ALT_CHOOSER_ID + ] = interaction_df.index.values - chunk.log_df(trace_label, 'interaction_df', interaction_df) + chunk.log_df(trace_label, "interaction_df", interaction_df) if have_trace_targets: trace_rows, trace_ids = tracing.interaction_trace_rows(interaction_df, choosers) - tracing.trace_df(interaction_df, - tracing.extend_trace_label(trace_label, 'interaction_df'), - transpose=False) + tracing.trace_df( + interaction_df, + tracing.extend_trace_label(trace_label, "interaction_df"), + transpose=False, + ) else: trace_rows = trace_ids = None @@ -141,21 +153,35 @@ def _interaction_sample_simulate( # column names of choosers match spec index values # utilities has utility value for element in the cross product of choosers and alternatives # interaction_utilities is a df with one utility column and one row per row in alternative - interaction_utilities, trace_eval_results \ - = interaction_simulate.eval_interaction_utilities(spec, interaction_df, locals_d, trace_label, trace_rows, - estimator=estimator, log_alt_losers=log_alt_losers) - chunk.log_df(trace_label, 'interaction_utilities', interaction_utilities) + ( + interaction_utilities, + trace_eval_results, + ) = interaction_simulate.eval_interaction_utilities( + spec, + interaction_df, + locals_d, + trace_label, + trace_rows, + estimator=estimator, + log_alt_losers=log_alt_losers, + ) + chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) del interaction_df - chunk.log_df(trace_label, 'interaction_df', None) + chunk.log_df(trace_label, "interaction_df", None) if have_trace_targets: - tracing.trace_interaction_eval_results(trace_eval_results, trace_ids, - tracing.extend_trace_label(trace_label, 'eval')) - - tracing.trace_df(interaction_utilities, - tracing.extend_trace_label(trace_label, 'interaction_utilities'), - transpose=False) + tracing.trace_interaction_eval_results( + trace_eval_results, + trace_ids, + tracing.extend_trace_label(trace_label, "eval"), + ) + + tracing.trace_df( + interaction_utilities, + tracing.extend_trace_label(trace_label, "interaction_utilities"), + transpose=False, + ) # reshape utilities (one utility column and one row per row in model_design) # to a dataframe with one row per chooser and one column per alternative @@ -163,8 +189,10 @@ def _interaction_sample_simulate( # so we need to pad with dummy utilities so low that they are never chosen # number of samples per chooser - sample_counts = interaction_utilities.groupby(interaction_utilities.index).size().values - chunk.log_df(trace_label, 'sample_counts', sample_counts) + sample_counts = ( + interaction_utilities.groupby(interaction_utilities.index).size().values + ) + chunk.log_df(trace_label, "sample_counts", sample_counts) # max number of alternatvies for any chooser max_sample_count = sample_counts.max() @@ -179,51 +207,61 @@ def _interaction_sample_simulate( inserts = np.repeat(last_row_offsets, max_sample_count - sample_counts) del sample_counts - chunk.log_df(trace_label, 'sample_counts', None) + chunk.log_df(trace_label, "sample_counts", None) # insert the zero-prob utilities to pad each alternative set to same size padded_utilities = np.insert(interaction_utilities.utility.values, inserts, -999) - chunk.log_df(trace_label, 'padded_utilities', padded_utilities) + chunk.log_df(trace_label, "padded_utilities", padded_utilities) del inserts del interaction_utilities - chunk.log_df(trace_label, 'interaction_utilities', None) + chunk.log_df(trace_label, "interaction_utilities", None) # reshape to array with one row per chooser, one column per alternative padded_utilities = padded_utilities.reshape(-1, max_sample_count) # convert to a dataframe with one row per chooser and one column per alternative - utilities_df = pd.DataFrame( - padded_utilities, - index=choosers.index) - chunk.log_df(trace_label, 'utilities_df', utilities_df) + utilities_df = pd.DataFrame(padded_utilities, index=choosers.index) + chunk.log_df(trace_label, "utilities_df", utilities_df) del padded_utilities - chunk.log_df(trace_label, 'padded_utilities', None) + chunk.log_df(trace_label, "padded_utilities", None) if have_trace_targets: - tracing.trace_df(utilities_df, tracing.extend_trace_label(trace_label, 'utilities'), - column_labels=['alternative', 'utility']) + tracing.trace_df( + utilities_df, + tracing.extend_trace_label(trace_label, "utilities"), + column_labels=["alternative", "utility"], + ) # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative - probs = logit.utils_to_probs(utilities_df, allow_zero_probs=allow_zero_probs, - trace_label=trace_label, trace_choosers=choosers) - chunk.log_df(trace_label, 'probs', probs) + probs = logit.utils_to_probs( + utilities_df, + allow_zero_probs=allow_zero_probs, + trace_label=trace_label, + trace_choosers=choosers, + ) + chunk.log_df(trace_label, "probs", probs) if want_logsums: - logsums = logit.utils_to_logsums(utilities_df, allow_zero_probs=allow_zero_probs) - chunk.log_df(trace_label, 'logsums', logsums) + logsums = logit.utils_to_logsums( + utilities_df, allow_zero_probs=allow_zero_probs + ) + chunk.log_df(trace_label, "logsums", logsums) del utilities_df - chunk.log_df(trace_label, 'utilities_df', None) + chunk.log_df(trace_label, "utilities_df", None) if have_trace_targets: - tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'), - column_labels=['alternative', 'probability']) + tracing.trace_df( + probs, + tracing.extend_trace_label(trace_label, "probs"), + column_labels=["alternative", "probability"], + ) if allow_zero_probs: - zero_probs = (probs.sum(axis=1) == 0) + zero_probs = probs.sum(axis=1) == 0 if zero_probs.any(): # FIXME this is kind of gnarly, but we force choice of first alt probs.loc[zero_probs, 0] = 1.0 @@ -231,14 +269,15 @@ def _interaction_sample_simulate( # make choices # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample - positions, rands = \ - logit.make_choices(probs, trace_label=trace_label, trace_choosers=choosers) + positions, rands = logit.make_choices( + probs, trace_label=trace_label, trace_choosers=choosers + ) - chunk.log_df(trace_label, 'positions', positions) - chunk.log_df(trace_label, 'rands', rands) + chunk.log_df(trace_label, "positions", positions) + chunk.log_df(trace_label, "rands", rands) del probs - chunk.log_df(trace_label, 'probs', None) + chunk.log_df(trace_label, "probs", None) # shouldn't have chosen any of the dummy pad utilities assert positions.max() < max_sample_count @@ -253,41 +292,59 @@ def _interaction_sample_simulate( # create a series with index from choosers and the index of the chosen alternative choices = pd.Series(choices, index=choosers.index) - chunk.log_df(trace_label, 'choices', choices) + chunk.log_df(trace_label, "choices", choices) if allow_zero_probs and zero_probs.any(): # FIXME this is kind of gnarly, patch choice for zero_probs choices.loc[zero_probs] = zero_prob_choice_val if have_trace_targets: - tracing.trace_df(choices, tracing.extend_trace_label(trace_label, 'choices'), - columns=[None, trace_choice_name]) - tracing.trace_df(rands, tracing.extend_trace_label(trace_label, 'rands'), - columns=[None, 'rand']) + tracing.trace_df( + choices, + tracing.extend_trace_label(trace_label, "choices"), + columns=[None, trace_choice_name], + ) + tracing.trace_df( + rands, + tracing.extend_trace_label(trace_label, "rands"), + columns=[None, "rand"], + ) if want_logsums: - tracing.trace_df(logsums, tracing.extend_trace_label(trace_label, 'logsum'), - columns=[None, 'logsum']) + tracing.trace_df( + logsums, + tracing.extend_trace_label(trace_label, "logsum"), + columns=[None, "logsum"], + ) if want_logsums: - choices = choices.to_frame('choice') - choices['logsum'] = logsums + choices = choices.to_frame("choice") + choices["logsum"] = logsums - chunk.log_df(trace_label, 'choices', choices) + chunk.log_df(trace_label, "choices", choices) # handing this off to our caller - chunk.log_df(trace_label, 'choices', None) + chunk.log_df(trace_label, "choices", None) return choices def interaction_sample_simulate( - choosers, alternatives, spec, choice_column, - allow_zero_probs=False, zero_prob_choice_val=None, - log_alt_losers=False, - want_logsums=False, - skims=None, locals_d=None, chunk_size=0, chunk_tag=None, - trace_label=None, trace_choice_name=None, - estimator=None): + choosers, + alternatives, + spec, + choice_column, + allow_zero_probs=False, + zero_prob_choice_val=None, + log_alt_losers=False, + want_logsums=False, + skims=None, + locals_d=None, + chunk_size=0, + chunk_tag=None, + trace_label=None, + trace_choice_name=None, + estimator=None, +): """ Run a simulation in the situation in which alternatives must @@ -344,24 +401,38 @@ def interaction_sample_simulate( """ - trace_label = tracing.extend_trace_label(trace_label, 'interaction_sample_simulate') + trace_label = tracing.extend_trace_label(trace_label, "interaction_sample_simulate") chunk_tag = chunk_tag or trace_label result_list = [] - for i, chooser_chunk, alternative_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers_and_alts(choosers, alternatives, chunk_size, trace_label, chunk_tag): + for ( + i, + chooser_chunk, + alternative_chunk, + chunk_trace_label, + ) in chunk.adaptive_chunked_choosers_and_alts( + choosers, alternatives, chunk_size, trace_label, chunk_tag + ): choices = _interaction_sample_simulate( - chooser_chunk, alternative_chunk, spec, choice_column, - allow_zero_probs, zero_prob_choice_val, log_alt_losers, + chooser_chunk, + alternative_chunk, + spec, + choice_column, + allow_zero_probs, + zero_prob_choice_val, + log_alt_losers, want_logsums, - skims, locals_d, - chunk_trace_label, trace_choice_name, - estimator) + skims, + locals_d, + chunk_trace_label, + trace_choice_name, + estimator, + ) result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index 292c9ccaef..cc6453bb98 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -1,29 +1,24 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import zip - import logging +from builtins import zip +from collections import OrderedDict import numpy as np import pandas as pd -from collections import OrderedDict - -from . import logit -from . import tracing -from . import config -from . import simulate -from . import chunk -from . import simulate +from . import chunk, config, logit, simulate, tracing logger = logging.getLogger(__name__) DUMP = False -ALT_CHOOSER_ID = '_chooser_id' +ALT_CHOOSER_ID = "_chooser_id" -def eval_interaction_utilities(spec, df, locals_d, trace_label, trace_rows, estimator=None, log_alt_losers=False): +def eval_interaction_utilities( + spec, df, locals_d, trace_label, trace_rows, estimator=None, log_alt_losers=False +): """ Compute the utilities for a single-alternative spec evaluated in the context of df @@ -67,13 +62,13 @@ def eval_interaction_utilities(spec, df, locals_d, trace_label, trace_rows, esti with chunk.chunk_log(trace_label): - assert(len(spec.columns) == 1) + assert len(spec.columns) == 1 # avoid altering caller's passed-in locals_d parameter (they may be looping) locals_d = locals_d.copy() if locals_d is not None else {} # add df for startswith('@') eval expressions - locals_d['df'] = df + locals_d["df"] = df def to_series(x): if np.isscalar(x): @@ -90,14 +85,14 @@ def to_series(x): else: trace_eval_results = None - check_for_variability = config.setting('check_for_variability') + check_for_variability = config.setting("check_for_variability") # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously - utilities = pd.DataFrame({'utility': 0.0}, index=df.index) + utilities = pd.DataFrame({"utility": 0.0}, index=df.index) - chunk.log_df(trace_label, 'eval.utilities', utilities) + chunk.log_df(trace_label, "eval.utilities", utilities) no_variability = has_missing_vals = 0 @@ -115,8 +110,10 @@ def to_series(x): # bug - location choice has df index_name zone_id but should be person_id???? if df.index.name is None: chooser_id = estimator.get_chooser_id() - assert chooser_id in df.columns, \ - "Expected to find choose_id column '%s' in interaction dataset" % (chooser_id, ) + assert chooser_id in df.columns, ( + "Expected to find choose_id column '%s' in interaction dataset" + % (chooser_id,) + ) assert df.index.name is None expression_values_df[chooser_id] = df[chooser_id] @@ -131,10 +128,10 @@ def to_series(x): try: # - allow temps of form _od_DIST@od_skim['DIST'] - if expr.startswith('_'): + if expr.startswith("_"): - target = expr[:expr.index('@')] - rhs = expr[expr.index('@') + 1:] + target = expr[: expr.index("@")] + rhs = expr[expr.index("@") + 1 :] v = to_series(eval(rhs, globals(), locals_d)) # update locals to allows us to ref previously assigned targets @@ -148,13 +145,16 @@ def to_series(x): # they have a non-zero dummy coefficient to avoid being removed from spec as NOPs continue - if expr.startswith('@'): + if expr.startswith("@"): v = to_series(eval(expr[1:], globals(), locals_d)) else: v = df.eval(expr) if check_for_variability and v.std() == 0: - logger.info("%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], expr)) + logger.info( + "%s: no variability (%s) in: %s" + % (trace_label, v.iloc[0], expr) + ) no_variability += 1 # FIXME - how likely is this to happen? Not sure it is really a problem? @@ -164,10 +164,13 @@ def to_series(x): if estimator: # in case we modified expression_values_df index - expression_values_df.insert(loc=len(expression_values_df.columns), column=label, - value=v.values if isinstance(v, pd.Series) else v) + expression_values_df.insert( + loc=len(expression_values_df.columns), + column=label, + value=v.values if isinstance(v, pd.Series) else v, + ) - utility = (v * coefficient).astype('float') + utility = (v * coefficient).astype("float") if log_alt_losers: @@ -176,9 +179,13 @@ def to_series(x): if (max_utils_by_chooser < simulate.ALT_LOSER_UTIL).any(): - losers = max_utils_by_chooser[max_utils_by_chooser < simulate.ALT_LOSER_UTIL] - logger.warning(f"{trace_label} - {len(losers)} choosers of {len(max_utils_by_chooser)} " - f"with prohibitive utilities for all alternatives for expression: {expr}") + losers = max_utils_by_chooser[ + max_utils_by_chooser < simulate.ALT_LOSER_UTIL + ] + logger.warning( + f"{trace_label} - {len(losers)} choosers of {len(max_utils_by_chooser)} " + f"with prohibitive utilities for all alternatives for expression: {expr}" + ) # loser_df = df[df[ALT_CHOOSER_ID].isin(losers.index)] # print(f"\nloser_df\n{loser_df}\n") @@ -197,50 +204,68 @@ def to_series(x): assert expr not in trace_eval_results trace_eval_results[expr] = v[trace_rows] - k = 'partial utility (coefficient = %s) for %s' % (coefficient, expr) + k = "partial utility (coefficient = %s) for %s" % ( + coefficient, + expr, + ) trace_eval_results[k] = v[trace_rows] * coefficient del v # chunk.log_df(trace_label, 'v', None) except Exception as err: - logger.exception(f"{trace_label} - {type(err).__name__} ({str(err)}) evaluating: {str(expr)}") + logger.exception( + f"{trace_label} - {type(err).__name__} ({str(err)}) evaluating: {str(expr)}" + ) raise err if estimator: - estimator.log("eval_interaction_utilities write_interaction_expression_values %s" % trace_label) + estimator.log( + "eval_interaction_utilities write_interaction_expression_values %s" + % trace_label + ) estimator.write_interaction_expression_values(expression_values_df) del expression_values_df if no_variability > 0: - logger.warning("%s: %s columns have no variability" % (trace_label, no_variability)) + logger.warning( + "%s: %s columns have no variability" % (trace_label, no_variability) + ) if has_missing_vals > 0: - logger.warning("%s: %s columns have missing values" % (trace_label, has_missing_vals)) + logger.warning( + "%s: %s columns have missing values" % (trace_label, has_missing_vals) + ) if trace_eval_results is not None: - trace_eval_results['total utility'] = utilities.utility[trace_rows] + trace_eval_results["total utility"] = utilities.utility[trace_rows] trace_eval_results = pd.DataFrame.from_dict(trace_eval_results) trace_eval_results.index = df[trace_rows].index # add df columns to trace_results trace_eval_results = pd.concat([df[trace_rows], trace_eval_results], axis=1) - chunk.log_df(trace_label, 'eval.trace_eval_results', trace_eval_results) + chunk.log_df(trace_label, "eval.trace_eval_results", trace_eval_results) - chunk.log_df(trace_label, 'v', None) - chunk.log_df(trace_label, 'eval.utilities', None) # out of out hands... - chunk.log_df(trace_label, 'eval.trace_eval_results', None) + chunk.log_df(trace_label, "v", None) + chunk.log_df(trace_label, "eval.utilities", None) # out of out hands... + chunk.log_df(trace_label, "eval.trace_eval_results", None) return utilities, trace_eval_results def _interaction_simulate( - choosers, alternatives, spec, - skims=None, locals_d=None, sample_size=None, - trace_label=None, trace_choice_name=None, - log_alt_losers=False, - estimator=None): + choosers, + alternatives, + spec, + skims=None, + locals_d=None, + sample_size=None, + trace_label=None, + trace_choice_name=None, + log_alt_losers=False, + estimator=None, +): """ Run a MNL simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or @@ -289,22 +314,28 @@ def _interaction_simulate( choices are simulated in the standard Monte Carlo fashion """ - trace_label = tracing.extend_trace_label(trace_label, 'interaction_simulate') + trace_label = tracing.extend_trace_label(trace_label, "interaction_simulate") have_trace_targets = tracing.has_trace_targets(choosers) if have_trace_targets: - tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, 'choosers')) - tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'), - slicer='NONE', transpose=False) + tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, "choosers")) + tracing.trace_df( + alternatives, + tracing.extend_trace_label(trace_label, "alternatives"), + slicer="NONE", + transpose=False, + ) if len(spec.columns) > 1: - raise RuntimeError('spec must have only one column') + raise RuntimeError("spec must have only one column") sample_size = sample_size or len(alternatives) if sample_size > len(alternatives): - logger.debug("clipping sample size %s to len(alternatives) %s" % - (sample_size, len(alternatives))) + logger.debug( + "clipping sample size %s to len(alternatives) %s" + % (sample_size, len(alternatives)) + ) sample_size = min(sample_size, len(alternatives)) # if using skims, copy index into the dataframe, so it will be @@ -319,9 +350,14 @@ def _interaction_simulate( alt_index_id = estimator.get_alt_id() if estimator else None chooser_index_id = ALT_CHOOSER_ID if log_alt_losers else None - interaction_df = logit.interaction_dataset(choosers, alternatives, sample_size, - alt_index_id=alt_index_id, chooser_index_id=chooser_index_id) - chunk.log_df(trace_label, 'interaction_df', interaction_df) + interaction_df = logit.interaction_dataset( + choosers, + alternatives, + sample_size, + alt_index_id=alt_index_id, + chooser_index_id=chooser_index_id, + ) + chunk.log_df(trace_label, "interaction_df", interaction_df) if skims is not None: simulate.set_skim_wrapper_targets(interaction_df, skims) @@ -332,67 +368,92 @@ def _interaction_simulate( # utilities has utility value for element in the cross product of choosers and alternatives # interaction_utilities is a df with one utility column and one row per row in model_design if have_trace_targets: - trace_rows, trace_ids \ - = tracing.interaction_trace_rows(interaction_df, choosers, sample_size) - - tracing.trace_df(interaction_df[trace_rows], - tracing.extend_trace_label(trace_label, 'interaction_df'), - slicer='NONE', transpose=False) + trace_rows, trace_ids = tracing.interaction_trace_rows( + interaction_df, choosers, sample_size + ) + + tracing.trace_df( + interaction_df[trace_rows], + tracing.extend_trace_label(trace_label, "interaction_df"), + slicer="NONE", + transpose=False, + ) else: trace_rows = trace_ids = None - interaction_utilities, trace_eval_results \ - = eval_interaction_utilities(spec, interaction_df, locals_d, trace_label, trace_rows, - estimator=estimator, - log_alt_losers=log_alt_losers) - chunk.log_df(trace_label, 'interaction_utilities', interaction_utilities) + interaction_utilities, trace_eval_results = eval_interaction_utilities( + spec, + interaction_df, + locals_d, + trace_label, + trace_rows, + estimator=estimator, + log_alt_losers=log_alt_losers, + ) + chunk.log_df(trace_label, "interaction_utilities", interaction_utilities) # print(f"interaction_df {interaction_df.shape}") # print(f"interaction_utilities {interaction_utilities.shape}") del interaction_df - chunk.log_df(trace_label, 'interaction_df', None) + chunk.log_df(trace_label, "interaction_df", None) if have_trace_targets: - tracing.trace_interaction_eval_results(trace_eval_results, trace_ids, - tracing.extend_trace_label(trace_label, 'eval')) - - tracing.trace_df(interaction_utilities[trace_rows], - tracing.extend_trace_label(trace_label, 'interaction_utils'), - slicer='NONE', transpose=False) + tracing.trace_interaction_eval_results( + trace_eval_results, + trace_ids, + tracing.extend_trace_label(trace_label, "eval"), + ) + + tracing.trace_df( + interaction_utilities[trace_rows], + tracing.extend_trace_label(trace_label, "interaction_utils"), + slicer="NONE", + transpose=False, + ) # reshape utilities (one utility column and one row per row in model_design) # to a dataframe with one row per chooser and one column per alternative utilities = pd.DataFrame( interaction_utilities.values.reshape(len(choosers), sample_size), - index=choosers.index) - chunk.log_df(trace_label, 'utilities', utilities) + index=choosers.index, + ) + chunk.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df(utilities, tracing.extend_trace_label(trace_label, 'utils'), - column_labels=['alternative', 'utility']) + tracing.trace_df( + utilities, + tracing.extend_trace_label(trace_label, "utils"), + column_labels=["alternative", "utility"], + ) - tracing.dump_df(DUMP, utilities, trace_label, 'utilities') + tracing.dump_df(DUMP, utilities, trace_label, "utilities") # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative - probs = logit.utils_to_probs(utilities, trace_label=trace_label, trace_choosers=choosers) - chunk.log_df(trace_label, 'probs', probs) + probs = logit.utils_to_probs( + utilities, trace_label=trace_label, trace_choosers=choosers + ) + chunk.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, 'utilities', None) + chunk.log_df(trace_label, "utilities", None) if have_trace_targets: - tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'), - column_labels=['alternative', 'probability']) + tracing.trace_df( + probs, + tracing.extend_trace_label(trace_label, "probs"), + column_labels=["alternative", "probability"], + ) # make choices # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample - positions, rands = \ - logit.make_choices(probs, trace_label=trace_label, trace_choosers=choosers) - chunk.log_df(trace_label, 'positions', positions) - chunk.log_df(trace_label, 'rands', rands) + positions, rands = logit.make_choices( + probs, trace_label=trace_label, trace_choosers=choosers + ) + chunk.log_df(trace_label, "positions", positions) + chunk.log_df(trace_label, "rands", rands) # need to get from an integer offset into the alternative sample to the alternative index # that is, we want the index value of the row that is offset by rows into the @@ -404,23 +465,36 @@ def _interaction_simulate( # create a series with index from choosers and the index of the chosen alternative choices = pd.Series(choices, index=choosers.index) - chunk.log_df(trace_label, 'choices', choices) + chunk.log_df(trace_label, "choices", choices) if have_trace_targets: - tracing.trace_df(choices, tracing.extend_trace_label(trace_label, 'choices'), - columns=[None, trace_choice_name]) - tracing.trace_df(rands, tracing.extend_trace_label(trace_label, 'rands'), - columns=[None, 'rand']) + tracing.trace_df( + choices, + tracing.extend_trace_label(trace_label, "choices"), + columns=[None, trace_choice_name], + ) + tracing.trace_df( + rands, + tracing.extend_trace_label(trace_label, "rands"), + columns=[None, "rand"], + ) return choices def interaction_simulate( - choosers, alternatives, spec, - log_alt_losers=False, - skims=None, locals_d=None, sample_size=None, chunk_size=0, - trace_label=None, trace_choice_name=None, - estimator=None): + choosers, + alternatives, + spec, + log_alt_losers=False, + skims=None, + locals_d=None, + sample_size=None, + chunk_size=0, + trace_label=None, + trace_choice_name=None, + estimator=None, +): """ Run a simulation in the situation in which alternatives must @@ -471,26 +545,31 @@ def interaction_simulate( choices are simulated in the standard Monte Carlo fashion """ - trace_label = tracing.extend_trace_label(trace_label, 'interaction_simulate') + trace_label = tracing.extend_trace_label(trace_label, "interaction_simulate") assert len(choosers) > 0 result_list = [] - for i, chooser_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers(choosers, chunk_size, trace_label): - - choices = _interaction_simulate(chooser_chunk, alternatives, spec, - skims=skims, - locals_d=locals_d, - sample_size=sample_size, - trace_label=chunk_trace_label, - trace_choice_name=trace_choice_name, - log_alt_losers=log_alt_losers, - estimator=estimator) + for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + choosers, chunk_size, trace_label + ): + + choices = _interaction_simulate( + chooser_chunk, + alternatives, + spec, + skims=skims, + locals_d=locals_d, + sample_size=sample_size, + trace_label=chunk_trace_label, + trace_choice_name=trace_choice_name, + log_alt_losers=log_alt_losers, + estimator=estimator, + ) result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: diff --git a/activitysim/core/logit.py b/activitysim/core/logit.py index b3dace2e0b..9d831866a2 100644 --- a/activitysim/core/logit.py +++ b/activitysim/core/logit.py @@ -1,15 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import object - import logging +from builtins import object import numpy as np import pandas as pd -from . import tracing -from . import pipeline -from . import config +from . import config, pipeline, tracing logger = logging.getLogger(__name__) @@ -20,7 +17,9 @@ PROB_MAX = 1.0 -def report_bad_choices(bad_row_map, df, trace_label, msg, trace_choosers=None, raise_error=True): +def report_bad_choices( + bad_row_map, df, trace_label, msg, trace_choosers=None, raise_error=True +): """ Parameters @@ -42,7 +41,12 @@ def report_bad_choices(bad_row_map, df, trace_label, msg, trace_choosers=None, r MAX_DUMP = 1000 MAX_PRINT = 10 - msg_with_count = "%s %s for %s of %s rows" % (trace_label, msg, bad_row_map.sum(), len(df)) + msg_with_count = "%s %s for %s of %s rows" % ( + trace_label, + msg, + bad_row_map.sum(), + len(df), + ) logger.warning(msg_with_count) df = df[bad_row_map] @@ -54,15 +58,18 @@ def report_bad_choices(bad_row_map, df, trace_label, msg, trace_choosers=None, r if trace_label: logger.info("dumping %s" % trace_label) - tracing.write_csv(df[:MAX_DUMP], - file_name=trace_label, - transpose=False) + tracing.write_csv(df[:MAX_DUMP], file_name=trace_label, transpose=False) # log the indexes of the first MAX_DUMP offending rows for idx in df.index[:MAX_PRINT].values: - row_msg = "%s : %s in: %s = %s (hh_id = %s)" % \ - (trace_label, msg, df.index.name, idx, df[trace_col].loc[idx]) + row_msg = "%s : %s in: %s = %s (hh_id = %s)" % ( + trace_label, + msg, + df.index.name, + idx, + df[trace_col].loc[idx], + ) logger.warning(row_msg) @@ -99,7 +106,7 @@ def utils_to_logsums(utils, exponentiated=False, allow_zero_probs=False): utils_arr = np.where(utils_arr == EXP_UTIL_MIN, 0.0, utils_arr) - with np.errstate(divide='ignore' if allow_zero_probs else 'warn'): + with np.errstate(divide="ignore" if allow_zero_probs else "warn"): logsums = np.log(utils_arr.sum(axis=1)) logsums = pd.Series(logsums, index=utils.index) @@ -107,8 +114,13 @@ def utils_to_logsums(utils, exponentiated=False, allow_zero_probs=False): return logsums -def utils_to_probs(utils, trace_label=None, exponentiated=False, allow_zero_probs=False, - trace_choosers=None): +def utils_to_probs( + utils, + trace_label=None, + exponentiated=False, + allow_zero_probs=False, + trace_choosers=None, +): """ Convert a table of utilities to probabilities. @@ -139,7 +151,7 @@ def utils_to_probs(utils, trace_label=None, exponentiated=False, allow_zero_prob Will have the same index and columns as `utils`. """ - trace_label = tracing.extend_trace_label(trace_label, 'utils_to_probs') + trace_label = tracing.extend_trace_label(trace_label, "utils_to_probs") # fixme - conversion to float not needed in either case? # utils_arr = utils.values.astype('float') @@ -154,24 +166,32 @@ def utils_to_probs(utils, trace_label=None, exponentiated=False, allow_zero_prob arr_sum = utils_arr.sum(axis=1) - zero_probs = (arr_sum == 0.0) + zero_probs = arr_sum == 0.0 if zero_probs.any() and not allow_zero_probs: - report_bad_choices(zero_probs, utils, - trace_label=tracing.extend_trace_label(trace_label, 'zero_prob_utils'), - msg="all probabilities are zero", - trace_choosers=trace_choosers) + report_bad_choices( + zero_probs, + utils, + trace_label=tracing.extend_trace_label(trace_label, "zero_prob_utils"), + msg="all probabilities are zero", + trace_choosers=trace_choosers, + ) inf_utils = np.isinf(arr_sum) if inf_utils.any(): - report_bad_choices(inf_utils, utils, - trace_label=tracing.extend_trace_label(trace_label, 'inf_exp_utils'), - msg="infinite exponentiated utilities", - trace_choosers=trace_choosers) + report_bad_choices( + inf_utils, + utils, + trace_label=tracing.extend_trace_label(trace_label, "inf_exp_utils"), + msg="infinite exponentiated utilities", + trace_choosers=trace_choosers, + ) # if allow_zero_probs, this may cause a RuntimeWarning: invalid value encountered in divide - with np.errstate(invalid='ignore' if allow_zero_probs else 'warn', - divide='ignore' if allow_zero_probs else 'warn'): + with np.errstate( + invalid="ignore" if allow_zero_probs else "warn", + divide="ignore" if allow_zero_probs else "warn", + ): np.divide(utils_arr, arr_sum.reshape(len(utils_arr), 1), out=utils_arr) # if allow_zero_probs, this will cause EXP_UTIL_MIN util rows to have all zero probabilities @@ -210,21 +230,24 @@ def make_choices(probs, trace_label=None, trace_choosers=None, allow_bad_probs=F The random numbers used to make the choices (for debugging, tracing) """ - trace_label = tracing.extend_trace_label(trace_label, 'make_choices') + trace_label = tracing.extend_trace_label(trace_label, "make_choices") # probs should sum to 1 across each row BAD_PROB_THRESHOLD = 0.001 - bad_probs = \ - probs.sum(axis=1).sub(np.ones(len(probs.index))).abs() \ - > BAD_PROB_THRESHOLD * np.ones(len(probs.index)) + bad_probs = probs.sum(axis=1).sub( + np.ones(len(probs.index)) + ).abs() > BAD_PROB_THRESHOLD * np.ones(len(probs.index)) if bad_probs.any() and not allow_bad_probs: - report_bad_choices(bad_probs, probs, - trace_label=tracing.extend_trace_label(trace_label, 'bad_probs'), - msg="probabilities do not add up to 1", - trace_choosers=trace_choosers) + report_bad_choices( + bad_probs, + probs, + trace_label=tracing.extend_trace_label(trace_label, "bad_probs"), + msg="probabilities do not add up to 1", + trace_choosers=trace_choosers, + ) rands = pipeline.get_rn_generator().random_for_df(probs) @@ -241,7 +264,9 @@ def make_choices(probs, trace_label=None, trace_choosers=None, allow_bad_probs=F return choices, rands -def interaction_dataset(choosers, alternatives, sample_size=None, alt_index_id=None, chooser_index_id=None): +def interaction_dataset( + choosers, alternatives, sample_size=None, alt_index_id=None, chooser_index_id=None +): """ Combine choosers and alternatives into one table for the purposes of creating interaction variables and/or sampling alternatives. @@ -265,12 +290,12 @@ def interaction_dataset(choosers, alternatives, sample_size=None, alt_index_id=N """ if not choosers.index.is_unique: raise RuntimeError( - "ERROR: choosers index is not unique, " - "sample will not work correctly") + "ERROR: choosers index is not unique, " "sample will not work correctly" + ) if not alternatives.index.is_unique: raise RuntimeError( - "ERROR: alternatives index is not unique, " - "sample will not work correctly") + "ERROR: alternatives index is not unique, " "sample will not work correctly" + ) numchoosers = len(choosers) numalts = len(alternatives) @@ -280,8 +305,9 @@ def interaction_dataset(choosers, alternatives, sample_size=None, alt_index_id=N alts_idx = np.arange(numalts) if sample_size < numalts: - sample = pipeline.get_rn_generator().choice_for_df(choosers, - alts_idx, sample_size, replace=False) + sample = pipeline.get_rn_generator().choice_for_df( + choosers, alts_idx, sample_size, replace=False + ) else: sample = np.tile(alts_idx, numchoosers) @@ -292,13 +318,15 @@ def interaction_dataset(choosers, alternatives, sample_size=None, alt_index_id=N # permits identification of alternative row in the joined dataset alts_sample[alt_index_id] = alts_sample.index - logger.debug("interaction_dataset pre-merge choosers %s alternatives %s alts_sample %s" % - (choosers.shape, alternatives.shape, alts_sample.shape)) + logger.debug( + "interaction_dataset pre-merge choosers %s alternatives %s alts_sample %s" + % (choosers.shape, alternatives.shape, alts_sample.shape) + ) # no need to do an expensive merge of alts and choosers # we can simply assign repeated chooser values for c in choosers.columns: - c_chooser = (c + '_chooser') if c in alts_sample.columns else c + c_chooser = (c + "_chooser") if c in alts_sample.columns else c alts_sample[c_chooser] = np.repeat(choosers[c].values, sample_size) # caller may want this to detect utils that make all alts for a chooser unavailable (e.g. -999) @@ -306,7 +334,7 @@ def interaction_dataset(choosers, alternatives, sample_size=None, alt_index_id=N assert chooser_index_id not in alts_sample alts_sample[chooser_index_id] = np.repeat(choosers.index.values, sample_size) - logger.debug("interaction_dataset merged alts_sample %s" % (alts_sample.shape, )) + logger.debug("interaction_dataset merged alts_sample %s" % (alts_sample.shape,)) return alts_sample @@ -332,20 +360,28 @@ def __init__(self, name=None, level=0): self.coefficient = 0 def print(self): - print("Nest name: %s level: %s coefficient: %s product_of_coefficients: %s ancestors: %s" % - (self.name, self.level, self.coefficient, self.product_of_coefficients, self.ancestors)) + print( + "Nest name: %s level: %s coefficient: %s product_of_coefficients: %s ancestors: %s" + % ( + self.name, + self.level, + self.coefficient, + self.product_of_coefficients, + self.ancestors, + ) + ) @property def is_leaf(self): - return (self.alternatives is None) + return self.alternatives is None @property def type(self): - return 'leaf' if self.is_leaf else 'node' + return "leaf" if self.is_leaf else "node" @classmethod def nest_types(cls): - return ['leaf', 'node'] + return ["leaf", "node"] def validate_nest_spec(nest_spec, trace_label): @@ -354,14 +390,20 @@ def validate_nest_spec(nest_spec, trace_label): duplicates = [] for nest in each_nest(nest_spec): if nest.name in keys: - logger.error("validate_nest_spec:duplicate nest key '%s' in nest spec - %s" % (nest.name, trace_label)) + logger.error( + "validate_nest_spec:duplicate nest key '%s' in nest spec - %s" + % (nest.name, trace_label) + ) duplicates.append(nest.name) keys.append(nest.name) # nest.print() if duplicates: - raise RuntimeError("validate_nest_spec:duplicate nest key/s '%s' in nest spec - %s" % (duplicates, trace_label)) + raise RuntimeError( + "validate_nest_spec:duplicate nest key/s '%s' in nest spec - %s" + % (duplicates, trace_label) + ) def _each_nest(spec, parent_nest, post_order): @@ -392,11 +434,14 @@ def _each_nest(spec, parent_nest, post_order): level = parent_nest.level + 1 if isinstance(spec, dict): - name = spec['name'] - coefficient = spec['coefficient'] - assert isinstance(coefficient, (int, float)), \ - "Coefficient '%s' (%s) not a number" % (name, coefficient) # forgot to eval coefficient? - alternatives = [a['name'] if isinstance(a, dict) else a for a in spec['alternatives']] + name = spec["name"] + coefficient = spec["coefficient"] + assert isinstance(coefficient, (int, float)), ( + "Coefficient '%s' (%s) not a number" % (name, coefficient) + ) # forgot to eval coefficient? + alternatives = [ + a["name"] if isinstance(a, dict) else a for a in spec["alternatives"] + ] nest = Nest(name=name) nest.level = parent_nest.level + 1 @@ -409,7 +454,7 @@ def _each_nest(spec, parent_nest, post_order): yield spec, nest # recursively iterate the list of alternatives - for alternative in spec['alternatives']: + for alternative in spec["alternatives"]: for sub_node, sub_nest in _each_nest(alternative, nest, post_order): yield sub_node, sub_nest @@ -464,7 +509,11 @@ def count_nests(nest_spec): def count_each_nest(spec, count): if isinstance(spec, dict): - return count + 1 + sum([count_each_nest(alt, count) for alt in spec['alternatives']]) + return ( + count + + 1 + + sum([count_each_nest(alt, count) for alt in spec["alternatives"]]) + ) else: assert isinstance(spec, str) return 1 diff --git a/activitysim/core/los.py b/activitysim/core/los.py index 125317c7a6..ba5bf4712b 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -1,42 +1,42 @@ # ActivitySim # See full license in LICENSE.txt. -import os import logging +import os import warnings import numpy as np import pandas as pd -from activitysim.core import skim_dictionary -from activitysim.core import inject -from activitysim.core import util -from activitysim.core import config -from activitysim.core import pathbuilder -from activitysim.core import mem -from activitysim.core import tracing - +from activitysim.core import ( + config, + inject, + mem, + pathbuilder, + skim_dictionary, + tracing, + util, +) +from activitysim.core.skim_dict_factory import MemMapSkimFactory, NumpyArraySkimFactory from activitysim.core.skim_dictionary import NOT_IN_SKIM_ZONE_ID -from activitysim.core.skim_dict_factory import NumpyArraySkimFactory -from activitysim.core.skim_dict_factory import MemMapSkimFactory skim_factories = { - 'NumpyArraySkimFactory': NumpyArraySkimFactory, - 'MemMapSkimFactory': MemMapSkimFactory, + "NumpyArraySkimFactory": NumpyArraySkimFactory, + "MemMapSkimFactory": MemMapSkimFactory, } logger = logging.getLogger(__name__) -LOS_SETTINGS_FILE_NAME = 'network_los.yaml' +LOS_SETTINGS_FILE_NAME = "network_los.yaml" ONE_ZONE = 1 TWO_ZONE = 2 THREE_ZONE = 3 DEFAULT_SETTINGS = { - 'rebuild_tvpb_cache': True, - 'zone_system': ONE_ZONE, - 'skim_dict_factory': 'NumpyArraySkimFactory' + "rebuild_tvpb_cache": True, + "zone_system": ONE_ZONE, + "skim_dict_factory": "NumpyArraySkimFactory", } TRACE_TRIMMED_MAZ_TO_TAP_TABLES = True @@ -80,7 +80,7 @@ def __init__(self, los_settings_file_name=LOS_SETTINGS_FILE_NAME): # Note: we require all skims to be of same dtype so they can share buffer - is that ok? # fixme is it ok to require skims be all the same type? if so, is this the right choice? - self.skim_dtype_name = 'float32' + self.skim_dtype_name = "float32" self.zone_system = None self.skim_time_periods = None self.skims_info = {} @@ -101,11 +101,16 @@ def __init__(self, los_settings_file_name=LOS_SETTINGS_FILE_NAME): self.load_settings() # dependency injection of skim factory (of type specified in skim_dict_factory setting) - skim_dict_factory_name = self.setting('skim_dict_factory') - assert skim_dict_factory_name in skim_factories, \ - f"Unrecognized skim_dict_factory setting '{skim_dict_factory_name}" - self.skim_dict_factory = skim_factories[skim_dict_factory_name](network_los=self) - logger.info(f"Network_LOS using skim_dict_factory: {type(self.skim_dict_factory).__name__}") + skim_dict_factory_name = self.setting("skim_dict_factory") + assert ( + skim_dict_factory_name in skim_factories + ), f"Unrecognized skim_dict_factory setting '{skim_dict_factory_name}" + self.skim_dict_factory = skim_factories[skim_dict_factory_name]( + network_los=self + ) + logger.info( + f"Network_LOS using skim_dict_factory: {type(self.skim_dict_factory).__name__}" + ) # load SkimInfo for all skims for this zone_system (TAZ for ONE_ZONE and TWO_ZONE, TAZ and MAZ for THREE_ZONE) self.load_skim_info() @@ -113,23 +118,33 @@ def __init__(self, los_settings_file_name=LOS_SETTINGS_FILE_NAME): @property def rebuild_tvpb_cache(self): # setting as property here so others don't need to know default - assert self.zone_system == THREE_ZONE, f"Should not even be asking about rebuild_tvpb_cache if not THREE_ZONE" - return self.setting('rebuild_tvpb_cache') + assert ( + self.zone_system == THREE_ZONE + ), f"Should not even be asking about rebuild_tvpb_cache if not THREE_ZONE" + return self.setting("rebuild_tvpb_cache") - def setting(self, keys, default=''): + def setting(self, keys, default=""): # if they dont specify a default, check the default defaults - default = DEFAULT_SETTINGS.get(keys, '') if default == '' else default + default = ( + DEFAULT_SETTINGS.get(keys, "") + if default == "" + else default + ) # get setting value for single key or dot-delimited key path (e.g. 'maz_to_maz.tables') - key_list = keys.split('.') + key_list = keys.split(".") s = self.los_settings for key in key_list[:-1]: s = s.get(key) - assert isinstance(s, dict), f"expected key '{key}' not found in '{keys}' in {self.los_settings_file_name}" + assert isinstance( + s, dict + ), f"expected key '{key}' not found in '{keys}' in {self.los_settings_file_name}" key = key_list[-1] # last key - if default == '': - assert key in s, f"Expected setting {keys} not found in in {LOS_SETTINGS_FILE_NAME}" + if default == "": + assert ( + key in s + ), f"Expected setting {keys} not found in in {LOS_SETTINGS_FILE_NAME}" return s.get(key, default) def load_settings(self): @@ -138,60 +153,87 @@ def load_settings(self): """ try: - self.los_settings = config.read_settings_file(self.los_settings_file_name, mandatory=True) + self.los_settings = config.read_settings_file( + self.los_settings_file_name, mandatory=True + ) except config.SettingsFileNotFound as e: - print(f"los_settings_file_name {self.los_settings_file_name} not found - trying global settings") + print( + f"los_settings_file_name {self.los_settings_file_name} not found - trying global settings" + ) print(f"skims_file: {config.setting('skims_file')}") print(f"skim_time_periods: {config.setting('skim_time_periods')}") print(f"source_file_paths: {config.setting('source_file_paths')}") - print(f"inject.get_injectable('configs_dir') {inject.get_injectable('configs_dir')}") + print( + f"inject.get_injectable('configs_dir') {inject.get_injectable('configs_dir')}" + ) # look for legacy 'skims_file' setting in global settings file - if config.setting('skims_file'): + if config.setting("skims_file"): - warnings.warn("Support for 'skims_file' setting in global settings file will be removed." - "Use 'taz_skims' in network_los.yaml config file instead.", FutureWarning) + warnings.warn( + "Support for 'skims_file' setting in global settings file will be removed." + "Use 'taz_skims' in network_los.yaml config file instead.", + FutureWarning, + ) # in which case, we also expect to find skim_time_periods in settings file - skim_time_periods = config.setting('skim_time_periods') - assert skim_time_periods is not None, "'skim_time_periods' setting not found." - warnings.warn("Support for 'skim_time_periods' setting in global settings file will be removed." - "Put 'skim_time_periods' in network_los.yaml config file instead.", FutureWarning) + skim_time_periods = config.setting("skim_time_periods") + assert ( + skim_time_periods is not None + ), "'skim_time_periods' setting not found." + warnings.warn( + "Support for 'skim_time_periods' setting in global settings file will be removed." + "Put 'skim_time_periods' in network_los.yaml config file instead.", + FutureWarning, + ) self.los_settings = { - 'taz_skims': config.setting('skims_file'), - 'zone_system': ONE_ZONE, - 'skim_time_periods': skim_time_periods + "taz_skims": config.setting("skims_file"), + "zone_system": ONE_ZONE, + "skim_time_periods": skim_time_periods, } else: raise e # validate skim_time_periods - self.skim_time_periods = self.setting('skim_time_periods') - if 'hours' in self.skim_time_periods: - self.skim_time_periods['periods'] = self.skim_time_periods.pop('hours') - warnings.warn('support for `skim_time_periods` key `hours` will be removed in ' - 'future verions. Use `periods` instead', - FutureWarning) - assert 'periods' in self.skim_time_periods, "'periods' key not found in network_los.skim_time_periods" - assert 'labels' in self.skim_time_periods, "'labels' key not found in network_los.skim_time_periods" - - self.zone_system = self.setting('zone_system') - assert self.zone_system in [ONE_ZONE, TWO_ZONE, THREE_ZONE], \ - f"Network_LOS: unrecognized zone_system: {self.zone_system}" + self.skim_time_periods = self.setting("skim_time_periods") + if "hours" in self.skim_time_periods: + self.skim_time_periods["periods"] = self.skim_time_periods.pop("hours") + warnings.warn( + "support for `skim_time_periods` key `hours` will be removed in " + "future verions. Use `periods` instead", + FutureWarning, + ) + assert ( + "periods" in self.skim_time_periods + ), "'periods' key not found in network_los.skim_time_periods" + assert ( + "labels" in self.skim_time_periods + ), "'labels' key not found in network_los.skim_time_periods" + + self.zone_system = self.setting("zone_system") + assert self.zone_system in [ + ONE_ZONE, + TWO_ZONE, + THREE_ZONE, + ], f"Network_LOS: unrecognized zone_system: {self.zone_system}" if self.zone_system in [TWO_ZONE, THREE_ZONE]: # maz_to_maz_settings - self.max_blend_distance = self.setting('maz_to_maz.max_blend_distance', default={}) + self.max_blend_distance = self.setting( + "maz_to_maz.max_blend_distance", default={} + ) if isinstance(self.max_blend_distance, int): - self.max_blend_distance = {'DEFAULT': self.max_blend_distance} - self.blend_distance_skim_name = self.setting('maz_to_maz.blend_distance_skim_name', default=None) + self.max_blend_distance = {"DEFAULT": self.max_blend_distance} + self.blend_distance_skim_name = self.setting( + "maz_to_maz.blend_distance_skim_name", default=None + ) # validate skim_time_periods - self.skim_time_periods = self.setting('skim_time_periods') - assert {'periods', 'labels'}.issubset(set(self.skim_time_periods.keys())) + self.skim_time_periods = self.setting("skim_time_periods") + assert {"periods", "labels"}.issubset(set(self.skim_time_periods.keys())) def load_skim_info(self): """ @@ -202,16 +244,20 @@ def load_skim_info(self): """ assert self.skim_dict_factory is not None # load taz skim_info - self.skims_info['taz'] = self.skim_dict_factory.load_skim_info('taz') + self.skims_info["taz"] = self.skim_dict_factory.load_skim_info("taz") if self.zone_system == THREE_ZONE: # load tap skim_info - self.skims_info['tap'] = self.skim_dict_factory.load_skim_info('tap') + self.skims_info["tap"] = self.skim_dict_factory.load_skim_info("tap") if self.zone_system == THREE_ZONE: # load this here rather than in load_data as it is required during multiprocessing to size TVPBCache - self.tap_df = pd.read_csv(config.data_file_path(self.setting('tap'), mandatory=True)).sort_values('TAP') - self.tvpb = pathbuilder.TransitVirtualPathBuilder(self) # dependent on self.tap_df + self.tap_df = pd.read_csv( + config.data_file_path(self.setting("tap"), mandatory=True) + ).sort_values("TAP") + self.tvpb = pathbuilder.TransitVirtualPathBuilder( + self + ) # dependent on self.tap_df def load_data(self): """ @@ -222,26 +268,36 @@ def load_data(self): if self.zone_system in [TWO_ZONE, THREE_ZONE]: # maz - file_name = self.setting('maz') - self.maz_taz_df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) - self.maz_taz_df = self.maz_taz_df[['MAZ', 'TAZ']].sort_values(by='MAZ') # only fields we need + file_name = self.setting("maz") + self.maz_taz_df = pd.read_csv( + config.data_file_path(file_name, mandatory=True) + ) + self.maz_taz_df = self.maz_taz_df[["MAZ", "TAZ"]].sort_values( + by="MAZ" + ) # only fields we need self.maz_ceiling = self.maz_taz_df.MAZ.max() + 1 # maz_to_maz_df - maz_to_maz_tables = self.setting('maz_to_maz.tables') - maz_to_maz_tables = [maz_to_maz_tables] if isinstance(maz_to_maz_tables, str) else maz_to_maz_tables + maz_to_maz_tables = self.setting("maz_to_maz.tables") + maz_to_maz_tables = ( + [maz_to_maz_tables] + if isinstance(maz_to_maz_tables, str) + else maz_to_maz_tables + ) for file_name in maz_to_maz_tables: df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) - df['i'] = df.OMAZ * self.maz_ceiling + df.DMAZ - df.set_index('i', drop=True, inplace=True, verify_integrity=True) - logger.debug(f"loading maz_to_maz table {file_name} with {len(df)} rows") + df["i"] = df.OMAZ * self.maz_ceiling + df.DMAZ + df.set_index("i", drop=True, inplace=True, verify_integrity=True) + logger.debug( + f"loading maz_to_maz table {file_name} with {len(df)} rows" + ) # FIXME - don't really need these columns, but if we do want them, # we would need to merge them in since files may have different numbers of rows - df.drop(columns=['OMAZ', 'DMAZ'], inplace=True) + df.drop(columns=["OMAZ", "DMAZ"], inplace=True) # besides, we only want data columns so we can coerce to same type as skims df = df.astype(np.dtype(self.skim_dtype_name)) @@ -260,24 +316,27 @@ def load_data(self): assert self.tap_df is not None # maz_to_tap_dfs - different sized sparse arrays with different columns, so we keep them seperate - for mode, maz_to_tap_settings in self.setting('maz_to_tap').items(): + for mode, maz_to_tap_settings in self.setting("maz_to_tap").items(): - assert 'table' in maz_to_tap_settings, \ - f"Expected setting maz_to_tap.{mode}.table not found in in {LOS_SETTINGS_FILE_NAME}" + assert ( + "table" in maz_to_tap_settings + ), f"Expected setting maz_to_tap.{mode}.table not found in in {LOS_SETTINGS_FILE_NAME}" - file_name = maz_to_tap_settings['table'] + file_name = maz_to_tap_settings["table"] df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) # trim tap set # if provided, use tap_line_distance_col together with tap_lines table to trim the near tap set # to only include the nearest tap to origin when more than one tap serves the same line - distance_col = maz_to_tap_settings.get('tap_line_distance_col') + distance_col = maz_to_tap_settings.get("tap_line_distance_col") if distance_col: if self.tap_lines_df is None: # load tap_lines on demand (required if they specify tap_line_distance_col) - tap_lines_file_name = self.setting('tap_lines', ) - self.tap_lines_df = pd.read_csv(config.data_file_path(tap_lines_file_name, mandatory=True)) + tap_lines_file_name = self.setting("tap_lines",) + self.tap_lines_df = pd.read_csv( + config.data_file_path(tap_lines_file_name, mandatory=True) + ) # csv file has one row per TAP with space-delimited list of lines served by that TAP # TAP LINES @@ -287,48 +346,74 @@ def load_data(self): # 6020 GG_024b_SB # 6020 GG_068_RT # 6020 GG_228_WB - self.tap_lines_df = \ - self.tap_lines_df.set_index('TAP').LINES.str.split(expand=True)\ - .stack().droplevel(1).to_frame('line') + self.tap_lines_df = ( + self.tap_lines_df.set_index("TAP") + .LINES.str.split(expand=True) + .stack() + .droplevel(1) + .to_frame("line") + ) old_len = len(df) # NOTE - merge will remove unused taps (not appearing in tap_lines) - df = pd.merge(df, self.tap_lines_df, left_on='TAP', right_index=True) + df = pd.merge( + df, self.tap_lines_df, left_on="TAP", right_index=True + ) # find nearest TAP to MAz that serves line - df = df.sort_values(by=distance_col).drop_duplicates(subset=['MAZ', 'line']) + df = df.sort_values(by=distance_col).drop_duplicates( + subset=["MAZ", "line"] + ) # we don't need to remember which lines are served by which TAPs - df = df.drop(columns='line').drop_duplicates(subset=['MAZ', 'TAP']).sort_values(['MAZ', 'TAP']) - - logger.debug(f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows " - f"based on tap_lines") - logger.debug(f"maz_to_tap table {file_name} max {distance_col} {df[distance_col].max()}") - - max_dist = maz_to_tap_settings.get('max_dist', None) + df = ( + df.drop(columns="line") + .drop_duplicates(subset=["MAZ", "TAP"]) + .sort_values(["MAZ", "TAP"]) + ) + + logger.debug( + f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows " + f"based on tap_lines" + ) + logger.debug( + f"maz_to_tap table {file_name} max {distance_col} {df[distance_col].max()}" + ) + + max_dist = maz_to_tap_settings.get("max_dist", None) if max_dist: old_len = len(df) df = df[df[distance_col] <= max_dist] - logger.debug(f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows " - f"based on max_dist {max_dist}") + logger.debug( + f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows " + f"based on max_dist {max_dist}" + ) if TRACE_TRIMMED_MAZ_TO_TAP_TABLES: - tracing.write_csv(df, file_name=f"trimmed_{maz_to_tap_settings['table']}", transpose=False) + tracing.write_csv( + df, + file_name=f"trimmed_{maz_to_tap_settings['table']}", + transpose=False, + ) else: - logger.warning(f"tap_line_distance_col not provided in {LOS_SETTINGS_FILE_NAME} so maz_to_tap " - f"pairs will not be trimmed which may result in high memory use and long runtimes") - - df.set_index(['MAZ', 'TAP'], drop=True, inplace=True, verify_integrity=True) + logger.warning( + f"tap_line_distance_col not provided in {LOS_SETTINGS_FILE_NAME} so maz_to_tap " + f"pairs will not be trimmed which may result in high memory use and long runtimes" + ) + + df.set_index( + ["MAZ", "TAP"], drop=True, inplace=True, verify_integrity=True + ) logger.debug(f"loaded maz_to_tap table {file_name} with {len(df)} rows") assert mode not in self.maz_to_tap_dfs self.maz_to_tap_dfs[mode] = df # create taz skim dict - assert 'taz' not in self.skim_dicts - self.skim_dicts['taz'] = self.create_skim_dict('taz') + assert "taz" not in self.skim_dicts + self.skim_dicts["taz"] = self.create_skim_dict("taz") # make sure skim has all taz_ids # FIXME - weird that there is no list of tazs? @@ -336,20 +421,26 @@ def load_data(self): if self.zone_system in [TWO_ZONE, THREE_ZONE]: # create MazSkimDict facade skim_dict # (must have already loaded dependencies: taz skim_dict, maz_to_maz_df, and maz_taz_df) - assert 'maz' not in self.skim_dicts - maz_skim_dict = self.create_skim_dict('maz') - self.skim_dicts['maz'] = maz_skim_dict + assert "maz" not in self.skim_dicts + maz_skim_dict = self.create_skim_dict("maz") + self.skim_dicts["maz"] = maz_skim_dict # make sure skim has all maz_ids - assert not (maz_skim_dict.offset_mapper.map(self.maz_taz_df['MAZ'].values) == NOT_IN_SKIM_ZONE_ID).any() + assert not ( + maz_skim_dict.offset_mapper.map(self.maz_taz_df["MAZ"].values) + == NOT_IN_SKIM_ZONE_ID + ).any() # create tap skim dict if self.zone_system == THREE_ZONE: - assert 'tap' not in self.skim_dicts - tap_skim_dict = self.create_skim_dict('tap') - self.skim_dicts['tap'] = tap_skim_dict + assert "tap" not in self.skim_dicts + tap_skim_dict = self.create_skim_dict("tap") + self.skim_dicts["tap"] = tap_skim_dict # make sure skim has all tap_ids - assert not (tap_skim_dict.offset_mapper.map(self.tap_df['TAP'].values) == NOT_IN_SKIM_ZONE_ID).any() + assert not ( + tap_skim_dict.offset_mapper.map(self.tap_df["TAP"].values) + == NOT_IN_SKIM_ZONE_ID + ).any() def create_skim_dict(self, skim_tag): """ @@ -363,16 +454,19 @@ def create_skim_dict(self, skim_tag): ------- SkimDict or subclass (e.g. MazSkimDict) """ - assert skim_tag not in self.skim_dicts # avoid inadvertently creating multiple copies + assert ( + skim_tag not in self.skim_dicts + ) # avoid inadvertently creating multiple copies - if skim_tag == 'maz': + if skim_tag == "maz": # MazSkimDict gets a reference to self here, because it has dependencies on self.load_data # (e.g. maz_to_maz_df, maz_taz_df...) We pass in taz_skim_dict as a parameter # to hilight the fact that we do not want two copies of its (very large) data array in memory - assert 'taz' in self.skim_dicts, \ - f"create_skim_dict 'maz': backing taz skim_dict not in skim_dicts" - taz_skim_dict = self.skim_dicts['taz'] - skim_dict = skim_dictionary.MazSkimDict('maz', self, taz_skim_dict) + assert ( + "taz" in self.skim_dicts + ), f"create_skim_dict 'maz': backing taz skim_dict not in skim_dicts" + taz_skim_dict = self.skim_dicts["taz"] + skim_dict = skim_dictionary.MazSkimDict("maz", self, taz_skim_dict) else: skim_info = self.skims_info[skim_tag] skim_data = self.skim_dict_factory.get_skim_data(skim_tag, skim_info) @@ -394,7 +488,7 @@ def omx_file_names(self, skim_tag): ------- list of str """ - file_names = self.setting(f'{skim_tag}_skims') + file_names = self.setting(f"{skim_tag}_skims") file_names = [file_names] if isinstance(file_names, str) else file_names return file_names @@ -406,7 +500,7 @@ def multiprocess(self): ------- bool """ - is_multiprocess = config.setting('multiprocess', False) + is_multiprocess = config.setting("multiprocess", False) return is_multiprocess def load_shared_data(self, shared_data_buffers): @@ -424,18 +518,24 @@ def load_shared_data(self, shared_data_buffers): if self.skim_dict_factory.supports_shared_data_for_multiprocessing: for skim_tag in self.skims_info.keys(): - assert skim_tag in shared_data_buffers, f"load_shared_data expected allocated shared_data_buffers" - self.skim_dict_factory.load_skims_to_buffer(self.skims_info[skim_tag], shared_data_buffers[skim_tag]) + assert ( + skim_tag in shared_data_buffers + ), f"load_shared_data expected allocated shared_data_buffers" + self.skim_dict_factory.load_skims_to_buffer( + self.skims_info[skim_tag], shared_data_buffers[skim_tag] + ) if self.zone_system == THREE_ZONE: assert self.tvpb is not None - if self.rebuild_tvpb_cache and not config.setting('resume_after', None): + if self.rebuild_tvpb_cache and not config.setting("resume_after", None): # delete old cache at start of new run so that stale cache is not loaded by load_data_to_buffer # when singleprocess, this call is made (later in program flow) in the initialize_los step self.tvpb.tap_cache.cleanup() - self.tvpb.tap_cache.load_data_to_buffer(shared_data_buffers[self.tvpb.tap_cache.cache_tag]) + self.tvpb.tap_cache.load_data_to_buffer( + shared_data_buffers[self.tvpb.tap_cache.cache_tag] + ) def allocate_shared_skim_buffers(self): """ @@ -453,19 +553,23 @@ def allocate_shared_skim_buffers(self): """ assert self.multiprocess() - assert not self.skim_dicts, f"allocate_shared_skim_buffers must be called BEFORE, not after, load_data" + assert ( + not self.skim_dicts + ), f"allocate_shared_skim_buffers must be called BEFORE, not after, load_data" skim_buffers = {} if self.skim_dict_factory.supports_shared_data_for_multiprocessing: for skim_tag in self.skims_info.keys(): - skim_buffers[skim_tag] = \ - self.skim_dict_factory.allocate_skim_buffer(self.skims_info[skim_tag], shared=True) + skim_buffers[skim_tag] = self.skim_dict_factory.allocate_skim_buffer( + self.skims_info[skim_tag], shared=True + ) if self.zone_system == THREE_ZONE: assert self.tvpb is not None - skim_buffers[self.tvpb.tap_cache.cache_tag] = \ - self.tvpb.tap_cache.allocate_data_buffer(shared=True) + skim_buffers[ + self.tvpb.tap_cache.cache_tag + ] = self.tvpb.tap_cache.allocate_data_buffer(shared=True) return skim_buffers @@ -478,8 +582,9 @@ def get_skim_dict(self, skim_tag): SkimDict or subclass (e.g. MazSkimDict) """ - assert skim_tag in self.skim_dicts, \ - f"network_los.get_skim_dict: skim tag '{skim_tag}' not in skim_dicts" + assert ( + skim_tag in self.skim_dicts + ), f"network_los.get_skim_dict: skim tag '{skim_tag}' not in skim_dicts" return self.skim_dicts[skim_tag] @@ -492,9 +597,9 @@ def get_default_skim_dict(self): TAZ SkimDict for ONE_ZONE, MazSkimDict for TWO_ZONE and THREE_ZONE """ if self.zone_system == ONE_ZONE: - return self.get_skim_dict('taz') + return self.get_skim_dict("taz") else: - return self.get_skim_dict('maz') + return self.get_skim_dict("maz") def get_mazpairs(self, omaz, dmaz, attribute): """ @@ -545,7 +650,7 @@ def get_tappairs3d(self, otap, dtap, dim3, key): Numpy.ndarray: list of tap skim values for odt tuples """ - s = self.get_skim_dict('tap').lookup_3d(otap, dtap, dim3, key) + s = self.get_skim_dict("tap").lookup_3d(otap, dtap, dim3, key) return s def skim_time_period_label(self, time_period): @@ -562,13 +667,15 @@ def skim_time_period_label(self, time_period): string time period labels """ - assert self.skim_time_periods is not None, "'skim_time_periods' setting not found." + assert ( + self.skim_time_periods is not None + ), "'skim_time_periods' setting not found." # Default to 60 minute time periods - period_minutes = self.skim_time_periods.get('period_minutes', 60) + period_minutes = self.skim_time_periods.get("period_minutes", 60) # Default to a day - model_time_window_min = self.skim_time_periods.get('time_window', 1440) + model_time_window_min = self.skim_time_periods.get("time_window", 1440) # Check to make sure the intervals result in no remainder time through 24 hour day assert 0 == model_time_window_min % period_minutes @@ -576,17 +683,27 @@ def skim_time_period_label(self, time_period): # FIXME - eventually test and use np version always? if np.isscalar(time_period): - bin = np.digitize([time_period % total_periods], - self.skim_time_periods['periods'], right=True)[0] - 1 - return self.skim_time_periods['labels'][bin] - - return pd.cut(time_period, self.skim_time_periods['periods'], - labels=self.skim_time_periods['labels'], ordered=False).astype(str) + bin = ( + np.digitize( + [time_period % total_periods], + self.skim_time_periods["periods"], + right=True, + )[0] + - 1 + ) + return self.skim_time_periods["labels"][bin] + + return pd.cut( + time_period, + self.skim_time_periods["periods"], + labels=self.skim_time_periods["labels"], + ordered=False, + ).astype(str) def get_tazs(self): # FIXME - should compute on init? if self.zone_system == ONE_ZONE: - tazs = inject.get_table('land_use').index.values + tazs = inject.get_table("land_use").index.values else: tazs = self.maz_taz_df.TAZ.unique() assert isinstance(tazs, np.ndarray) diff --git a/activitysim/core/mem.py b/activitysim/core/mem.py index 1066c34324..4575186fea 100644 --- a/activitysim/core/mem.py +++ b/activitysim/core/mem.py @@ -1,25 +1,21 @@ - # ActivitySim # See full license in LICENSE.txt. import datetime - import gc import glob import logging import multiprocessing import os import platform -import psutil import threading import time import numpy as np import pandas as pd +import psutil -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import util +from activitysim.core import config, inject, util logger = logging.getLogger(__name__) @@ -46,7 +42,7 @@ def time_bin(timestamps): seconds_since_epoch = (timestamps - epoch) // pd.Timedelta("1s") bin = seconds_since_epoch - (seconds_since_epoch % bins_size_in_seconds) - return pd.to_datetime(bin, unit='s', origin='unix') + return pd.to_datetime(bin, unit="s", origin="unix") def consolidate_logs(): @@ -54,80 +50,95 @@ def consolidate_logs(): Consolidate and aggregate subprocess mem logs """ - if not config.setting('multiprocess', False): + if not config.setting("multiprocess", False): return - delete_originals = not config.setting('keep_mem_logs', False) + delete_originals = not config.setting("keep_mem_logs", False) omnibus_df = [] # for each multiprocess step - multiprocess_steps = config.setting('multiprocess_steps', []) + multiprocess_steps = config.setting("multiprocess_steps", []) for step in multiprocess_steps: - step_name = step.get('name', None) + step_name = step.get("name", None) logger.debug(f"mem.consolidate_logs for step {step_name}") - glob_file_name = config.log_file_path(f"{step_name}*{MEM_LOG_FILE_NAME}", prefix=False) + glob_file_name = config.log_file_path( + f"{step_name}*{MEM_LOG_FILE_NAME}", prefix=False + ) glob_files = glob.glob(glob_file_name) if not glob_files: continue - logger.debug(f"mem.consolidate_logs consolidating {len(glob_files)} logs for {step_name}") + logger.debug( + f"mem.consolidate_logs consolidating {len(glob_files)} logs for {step_name}" + ) # for each individual log step_summary_df = [] for f in glob_files: - df = pd.read_csv(f, comment='#') + df = pd.read_csv(f, comment="#") - df = df[['rss', 'uss', 'event', 'time']] + df = df[["rss", "uss", "event", "time"]] df.rss = df.rss.astype(np.int64) df.uss = df.uss.astype(np.int64) - df['time'] = time_bin(pd.to_datetime(df.time, errors='coerce', format='%Y/%m/%d %H:%M:%S')) + df["time"] = time_bin( + pd.to_datetime(df.time, errors="coerce", format="%Y/%m/%d %H:%M:%S") + ) # consolidate events (duplicate rows should be idle steps (e.g. log_rss) - df = df.groupby('time')\ - .agg(rss=('rss', 'max'), uss=('uss', 'max'),)\ + df = ( + df.groupby("time") + .agg(rss=("rss", "max"), uss=("uss", "max"),) .reset_index(drop=False) + ) step_summary_df.append(df) # add step_df to step summary # aggregate the individual the logs into a single step log step_summary_df = pd.concat(step_summary_df) - step_summary_df = step_summary_df.groupby('time') \ - .agg(rss=('rss', 'sum'), uss=('uss', 'sum'), num_files=('rss', 'size')) \ + step_summary_df = ( + step_summary_df.groupby("time") + .agg(rss=("rss", "sum"), uss=("uss", "sum"), num_files=("rss", "size")) .reset_index(drop=False) - step_summary_df = step_summary_df.sort_values('time') + ) + step_summary_df = step_summary_df.sort_values("time") - step_summary_df['step'] = step_name + step_summary_df["step"] = step_name # scale missing values (might be missing idle steps for some chunk_tags) - scale = 1 + (len(glob_files) - step_summary_df.num_files) / step_summary_df.num_files - for c in ['rss', 'uss']: + scale = ( + 1 + + (len(glob_files) - step_summary_df.num_files) / step_summary_df.num_files + ) + for c in ["rss", "uss"]: step_summary_df[c] = (step_summary_df[c] * scale).astype(np.int64) - step_summary_df['scale'] = scale - del step_summary_df['num_files'] # do we want to keep track of scale factor? + step_summary_df["scale"] = scale + del step_summary_df["num_files"] # do we want to keep track of scale factor? if delete_originals: - util.delete_files(glob_files, f'mem.consolidate_logs.{step_name}') + util.delete_files(glob_files, f"mem.consolidate_logs.{step_name}") # write aggregate step log - output_path = config.log_file_path(f'mem_{step_name}.csv', prefix=False) - logger.debug(f"chunk.consolidate_logs writing step summary log for step {step_name} to {output_path}") - step_summary_df.to_csv(output_path, mode='w', index=False) + output_path = config.log_file_path(f"mem_{step_name}.csv", prefix=False) + logger.debug( + f"chunk.consolidate_logs writing step summary log for step {step_name} to {output_path}" + ) + step_summary_df.to_csv(output_path, mode="w", index=False) omnibus_df.append(step_summary_df) # add step summary to omnibus # aggregate the step logs into a single omnibus log ordered by timestamp omnibus_df = pd.concat(omnibus_df) - omnibus_df = omnibus_df.sort_values('time') + omnibus_df = omnibus_df.sort_values("time") output_path = config.log_file_path(OMNIBUS_LOG_FILE_NAME, prefix=False) logger.debug(f"chunk.consolidate_logs writing omnibus log to {output_path}") - omnibus_df.to_csv(output_path, mode='w', index=False) + omnibus_df.to_csv(output_path, mode="w", index=False) def check_global_hwm(tag, value, label): @@ -136,13 +147,13 @@ def check_global_hwm(tag, value, label): hwm = GLOBAL_HWM.setdefault(tag, {}) - is_new_hwm = value > hwm.get('mark', 0) or not hwm + is_new_hwm = value > hwm.get("mark", 0) or not hwm if is_new_hwm: timestamp = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") - hwm['mark'] = value - hwm['timestamp'] = timestamp - hwm['label'] = label + hwm["mark"] = value + hwm["timestamp"] = timestamp + hwm["label"] = label return is_new_hwm @@ -153,9 +164,11 @@ def log_global_hwm(): for tag in GLOBAL_HWM: hwm = GLOBAL_HWM[tag] - value = hwm.get('mark', 0) - logger.info(f"{process_name} high water mark {tag}: {util.INT(value)} ({util.GB(value)}) " - f"timestamp: {hwm.get('timestamp', '')} label:{hwm.get('label', '')}") + value = hwm.get("mark", 0) + logger.info( + f"{process_name} high water mark {tag}: {util.INT(value)} ({util.GB(value)}) " + f"timestamp: {hwm.get('timestamp', '')} label:{hwm.get('label', '')}" + ) def trace_memory_info(event, trace_ticks=0): @@ -190,11 +203,13 @@ def trace_memory_info(event, trace_ticks=0): except (psutil.NoSuchProcess, psutil.AccessDenied) as e: pass - noteworthy = True # any reason not to always log this if we are filtering idle ticks? + noteworthy = ( + True # any reason not to always log this if we are filtering idle ticks? + ) noteworthy = (num_children > 0) or noteworthy - noteworthy = check_global_hwm('rss', full_rss or rss, event) or noteworthy - noteworthy = check_global_hwm('uss', uss, event) or noteworthy + noteworthy = check_global_hwm("rss", full_rss or rss, event) or noteworthy + noteworthy = check_global_hwm("uss", uss, event) or noteworthy if noteworthy: @@ -206,16 +221,20 @@ def trace_memory_info(event, trace_ticks=0): with mem_log_lock: MEM_LOG_HEADER = "process,pid,rss,full_rss,uss,event,children,time" - with config.open_log_file(MEM_LOG_FILE_NAME, 'a', header=MEM_LOG_HEADER, prefix=True) as log_file: - print(f"{process_name}," - f"{pid}," - f"{util.INT(rss)}," # want these as ints so we can plot them... - f"{util.INT(full_rss)}," - f"{util.INT(uss)}," - f"{event}," - f"{num_children}," - f"{timestamp}", - file=log_file) + with config.open_log_file( + MEM_LOG_FILE_NAME, "a", header=MEM_LOG_HEADER, prefix=True + ) as log_file: + print( + f"{process_name}," + f"{pid}," + f"{util.INT(rss)}," # want these as ints so we can plot them... + f"{util.INT(full_rss)}," + f"{util.INT(uss)}," + f"{event}," + f"{num_children}," + f"{timestamp}", + file=log_file, + ) # return rss and uss for optional use by interested callers return full_rss or rss, uss @@ -251,7 +270,7 @@ def shared_memory_size(data_buffers=None): shared_size = 0 if data_buffers is None: - data_buffers = inject.get_injectable('data_buffers', {}) + data_buffers = inject.get_injectable("data_buffers", {}) for k, data_buffer in data_buffers.items(): try: @@ -273,11 +292,11 @@ def shared_memory_in_child_rss(): # Windows: Windows os_name = platform.system() - if os_name in ['Darwin']: + if os_name in ["Darwin"]: return False - elif os_name in ['Windows']: + elif os_name in ["Windows"]: return False - elif os_name in ['Linux']: + elif os_name in ["Linux"]: return True # ??? else: bug diff --git a/activitysim/core/mp_tasks.py b/activitysim/core/mp_tasks.py index 3b5d37396e..742a86a0ca 100644 --- a/activitysim/core/mp_tasks.py +++ b/activitysim/core/mp_tasks.py @@ -1,31 +1,23 @@ # ActivitySim # See full license in LICENSE.txt. -import sys -import os -import time import logging import multiprocessing +import os +import sys +import time import traceback - from collections import OrderedDict -import yaml import numpy as np import pandas as pd +import yaml -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import mem -from activitysim.core import pipeline -from activitysim.core import tracing -from activitysim.core import util - - +from activitysim.core import config, inject, mem, pipeline, tracing, util from activitysim.core.config import setting logger = logging.getLogger(__name__) -LAST_CHECKPOINT = '_' +LAST_CHECKPOINT = "_" MEM_TRACE_TICKS = 5 @@ -211,7 +203,7 @@ def log(msg, level, write_to_log_file=True): print(f"############ mp_tasks - {process_name} - {msg}") if write_to_log_file: - with config.open_log_file('mp_tasks_log.txt', 'a') as log_file: + with config.open_log_file("mp_tasks_log.txt", "a") as log_file: print(f"mp_tasks - {process_name} - {msg}", file=log_file) if write_to_log_file: @@ -243,7 +235,7 @@ def exception(msg, write_to_log_file=True): print(f"mp_tasks - {process_name} - {msg}") print(f"---\n{traceback.format_exc()}---") - with config.open_log_file('mp_tasks_log.txt', 'a') as log_file: + with config.open_log_file("mp_tasks_log.txt", "a") as log_file: print(f"---\nmp_tasks - {process_name} - {msg}", file=log_file) traceback.print_exc(limit=10, file=log_file) print("---", file=log_file) @@ -297,11 +289,13 @@ def pipeline_table_keys(pipeline_store): checkpoint_tables = checkpoint[~checkpoint.index.isin(pipeline.NON_TABLE_COLUMNS)] # omit dropped tables with empty checkpoint name - checkpoint_tables = checkpoint_tables[checkpoint_tables != ''] + checkpoint_tables = checkpoint_tables[checkpoint_tables != ""] # hdf5 key is / - checkpoint_tables = {table_name: pipeline.pipeline_table_key(table_name, checkpoint_name) - for table_name, checkpoint_name in checkpoint_tables.items()} + checkpoint_tables = { + table_name: pipeline.pipeline_table_key(table_name, checkpoint_name) + for table_name, checkpoint_name in checkpoint_tables.items() + } # checkpoint name and series mapping table name to hdf5 key for tables in that checkpoint return checkpoint_name, checkpoint_tables @@ -386,8 +380,8 @@ def build_slice_rules(slice_info, pipeline_tables): slice_rules : dict """ - slicer_table_names = slice_info['tables'] - slicer_table_exceptions = slice_info.get('except', []) + slicer_table_names = slice_info["tables"] + slicer_table_exceptions = slice_info.get("except", []) primary_slicer = slicer_table_names[0] # - ensure that tables listed in slice_info appear in correct order and before any others @@ -404,14 +398,16 @@ def build_slice_rules(slice_info, pipeline_tables): # followed by a slice.coalesce directive to explicitly list the omnibus tables created by the subprocesses. # So don't change this behavior withoyt testing populationsim multiprocess! if slicer_table_exceptions is True: - debug(f"slice.except wildcard (True): excluding all tables not explicitly listed in slice.tables") + debug( + f"slice.except wildcard (True): excluding all tables not explicitly listed in slice.tables" + ) slicer_table_exceptions = [t for t in tables if t not in slicer_table_names] # dict mapping slicer table_name to index name # (also presumed to be name of ref col name in referencing table) slicer_ref_cols = OrderedDict() - if slicer_table_exceptions == '*': + if slicer_table_exceptions == "*": slicer_table_exceptions = [t for t in tables if t not in slicer_table_names] # build slice rules for loaded tables @@ -421,36 +417,44 @@ def build_slice_rules(slice_info, pipeline_tables): rule = {} if table_name == primary_slicer: # slice primary apportion table - rule = {'slice_by': 'primary'} + rule = {"slice_by": "primary"} elif table_name in slicer_table_exceptions: - rule['slice_by'] = None + rule["slice_by"] = None else: for slicer_table_name in slicer_ref_cols: - if df.index.name is not None and (df.index.name == tables[slicer_table_name].index.name): + if df.index.name is not None and ( + df.index.name == tables[slicer_table_name].index.name + ): # slice df with same index name as a known slicer - rule = {'slice_by': 'index', 'source': slicer_table_name} + rule = {"slice_by": "index", "source": slicer_table_name} else: # if df has a column with same name as the ref_col (index) of a slicer? try: - source, ref_col = next((t, c) - for t, c in slicer_ref_cols.items() - if c in df.columns) + source, ref_col = next( + (t, c) + for t, c in slicer_ref_cols.items() + if c in df.columns + ) # then we can use that table to slice this df - rule = {'slice_by': 'column', - 'column': ref_col, - 'source': source} + rule = { + "slice_by": "column", + "column": ref_col, + "source": source, + } except StopIteration: - rule['slice_by'] = None + rule["slice_by"] = None - if rule['slice_by']: + if rule["slice_by"]: # cascade sliceability slicer_ref_cols[table_name] = df.index.name slice_rules[table_name] = rule for table_name, rule in slice_rules.items(): - if rule['slice_by'] is not None: - debug(f"### table_name: {table_name} slice_rules: {slice_rules[table_name]}") + if rule["slice_by"] is not None: + debug( + f"### table_name: {table_name} slice_rules: {slice_rules[table_name]}" + ) debug(f"### slicer_ref_cols: {slicer_ref_cols}") return slice_rules @@ -477,19 +481,21 @@ def apportion_pipeline(sub_proc_names, step_info): creates apportioned pipeline files for each sub job """ - slice_info = step_info.get('slice', None) - multiprocess_step_name = step_info.get('name', None) + slice_info = step_info.get("slice", None) + multiprocess_step_name = step_info.get("name", None) - pipeline_file_name = inject.get_injectable('pipeline_file_name') + pipeline_file_name = inject.get_injectable("pipeline_file_name") # ensure that if we are resuming, we don't apportion any tables from future model steps - last_checkpoint_in_previous_multiprocess_step = step_info.get('last_checkpoint_in_previous_multiprocess_step', None) + last_checkpoint_in_previous_multiprocess_step = step_info.get( + "last_checkpoint_in_previous_multiprocess_step", None + ) assert last_checkpoint_in_previous_multiprocess_step is not None pipeline.open_pipeline(resume_after=last_checkpoint_in_previous_multiprocess_step) # ensure all tables are in the pipeline checkpointed_tables = pipeline.checkpointed_tables() - for table_name in slice_info['tables']: + for table_name in slice_info["tables"]: if table_name not in checkpointed_tables: raise RuntimeError(f"slicer table {table_name} not found in pipeline") @@ -523,7 +529,9 @@ def apportion_pipeline(sub_proc_names, step_info): # use well-known pipeline file name process_name = sub_proc_names[i] - pipeline_path = config.build_output_file_path(pipeline_file_name, use_prefix=process_name) + pipeline_path = config.build_output_file_path( + pipeline_file_name, use_prefix=process_name + ) # remove existing file try: @@ -531,7 +539,7 @@ def apportion_pipeline(sub_proc_names, step_info): except OSError: pass - with pd.HDFStore(pipeline_path, mode='a') as pipeline_store: + with pd.HDFStore(pipeline_path, mode="a") as pipeline_store: # remember sliced_tables so we can cascade slicing to other tables sliced_tables = {} @@ -541,14 +549,16 @@ def apportion_pipeline(sub_proc_names, step_info): df = tables[table_name] - if rule['slice_by'] is not None and num_sub_procs > len(df): + if rule["slice_by"] is not None and num_sub_procs > len(df): # almost certainly a configuration error - raise RuntimeError(f"apportion_pipeline: multiprocess step {multiprocess_step_name} " - f"slice table {table_name} has fewer rows {df.shape} " - f"than num_processes ({num_sub_procs}).") + raise RuntimeError( + f"apportion_pipeline: multiprocess step {multiprocess_step_name} " + f"slice table {table_name} has fewer rows {df.shape} " + f"than num_processes ({num_sub_procs})." + ) - if rule['slice_by'] == 'primary': + if rule["slice_by"] == "primary": # slice primary apportion table by num_sub_procs strides # this hopefully yields a more random distribution # (e.g.) households are ordered by size in input store @@ -557,29 +567,37 @@ def apportion_pipeline(sub_proc_names, step_info): # we could easily work around this, but it seems likely this was an error on the user's part assert not df.index.duplicated().any() - primary_df = df[np.asanyarray(list(range(df.shape[0]))) % num_sub_procs == i] + primary_df = df[ + np.asanyarray(list(range(df.shape[0]))) % num_sub_procs == i + ] sliced_tables[table_name] = primary_df - elif rule['slice_by'] == 'index': + elif rule["slice_by"] == "index": # slice a table with same index name as a known slicer - source_df = sliced_tables[rule['source']] + source_df = sliced_tables[rule["source"]] sliced_tables[table_name] = df.loc[source_df.index] - elif rule['slice_by'] == 'column': + elif rule["slice_by"] == "column": # slice a table with a recognized slicer_column - source_df = sliced_tables[rule['source']] - sliced_tables[table_name] = df[df[rule['column']].isin(source_df.index)] - elif rule['slice_by'] is None: + source_df = sliced_tables[rule["source"]] + sliced_tables[table_name] = df[ + df[rule["column"]].isin(source_df.index) + ] + elif rule["slice_by"] is None: # don't slice mirrored tables sliced_tables[table_name] = df else: - raise RuntimeError("Unrecognized slice rule '%s' for table %s" % - (rule['slice_by'], table_name)) + raise RuntimeError( + "Unrecognized slice rule '%s' for table %s" + % (rule["slice_by"], table_name) + ) # - write table to pipeline hdf5_key = pipeline.pipeline_table_key(table_name, checkpoint_name) pipeline_store[hdf5_key] = sliced_tables[table_name] - debug(f"writing checkpoints ({checkpoints_df.shape}) " - f"to {pipeline.CHECKPOINT_TABLE_NAME} in {pipeline_path}") + debug( + f"writing checkpoints ({checkpoints_df.shape}) " + f"to {pipeline.CHECKPOINT_TABLE_NAME} in {pipeline_path}" + ) pipeline_store[pipeline.CHECKPOINT_TABLE_NAME] = checkpoints_df @@ -603,16 +621,18 @@ def coalesce_pipelines(sub_proc_names, slice_info): creates an omnibus pipeline with coalesced data from individual sub_proc pipelines """ - pipeline_file_name = inject.get_injectable('pipeline_file_name') + pipeline_file_name = inject.get_injectable("pipeline_file_name") debug(f"coalesce_pipelines to: {pipeline_file_name}") # - read all tables from first process pipeline # FIXME - note: assumes any new tables will be present in ALL subprocess pipelines tables = {} - pipeline_path = config.build_output_file_path(pipeline_file_name, use_prefix=sub_proc_names[0]) + pipeline_path = config.build_output_file_path( + pipeline_file_name, use_prefix=sub_proc_names[0] + ) - with pd.HDFStore(pipeline_path, mode='r') as pipeline_store: + with pd.HDFStore(pipeline_path, mode="r") as pipeline_store: # hdf5_keys is a dict mapping table_name to pipeline hdf5_key checkpoint_name, hdf5_keys = pipeline_table_keys(pipeline_store) @@ -630,21 +650,26 @@ def coalesce_pipelines(sub_proc_names, slice_info): # which new tables to coalesce. Populationsim uses this wildcard except directives to avoid having to list # many slice exceptions, and just lists weigh tables to coalesce. So don't change this behavior without testing # populationsim multiprocessing! - coalesce_tables = slice_info.get('coalesce', []) + coalesce_tables = slice_info.get("coalesce", []) # report absence of any slice_info.coalesce tables not in pipeline # we don't require their presence in case there are tracing tables that will only be present if tracing is enabled for table_name in coalesce_tables: if table_name not in tables: - logger.warning("slicer coalesce.table %s not found in pipeline" % table_name) + logger.warning( + "slicer coalesce.table %s not found in pipeline" % table_name + ) # - use slice rules followed by apportion_pipeline to identify mirrored tables # (tables that are identical in every pipeline and so don't need to be concatenated) slice_rules = build_slice_rules(slice_info, tables) # table is mirrored if no slice rule or explicitly listed in slice_info.coalesce setting - mirrored_table_names = \ - [t for t, rule in slice_rules.items() if rule['slice_by'] is None and t not in coalesce_tables] + mirrored_table_names = [ + t + for t, rule in slice_rules.items() + if rule["slice_by"] is None and t not in coalesce_tables + ] mirrored_tables = {t: tables[t] for t in mirrored_table_names} omnibus_keys = {t: k for t, k in hdf5_keys.items() if t not in mirrored_table_names} @@ -655,15 +680,17 @@ def coalesce_pipelines(sub_proc_names, slice_info): # assemble lists of omnibus tables from all sub_processes omnibus_tables = {table_name: [] for table_name in omnibus_keys} for process_name in sub_proc_names: - pipeline_path = config.build_output_file_path(pipeline_file_name, use_prefix=process_name) + pipeline_path = config.build_output_file_path( + pipeline_file_name, use_prefix=process_name + ) logger.info(f"coalesce pipeline {pipeline_path}") - with pd.HDFStore(pipeline_path, mode='r') as pipeline_store: + with pd.HDFStore(pipeline_path, mode="r") as pipeline_store: for table_name, hdf5_key in omnibus_keys.items(): omnibus_tables[table_name].append(pipeline_store[hdf5_key]) # open pipeline, preserving existing checkpoints (so resume_after will work for prior steps) - pipeline.open_pipeline('_') + pipeline.open_pipeline("_") # - add mirrored tables to pipeline for table_name in mirrored_tables: @@ -705,16 +732,18 @@ def setup_injectables_and_logging(injectables, locutor=True): # by default, assume we are running activitysim.abm # other callers (e.g. piopulationsim) will have to arrange to register their own steps and injectables # (presumably) in a custom run_simulation.py instead of using the 'activitysim run' command - if not inject.is_injectable('preload_injectables'): - from activitysim import abm # register abm steps and other abm-specific injectables + if not inject.is_injectable("preload_injectables"): + from activitysim import ( + abm, # register abm steps and other abm-specific injectables + ) try: for k, v in injectables.items(): inject.add_injectable(k, v) - inject.add_injectable('is_sub_task', True) - inject.add_injectable('locutor', locutor) + inject.add_injectable("is_sub_task", True) + inject.add_injectable("locutor", locutor) config.filter_warnings() @@ -722,7 +751,10 @@ def setup_injectables_and_logging(injectables, locutor=True): inject.add_injectable("log_file_prefix", process_name) except Exception as e: - exception(f"{type(e).__name__} exception while setting up injectables: {str(e)}", write_to_log_file=False) + exception( + f"{type(e).__name__} exception while setting up injectables: {str(e)}", + write_to_log_file=False, + ) raise e try: @@ -750,21 +782,27 @@ def adjust_chunk_size_for_shared_memory(chunk_size, data_buffers, num_processes) fair_share_of_shared_memory = int(shared_memory_size / num_processes) if shared_memory_in_child_rss: - adjusted_chunk_size = chunk_size + shared_memory_size - fair_share_of_shared_memory + adjusted_chunk_size = ( + chunk_size + shared_memory_size - fair_share_of_shared_memory + ) else: adjusted_chunk_size = chunk_size - fair_share_of_shared_memory - logger.info(f"adjust_chunk_size_for_shared_memory " - f"adjusted_chunk_size {util.INT(adjusted_chunk_size)} " - f"shared_memory_in_child_rss {shared_memory_in_child_rss} " - f"chunk_size {util.INT(chunk_size)} " - f"shared_memory_size {util.INT(shared_memory_size)} " - f"num_processes {num_processes} " - f"fair_share_of_shared_memory {util.INT(fair_share_of_shared_memory)} ") + logger.info( + f"adjust_chunk_size_for_shared_memory " + f"adjusted_chunk_size {util.INT(adjusted_chunk_size)} " + f"shared_memory_in_child_rss {shared_memory_in_child_rss} " + f"chunk_size {util.INT(chunk_size)} " + f"shared_memory_size {util.INT(shared_memory_size)} " + f"num_processes {num_processes} " + f"fair_share_of_shared_memory {util.INT(fair_share_of_shared_memory)} " + ) if adjusted_chunk_size <= 0: - raise RuntimeError(f"adjust_chunk_size_for_shared_memory: chunk_size too small for shared memory. " - f"adjusted_chunk_size: {adjusted_chunk_size}") + raise RuntimeError( + f"adjust_chunk_size_for_shared_memory: chunk_size too small for shared memory. " + f"adjusted_chunk_size: {adjusted_chunk_size}" + ) return adjusted_chunk_size @@ -790,13 +828,15 @@ def run_simulation(queue, step_info, resume_after, shared_data_buffer): # step_label = step_info['name'] - models = step_info['models'] - chunk_size = step_info['chunk_size'] - num_processes = step_info['num_processes'] + models = step_info["models"] + chunk_size = step_info["chunk_size"] + num_processes = step_info["num_processes"] - chunk_size = adjust_chunk_size_for_shared_memory(chunk_size, shared_data_buffer, num_processes) + chunk_size = adjust_chunk_size_for_shared_memory( + chunk_size, shared_data_buffer, num_processes + ) - inject.add_injectable('data_buffers', shared_data_buffer) + inject.add_injectable("data_buffers", shared_data_buffer) inject.add_injectable("chunk_size", chunk_size) inject.add_injectable("num_processes", num_processes) @@ -804,8 +844,11 @@ def run_simulation(queue, step_info, resume_after, shared_data_buffer): info(f"resume_after {resume_after}") # if they specified a resume_after model, check to make sure it is checkpointed - if resume_after != LAST_CHECKPOINT and \ - resume_after not in pipeline.get_checkpoints()[pipeline.CHECKPOINT_NAME].values: + if ( + resume_after != LAST_CHECKPOINT + and resume_after + not in pipeline.get_checkpoints()[pipeline.CHECKPOINT_NAME].values + ): # if not checkpointed, then fall back to last checkpoint info(f"resume_after checkpoint '{resume_after}' not in pipeline.") resume_after = LAST_CHECKPOINT @@ -815,9 +858,9 @@ def run_simulation(queue, step_info, resume_after, shared_data_buffer): if last_checkpoint in models: info(f"Resuming model run list after {last_checkpoint}") - models = models[models.index(last_checkpoint) + 1:] + models = models[models.index(last_checkpoint) + 1 :] - assert inject.get_injectable('preload_injectables', None) + assert inject.get_injectable("preload_injectables", None) t0 = tracing.print_elapsed_time() for model in models: @@ -831,12 +874,12 @@ def run_simulation(queue, step_info, resume_after, shared_data_buffer): raise e tracing.log_runtime(model_name=model, start_time=t1) - queue.put({'model': model, 'time': time.time()-t1}) + queue.put({"model": model, "time": time.time() - t1}) tracing.print_elapsed_time("run (%s models)" % len(models), t0) # add checkpoint with final tables even if not intermediate checkpointing - checkpoint_name = step_info['name'] + checkpoint_name = step_info["name"] pipeline.add_checkpoint(checkpoint_name) pipeline.close_pipeline() @@ -864,11 +907,13 @@ def mp_run_simulation(locutor, queue, injectables, step_info, resume_after, **kw setup_injectables_and_logging(injectables, locutor=locutor) - debug(f"mp_run_simulation {step_info['name']} locutor={inject.get_injectable('locutor', False)} ") + debug( + f"mp_run_simulation {step_info['name']} locutor={inject.get_injectable('locutor', False)} " + ) try: - if step_info['num_processes'] > 1: + if step_info["num_processes"] > 1: pipeline_prefix = multiprocessing.current_process().name logger.debug(f"injecting pipeline_file_prefix '{pipeline_prefix}'") inject.add_injectable("pipeline_file_prefix", pipeline_prefix) @@ -902,7 +947,9 @@ def mp_apportion_pipeline(injectables, sub_proc_names, step_info): try: apportion_pipeline(sub_proc_names, step_info) except Exception as e: - exception(f"{type(e).__name__} exception caught in mp_apportion_pipeline: {str(e)}") + exception( + f"{type(e).__name__} exception caught in mp_apportion_pipeline: {str(e)}" + ) raise e @@ -928,7 +975,7 @@ def mp_setup_skims(injectables, **kwargs): try: shared_data_buffer = kwargs - network_los_preload = inject.get_injectable('network_los_preload', None) + network_los_preload = inject.get_injectable("network_los_preload", None) if network_los_preload is not None: network_los_preload.load_shared_data(shared_data_buffer) @@ -957,7 +1004,9 @@ def mp_coalesce_pipelines(injectables, sub_proc_names, slice_info): try: coalesce_pipelines(sub_proc_names, slice_info) except Exception as e: - exception(f"{type(e).__name__} exception caught in coalesce_pipelines: {str(e)}") + exception( + f"{type(e).__name__} exception caught in coalesce_pipelines: {str(e)}" + ) raise e @@ -980,7 +1029,7 @@ def allocate_shared_skim_buffers(): info("allocate_shared_skim_buffer") - network_los = inject.get_injectable('network_los_preload', None) + network_los = inject.get_injectable("network_los_preload", None) if network_los is not None: skim_buffers = network_los.allocate_shared_skim_buffers() else: @@ -1000,11 +1049,14 @@ def allocate_shared_shadow_pricing_buffers(): info("allocate_shared_shadow_pricing_buffers") - shadow_pricing_info = inject.get_injectable('shadow_pricing_info', None) + shadow_pricing_info = inject.get_injectable("shadow_pricing_info", None) if shadow_pricing_info is not None: from activitysim.abm.tables import shadow_pricing - shadow_pricing_buffers = shadow_pricing.buffers_for_shadow_pricing(shadow_pricing_info) + + shadow_pricing_buffers = shadow_pricing.buffers_for_shadow_pricing( + shadow_pricing_info + ) else: shadow_pricing_buffers = {} @@ -1012,10 +1064,14 @@ def allocate_shared_shadow_pricing_buffers(): def run_sub_simulations( - injectables, - shared_data_buffers, - step_info, process_names, - resume_after, previously_completed, fail_fast): + injectables, + shared_data_buffers, + step_info, + process_names, + resume_after, + previously_completed, + fail_fast, +): """ Launch sub processes to run models in step according to specification in step_info. @@ -1051,12 +1107,15 @@ def run_sub_simulations( names of sub_processes that completed successfully """ + def log_queued_messages(): for process, queue in zip(procs, queues): while not queue.empty(): msg = queue.get(block=False) - model_name = msg['model'] - info(f"{process.name} {model_name} : {tracing.format_elapsed_time(msg['time'])}") + model_name = msg["model"] + info( + f"{process.name} {model_name} : {tracing.format_elapsed_time(msg['time'])}" + ) mem.trace_memory_info(f"{process.name}.{model_name}.completed") def check_proc_status(): @@ -1070,7 +1129,7 @@ def check_proc_status(): if p.name not in completed: info(f"process {p.name} completed") completed.add(p.name) - drop_breadcrumb(step_name, 'completed', list(completed)) + drop_breadcrumb(step_name, "completed", list(completed)) mem.trace_memory_info(f"{p.name}.completed") else: # process failed @@ -1089,10 +1148,10 @@ def check_proc_status(): info(f"error terminating process {op.name}: {e}") raise RuntimeError("Process %s failed" % (p.name,)) - step_name = step_info['name'] + step_name = step_info["name"] t0 = tracing.print_elapsed_time() - info(f'run_sub_simulations step {step_name} models resume_after {resume_after}') + info(f"run_sub_simulations step {step_name} models resume_after {resume_after}") # if resuming and some processes completed successfully in previous run if previously_completed: @@ -1102,15 +1161,19 @@ def check_proc_status(): if resume_after == LAST_CHECKPOINT: # if we are resuming where previous run left off, then we can skip running # any subprocudures that successfully complete the previous run - process_names = [name for name in process_names if name not in previously_completed] - info(f'step {step_name}: skipping {len(previously_completed)} previously completed subprocedures') + process_names = [ + name for name in process_names if name not in previously_completed + ] + info( + f"step {step_name}: skipping {len(previously_completed)} previously completed subprocedures" + ) else: # if we are resuming after a specific model, then force all subprocesses to run # (assuming if they specified a model, they really want everything after that to run) previously_completed = [] # if not the first step, resume_after the last checkpoint from the previous step - if resume_after is None and step_info['step_num'] > 0: + if resume_after is None and step_info["step_num"] > 0: resume_after = LAST_CHECKPOINT num_simulations = len(process_names) @@ -1119,17 +1182,19 @@ def check_proc_status(): completed = set(previously_completed) failed = set([]) # so we can log process failure first time it happens - drop_breadcrumb(step_name, 'completed', list(completed)) + drop_breadcrumb(step_name, "completed", list(completed)) for i, process_name in enumerate(process_names): q = multiprocessing.Queue() - locutor = (i == 0) - - args = OrderedDict(locutor=locutor, - queue=q, - injectables=injectables, - step_info=step_info, - resume_after=resume_after) + locutor = i == 0 + + args = OrderedDict( + locutor=locutor, + queue=q, + injectables=injectables, + step_info=step_info, + resume_after=resume_after, + ) # debug(f"create_process {process_name} target={mp_run_simulation}") # for k in args: @@ -1137,9 +1202,12 @@ def check_proc_status(): # for k in shared_data_buffers: # debug(f"create_process {process_name} shared_data_buffers {k}={shared_data_buffers[k]}") - p = multiprocessing.Process(target=mp_run_simulation, name=process_name, - args=(locutor, q, injectables, step_info, resume_after,), - kwargs=shared_data_buffers) + p = multiprocessing.Process( + target=mp_run_simulation, + name=process_name, + args=(locutor, q, injectables, step_info, resume_after,), + kwargs=shared_data_buffers, + ) procs.append(p) queues.append(q) @@ -1164,7 +1232,7 @@ def __setstate__(self, state): # XXX the correct long-term fix. See issue 23060 #assert _winapi.GetLastError() == _winapi.ERROR_ALREADY_EXISTS """ - if sys.platform == 'win32': + if sys.platform == "win32": time.sleep(1) mem.trace_memory_info(f"{p.name}.start") @@ -1175,7 +1243,9 @@ def __setstate__(self, state): # monitor sub process status and drop breadcrumbs or fail_fast as they terminate check_proc_status() # monitor memory usage - mem.trace_memory_info("run_sub_simulations.idle", trace_ticks=mem.MEM_PARENT_TRACE_TICK_LEN) + mem.trace_memory_info( + "run_sub_simulations.idle", trace_ticks=mem.MEM_PARENT_TRACE_TICK_LEN + ) time.sleep(1) # clean up any messages or breadcrumbs that occurred while we slept @@ -1193,7 +1263,7 @@ def __setstate__(self, state): info(f"Process {p.name} completed with exitcode {p.exitcode}") assert p.name in completed - t0 = tracing.print_elapsed_time('run_sub_simulations step %s' % step_name, t0) + t0 = tracing.print_elapsed_time("run_sub_simulations step %s" % step_name, t0) return list(completed) @@ -1216,13 +1286,15 @@ def run_sub_task(p): p.start() while multiprocessing.active_children(): - mem.trace_memory_info("run_sub_simulations.idle", trace_ticks=mem.MEM_PARENT_TRACE_TICK_LEN) + mem.trace_memory_info( + "run_sub_simulations.idle", trace_ticks=mem.MEM_PARENT_TRACE_TICK_LEN + ) time.sleep(1) # no need to join explicitly since multiprocessing.active_children joins completed procs # p.join() - t0 = tracing.print_elapsed_time('#run_model sub_process %s' % p.name, t0) + t0 = tracing.print_elapsed_time("#run_model sub_process %s" % p.name, t0) # info(f'{p.name}.exitcode = {p.exitcode}') mem.trace_memory_info(f"run_model {p.name} completed") @@ -1253,9 +1325,9 @@ def drop_breadcrumb(step_name, crumb, value=True): ------- """ - breadcrumbs = inject.get_injectable('breadcrumbs', OrderedDict()) - breadcrumbs.setdefault(step_name, {'name': step_name})[crumb] = value - inject.add_injectable('breadcrumbs', breadcrumbs) + breadcrumbs = inject.get_injectable("breadcrumbs", OrderedDict()) + breadcrumbs.setdefault(step_name, {"name": step_name})[crumb] = value + inject.add_injectable("breadcrumbs", breadcrumbs) write_breadcrumbs(breadcrumbs) @@ -1293,14 +1365,16 @@ def run_multiprocess(injectables): run_list = get_run_list() - if not run_list['multiprocess']: - raise RuntimeError("run_multiprocess called but multiprocess flag is %s" % - run_list['multiprocess']) + if not run_list["multiprocess"]: + raise RuntimeError( + "run_multiprocess called but multiprocess flag is %s" + % run_list["multiprocess"] + ) - old_breadcrumbs = run_list.get('breadcrumbs', {}) + old_breadcrumbs = run_list.get("breadcrumbs", {}) # raise error if any sub-process fails without waiting for others to complete - fail_fast = setting('fail_fast') + fail_fast = setting("fail_fast") info(f"run_multiprocess fail_fast: {fail_fast}") def skip_phase(phase): @@ -1319,32 +1393,35 @@ def find_breadcrumb(crumb, default=None): t0 = tracing.print_elapsed_time() shared_data_buffers.update(allocate_shared_skim_buffers()) - t0 = tracing.print_elapsed_time('allocate shared skim buffer', t0) + t0 = tracing.print_elapsed_time("allocate shared skim buffer", t0) mem.trace_memory_info("allocate_shared_skim_buffer.completed") # combine shared_skim_buffer and shared_shadow_pricing_buffer in shared_data_buffer t0 = tracing.print_elapsed_time() shared_data_buffers.update(allocate_shared_shadow_pricing_buffers()) - t0 = tracing.print_elapsed_time('allocate shared shadow_pricing buffer', t0) + t0 = tracing.print_elapsed_time("allocate shared shadow_pricing buffer", t0) mem.trace_memory_info("allocate_shared_shadow_pricing_buffers.completed") # - mp_setup_skims if len(shared_data_buffers) > 0: run_sub_task( multiprocessing.Process( - target=mp_setup_skims, name='mp_setup_skims', args=(injectables,), - kwargs=shared_data_buffers) + target=mp_setup_skims, + name="mp_setup_skims", + args=(injectables,), + kwargs=shared_data_buffers, + ) ) - t0 = tracing.print_elapsed_time('setup shared_data_buffers', t0) + t0 = tracing.print_elapsed_time("setup shared_data_buffers", t0) mem.trace_memory_info("mp_setup_skims.completed") # - for each step in run list - for step_info in run_list['multiprocess_steps']: + for step_info in run_list["multiprocess_steps"]: - step_name = step_info['name'] + step_name = step_info["name"] - num_processes = step_info['num_processes'] - slice_info = step_info.get('slice', None) + num_processes = step_info["num_processes"] + slice_info = step_info.get("slice", None) if num_processes == 1: sub_proc_names = [step_name] @@ -1352,43 +1429,53 @@ def find_breadcrumb(crumb, default=None): sub_proc_names = ["%s_%s" % (step_name, i) for i in range(num_processes)] # - mp_apportion_pipeline - if not skip_phase('apportion') and num_processes > 1: + if not skip_phase("apportion") and num_processes > 1: run_sub_task( multiprocessing.Process( - target=mp_apportion_pipeline, name='%s_apportion' % step_name, - args=(injectables, sub_proc_names, step_info)) + target=mp_apportion_pipeline, + name="%s_apportion" % step_name, + args=(injectables, sub_proc_names, step_info), + ) ) - drop_breadcrumb(step_name, 'apportion') + drop_breadcrumb(step_name, "apportion") # - run_sub_simulations - if not skip_phase('simulate'): - resume_after = step_info.get('resume_after', None) - - previously_completed = find_breadcrumb('completed', default=[]) - - completed = run_sub_simulations(injectables, - shared_data_buffers, - step_info, - sub_proc_names, - resume_after, previously_completed, fail_fast) + if not skip_phase("simulate"): + resume_after = step_info.get("resume_after", None) + + previously_completed = find_breadcrumb("completed", default=[]) + + completed = run_sub_simulations( + injectables, + shared_data_buffers, + step_info, + sub_proc_names, + resume_after, + previously_completed, + fail_fast, + ) if len(completed) != num_processes: - raise RuntimeError("%s processes failed in step %s" % - (num_processes - len(completed), step_name)) - drop_breadcrumb(step_name, 'simulate') + raise RuntimeError( + "%s processes failed in step %s" + % (num_processes - len(completed), step_name) + ) + drop_breadcrumb(step_name, "simulate") # - mp_coalesce_pipelines - if not skip_phase('coalesce') and num_processes > 1: + if not skip_phase("coalesce") and num_processes > 1: run_sub_task( multiprocessing.Process( - target=mp_coalesce_pipelines, name='%s_coalesce' % step_name, - args=(injectables, sub_proc_names, slice_info)) + target=mp_coalesce_pipelines, + name="%s_coalesce" % step_name, + args=(injectables, sub_proc_names, slice_info), + ) ) - drop_breadcrumb(step_name, 'coalesce') + drop_breadcrumb(step_name, "coalesce") # add checkpoint with final tables even if not intermediate checkpointing if not pipeline.intermediate_checkpoint(): - pipeline.open_pipeline('_') + pipeline.open_pipeline("_") pipeline.add_checkpoint(pipeline.FINAL_CHECKPOINT_NAME) pipeline.close_pipeline() @@ -1423,7 +1510,7 @@ def get_breadcrumbs(run_list): validated and annotated breadcrumbs file from previous run """ - resume_after = run_list['resume_after'] + resume_after = run_list["resume_after"] assert resume_after is not None # - read breadcrumbs file from previous run @@ -1441,31 +1528,40 @@ def get_breadcrumbs(run_list): previous_steps = list(breadcrumbs.keys()) # find the run_list step resume_after is in - resume_step = next((step for step in run_list['multiprocess_steps'] - if resume_after in step['models']), None) + resume_step = next( + ( + step + for step in run_list["multiprocess_steps"] + if resume_after in step["models"] + ), + None, + ) - resume_step_name = resume_step['name'] + resume_step_name = resume_step["name"] if resume_step_name not in previous_steps: error(f"resume_after model '{resume_after}' not in breadcrumbs") - raise RuntimeError("resume_after model '%s' not in breadcrumbs" % resume_after) + raise RuntimeError( + "resume_after model '%s' not in breadcrumbs" % resume_after + ) # drop any previous_breadcrumbs steps after resume_step - for step in previous_steps[previous_steps.index(resume_step_name) + 1:]: + for step in previous_steps[previous_steps.index(resume_step_name) + 1 :]: del breadcrumbs[step] # if resume_after is not the last model in the step # then we need to rerun the simulations in that step, even if they succeeded - if resume_after in resume_step['models'][:-1]: - if 'simulate' in breadcrumbs[resume_step_name]: - breadcrumbs[resume_step_name]['simulate'] = None - if 'coalesce' in breadcrumbs[resume_step_name]: - breadcrumbs[resume_step_name]['coalesce'] = None - - multiprocess_step_names = [step['name'] for step in run_list['multiprocess_steps']] - if list(breadcrumbs.keys()) != multiprocess_step_names[:len(breadcrumbs)]: - raise RuntimeError("last run steps don't match run list: %s" % - list(breadcrumbs.keys())) + if resume_after in resume_step["models"][:-1]: + if "simulate" in breadcrumbs[resume_step_name]: + breadcrumbs[resume_step_name]["simulate"] = None + if "coalesce" in breadcrumbs[resume_step_name]: + breadcrumbs[resume_step_name]["coalesce"] = None + + multiprocess_step_names = [step["name"] for step in run_list["multiprocess_steps"]] + if list(breadcrumbs.keys()) != multiprocess_step_names[: len(breadcrumbs)]: + raise RuntimeError( + "last run steps don't match run list: %s" % list(breadcrumbs.keys()) + ) return breadcrumbs @@ -1519,36 +1615,46 @@ def get_run_list(): validated and annotated run_list """ - models = setting('models', []) - multiprocess_steps = setting('multiprocess_steps', []) + models = setting("models", []) + multiprocess_steps = setting("multiprocess_steps", []) - resume_after = inject.get_injectable('resume_after', None) or setting('resume_after', None) - multiprocess = inject.get_injectable('multiprocess', False) or setting('multiprocess', False) + resume_after = inject.get_injectable("resume_after", None) or setting( + "resume_after", None + ) + multiprocess = inject.get_injectable("multiprocess", False) or setting( + "multiprocess", False + ) # default settings that can be overridden by settings in individual steps - global_chunk_size = setting('chunk_size', 0) or 0 - default_mp_processes = setting('num_processes', 0) or int(1 + multiprocessing.cpu_count() / 2.0) + global_chunk_size = setting("chunk_size", 0) or 0 + default_mp_processes = setting("num_processes", 0) or int( + 1 + multiprocessing.cpu_count() / 2.0 + ) if multiprocess and multiprocessing.cpu_count() == 1: warning("Can't multiprocess because there is only 1 cpu") run_list = { - 'models': models, - 'resume_after': resume_after, - 'multiprocess': multiprocess, + "models": models, + "resume_after": resume_after, + "multiprocess": multiprocess, # 'multiprocess_steps': multiprocess_steps # add this later if multiprocess } if not models or not isinstance(models, list): - raise RuntimeError('No models list in settings file') + raise RuntimeError("No models list in settings file") if resume_after == models[-1]: - raise RuntimeError("resume_after '%s' is last model in models list" % resume_after) + raise RuntimeError( + "resume_after '%s' is last model in models list" % resume_after + ) if multiprocess: if not multiprocess_steps: - raise RuntimeError("multiprocess setting is %s but no multiprocess_steps setting" % - multiprocess) + raise RuntimeError( + "multiprocess setting is %s but no multiprocess_steps setting" + % multiprocess + ) # check step name, num_processes, chunk_size and presence of slice info num_steps = len(multiprocess_steps) @@ -1556,46 +1662,56 @@ def get_run_list(): for istep in range(num_steps): step = multiprocess_steps[istep] - step['step_num'] = istep + step["step_num"] = istep # - validate step name - name = step.get('name', None) + name = step.get("name", None) if not name: - raise RuntimeError("missing name for step %s" - " in multiprocess_steps" % istep) + raise RuntimeError( + "missing name for step %s" " in multiprocess_steps" % istep + ) if name in step_names: - raise RuntimeError("duplicate step name %s" - " in multiprocess_steps" % name) + raise RuntimeError( + "duplicate step name %s" " in multiprocess_steps" % name + ) if name in models: - raise RuntimeError(f"multiprocess_steps step name '{name}' cannot also be a model name") + raise RuntimeError( + f"multiprocess_steps step name '{name}' cannot also be a model name" + ) step_names.add(name) # - validate num_processes and assign default - num_processes = step.get('num_processes', 0) + num_processes = step.get("num_processes", 0) if not isinstance(num_processes, int) or num_processes < 0: - raise RuntimeError("bad value (%s) for num_processes for step %s" - " in multiprocess_steps" % (num_processes, name)) + raise RuntimeError( + "bad value (%s) for num_processes for step %s" + " in multiprocess_steps" % (num_processes, name) + ) - if 'slice' in step: + if "slice" in step: if num_processes == 0: info(f"Setting num_processes = {num_processes} for step {name}") num_processes = default_mp_processes if num_processes > multiprocessing.cpu_count(): - warning(f"num_processes setting ({num_processes}) " - f"greater than cpu count ({ multiprocessing.cpu_count()})") + warning( + f"num_processes setting ({num_processes}) " + f"greater than cpu count ({ multiprocessing.cpu_count()})" + ) else: if num_processes == 0: num_processes = 1 if num_processes > 1: - raise RuntimeError("num_processes > 1 but no slice info for step %s" - " in multiprocess_steps" % name) + raise RuntimeError( + "num_processes > 1 but no slice info for step %s" + " in multiprocess_steps" % name + ) - multiprocess_steps[istep]['num_processes'] = num_processes + multiprocess_steps[istep]["num_processes"] = num_processes # - validate chunk_size and assign default - chunk_size = step.get('chunk_size', None) + chunk_size = step.get("chunk_size", None) if chunk_size is None: if global_chunk_size > 0 and num_processes > 1: chunk_size = int(round(global_chunk_size / num_processes)) @@ -1603,77 +1719,91 @@ def get_run_list(): else: chunk_size = global_chunk_size - multiprocess_steps[istep]['chunk_size'] = chunk_size + multiprocess_steps[istep]["chunk_size"] = chunk_size # - determine index in models list of step starts - start_tag = 'begin' + start_tag = "begin" starts = [0] * len(multiprocess_steps) for istep in range(num_steps): step = multiprocess_steps[istep] - name = step['name'] + name = step["name"] - slice = step.get('slice', None) + slice = step.get("slice", None) if slice: - if 'tables' not in slice: - raise RuntimeError("missing tables list for step %s" - " in multiprocess_steps" % istep) + if "tables" not in slice: + raise RuntimeError( + "missing tables list for step %s" + " in multiprocess_steps" % istep + ) start = step.get(start_tag, None) if not name: - raise RuntimeError("missing %s tag for step '%s' (%s)" - " in multiprocess_steps" % - (start_tag, name, istep)) + raise RuntimeError( + "missing %s tag for step '%s' (%s)" + " in multiprocess_steps" % (start_tag, name, istep) + ) if start not in models: - raise RuntimeError("%s tag '%s' for step '%s' (%s) not in models list" % - (start_tag, start, name, istep)) + raise RuntimeError( + "%s tag '%s' for step '%s' (%s) not in models list" + % (start_tag, start, name, istep) + ) starts[istep] = models.index(start) if istep == 0 and starts[istep] != 0: - raise RuntimeError("%s tag '%s' for first step '%s' (%s)" - " is not first model in models list" % - (start_tag, start, name, istep)) + raise RuntimeError( + "%s tag '%s' for first step '%s' (%s)" + " is not first model in models list" + % (start_tag, start, name, istep) + ) if istep > 0 and starts[istep] <= starts[istep - 1]: - raise RuntimeError("%s tag '%s' for step '%s' (%s)" - " falls before that of prior step in models list" % - (start_tag, start, name, istep)) + raise RuntimeError( + "%s tag '%s' for step '%s' (%s)" + " falls before that of prior step in models list" + % (start_tag, start, name, istep) + ) # remember there should always be a final checkpoint with same name as multiprocess_step name - multiprocess_steps[istep]['last_checkpoint_in_previous_multiprocess_step'] = \ - multiprocess_steps[istep - 1].get('name') if istep > 0 else None + multiprocess_steps[istep][ + "last_checkpoint_in_previous_multiprocess_step" + ] = (multiprocess_steps[istep - 1].get("name") if istep > 0 else None) # - build individual step model lists based on starts starts.append(len(models)) # so last step gets remaining models in list for istep in range(num_steps): - step_models = models[starts[istep]: starts[istep + 1]] + step_models = models[starts[istep] : starts[istep + 1]] if step_models[-1][0] == LAST_CHECKPOINT: - raise RuntimeError("Final model '%s' in step %s models list not checkpointed" % - (step_models[-1], name)) + raise RuntimeError( + "Final model '%s' in step %s models list not checkpointed" + % (step_models[-1], name) + ) - multiprocess_steps[istep]['models'] = step_models + multiprocess_steps[istep]["models"] = step_models - run_list['multiprocess_steps'] = multiprocess_steps + run_list["multiprocess_steps"] = multiprocess_steps # - add resume breadcrumbs if resume_after: breadcrumbs = get_breadcrumbs(run_list) if breadcrumbs: - run_list['breadcrumbs'] = breadcrumbs + run_list["breadcrumbs"] = breadcrumbs # - add resume_after to last step if resume_after is not None: # get_breadcrumbs should have deleted breadcrumbs for any subsequent steps istep = len(breadcrumbs) - 1 - assert resume_after == LAST_CHECKPOINT or \ - resume_after in multiprocess_steps[istep]['models'] - multiprocess_steps[istep]['resume_after'] = resume_after + assert ( + resume_after == LAST_CHECKPOINT + or resume_after in multiprocess_steps[istep]["models"] + ) + multiprocess_steps[istep]["resume_after"] = resume_after # - write run list to output dir # use log_file_path so we use (optional) log subdir and prefix process name - with config.open_log_file('run_list.txt', 'w') as f: + with config.open_log_file("run_list.txt", "w") as f: print_run_list(run_list, f) return run_list @@ -1692,18 +1822,18 @@ def print_run_list(run_list, output_file=None): if output_file is None: output_file = sys.stdout - print("resume_after:", run_list['resume_after'], file=output_file) - print("multiprocess:", run_list['multiprocess'], file=output_file) + print("resume_after:", run_list["resume_after"], file=output_file) + print("multiprocess:", run_list["multiprocess"], file=output_file) print("models:", file=output_file) - for m in run_list['models']: + for m in run_list["models"]: print(" - ", m, file=output_file) # - print multiprocess_steps - if run_list['multiprocess']: + if run_list["multiprocess"]: print("\nmultiprocess_steps:", file=output_file) - for step in run_list['multiprocess_steps']: - print(" step:", step['name'], file=output_file) + for step in run_list["multiprocess_steps"]: + print(" step:", step["name"], file=output_file) for k in step: if isinstance(step[k], list): print(" %s:" % k, file=output_file) @@ -1713,7 +1843,7 @@ def print_run_list(run_list, output_file=None): print(" %s: %s" % (k, step[k]), file=output_file) # - print breadcrumbs - breadcrumbs = run_list.get('breadcrumbs') + breadcrumbs = run_list.get("breadcrumbs") if breadcrumbs: print("\nbreadcrumbs:", file=output_file) for step_name in breadcrumbs: @@ -1730,7 +1860,7 @@ def print_run_list(run_list, output_file=None): def breadcrumbs_file_path(): # return path to breadcrumbs file in output_dir - return config.build_output_file_path('breadcrumbs.yaml') + return config.build_output_file_path("breadcrumbs.yaml") def read_breadcrumbs(): @@ -1747,10 +1877,10 @@ def read_breadcrumbs(): file_path = breadcrumbs_file_path() if not os.path.exists(file_path): raise IOError("Could not find saved breadcrumbs file '%s'" % file_path) - with open(file_path, 'r') as f: + with open(file_path, "r") as f: breadcrumbs = yaml.load(f, Loader=yaml.SafeLoader) # convert array to ordered dict keyed by step name - breadcrumbs = OrderedDict([(step['name'], step) for step in breadcrumbs]) + breadcrumbs = OrderedDict([(step["name"], step) for step in breadcrumbs]) return breadcrumbs @@ -1773,7 +1903,7 @@ def write_breadcrumbs(breadcrumbs): ---------- breadcrumbs : OrderedDict """ - with open(breadcrumbs_file_path(), 'w') as f: + with open(breadcrumbs_file_path(), "w") as f: # write ordered dict as array breadcrumbs = [step for step in list(breadcrumbs.values())] yaml.dump(breadcrumbs, f) @@ -1801,4 +1931,4 @@ def if_sub_task(if_is, if_isnt): (any type) (one of parameters if_is or if_isnt) """ - return if_is if inject.get_injectable('is_sub_task', False) else if_isnt + return if_is if inject.get_injectable("is_sub_task", False) else if_isnt diff --git a/activitysim/core/pathbuilder.py b/activitysim/core/pathbuilder.py index d4cee77504..91107e08e7 100644 --- a/activitysim/core/pathbuilder.py +++ b/activitysim/core/pathbuilder.py @@ -1,27 +1,26 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range - import logging import warnings +from builtins import range + import numpy as np import pandas as pd -from activitysim.core import tracing -from activitysim.core import inject -from activitysim.core import config -from activitysim.core import chunk -from activitysim.core import logit -from activitysim.core import simulate -from activitysim.core import los -from activitysim.core import pathbuilder_cache - -from activitysim.core.util import reindex - -from activitysim.core import expressions -from activitysim.core import assign - +from activitysim.core import ( + assign, + chunk, + config, + expressions, + inject, + logit, + los, + pathbuilder_cache, + simulate, + tracing, +) from activitysim.core.pathbuilder_cache import memo +from activitysim.core.util import reindex logger = logging.getLogger(__name__) @@ -33,31 +32,40 @@ UNAVAILABLE = -999 # used as base file name for cached files and as shared buffer tag -CACHE_TAG = 'tap_tap_utilities' - - -def compute_utilities(network_los, model_settings, choosers, model_constants, - trace_label, trace=False, trace_column_names=None): +CACHE_TAG = "tap_tap_utilities" + + +def compute_utilities( + network_los, + model_settings, + choosers, + model_constants, + trace_label, + trace=False, + trace_column_names=None, +): """ Compute utilities """ - trace_label = tracing.extend_trace_label(trace_label, 'compute_utils') + trace_label = tracing.extend_trace_label(trace_label, "compute_utils") with chunk.chunk_log(trace_label): - logger.debug(f"{trace_label} Running compute_utilities with {choosers.shape[0]} choosers") + logger.debug( + f"{trace_label} Running compute_utilities with {choosers.shape[0]} choosers" + ) - locals_dict = {'np': np, 'los': network_los} + locals_dict = {"np": np, "los": network_los} locals_dict.update(model_constants) # we don't grok coefficients, but allow them to use constants in spec alt columns - spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) for c in spec.columns: if c != simulate.SPEC_LABEL_NAME: spec[c] = spec[c].map(lambda s: model_constants.get(s, s)).astype(float) # - run preprocessor to annotate choosers - preprocessor_settings = model_settings.get('PREPROCESSOR') + preprocessor_settings = model_settings.get("PREPROCESSOR") if preprocessor_settings: # don't want to alter caller's dataframe @@ -67,7 +75,8 @@ def compute_utilities(network_los, model_settings, choosers, model_constants, df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, - trace_label=trace_label) + trace_label=trace_label, + ) utilities = simulate.eval_utilities( spec, @@ -75,7 +84,8 @@ def compute_utilities(network_los, model_settings, choosers, model_constants, locals_d=locals_dict, trace_all_rows=trace, trace_label=trace_label, - trace_column_names=trace_column_names) + trace_column_names=trace_column_names, + ) return utilities @@ -84,6 +94,7 @@ class TransitVirtualPathBuilder(object): """ Transit virtual path builder for three zone systems """ + def __init__(self, network_los): self.network_los = network_los @@ -91,130 +102,168 @@ def __init__(self, network_los): self.uid_calculator = pathbuilder_cache.TapTapUidCalculator(network_los) # note: pathbuilder_cache is lightweight until opened - self.tap_cache = pathbuilder_cache.TVPBCache(self.network_los, self.uid_calculator, CACHE_TAG) + self.tap_cache = pathbuilder_cache.TVPBCache( + self.network_los, self.uid_calculator, CACHE_TAG + ) - assert network_los.zone_system == los.THREE_ZONE, \ - f"TransitVirtualPathBuilder: network_los zone_system not THREE_ZONE" + assert ( + network_los.zone_system == los.THREE_ZONE + ), f"TransitVirtualPathBuilder: network_los zone_system not THREE_ZONE" def trace_df(self, df, trace_label, extension): assert len(df) > 0 - tracing.trace_df(df, label=tracing.extend_trace_label(trace_label, extension), slicer='NONE', transpose=False) + tracing.trace_df( + df, + label=tracing.extend_trace_label(trace_label, extension), + slicer="NONE", + transpose=False, + ) def trace_maz_tap(self, maz_od_df, access_mode, egress_mode): - def maz_tap_stats(mode, name): maz_tap_df = self.network_los.maz_to_tap_dfs[mode].reset_index() logger.debug(f"TVPB access_maz_tap {maz_tap_df.shape}") MAZ_count = len(maz_tap_df.MAZ.unique()) TAP_count = len(maz_tap_df.TAP.unique()) MAZ_PER_TAP = MAZ_count / TAP_count - logger.debug(f"TVPB maz_tap_stats {name} {mode} MAZ {MAZ_count} TAP {TAP_count} ratio {MAZ_PER_TAP}") + logger.debug( + f"TVPB maz_tap_stats {name} {mode} MAZ {MAZ_count} TAP {TAP_count} ratio {MAZ_PER_TAP}" + ) logger.debug(f"TVPB maz_od_df {maz_od_df.shape}") - maz_tap_stats(access_mode, 'access') - maz_tap_stats(egress_mode, 'egress') + maz_tap_stats(access_mode, "access") + maz_tap_stats(egress_mode, "egress") def units_for_recipe(self, recipe): - units = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.units') - assert units in ['utility', 'time'], \ - f"unrecognized units: {units} for {recipe}. Expected either 'time' or 'utility'." + units = self.network_los.setting(f"TVPB_SETTINGS.{recipe}.units") + assert units in [ + "utility", + "time", + ], f"unrecognized units: {units} for {recipe}. Expected either 'time' or 'utility'." return units - def compute_maz_tap_utilities(self, recipe, maz_od_df, chooser_attributes, leg, mode, trace_label, trace): + def compute_maz_tap_utilities( + self, recipe, maz_od_df, chooser_attributes, leg, mode, trace_label, trace + ): - trace_label = tracing.extend_trace_label(trace_label, f'maz_tap_utils.{leg}') + trace_label = tracing.extend_trace_label(trace_label, f"maz_tap_utils.{leg}") with chunk.chunk_log(trace_label): - maz_tap_settings = \ - self.network_los.setting(f'TVPB_SETTINGS.{recipe}.maz_tap_settings.{mode}') - chooser_columns = maz_tap_settings['CHOOSER_COLUMNS'] - attribute_columns = list(chooser_attributes.columns) if chooser_attributes is not None else [] - model_constants = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.CONSTANTS') - - if leg == 'access': - maz_col = 'omaz' - tap_col = 'btap' + maz_tap_settings = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.maz_tap_settings.{mode}" + ) + chooser_columns = maz_tap_settings["CHOOSER_COLUMNS"] + attribute_columns = ( + list(chooser_attributes.columns) + if chooser_attributes is not None + else [] + ) + model_constants = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.CONSTANTS" + ) + + if leg == "access": + maz_col = "omaz" + tap_col = "btap" else: - maz_col = 'dmaz' - tap_col = 'atap' + maz_col = "dmaz" + tap_col = "atap" # maz_to_tap access/egress utilities # deduped utilities_df - one row per chooser for each boarding tap (btap) accessible from omaz utilities_df = self.network_los.maz_to_tap_dfs[mode] - utilities_df = utilities_df[chooser_columns]. \ - reset_index(drop=False). \ - rename(columns={'MAZ': maz_col, 'TAP': tap_col}) + utilities_df = ( + utilities_df[chooser_columns] + .reset_index(drop=False) + .rename(columns={"MAZ": maz_col, "TAP": tap_col}) + ) utilities_df = pd.merge( - maz_od_df[['idx', maz_col]].drop_duplicates(), + maz_od_df[["idx", maz_col]].drop_duplicates(), utilities_df, - on=maz_col, how='inner') + on=maz_col, + how="inner", + ) if len(utilities_df) == 0: trace = False # add any supplemental chooser attributes (e.g. demographic_segment, tod) for c in attribute_columns: - utilities_df[c] = reindex(chooser_attributes[c], utilities_df['idx']) + utilities_df[c] = reindex(chooser_attributes[c], utilities_df["idx"]) chunk.log_df(trace_label, "utilities_df", utilities_df) - if self.units_for_recipe(recipe) == 'utility': + if self.units_for_recipe(recipe) == "utility": utilities_df[leg] = compute_utilities( self.network_los, maz_tap_settings, utilities_df, model_constants=model_constants, - trace_label=trace_label, trace=trace, - trace_column_names=['idx', maz_col, tap_col] if trace else None) + trace_label=trace_label, + trace=trace, + trace_column_names=["idx", maz_col, tap_col] if trace else None, + ) chunk.log_df(trace_label, "utilities_df", utilities_df) # annotated else: - assignment_spec = \ - assign.read_assignment_spec(file_name=config.config_file_path(maz_tap_settings['SPEC'])) + assignment_spec = assign.read_assignment_spec( + file_name=config.config_file_path(maz_tap_settings["SPEC"]) + ) - results, _, _ = assign.assign_variables(assignment_spec, utilities_df, model_constants) + results, _, _ = assign.assign_variables( + assignment_spec, utilities_df, model_constants + ) assert len(results.columns == 1) utilities_df[leg] = results chunk.log_df(trace_label, "utilities_df", utilities_df) if trace: - self.trace_df(utilities_df, trace_label, 'utilities_df') + self.trace_df(utilities_df, trace_label, "utilities_df") # drop utility computation columns ('tod', 'demographic_segment' and maz_to_tap_df time/distance columns) utilities_df.drop(columns=attribute_columns + chooser_columns, inplace=True) return utilities_df - def all_transit_paths(self, access_df, egress_df, chooser_attributes, trace_label, trace): + def all_transit_paths( + self, access_df, egress_df, chooser_attributes, trace_label, trace + ): - trace_label = tracing.extend_trace_label(trace_label, 'all_transit_paths') + trace_label = tracing.extend_trace_label(trace_label, "all_transit_paths") # deduped transit_df has one row per chooser for each boarding (btap) and alighting (atap) pair transit_df = pd.merge( - access_df[['idx', 'btap']], - egress_df[['idx', 'atap']], - on='idx').drop_duplicates() + access_df[["idx", "btap"]], egress_df[["idx", "atap"]], on="idx" + ).drop_duplicates() # don't want transit trips that start and stop in same tap transit_df = transit_df[transit_df.atap != transit_df.btap] for c in list(chooser_attributes.columns): - transit_df[c] = reindex(chooser_attributes[c], transit_df['idx']) + transit_df[c] = reindex(chooser_attributes[c], transit_df["idx"]) transit_df = transit_df.reset_index(drop=True) if trace: - self.trace_df(transit_df, trace_label, 'all_transit_df') + self.trace_df(transit_df, trace_label, "all_transit_df") return transit_df - def compute_tap_tap_utilities(self, recipe, access_df, egress_df, chooser_attributes, path_info, - trace_label, trace): + def compute_tap_tap_utilities( + self, + recipe, + access_df, + egress_df, + chooser_attributes, + path_info, + trace_label, + trace, + ): """ create transit_df and compute utilities for all atap-btap pairs between omaz in access and dmaz in egress_df compute the utilities using the tap_tap utility expressions file specified in tap_tap_settings @@ -244,15 +293,21 @@ def compute_tap_tap_utilities(self, recipe, access_df, egress_df, chooser_attrib assert trace - trace_label = tracing.extend_trace_label(trace_label, 'compute_tap_tap_utils') + trace_label = tracing.extend_trace_label(trace_label, "compute_tap_tap_utils") with chunk.chunk_log(trace_label): - model_constants = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.CONSTANTS') - tap_tap_settings = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.tap_tap_settings') + model_constants = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.CONSTANTS" + ) + tap_tap_settings = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.tap_tap_settings" + ) with memo("#TVPB CACHE compute_tap_tap_utilities all_transit_paths"): - transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace) + transit_df = self.all_transit_paths( + access_df, egress_df, chooser_attributes, trace_label, trace + ) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) @@ -261,31 +316,48 @@ def compute_tap_tap_utilities(self, recipe, access_df, egress_df, chooser_attrib locals_dict.update(model_constants) # columns needed for compute_utilities - chooser_columns = ['btap', 'atap'] + list(chooser_attributes.columns) + chooser_columns = ["btap", "atap"] + list(chooser_attributes.columns) # deduplicate transit_df to unique_transit_df with memo("#TVPB compute_tap_tap_utilities deduplicate transit_df"): - attribute_segments = \ - self.network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments') - scalar_attributes = {k: locals_dict[k] for k in attribute_segments.keys() if k not in transit_df} - - transit_df['uid'] = self.uid_calculator.get_unique_ids(transit_df, scalar_attributes) + attribute_segments = self.network_los.setting( + "TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments" + ) + scalar_attributes = { + k: locals_dict[k] + for k in attribute_segments.keys() + if k not in transit_df + } + + transit_df["uid"] = self.uid_calculator.get_unique_ids( + transit_df, scalar_attributes + ) - unique_transit_df = transit_df.loc[~transit_df.uid.duplicated(), chooser_columns + ['uid']] - logger.debug(f"#TVPB CACHE deduped transit_df from {len(transit_df)} to {len(unique_transit_df)}") + unique_transit_df = transit_df.loc[ + ~transit_df.uid.duplicated(), chooser_columns + ["uid"] + ] + logger.debug( + f"#TVPB CACHE deduped transit_df from {len(transit_df)} to {len(unique_transit_df)}" + ) - unique_transit_df.set_index('uid', inplace=True) + unique_transit_df.set_index("uid", inplace=True) chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) - transit_df = transit_df[['idx', 'btap', 'atap', 'uid']] # don't need chooser columns + transit_df = transit_df[ + ["idx", "btap", "atap", "uid"] + ] # don't need chooser columns chunk.log_df(trace_label, "transit_df", transit_df) - logger.debug(f"#TVPB CACHE compute_tap_tap_utilities dedupe transit_df " - f"from {len(transit_df)} to {len(unique_transit_df)} rows") + logger.debug( + f"#TVPB CACHE compute_tap_tap_utilities dedupe transit_df " + f"from {len(transit_df)} to {len(unique_transit_df)} rows" + ) num_unique_transit_rows = len(unique_transit_df) # errcheck - logger.debug(f"#TVPB CACHE compute_tap_tap_utilities compute_utilities for {len(unique_transit_df)} rows") + logger.debug( + f"#TVPB CACHE compute_tap_tap_utilities compute_utilities for {len(unique_transit_df)} rows" + ) with memo("#TVPB compute_tap_tap_utilities compute_utilities"): unique_utilities_df = compute_utilities( @@ -295,16 +367,23 @@ def compute_tap_tap_utilities(self, recipe, access_df, egress_df, chooser_attrib model_constants=locals_dict, trace_label=trace_label, trace=trace, - trace_column_names=chooser_columns if trace else None + trace_column_names=chooser_columns if trace else None, ) chunk.log_df(trace_label, "unique_utilities_df", unique_utilities_df) - chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) # annotated + chunk.log_df( + trace_label, "unique_transit_df", unique_transit_df + ) # annotated if trace: # combine unique_transit_df with unique_utilities_df for legibility - omnibus_df = pd.merge(unique_transit_df, unique_utilities_df, - left_index=True, right_index=True, how='left') - self.trace_df(omnibus_df, trace_label, 'unique_utilities_df') + omnibus_df = pd.merge( + unique_transit_df, + unique_utilities_df, + left_index=True, + right_index=True, + how="left", + ) + self.trace_df(omnibus_df, trace_label, "unique_utilities_df") chunk.log_df(trace_label, "omnibus_df", omnibus_df) del omnibus_df chunk.log_df(trace_label, "omnibus_df", None) @@ -315,8 +394,10 @@ def compute_tap_tap_utilities(self, recipe, access_df, egress_df, chooser_attrib with memo("#TVPB compute_tap_tap_utilities redupe transit_df"): # idx = transit_df.index - transit_df = pd.merge(transit_df, unique_utilities_df, left_on='uid', right_index=True) - del transit_df['uid'] + transit_df = pd.merge( + transit_df, unique_utilities_df, left_on="uid", right_index=True + ) + del transit_df["uid"] # transit_df.index = idx # note: left merge on columns does not preserve index, # but transit_df index is arbitrary so no need to restore @@ -334,12 +415,20 @@ def compute_tap_tap_utilities(self, recipe, access_df, egress_df, chooser_attrib chunk.log_df(trace_label, "transit_df", None) if trace: - self.trace_df(transit_df, trace_label, 'transit_df') + self.trace_df(transit_df, trace_label, "transit_df") return transit_df - def lookup_tap_tap_utilities(self, recipe, maz_od_df, access_df, egress_df, chooser_attributes, - path_info, trace_label): + def lookup_tap_tap_utilities( + self, + recipe, + maz_od_df, + access_df, + egress_df, + chooser_attributes, + path_info, + trace_label, + ): """ create transit_df and compute utilities for all atap-btap pairs between omaz in access and dmaz in egress_df look up the utilities in the precomputed tap_cache data (which is indexed by uid_calculator unique_ids) @@ -362,21 +451,29 @@ def lookup_tap_tap_utilities(self, recipe, maz_od_df, access_df, egress_df, choo """ - trace_label = tracing.extend_trace_label(trace_label, 'lookup_tap_tap_utils') + trace_label = tracing.extend_trace_label(trace_label, "lookup_tap_tap_utils") with chunk.chunk_log(trace_label): with memo("#TVPB CACHE lookup_tap_tap_utilities all_transit_paths"): - transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace=False) + transit_df = self.all_transit_paths( + access_df, egress_df, chooser_attributes, trace_label, trace=False + ) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) if TRACE_COMPLEXITY: # diagnostic: log the omaz,dmaz pairs with the greatest number of virtual tap-tap paths - num_paths = transit_df.groupby(['idx']).size().to_frame('n') - num_paths = pd.merge(maz_od_df, num_paths, left_on='idx', right_index=True) - num_paths = num_paths[['omaz', 'dmaz', 'n']].drop_duplicates(subset=['omaz', 'dmaz']) - num_paths = num_paths.sort_values('n', ascending=False).reset_index(drop=True) + num_paths = transit_df.groupby(["idx"]).size().to_frame("n") + num_paths = pd.merge( + maz_od_df, num_paths, left_on="idx", right_index=True + ) + num_paths = num_paths[["omaz", "dmaz", "n"]].drop_duplicates( + subset=["omaz", "dmaz"] + ) + num_paths = num_paths.sort_values("n", ascending=False).reset_index( + drop=True + ) logger.debug(f"num_paths\n{num_paths.head(10)}") # FIXME some expressions may want to know access mode - @@ -384,12 +481,21 @@ def lookup_tap_tap_utilities(self, recipe, maz_od_df, access_df, egress_df, choo # add uid column to transit_df with memo("#TVPB lookup_tap_tap_utilities assign uid"): - attribute_segments = \ - self.network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments') - scalar_attributes = {k: locals_dict[k] for k in attribute_segments.keys() if k not in transit_df} - - transit_df.index = self.uid_calculator.get_unique_ids(transit_df, scalar_attributes) - transit_df = transit_df[['idx', 'btap', 'atap']] # just needed chooser_columns for uid calculation + attribute_segments = self.network_los.setting( + "TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments" + ) + scalar_attributes = { + k: locals_dict[k] + for k in attribute_segments.keys() + if k not in transit_df + } + + transit_df.index = self.uid_calculator.get_unique_ids( + transit_df, scalar_attributes + ) + transit_df = transit_df[ + ["idx", "btap", "atap"] + ] # just needed chooser_columns for uid calculation chunk.log_df(trace_label, "transit_df add uid index", transit_df) with memo("#TVPB lookup_tap_tap_utilities reindex transit_df"): @@ -406,64 +512,93 @@ def lookup_tap_tap_utilities(self, recipe, maz_od_df, access_df, egress_df, choo return transit_df - def compute_tap_tap_time(self, recipe, access_df, egress_df, chooser_attributes, path_info, trace_label, trace): + def compute_tap_tap_time( + self, + recipe, + access_df, + egress_df, + chooser_attributes, + path_info, + trace_label, + trace, + ): - trace_label = tracing.extend_trace_label(trace_label, 'compute_tap_tap_time') + trace_label = tracing.extend_trace_label(trace_label, "compute_tap_tap_time") with chunk.chunk_log(trace_label): - model_constants = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.CONSTANTS') - tap_tap_settings = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.tap_tap_settings') + model_constants = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.CONSTANTS" + ) + tap_tap_settings = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.tap_tap_settings" + ) with memo("#TVPB CACHE compute_tap_tap_utilities all_transit_paths"): - transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace) + transit_df = self.all_transit_paths( + access_df, egress_df, chooser_attributes, trace_label, trace + ) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) # some expressions may want to know access mode - locals_dict = path_info.copy() - locals_dict['los'] = self.network_los + locals_dict["los"] = self.network_los locals_dict.update(model_constants) - assignment_spec = assign.read_assignment_spec(file_name=config.config_file_path(tap_tap_settings['SPEC'])) + assignment_spec = assign.read_assignment_spec( + file_name=config.config_file_path(tap_tap_settings["SPEC"]) + ) DEDUPE = True if DEDUPE: # assign uid for reduping max_atap = transit_df.atap.max() + 1 - transit_df['uid'] = transit_df.btap * max_atap + transit_df.atap + transit_df["uid"] = transit_df.btap * max_atap + transit_df.atap # dedupe chooser_attribute_columns = list(chooser_attributes.columns) - unique_transit_df = \ - transit_df.loc[~transit_df.uid.duplicated(), ['btap', 'atap', 'uid'] + chooser_attribute_columns] - unique_transit_df.set_index('uid', inplace=True) + unique_transit_df = transit_df.loc[ + ~transit_df.uid.duplicated(), + ["btap", "atap", "uid"] + chooser_attribute_columns, + ] + unique_transit_df.set_index("uid", inplace=True) chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) - logger.debug(f"#TVPB CACHE deduped transit_df from {len(transit_df)} to {len(unique_transit_df)}") + logger.debug( + f"#TVPB CACHE deduped transit_df from {len(transit_df)} to {len(unique_transit_df)}" + ) # assign_variables - results, _, _ = assign.assign_variables(assignment_spec, unique_transit_df, locals_dict) + results, _, _ = assign.assign_variables( + assignment_spec, unique_transit_df, locals_dict + ) assert len(results.columns == 1) - unique_transit_df['transit'] = results + unique_transit_df["transit"] = results # redupe results back into transit_df with memo("#TVPB compute_tap_tap_time redupe transit_df"): - transit_df['transit'] = reindex(unique_transit_df.transit, transit_df.uid) + transit_df["transit"] = reindex( + unique_transit_df.transit, transit_df.uid + ) - del transit_df['uid'] + del transit_df["uid"] del unique_transit_df chunk.log_df(trace_label, "transit_df", transit_df) chunk.log_df(trace_label, "unique_transit_df", None) else: - results, _, _ = assign.assign_variables(assignment_spec, transit_df, locals_dict) + results, _, _ = assign.assign_variables( + assignment_spec, transit_df, locals_dict + ) assert len(results.columns == 1) - transit_df['transit'] = results + transit_df["transit"] = results # filter out unavailable btap_atap pairs - logger.debug(f"{(transit_df['transit'] <= 0).sum()} unavailable tap_tap pairs out of {len(transit_df)}") + logger.debug( + f"{(transit_df['transit'] <= 0).sum()} unavailable tap_tap pairs out of {len(transit_df)}" + ) transit_df = transit_df[transit_df.transit > 0] transit_df.drop(columns=chooser_attributes.columns, inplace=True) @@ -471,104 +606,173 @@ def compute_tap_tap_time(self, recipe, access_df, egress_df, chooser_attributes, chunk.log_df(trace_label, "transit_df", None) if trace: - self.trace_df(transit_df, trace_label, 'transit_df') + self.trace_df(transit_df, trace_label, "transit_df") return transit_df - def compute_tap_tap(self, recipe, maz_od_df, access_df, egress_df, chooser_attributes, path_info, - trace_label, trace): + def compute_tap_tap( + self, + recipe, + maz_od_df, + access_df, + egress_df, + chooser_attributes, + path_info, + trace_label, + trace, + ): - if self.units_for_recipe(recipe) == 'utility': + if self.units_for_recipe(recipe) == "utility": if not self.tap_cache.is_open: with memo("#TVPB compute_tap_tap tap_cache.open"): self.tap_cache.open() if trace: - result = \ - self.compute_tap_tap_utilities(recipe, access_df, egress_df, chooser_attributes, - path_info, trace_label, trace) + result = self.compute_tap_tap_utilities( + recipe, + access_df, + egress_df, + chooser_attributes, + path_info, + trace_label, + trace, + ) else: - result = \ - self.lookup_tap_tap_utilities(recipe, maz_od_df, access_df, egress_df, chooser_attributes, - path_info, trace_label) + result = self.lookup_tap_tap_utilities( + recipe, + maz_od_df, + access_df, + egress_df, + chooser_attributes, + path_info, + trace_label, + ) return result else: - assert self.units_for_recipe(recipe) == 'time' + assert self.units_for_recipe(recipe) == "time" with memo("#TVPB compute_tap_tap_time"): - result = self.compute_tap_tap_time(recipe, access_df, egress_df, chooser_attributes, - path_info, trace_label, trace) + result = self.compute_tap_tap_time( + recipe, + access_df, + egress_df, + chooser_attributes, + path_info, + trace_label, + trace, + ) return result - def best_paths(self, recipe, path_type, maz_od_df, access_df, egress_df, transit_df, trace_label, trace=False): + def best_paths( + self, + recipe, + path_type, + maz_od_df, + access_df, + egress_df, + transit_df, + trace_label, + trace=False, + ): - trace_label = tracing.extend_trace_label(trace_label, 'best_paths') + trace_label = tracing.extend_trace_label(trace_label, "best_paths") with chunk.chunk_log(trace_label): - path_settings = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.path_types.{path_type}') - max_paths_per_tap_set = path_settings.get('max_paths_per_tap_set', 1) - max_paths_across_tap_sets = path_settings.get('max_paths_across_tap_sets', 1) + path_settings = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.path_types.{path_type}" + ) + max_paths_per_tap_set = path_settings.get("max_paths_per_tap_set", 1) + max_paths_across_tap_sets = path_settings.get( + "max_paths_across_tap_sets", 1 + ) units = self.units_for_recipe(recipe) - smaller_is_better = (units in ['time']) + smaller_is_better = units in ["time"] - maz_od_df['seq'] = maz_od_df.index + maz_od_df["seq"] = maz_od_df.index # maz_od_df has one row per chooser # inner join to add rows for each access, egress, and transit segment combination - path_df = maz_od_df. \ - merge(access_df, on=['idx', 'omaz'], how='inner'). \ - merge(egress_df, on=['idx', 'dmaz'], how='inner'). \ - merge(transit_df, on=['idx', 'atap', 'btap'], how='inner') + path_df = ( + maz_od_df.merge(access_df, on=["idx", "omaz"], how="inner") + .merge(egress_df, on=["idx", "dmaz"], how="inner") + .merge(transit_df, on=["idx", "atap", "btap"], how="inner") + ) chunk.log_df(trace_label, "path_df", path_df) # transit sets are the transit_df non-join columns - transit_sets = [c for c in transit_df.columns if c not in ['idx', 'atap', 'btap']] + transit_sets = [ + c for c in transit_df.columns if c not in ["idx", "atap", "btap"] + ] if trace: # be nice and show both tap_tap set utility and total_set = access + set + egress for c in transit_sets: - path_df[f'total_{c}'] = path_df[c] + path_df['access'] + path_df['egress'] - self.trace_df(path_df, trace_label, 'best_paths.full') + path_df[f"total_{c}"] = ( + path_df[c] + path_df["access"] + path_df["egress"] + ) + self.trace_df(path_df, trace_label, "best_paths.full") for c in transit_sets: - del path_df[f'total_{c}'] + del path_df[f"total_{c}"] for c in transit_sets: - path_df[c] = path_df[c] + path_df['access'] + path_df['egress'] - path_df.drop(columns=['access', 'egress'], inplace=True) + path_df[c] = path_df[c] + path_df["access"] + path_df["egress"] + path_df.drop(columns=["access", "egress"], inplace=True) # choose best paths by tap set best_paths_list = [] for c in transit_sets: keep = path_df.index.isin( - path_df[['seq', c]].sort_values(by=c, ascending=smaller_is_better). - groupby(['seq']).head(max_paths_per_tap_set).index + path_df[["seq", c]] + .sort_values(by=c, ascending=smaller_is_better) + .groupby(["seq"]) + .head(max_paths_per_tap_set) + .index ) best_paths_for_set = path_df[keep] - best_paths_for_set['path_set'] = c # remember the path set + best_paths_for_set["path_set"] = c # remember the path set best_paths_for_set[units] = path_df[keep][c] best_paths_for_set.drop(columns=transit_sets, inplace=True) best_paths_list.append(best_paths_for_set) - path_df = pd.concat(best_paths_list).sort_values(by=['seq', units], ascending=[True, smaller_is_better]) + path_df = pd.concat(best_paths_list).sort_values( + by=["seq", units], ascending=[True, smaller_is_better] + ) # choose best paths overall by seq - path_df = path_df.sort_values(by=['seq', units], ascending=[True, smaller_is_better]) - path_df = path_df[path_df.index.isin(path_df.groupby(['seq']).head(max_paths_across_tap_sets).index)] + path_df = path_df.sort_values( + by=["seq", units], ascending=[True, smaller_is_better] + ) + path_df = path_df[ + path_df.index.isin( + path_df.groupby(["seq"]).head(max_paths_across_tap_sets).index + ) + ] if trace: - self.trace_df(path_df, trace_label, 'best_paths') + self.trace_df(path_df, trace_label, "best_paths") return path_df - def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_segment, - want_choices, trace_label, - filter_targets=None, trace=False, override_choices=None): - - trace_label = tracing.extend_trace_label(trace_label, 'build_virtual_path') + def build_virtual_path( + self, + recipe, + path_type, + orig, + dest, + tod, + demographic_segment, + want_choices, + trace_label, + filter_targets=None, + trace=False, + override_choices=None, + ): + + trace_label = tracing.extend_trace_label(trace_label, "build_virtual_path") # Tracing is implemented as a seperate, second call that operates ONLY on filter_targets assert not (trace and filter_targets is None) @@ -595,49 +799,68 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg override_choices = override_choices[filter_targets] units = self.units_for_recipe(recipe) - assert units == 'utility' or not want_choices, "'want_choices' only supported supported if units is utility" - - access_mode = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.access') - egress_mode = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.egress') - path_types_settings = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.path_types.{path_type}') - attributes_as_columns = \ - self.network_los.setting(f'TVPB_SETTINGS.{recipe}.tap_tap_settings.attributes_as_columns', []) - - path_info = {'path_type': path_type, 'access_mode': access_mode, 'egress_mode': egress_mode} + assert ( + units == "utility" or not want_choices + ), "'want_choices' only supported supported if units is utility" + + access_mode = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.path_types.{path_type}.access" + ) + egress_mode = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.path_types.{path_type}.egress" + ) + path_types_settings = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.path_types.{path_type}" + ) + attributes_as_columns = self.network_los.setting( + f"TVPB_SETTINGS.{recipe}.tap_tap_settings.attributes_as_columns", [] + ) + + path_info = { + "path_type": path_type, + "access_mode": access_mode, + "egress_mode": egress_mode, + } # maz od pairs requested with memo("#TVPB build_virtual_path maz_od_df"): - maz_od_df = pd.DataFrame({ - 'idx': orig.index.values, - 'omaz': orig.values, - 'dmaz': dest.values, - 'seq': range(len(orig)) - }) + maz_od_df = pd.DataFrame( + { + "idx": orig.index.values, + "omaz": orig.values, + "dmaz": dest.values, + "seq": range(len(orig)), + } + ) chunk.log_df(trace_label, "maz_od_df", maz_od_df) self.trace_maz_tap(maz_od_df, access_mode, egress_mode) # for location choice, there will be multiple alt dest rows per chooser and duplicate orig.index values # but tod and demographic_segment should be the same for all chooser rows (unique orig index values) # knowing this allows us to eliminate redundant computations (e.g. utilities of maz_tap pairs) - duplicated = orig.index.duplicated(keep='first') + duplicated = orig.index.duplicated(keep="first") chooser_attributes = pd.DataFrame(index=orig.index[~duplicated]) if not isinstance(tod, str): - chooser_attributes['tod'] = tod.loc[~duplicated] - elif 'tod' in attributes_as_columns: - chooser_attributes['tod'] = tod + chooser_attributes["tod"] = tod.loc[~duplicated] + elif "tod" in attributes_as_columns: + chooser_attributes["tod"] = tod else: - path_info['tod'] = tod + path_info["tod"] = tod if demographic_segment is not None: - chooser_attributes['demographic_segment'] = demographic_segment.loc[~duplicated] + chooser_attributes["demographic_segment"] = demographic_segment.loc[ + ~duplicated + ] with memo("#TVPB build_virtual_path access_df"): access_df = self.compute_maz_tap_utilities( recipe, maz_od_df, chooser_attributes, - leg='access', + leg="access", mode=access_mode, - trace_label=trace_label, trace=trace) + trace_label=trace_label, + trace=trace, + ) chunk.log_df(trace_label, "access_df", access_df) with memo("#TVPB build_virtual_path egress_df"): @@ -645,13 +868,15 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg recipe, maz_od_df, chooser_attributes, - leg='egress', + leg="egress", mode=egress_mode, - trace_label=trace_label, trace=trace) + trace_label=trace_label, + trace=trace, + ) chunk.log_df(trace_label, "egress_df", egress_df) # L200 will drop all rows if all trips are intra-tap. - if np.array_equal(access_df['btap'].values, egress_df['atap'].values): + if np.array_equal(access_df["btap"].values, egress_df["atap"].values): trace = False # path_info for use by expressions (e.g. penalty for drive access if no parking at access tap) @@ -665,7 +890,9 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg egress_df, chooser_attributes, path_info=path_info, - trace_label=trace_label, trace=trace) + trace_label=trace_label, + trace=trace, + ) chunk.log_df(trace_label, "transit_df", transit_df) # Cannot trace if df is empty. Prob happened at L200 @@ -674,9 +901,15 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg with memo("#TVPB build_virtual_path best_paths"): path_df = self.best_paths( - recipe, path_type, - maz_od_df, access_df, egress_df, transit_df, - trace_label, trace) + recipe, + path_type, + maz_od_df, + access_df, + egress_df, + transit_df, + trace_label, + trace, + ) chunk.log_df(trace_label, "path_df", path_df) # now that we have created path_df, we are done with the dataframes for the separate legs @@ -687,21 +920,31 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg del transit_df chunk.log_df(trace_label, "transit_df", None) - if units == 'utility': + if units == "utility": # logsums with memo("#TVPB build_virtual_path logsums"): # one row per seq with utilities in columns # path_num 0-based to aligh with logit.make_choices 0-based choice indexes - path_df['path_num'] = path_df.groupby('seq').cumcount() + path_df["path_num"] = path_df.groupby("seq").cumcount() chunk.log_df(trace_label, "path_df", path_df) - utilities_df = path_df[['seq', 'path_num', units]].set_index(['seq', 'path_num']).unstack() - utilities_df.columns = utilities_df.columns.droplevel() # for legibility + utilities_df = ( + path_df[["seq", "path_num", units]] + .set_index(["seq", "path_num"]) + .unstack() + ) + utilities_df.columns = ( + utilities_df.columns.droplevel() + ) # for legibility # add rows missing because no access or egress availability - utilities_df = pd.concat([pd.DataFrame(index=maz_od_df.seq), utilities_df], axis=1) - utilities_df = utilities_df.fillna(UNAVAILABLE) # set utilities for missing paths to UNAVAILABLE + utilities_df = pd.concat( + [pd.DataFrame(index=maz_od_df.seq), utilities_df], axis=1 + ) + utilities_df = utilities_df.fillna( + UNAVAILABLE + ) # set utilities for missing paths to UNAVAILABLE chunk.log_df(trace_label, "utilities_df", utilities_df) @@ -710,19 +953,31 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg # most likely "divide by zero encountered in log" caused by all transit sets non-viable warnings.simplefilter("always") - paths_nest_nesting_coefficient = path_types_settings.get('paths_nest_nesting_coefficient', 1) - exp_utilities = np.exp(utilities_df.values / paths_nest_nesting_coefficient) - logsums = np.maximum(np.log(np.nansum(exp_utilities, axis=1)), UNAVAILABLE) + paths_nest_nesting_coefficient = path_types_settings.get( + "paths_nest_nesting_coefficient", 1 + ) + exp_utilities = np.exp( + utilities_df.values / paths_nest_nesting_coefficient + ) + logsums = np.maximum( + np.log(np.nansum(exp_utilities, axis=1)), UNAVAILABLE + ) if len(w) > 0: for wrn in w: logger.warning( - f"{trace_label} - {type(wrn).__name__} ({wrn.message})") + f"{trace_label} - {type(wrn).__name__} ({wrn.message})" + ) DUMP = False if DUMP: - zero_utilities_df = utilities_df[np.nansum(np.exp(utilities_df.values), axis=1) == 0] - zero_utilities_df.to_csv(config.output_file_path('warning_utilities_df.csv'), index=True) + zero_utilities_df = utilities_df[ + np.nansum(np.exp(utilities_df.values), axis=1) == 0 + ] + zero_utilities_df.to_csv( + config.output_file_path("warning_utilities_df.csv"), + index=True, + ) if want_choices: @@ -731,20 +986,24 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg with memo("#TVPB build_virtual_path make_choices"): - probs = logit.utils_to_probs(utilities_df, allow_zero_probs=True, trace_label=trace_label) + probs = logit.utils_to_probs( + utilities_df, allow_zero_probs=True, trace_label=trace_label + ) chunk.log_df(trace_label, "probs", probs) if trace: choices = override_choices - utilities_df['choices'] = choices - self.trace_df(utilities_df, trace_label, 'utilities_df') + utilities_df["choices"] = choices + self.trace_df(utilities_df, trace_label, "utilities_df") - probs['choices'] = choices - self.trace_df(probs, trace_label, 'probs') + probs["choices"] = choices + self.trace_df(probs, trace_label, "probs") else: - choices, rands = logit.make_choices(probs, allow_bad_probs=True, trace_label=trace_label) + choices, rands = logit.make_choices( + probs, allow_bad_probs=True, trace_label=trace_label + ) chunk.log_df(trace_label, "rands", rands) del rands @@ -755,20 +1014,26 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg # we need to get path_set, btap, atap from path_df row with same seq and path_num # drop seq join column, but keep path_num of choice to override_choices when tracing - columns_to_cache = ['btap', 'atap', 'path_set', 'path_num'] - logsum_df = \ - pd.merge(pd.DataFrame({'seq': range(len(orig)), 'path_num': choices.values}), - path_df[['seq'] + columns_to_cache], - on=['seq', 'path_num'], how='left')\ - .drop(columns=['seq'])\ + columns_to_cache = ["btap", "atap", "path_set", "path_num"] + logsum_df = ( + pd.merge( + pd.DataFrame( + {"seq": range(len(orig)), "path_num": choices.values} + ), + path_df[["seq"] + columns_to_cache], + on=["seq", "path_num"], + how="left", + ) + .drop(columns=["seq"]) .set_index(orig.index) + ) - logsum_df['logsum'] = logsums + logsum_df["logsum"] = logsums else: assert len(logsums) == len(orig) - logsum_df = pd.DataFrame({'logsum': logsums}, index=orig.index) + logsum_df = pd.DataFrame({"logsum": logsums}, index=orig.index) chunk.log_df(trace_label, "logsum_df", logsum_df) @@ -776,16 +1041,16 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg chunk.log_df(trace_label, "utilities_df", None) if trace: - self.trace_df(logsum_df, trace_label, 'logsum_df') + self.trace_df(logsum_df, trace_label, "logsum_df") chunk.log_df(trace_label, "logsum_df", logsum_df) results = logsum_df else: - assert units == 'time' + assert units == "time" # return a series - results = pd.Series(path_df[units].values, index=path_df['idx']) + results = pd.Series(path_df[units].values, index=path_df["idx"]) # zero-fill rows for O-D pairs where no best path exists because there was no tap-tap transit availability results = reindex(results, maz_od_df.idx).fillna(0.0) @@ -805,31 +1070,56 @@ def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_seg return results def get_tvpb_logsum( - self, path_type, orig, dest, tod, demographic_segment, want_choices, - recipe='tour_mode_choice', trace_label=None): + self, + path_type, + orig, + dest, + tod, + demographic_segment, + want_choices, + recipe="tour_mode_choice", + trace_label=None, + ): # assume they have given us a more specific name (since there may be more than one active wrapper) - trace_label = trace_label or 'get_tvpb_logsum' + trace_label = trace_label or "get_tvpb_logsum" trace_label = tracing.extend_trace_label(trace_label, path_type) with chunk.chunk_log(trace_label): - logsum_df = \ - self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment, - want_choices=want_choices, trace_label=trace_label) + logsum_df = self.build_virtual_path( + recipe, + path_type, + orig, + dest, + tod, + demographic_segment, + want_choices=want_choices, + trace_label=trace_label, + ) trace_hh_id = inject.get_injectable("trace_hh_id", None) - if (all(logsum_df['logsum'] == UNAVAILABLE)) or (len(logsum_df) == 0): + if (all(logsum_df["logsum"] == UNAVAILABLE)) or (len(logsum_df) == 0): trace_hh_id = False if trace_hh_id: filter_targets = tracing.trace_targets(orig) # choices from preceding run (because random numbers) - override_choices = logsum_df['path_num'] if want_choices else None + override_choices = logsum_df["path_num"] if want_choices else None if filter_targets.any(): - self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment, - want_choices=want_choices, override_choices=override_choices, - trace_label=trace_label, filter_targets=filter_targets, trace=True) + self.build_virtual_path( + recipe, + path_type, + orig, + dest, + tod, + demographic_segment, + want_choices=want_choices, + override_choices=override_choices, + trace_label=trace_label, + filter_targets=filter_targets, + trace=True, + ) return logsum_df @@ -837,44 +1127,86 @@ def get_tvpb_best_transit_time(self, orig, dest, tod): # FIXME lots of pathological knowledge here as we are only called by accessibility directly from expressions - trace_label = tracing.extend_trace_label('accessibility.tvpb_best_time', tod) - recipe = 'accessibility' - path_type = 'WTW' + trace_label = tracing.extend_trace_label("accessibility.tvpb_best_time", tod) + recipe = "accessibility" + path_type = "WTW" with chunk.chunk_log(trace_label): - result = \ - self.build_virtual_path(recipe, path_type, orig, dest, tod, - demographic_segment=None, want_choices=False, - trace_label=trace_label) + result = self.build_virtual_path( + recipe, + path_type, + orig, + dest, + tod, + demographic_segment=None, + want_choices=False, + trace_label=trace_label, + ) trace_od = inject.get_injectable("trace_od", None) if trace_od: filter_targets = (orig == trace_od[0]) & (dest == trace_od[1]) if filter_targets.any(): - self.build_virtual_path(recipe, path_type, orig, dest, tod, - demographic_segment=None, want_choices=False, - trace_label=trace_label, filter_targets=filter_targets, trace=True) + self.build_virtual_path( + recipe, + path_type, + orig, + dest, + tod, + demographic_segment=None, + want_choices=False, + trace_label=trace_label, + filter_targets=filter_targets, + trace=True, + ) return result - def wrap_logsum(self, orig_key, dest_key, tod_key, segment_key, - recipe='tour_mode_choice', - cache_choices=False, trace_label=None, tag=None): + def wrap_logsum( + self, + orig_key, + dest_key, + tod_key, + segment_key, + recipe="tour_mode_choice", + cache_choices=False, + trace_label=None, + tag=None, + ): return TransitVirtualPathLogsumWrapper( - self, orig_key, dest_key, tod_key, segment_key, - recipe, cache_choices, trace_label, tag) + self, + orig_key, + dest_key, + tod_key, + segment_key, + recipe, + cache_choices, + trace_label, + tag, + ) class TransitVirtualPathLogsumWrapper(object): """ Transit virtual path builder logsum wrapper for three zone systems """ - def __init__(self, pathbuilder, orig_key, dest_key, tod_key, segment_key, - recipe, cache_choices, trace_label, tag): + + def __init__( + self, + pathbuilder, + orig_key, + dest_key, + tod_key, + segment_key, + recipe, + cache_choices, + trace_label, + tag, + ): self.tvpb = pathbuilder - assert hasattr(pathbuilder, 'get_tvpb_logsum') + assert hasattr(pathbuilder, "get_tvpb_logsum") self.orig_key = orig_key self.dest_key = dest_key @@ -886,7 +1218,9 @@ def __init__(self, pathbuilder, orig_key, dest_key, tod_key, segment_key, self.cache_choices = cache_choices self.cache = {} if cache_choices else None - self.base_trace_label = tracing.extend_trace_label(trace_label, tag) or f'tvpb_logsum.{tag}' + self.base_trace_label = ( + tracing.extend_trace_label(trace_label, tag) or f"tvpb_logsum.{tag}" + ) self.trace_label = self.base_trace_label self.tag = tag @@ -913,7 +1247,9 @@ def set_df(self, df): def extend_trace_label(self, extension=None): if extension: - self.trace_label = tracing.extend_trace_label(self.base_trace_label, extension) + self.trace_label = tracing.extend_trace_label( + self.base_trace_label, extension + ) else: self.trace_label = self.base_trace_label @@ -933,37 +1269,49 @@ def __getitem__(self, path_type): """ assert self.df is not None, "Call set_df first" - assert(self.orig_key in self.df), \ - f"TransitVirtualPathLogsumWrapper: orig_key '{self.orig_key}' not in df" - assert(self.dest_key in self.df), \ - f"TransitVirtualPathLogsumWrapper: dest_key '{self.dest_key}' not in df" - assert(self.tod_key in self.df), \ - f"TransitVirtualPathLogsumWrapper: tod_key '{self.tod_key}' not in df" - assert(self.segment_key in self.df), \ - f"TransitVirtualPathLogsumWrapper: segment_key '{self.segment_key}' not in df" - - orig = self.df[self.orig_key].astype('int') - dest = self.df[self.dest_key].astype('int') + assert ( + self.orig_key in self.df + ), f"TransitVirtualPathLogsumWrapper: orig_key '{self.orig_key}' not in df" + assert ( + self.dest_key in self.df + ), f"TransitVirtualPathLogsumWrapper: dest_key '{self.dest_key}' not in df" + assert ( + self.tod_key in self.df + ), f"TransitVirtualPathLogsumWrapper: tod_key '{self.tod_key}' not in df" + assert ( + self.segment_key in self.df + ), f"TransitVirtualPathLogsumWrapper: segment_key '{self.segment_key}' not in df" + + orig = self.df[self.orig_key].astype("int") + dest = self.df[self.dest_key].astype("int") tod = self.df[self.tod_key] segment = self.df[self.segment_key] - logsum_df = \ - self.tvpb.get_tvpb_logsum(path_type, orig, dest, tod, segment, - want_choices=self.cache_choices, - recipe=self.recipe, - trace_label=self.trace_label) + logsum_df = self.tvpb.get_tvpb_logsum( + path_type, + orig, + dest, + tod, + segment, + want_choices=self.cache_choices, + recipe=self.recipe, + trace_label=self.trace_label, + ) - if (self.cache_choices) and (not all(logsum_df['logsum'] == UNAVAILABLE)): + if (self.cache_choices) and (not all(logsum_df["logsum"] == UNAVAILABLE)): # not tested on duplicate index because not currently needed # caching strategy does not require unique indexes but care would need to be taken to maintain alignment assert not orig.index.duplicated().any() # we only need to cache taps and path_set - choices_df = logsum_df[['atap', 'btap', 'path_set']] + choices_df = logsum_df[["atap", "btap", "path_set"]] if path_type in self.cache: - assert len(self.cache.get(path_type).index.intersection(logsum_df.index)) == 0 + assert ( + len(self.cache.get(path_type).index.intersection(logsum_df.index)) + == 0 + ) choices_df = pd.concat([self.cache.get(path_type), choices_df]) self.cache[path_type] = choices_df diff --git a/activitysim/core/pathbuilder_cache.py b/activitysim/core/pathbuilder_cache.py index a47b8ab357..b5f1089138 100644 --- a/activitysim/core/pathbuilder_cache.py +++ b/activitysim/core/pathbuilder_cache.py @@ -1,35 +1,29 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range - -import logging -import os +import gc as _gc import itertools +import logging import multiprocessing -import gc as _gc -import psutil +import os import time - +from builtins import range from contextlib import contextmanager import numpy as np import pandas as pd +import psutil -from activitysim.core import util -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import simulate -from activitysim.core import tracing +from activitysim.core import config, inject, simulate, tracing, util logger = logging.getLogger(__name__) RAWARRAY = False -DTYPE_NAME = 'float32' +DTYPE_NAME = "float32" RESCALE = 1000 -DYNAMIC = 'dynamic' -STATIC = 'static' -TRACE = 'trace' +DYNAMIC = "dynamic" +STATIC = "static" +TRACE = "trace" MEMO_STACK = [] @@ -52,7 +46,7 @@ def memo(tag, console=False, disable_gc=True): finally: elapsed_time = time.time() - t0 - current_mem = (psutil.Process().memory_info().rss) + current_mem = psutil.Process().memory_info().rss marginal_mem = current_mem - previous_mem mem_str = f"net {util.GB(marginal_mem)} ({util.INT(marginal_mem)}) total {util.GB(current_mem)}" @@ -73,6 +67,7 @@ class TVPBCache(object): """ Transit virtual path builder cache for three zone systems """ + def __init__(self, network_los, uid_calculator, cache_tag): # lightweight until opened @@ -88,13 +83,13 @@ def __init__(self, network_los, uid_calculator, cache_tag): @property def cache_path(self): - file_type = 'mmap' - return os.path.join(config.get_cache_dir(), f'{self.cache_tag}.{file_type}') + file_type = "mmap" + return os.path.join(config.get_cache_dir(), f"{self.cache_tag}.{file_type}") @property def csv_trace_path(self): - file_type = 'csv' - return os.path.join(config.get_cache_dir(), f'{self.cache_tag}.{file_type}') + file_type = "csv" + return os.path.join(config.get_cache_dir(), f"{self.cache_tag}.{file_type}") def cleanup(self): """ @@ -116,16 +111,17 @@ def write_static_cache(self, data): logger.debug(f"#TVPB CACHE write_static_cache df {data.shape}") - mm_data = np.memmap(self.cache_path, - shape=data.shape, - dtype=DTYPE_NAME, - mode='w+') + mm_data = np.memmap( + self.cache_path, shape=data.shape, dtype=DTYPE_NAME, mode="w+" + ) np.copyto(mm_data, data) mm_data._mmap.close() del mm_data - logger.debug(f"#TVPB CACHE write_static_cache wrote static cache table " - f"({data.shape}) to {self.cache_path}") + logger.debug( + f"#TVPB CACHE write_static_cache wrote static cache table " + f"({data.shape}) to {self.cache_path}" + ) def open(self): """ @@ -144,10 +140,12 @@ def open(self): # multiprocessing usex preloaded fully_populated shared data buffer with memo("TVPBCache.open get_data_and_lock_from_buffers"): data, _ = self.get_data_and_lock_from_buffers() - logger.info(f"TVPBCache.open {self.cache_tag} STATIC cache using existing data_buffers") + logger.info( + f"TVPBCache.open {self.cache_tag} STATIC cache using existing data_buffers" + ) elif os.path.isfile(self.cache_path): # single process ought have created a precomputed fully_populated STATIC file - data = np.memmap(self.cache_path, dtype=DTYPE_NAME, mode='r') + data = np.memmap(self.cache_path, dtype=DTYPE_NAME, mode="r") # FIXME - why leave memmap open - maybe should copy since it will be read into memory when accessed anyway # mm_data = np.memmap(self.cache_path, dtype=DTYPE_NAME, mode='r') @@ -156,15 +154,21 @@ def open(self): # mm_data._mmap.close() # del mm_data - logger.info(f"TVPBCache.open {self.cache_tag} read fully_populated data array from mmap file") + logger.info( + f"TVPBCache.open {self.cache_tag} read fully_populated data array from mmap file" + ) else: - raise RuntimeError(f"Pathbuilder cache not found. Did you forget to run initialize tvpb?" - f"Expected cache file: {self.cache_path}") + raise RuntimeError( + f"Pathbuilder cache not found. Did you forget to run initialize tvpb?" + f"Expected cache file: {self.cache_path}" + ) # create no-copy pandas DataFrame from numpy wrapped RawArray or Memmap buffer column_names = self.uid_calculator.set_names with memo("TVPBCache.open data.reshape"): - data = data.reshape((-1, len(column_names))) # reshape so there is one column per set + data = data.reshape( + (-1, len(column_names)) + ) # reshape so there is one column per set # data should be fully_populated and in canonical order - so we can assign canonical uid index with memo("TVPBCache.open uid_calculator.fully_populated_uids"): @@ -219,31 +223,41 @@ def allocate_data_buffer(self, shared=False): buffer_size = util.iprod(self.uid_calculator.fully_populated_shape) csz = buffer_size * dtype.itemsize - logger.info(f"TVPBCache.allocate_data_buffer allocating data buffer " - f"shape {shape} buffer_size {util.INT(buffer_size)} total size: {util.INT(csz)} ({util.GB(csz)})") + logger.info( + f"TVPBCache.allocate_data_buffer allocating data buffer " + f"shape {shape} buffer_size {util.INT(buffer_size)} total size: {util.INT(csz)} ({util.GB(csz)})" + ) if shared: - if dtype_name == 'float64': - typecode = 'd' - elif dtype_name == 'float32': - typecode = 'f' + if dtype_name == "float64": + typecode = "d" + elif dtype_name == "float32": + typecode = "f" else: - raise RuntimeError("allocate_data_buffer unrecognized dtype %s" % dtype_name) + raise RuntimeError( + "allocate_data_buffer unrecognized dtype %s" % dtype_name + ) if RAWARRAY: with memo("TVPBCache.allocate_data_buffer allocate RawArray"): buffer = multiprocessing.RawArray(typecode, buffer_size) - logger.info(f"TVPBCache.allocate_data_buffer allocated shared multiprocessing.RawArray as buffer") + logger.info( + f"TVPBCache.allocate_data_buffer allocated shared multiprocessing.RawArray as buffer" + ) else: with memo("TVPBCache.allocate_data_buffer allocate Array"): buffer = multiprocessing.Array(typecode, buffer_size) - logger.info(f"TVPBCache.allocate_data_buffer allocated shared multiprocessing.Array as buffer") + logger.info( + f"TVPBCache.allocate_data_buffer allocated shared multiprocessing.Array as buffer" + ) else: buffer = np.empty(buffer_size, dtype=dtype) np.copyto(buffer, np.nan) # fill with np.nan - logger.info(f"TVPBCache.allocate_data_buffer allocating non-shared numpy array as buffer") + logger.info( + f"TVPBCache.allocate_data_buffer allocating non-shared numpy array as buffer" + ) return buffer @@ -263,11 +277,13 @@ def load_data_to_buffer(self, data_buffer): if os.path.isfile(self.cache_path): with memo("TVPBCache.load_data_to_buffer copy memmap"): - data = np.memmap(self.cache_path, dtype=DTYPE_NAME, mode='r') + data = np.memmap(self.cache_path, dtype=DTYPE_NAME, mode="r") np.copyto(np_wrapped_data_buffer, data) data._mmap.close() del data - logger.debug(f"TVPBCache.load_data_to_buffer loaded data from {self.cache_path}") + logger.debug( + f"TVPBCache.load_data_to_buffer loaded data from {self.cache_path}" + ) else: np.copyto(np_wrapped_data_buffer, np.nan) logger.debug(f"TVPBCache.load_data_to_buffer - saved cache file not found.") @@ -279,7 +295,7 @@ def get_data_and_lock_from_buffers(self): ------- either multiprocessing.Array and lock or multiprocessing.RawArray and None according to RAWARRAY """ - data_buffers = inject.get_injectable('data_buffers', None) + data_buffers = inject.get_injectable("data_buffers", None) assert self.cache_tag in data_buffers # internal error logger.debug(f"TVPBCache.get_data_and_lock_from_buffers") data_buffer = data_buffers[self.cache_tag] @@ -297,6 +313,7 @@ class TapTapUidCalculator(object): """ Transit virtual path builder TAP to TAP unique ID calculator for three zone systems """ + def __init__(self, network_los): self.network_los = network_los @@ -304,13 +321,16 @@ def __init__(self, network_los): # ensure that tap_df has been loaded # (during multiprocessing we are initialized before network_los.load_data is called) assert network_los.tap_df is not None - self.tap_ids = network_los.tap_df['TAP'].values + self.tap_ids = network_los.tap_df["TAP"].values - self.segmentation = \ - network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments') + self.segmentation = network_los.setting( + "TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments" + ) # e.g. [(0, 'AM', 'walk'), (0, 'AM', 'walk')...]) for attributes demographic_segment, tod, and access_mode - self.attribute_combination_tuples = list(itertools.product(*list(self.segmentation.values()))) + self.attribute_combination_tuples = list( + itertools.product(*list(self.segmentation.values())) + ) # ordinalizers - for mapping attribute values to canonical ordinal values for uid computation # (pandas series of ordinal position with attribute value index (e.g. map tod value 'AM' to 0, 'MD' to 1,...) @@ -319,13 +339,17 @@ def __init__(self, network_los): for k, v in self.segmentation.items(): self.ordinalizers[k] = pd.Series(range(len(v)), index=v) # orig/dest go last so all rows in same 'skim' end up with adjacent uids - self.ordinalizers['btap'] = pd.Series(range(len(self.tap_ids)), index=self.tap_ids) - self.ordinalizers['atap'] = self.ordinalizers['btap'] + self.ordinalizers["btap"] = pd.Series( + range(len(self.tap_ids)), index=self.tap_ids + ) + self.ordinalizers["atap"] = self.ordinalizers["btap"] # for k,v in self.ordinalizers.items(): # print(f"\ordinalizer {k}\n{v}") - spec_name = self.network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.SPEC') + spec_name = self.network_los.setting( + f"TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.SPEC" + ) self.set_names = list(simulate.read_model_spec(file_name=spec_name).columns) @property @@ -379,7 +403,9 @@ def get_unique_ids(self, df, scalar_attributes): uid = uid * cardinality + np.asanyarray(df[name].map(ordinalizer)) else: # otherwise it should be in scalar_attributes - assert name in scalar_attributes, f"attribute '{name}' not found in df.columns or scalar_attributes." + assert ( + name in scalar_attributes + ), f"attribute '{name}' not found in df.columns or scalar_attributes." uid = uid * cardinality + ordinalizer.at[scalar_attributes[name]] return uid @@ -403,8 +429,8 @@ def get_od_dataframe(self, scalar_attributes): num_taps = len(self.tap_ids) od_choosers_df = pd.DataFrame( data={ - 'btap': np.repeat(self.tap_ids, num_taps), - 'atap': np.tile(self.tap_ids, num_taps) + "btap": np.repeat(self.tap_ids, num_taps), + "atap": np.tile(self.tap_ids, num_taps), } ) od_choosers_df.index = self.get_unique_ids(od_choosers_df, scalar_attributes) @@ -430,7 +456,10 @@ def each_scalar_attribute_combination(self): # attribute_value_tuple is an tuple of attribute values - e.g. (0, 'AM', 'walk') # build dict of attribute name:value pairs - e.g. {'demographic_segment': 0, 'tod': 'AM', }) - scalar_attributes = {name: value for name, value in zip(attribute_names, attribute_value_tuple)} + scalar_attributes = { + name: value + for name, value in zip(attribute_names, attribute_value_tuple) + } yield scalar_attributes @@ -439,5 +468,5 @@ def scalar_attribute_combinations(self): attribute_tuples = self.attribute_combination_tuples x = [list(t) for t in attribute_tuples] df = pd.DataFrame(data=x, columns=attribute_names) - df.index.name = 'offset' + df.index.name = "offset" return df diff --git a/activitysim/core/pipeline.py b/activitysim/core/pipeline.py index 1b2240e6d7..bf1ac7f5c3 100644 --- a/activitysim/core/pipeline.py +++ b/activitysim/core/pipeline.py @@ -1,48 +1,36 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import next -from builtins import map -from builtins import object - -import os -import logging import datetime as dt +import logging +import os +from builtins import map, next, object import pandas as pd - from orca import orca -from . import inject -from . import config -from . import random -from . import tracing -from . import mem - - -from . import util +from . import config, inject, mem, random, tracing, util from .tracing import print_elapsed_time - logger = logging.getLogger(__name__) # name of the checkpoint dict keys # (which are also columns in the checkpoints dataframe stored in hte pipeline store) -TIMESTAMP = 'timestamp' -CHECKPOINT_NAME = 'checkpoint_name' +TIMESTAMP = "timestamp" +CHECKPOINT_NAME = "checkpoint_name" NON_TABLE_COLUMNS = [CHECKPOINT_NAME, TIMESTAMP] # name used for storing the checkpoints dataframe to the pipeline store -CHECKPOINT_TABLE_NAME = 'checkpoints' +CHECKPOINT_TABLE_NAME = "checkpoints" # name of the first step/checkpoint created when the pipeline is started -INITIAL_CHECKPOINT_NAME = 'init' -FINAL_CHECKPOINT_NAME = 'final' +INITIAL_CHECKPOINT_NAME = "init" +FINAL_CHECKPOINT_NAME = "final" # special value for resume_after meaning last checkpoint -LAST_CHECKPOINT = '_' +LAST_CHECKPOINT = "_" # single character prefix for run_list model name to indicate that no checkpoint should be saved -NO_CHECKPOINT_PREFIX = '_' +NO_CHECKPOINT_PREFIX = "_" class Pipeline(object): @@ -114,7 +102,9 @@ def open_pipeline_store(overwrite=False): if _PIPELINE.pipeline_store is not None: raise RuntimeError("Pipeline store is already open!") - pipeline_file_path = config.pipeline_file_path(inject.get_injectable('pipeline_file_name')) + pipeline_file_path = config.pipeline_file_path( + inject.get_injectable("pipeline_file_name") + ) if overwrite: try: @@ -125,7 +115,7 @@ def open_pipeline_store(overwrite=False): print(e) logger.warning("Error removing %s: %s" % (pipeline_file_path, e)) - _PIPELINE.pipeline_store = pd.HDFStore(pipeline_file_path, mode='a') + _PIPELINE.pipeline_store = pd.HDFStore(pipeline_file_path, mode="a") logger.debug(f"opened pipeline_store {pipeline_file_path}") @@ -282,13 +272,18 @@ def add_checkpoint(checkpoint_name): if len(orca.list_columns_for_table(table_name)): # rewrap the changed orca table as a unitary DataFrame-backed DataFrameWrapper table df = rewrap(table_name) - elif table_name not in _PIPELINE.last_checkpoint or table_name in _PIPELINE.replaced_tables: + elif ( + table_name not in _PIPELINE.last_checkpoint + or table_name in _PIPELINE.replaced_tables + ): df = orca.get_table(table_name).to_frame() else: continue - logger.debug("add_checkpoint '%s' table '%s' %s" % - (checkpoint_name, table_name, util.df_size(df))) + logger.debug( + "add_checkpoint '%s' table '%s' %s" + % (checkpoint_name, table_name, util.df_size(df)) + ) write_df(df, table_name, checkpoint_name) # remember which checkpoint it was last written @@ -307,7 +302,7 @@ def add_checkpoint(checkpoint_name): # convert empty values to str so PyTables doesn't pickle object types for c in checkpoints.columns: - checkpoints[c] = checkpoints[c].fillna('') + checkpoints[c] = checkpoints[c].fillna("") # write it to the store, overwriting any previous version (no way to simply extend) write_df(checkpoints, CHECKPOINT_TABLE_NAME) @@ -317,7 +312,7 @@ def registered_tables(): """ Return a list of the names of all currently registered dataframe tables """ - return [name for name in orca.list_tables() if orca.table_type(name) == 'dataframe'] + return [name for name in orca.list_tables() if orca.table_type(name) == "dataframe"] def checkpointed_tables(): @@ -325,8 +320,11 @@ def checkpointed_tables(): Return a list of the names of all checkpointed tables """ - return [name for name, checkpoint_name in _PIPELINE.last_checkpoint.items() - if checkpoint_name and name not in NON_TABLE_COLUMNS] + return [ + name + for name, checkpoint_name in _PIPELINE.last_checkpoint.items() + if checkpoint_name and name not in NON_TABLE_COLUMNS + ] def load_checkpoint(checkpoint_name): @@ -364,7 +362,7 @@ def load_checkpoint(checkpoint_name): raise RuntimeError(msg) # convert pandas dataframe back to array of checkpoint dicts - checkpoints = checkpoints.to_dict(orient='records') + checkpoints = checkpoints.to_dict(orient="records") # drop tables with empty names for checkpoint in checkpoints: @@ -379,8 +377,10 @@ def load_checkpoint(checkpoint_name): _PIPELINE.last_checkpoint.clear() _PIPELINE.last_checkpoint.update(_PIPELINE.checkpoints[-1]) - logger.info("load_checkpoint %s timestamp %s" - % (checkpoint_name, _PIPELINE.last_checkpoint['timestamp'])) + logger.info( + "load_checkpoint %s timestamp %s" + % (checkpoint_name, _PIPELINE.last_checkpoint["timestamp"]) + ) tables = checkpointed_tables() @@ -394,14 +394,14 @@ def load_checkpoint(checkpoint_name): loaded_tables[table_name] = df # register for tracing in order that tracing.register_traceable_table wants us to register them - traceable_tables = inject.get_injectable('traceable_tables', []) + traceable_tables = inject.get_injectable("traceable_tables", []) for table_name in traceable_tables: if table_name in loaded_tables: tracing.register_traceable_table(table_name, loaded_tables[table_name]) # add tables of known rng channels - rng_channels = inject.get_injectable('rng_channels', []) + rng_channels = inject.get_injectable("rng_channels", []) if rng_channels: logger.debug("loading random channels %s" % rng_channels) for table_name in rng_channels: @@ -410,7 +410,7 @@ def load_checkpoint(checkpoint_name): _PIPELINE.rng().add_channel(table_name, loaded_tables[table_name]) -def split_arg(s, sep, default=''): +def split_arg(s, sep, default=""): """ split str s in two at first sep, returning empty string as second result if no sep """ @@ -423,7 +423,7 @@ def split_arg(s, sep, default=''): val = default else: val = r[1] - val = {'true': True, 'false': False}.get(val.lower(), val) + val = {"true": True, "false": False}.get(val.lower(), val) return arg, val @@ -444,17 +444,22 @@ def run_model(model_name): raise RuntimeError("Pipeline not initialized! Did you call open_pipeline?") # can't run same model more than once - if model_name in [checkpoint[CHECKPOINT_NAME] for checkpoint in _PIPELINE.checkpoints]: + if model_name in [ + checkpoint[CHECKPOINT_NAME] for checkpoint in _PIPELINE.checkpoints + ]: raise RuntimeError("Cannot run model '%s' more than once" % model_name) _PIPELINE.rng().begin_step(model_name) # check for args - if '.' in model_name: - step_name, arg_string = model_name.split('.', 1) - args = dict((k, v) - for k, v in (split_arg(item, "=", default=True) - for item in arg_string.split(";"))) + if "." in model_name: + step_name, arg_string = model_name.split(".", 1) + args = dict( + (k, v) + for k, v in ( + split_arg(item, "=", default=True) for item in arg_string.split(";") + ) + ) else: step_name = model_name args = {} @@ -475,7 +480,9 @@ def run_model(model_name): orca.run([step_name]) - t0 = print_elapsed_time("#run_model completed step '%s'" % model_name, t0, debug=True) + t0 = print_elapsed_time( + "#run_model completed step '%s'" % model_name, t0, debug=True + ) mem.trace_memory_info(f"pipeline.run_model {model_name} finished") inject.set_step_args(None) @@ -506,7 +513,7 @@ def open_pipeline(resume_after=None): _PIPELINE.init_state() _PIPELINE.is_open = True - get_rn_generator().set_base_seed(inject.get_injectable('rng_base_seed', 0)) + get_rn_generator().set_base_seed(inject.get_injectable("rng_base_seed", 0)) if resume_after: # open existing pipeline @@ -558,12 +565,14 @@ def close_pipeline(): def intermediate_checkpoint(checkpoint_name=None): - checkpoints = config.setting('checkpoints', True) + checkpoints = config.setting("checkpoints", True) if checkpoints is True or checkpoints is False: return checkpoints - assert isinstance(checkpoints, list), f"setting 'checkpoints'' should be True or False or a list" + assert isinstance( + checkpoints, list + ), f"setting 'checkpoints'' should be True or False or a list" return checkpoint_name in checkpoints @@ -593,7 +602,7 @@ def run(models, resume_after=None): t0 = print_elapsed_time() open_pipeline(resume_after) - t0 = print_elapsed_time('open_pipeline', t0) + t0 = print_elapsed_time("open_pipeline", t0) if resume_after == LAST_CHECKPOINT: resume_after = _PIPELINE.last_checkpoint[CHECKPOINT_NAME] @@ -601,15 +610,15 @@ def run(models, resume_after=None): if resume_after: logger.info("resume_after %s" % resume_after) if resume_after in models: - models = models[models.index(resume_after) + 1:] + models = models[models.index(resume_after) + 1 :] - mem.trace_memory_info('pipeline.run before preload_injectables') + mem.trace_memory_info("pipeline.run before preload_injectables") # preload any bulky injectables (e.g. skims) not in pipeline - if inject.get_injectable('preload_injectables', None): - t0 = print_elapsed_time('preload_injectables', t0) + if inject.get_injectable("preload_injectables", None): + t0 = print_elapsed_time("preload_injectables", t0) - mem.trace_memory_info('pipeline.run after preload_injectables') + mem.trace_memory_info("pipeline.run after preload_injectables") t0 = print_elapsed_time() for model in models: @@ -623,7 +632,7 @@ def run(models, resume_after=None): if not intermediate_checkpoint(): add_checkpoint(FINAL_CHECKPOINT_NAME) - mem.trace_memory_info('pipeline.run after run_models') + mem.trace_memory_info("pipeline.run after run_models") t0 = print_elapsed_time("run_model (%s models)" % len(models), t0) @@ -655,8 +664,10 @@ def get_table(table_name, checkpoint_name=None): # orca table not in checkpoints (e.g. a merged table) if table_name not in _PIPELINE.last_checkpoint and orca.is_table(table_name): if checkpoint_name is not None: - raise RuntimeError("get_table: checkpoint_name ('%s') not supported" - "for non-checkpointed table '%s'" % (checkpoint_name, table_name)) + raise RuntimeError( + "get_table: checkpoint_name ('%s') not supported" + "for non-checkpointed table '%s'" % (checkpoint_name, table_name) + ) return orca.get_table(table_name).to_frame() @@ -673,8 +684,10 @@ def get_table(table_name, checkpoint_name=None): return orca.get_table(table_name).to_frame() # find the requested checkpoint - checkpoint = \ - next((x for x in _PIPELINE.checkpoints if x['checkpoint_name'] == checkpoint_name), None) + checkpoint = next( + (x for x in _PIPELINE.checkpoints if x["checkpoint_name"] == checkpoint_name), + None, + ) if checkpoint is None: raise RuntimeError("checkpoint '%s' not in checkpoints." % checkpoint_name) @@ -682,7 +695,9 @@ def get_table(table_name, checkpoint_name=None): last_checkpoint_name = checkpoint.get(table_name, None) if not last_checkpoint_name: - raise RuntimeError("table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name)) + raise RuntimeError( + "table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name) + ) # if this version of table is same as current if _PIPELINE.last_checkpoint.get(table_name, None) == last_checkpoint_name: @@ -708,7 +723,9 @@ def get_checkpoints(): if store is not None: df = store[CHECKPOINT_TABLE_NAME] else: - pipeline_file_path = config.pipeline_file_path(orca.get_injectable('pipeline_file_name')) + pipeline_file_path = config.pipeline_file_path( + orca.get_injectable("pipeline_file_name") + ) df = pd.read_hdf(pipeline_file_path, CHECKPOINT_TABLE_NAME) # non-table columns first (column order in df is random because created from a dict) @@ -742,11 +759,15 @@ def replace_table(table_name, df): assert is_open(), f"Pipeline is not open." if df.columns.duplicated().any(): - logger.error("replace_table: dataframe '%s' has duplicate columns: %s" % - (table_name, df.columns[df.columns.duplicated()])) + logger.error( + "replace_table: dataframe '%s' has duplicate columns: %s" + % (table_name, df.columns[df.columns.duplicated()]) + ) - raise RuntimeError("replace_table: dataframe '%s' has duplicate columns: %s" % - (table_name, df.columns[df.columns.duplicated()])) + raise RuntimeError( + "replace_table: dataframe '%s' has duplicate columns: %s" + % (table_name, df.columns[df.columns.duplicated()]) + ) rewrap(table_name, df) @@ -775,8 +796,11 @@ def extend_table(table_name, df, axis=0): if axis == 0: # don't expect indexes to overlap assert len(table_df.index.intersection(df.index)) == 0 - missing_df_str_columns = [c for c in table_df.columns - if c not in df.columns and table_df[c].dtype == 'O'] + missing_df_str_columns = [ + c + for c in table_df.columns + if c not in df.columns and table_df[c].dtype == "O" + ] else: # expect indexes be same assert table_df.index.equals(df.index) @@ -789,7 +813,7 @@ def extend_table(table_name, df, axis=0): # backfill missing df columns that were str (object) type in table_df if axis == 0: for c in missing_df_str_columns: - df[c] = df[c].fillna('') + df[c] = df[c].fillna("") replace_table(table_name, df) @@ -824,7 +848,7 @@ def drop_table(table_name): logger.debug("drop_table removing table %s from last_checkpoint" % table_name) - _PIPELINE.last_checkpoint[table_name] = '' + _PIPELINE.last_checkpoint[table_name] = "" def is_table(table_name): @@ -847,30 +871,34 @@ def cleanup_pipeline(): """ # we don't expect to be called unless cleanup_pipeline_after_run setting is True - assert config.setting('cleanup_pipeline_after_run', False) + assert config.setting("cleanup_pipeline_after_run", False) if not is_open(): - open_pipeline('_') + open_pipeline("_") assert is_open(), f"Pipeline is not open." - FINAL_PIPELINE_FILE_NAME = f"final_{inject.get_injectable('pipeline_file_name', 'pipeline')}" - FINAL_CHECKPOINT_NAME = 'final' + FINAL_PIPELINE_FILE_NAME = ( + f"final_{inject.get_injectable('pipeline_file_name', 'pipeline')}" + ) + FINAL_CHECKPOINT_NAME = "final" final_pipeline_file_path = config.build_output_file_path(FINAL_PIPELINE_FILE_NAME) # keep only the last row of checkpoints and patch the last checkpoint name checkpoints_df = get_checkpoints().tail(1).copy() - checkpoints_df['checkpoint_name'] = FINAL_CHECKPOINT_NAME + checkpoints_df["checkpoint_name"] = FINAL_CHECKPOINT_NAME - with pd.HDFStore(final_pipeline_file_path, mode='w') as final_pipeline_store: + with pd.HDFStore(final_pipeline_file_path, mode="w") as final_pipeline_store: for table_name in checkpointed_tables(): # patch last checkpoint name for all tables checkpoints_df[table_name] = FINAL_CHECKPOINT_NAME table_df = get_table(table_name) - logger.debug(f"cleanup_pipeline - adding table {table_name} {table_df.shape}") + logger.debug( + f"cleanup_pipeline - adding table {table_name} {table_df.shape}" + ) final_pipeline_store[table_name] = table_df @@ -879,4 +907,4 @@ def cleanup_pipeline(): close_pipeline() logger.debug(f"deleting all pipeline files except {final_pipeline_file_path}") - tracing.delete_output_files('h5', ignore=[final_pipeline_file_path]) + tracing.delete_output_files("h5", ignore=[final_pipeline_file_path]) diff --git a/activitysim/core/random.py b/activitysim/core/random.py index a9c9770340..12527840f4 100644 --- a/activitysim/core/random.py +++ b/activitysim/core/random.py @@ -1,24 +1,22 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range -from builtins import object - -import logging import hashlib +import logging +from builtins import object, range import numpy as np import pandas as pd + from activitysim.core.util import reindex from .tracing import print_elapsed_time - logger = logging.getLogger(__name__) # one more than 0xFFFFFFFF so we can wrap using: int64 % _MAX_SEED -_MAX_SEED = (1 << 32) -_SEED_MASK = 0xffffffff +_MAX_SEED = 1 << 32 +_SEED_MASK = 0xFFFFFFFF def hash32(s): @@ -32,7 +30,7 @@ def hash32(s): ------- 32 bit unsigned hash """ - s = s.encode('utf8') + s = s.encode("utf8") h = hashlib.md5(s).hexdigest() return int(h, base=16) & _SEED_MASK @@ -103,13 +101,12 @@ def init_row_states_for_step(self, row_states): if self.step_name and not row_states.empty: - row_states['row_seed'] = (self.base_seed + - self.channel_seed + - self.step_seed + - row_states.index) % _MAX_SEED + row_states["row_seed"] = ( + self.base_seed + self.channel_seed + self.step_seed + row_states.index + ) % _MAX_SEED # number of rands pulled this step - row_states['offset'] = 0 + row_states["offset"] = 0 return row_states @@ -128,10 +125,12 @@ def extend_domain(self, domain_df): """ if domain_df.empty: - logger.warning("extend_domain for channel %s for empty domain_df" % self.channel_name) + logger.warning( + "extend_domain for channel %s for empty domain_df" % self.channel_name + ) # dataframe to hold state for every df row - row_states = pd.DataFrame(columns=['row_seed', 'offset'], index=domain_df.index) + row_states = pd.DataFrame(columns=["row_seed", "offset"], index=domain_df.index) if self.step_name and not row_states.empty: self.init_row_states_for_step(row_states) @@ -170,8 +169,8 @@ def end_step(self, step_name): self.step_name = None self.step_seed = None - self.row_states['offset'] = 0 - self.row_states['row_seed'] = 0 + self.row_states["offset"] = 0 + self.row_states["row_seed"] = 0 def _generators_for_df(self, df): """ @@ -245,7 +244,7 @@ def random_for_df(self, df, step_name, n=1): rands = np.asanyarray([prng.rand(n) for prng in generators]) # update offset for rows we handled - self.row_states.loc[df.index, 'offset'] += n + self.row_states.loc[df.index, "offset"] += n return rands def normal_for_df(self, df, step_name, mu, sigma, lognormal=False): @@ -295,16 +294,22 @@ def to_series(x): sigma = to_series(sigma) if lognormal: - rands = \ - np.asanyarray([prng.lognormal(mean=mu[i], sigma=sigma[i]) - for i, prng in enumerate(generators)]) + rands = np.asanyarray( + [ + prng.lognormal(mean=mu[i], sigma=sigma[i]) + for i, prng in enumerate(generators) + ] + ) else: - rands = \ - np.asanyarray([prng.normal(loc=mu[i], scale=sigma[i]) - for i, prng in enumerate(generators)]) + rands = np.asanyarray( + [ + prng.normal(loc=mu[i], scale=sigma[i]) + for i, prng in enumerate(generators) + ] + ) # update offset for rows we handled - self.row_states.loc[df.index, 'offset'] += 1 + self.row_states.loc[df.index, "offset"] += 1 return rands @@ -350,20 +355,21 @@ def choice_for_df(self, df, step_name, a, size, replace): # initialize the generator iterator generators = self._generators_for_df(df) - sample = np.concatenate(tuple(prng.choice(a, size, replace) for prng in generators)) + sample = np.concatenate( + tuple(prng.choice(a, size, replace) for prng in generators) + ) if not self.multi_choice_offset: # FIXME - if replace, should we estimate rands_consumed? if replace: logger.warning("choice_for_df MULTI_CHOICE_FF with replace") # update offset for rows we handled - self.row_states.loc[df.index, 'offset'] += size + self.row_states.loc[df.index, "offset"] += size return sample class Random(object): - def __init__(self): self.channels = {} @@ -462,20 +468,22 @@ def add_channel(self, channel_name, domain_df): if channel_name in self.channels: assert channel_name == self.index_to_channel[domain_df.index.name] - logger.debug("Random: extending channel '%s' %s ids" % - (channel_name, len(domain_df.index))) + logger.debug( + "Random: extending channel '%s' %s ids" + % (channel_name, len(domain_df.index)) + ) channel = self.channels[channel_name] channel.extend_domain(domain_df) else: - logger.debug("Adding channel '%s' %s ids" % (channel_name, len(domain_df.index))) + logger.debug( + "Adding channel '%s' %s ids" % (channel_name, len(domain_df.index)) + ) - channel = SimpleChannel(channel_name, - self.base_seed, - domain_df, - self.step_name - ) + channel = SimpleChannel( + channel_name, self.base_seed, domain_df, self.step_name + ) self.channels[channel_name] = channel self.index_to_channel[domain_df.index.name] = channel_name @@ -490,10 +498,12 @@ def drop_channel(self, channel_name): """ if channel_name in self.channels: - logger.debug("Dropping channel '%s'" % (channel_name, )) + logger.debug("Dropping channel '%s'" % (channel_name,)) del self.channels[channel_name] else: - logger.error("drop_channel called with unknown channel '%s'" % (channel_name,)) + logger.error( + "drop_channel called with unknown channel '%s'" % (channel_name,) + ) def set_base_seed(self, seed=None): """ @@ -640,11 +650,15 @@ def normal_for_df(self, df, mu=0, sigma=1, broadcast=False): if broadcast: alts_df = df df = df.index.unique().to_series() - rands = channel.normal_for_df(df, self.step_name, mu=0, sigma=1, lognormal=False) + rands = channel.normal_for_df( + df, self.step_name, mu=0, sigma=1, lognormal=False + ) rands = reindex(pd.Series(rands, index=df.index), alts_df.index) - rands = rands*sigma + mu + rands = rands * sigma + mu else: - rands = channel.normal_for_df(df, self.step_name, mu, sigma, lognormal=False) + rands = channel.normal_for_df( + df, self.step_name, mu, sigma, lognormal=False + ) return rands @@ -703,7 +717,9 @@ def lognormal_for_df(self, df, mu, sigma, broadcast=False, scale=False): rands = np.exp(rands) else: channel = self.get_channel_for_df(df) - rands = channel.normal_for_df(df, self.step_name, mu=mu, sigma=sigma, lognormal=True) + rands = channel.normal_for_df( + df, self.step_name, mu=mu, sigma=sigma, lognormal=True + ) return rands @@ -746,11 +762,15 @@ def choice_for_df(self, df, a, size, replace): # FIXME - for tests if not self.channels: rng = np.random.RandomState(0) - choices = np.concatenate(tuple(rng.choice(a, size, replace) for _ in range(len(df)))) + choices = np.concatenate( + tuple(rng.choice(a, size, replace) for _ in range(len(df))) + ) return choices t0 = print_elapsed_time() channel = self.get_channel_for_df(df) choices = channel.choice_for_df(df, self.step_name, a, size, replace) - t0 = print_elapsed_time("choice_for_df for %s rows" % len(df.index), t0, debug=True) + t0 = print_elapsed_time( + "choice_for_df for %s rows" % len(df.index), t0, debug=True + ) return choices diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index 5d627f1ed6..4e138149a5 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -1,30 +1,21 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range - -import warnings import logging +import warnings +from builtins import range from collections import OrderedDict import numpy as np import pandas as pd -from . import logit -from . import tracing -from . import pipeline -from . import config -from . import util -from . import assign -from . import chunk - -from . import pathbuilder +from . import assign, chunk, config, logit, pathbuilder, pipeline, tracing, util logger = logging.getLogger(__name__) -SPEC_DESCRIPTION_NAME = 'Description' -SPEC_EXPRESSION_NAME = 'Expression' -SPEC_LABEL_NAME = 'Label' +SPEC_DESCRIPTION_NAME = "Description" +SPEC_EXPRESSION_NAME = "Expression" +SPEC_LABEL_NAME = "Label" ALT_LOSER_UTIL = -900 @@ -58,7 +49,7 @@ def uniquify_spec_index(spec): def read_model_alts(file_name, set_index=None): file_path = config.config_file_path(file_name) - df = pd.read_csv(file_path, comment='#') + df = pd.read_csv(file_path, comment="#") if set_index: df.set_index(set_index, inplace=True) return df @@ -97,17 +88,17 @@ def read_model_spec(file_name): """ assert isinstance(file_name, str) - if not file_name.lower().endswith('.csv'): - file_name = '%s.csv' % (file_name,) + if not file_name.lower().endswith(".csv"): + file_name = "%s.csv" % (file_name,) file_path = config.config_file_path(file_name) try: - spec = pd.read_csv(file_path, comment='#') + spec = pd.read_csv(file_path, comment="#") except Exception as err: logger.error(f"read_model_spec error reading {file_path}") logger.error(f"read_model_spec error {type(err).__name__}: {str(err)}") - raise(err) + raise (err) spec = spec.dropna(subset=[SPEC_EXPRESSION_NAME]) @@ -137,26 +128,31 @@ def read_model_coefficients(model_settings=None, file_name=None): assert file_name is not None else: assert file_name is None - assert 'COEFFICIENTS' in model_settings, \ - "'COEFFICIENTS' tag not in model_settings in %s" % model_settings.get('source_file_paths') - file_name = model_settings['COEFFICIENTS'] + assert "COEFFICIENTS" in model_settings, ( + "'COEFFICIENTS' tag not in model_settings in %s" + % model_settings.get("source_file_paths") + ) + file_name = model_settings["COEFFICIENTS"] logger.debug(f"read_model_coefficients file_name {file_name}") file_path = config.config_file_path(file_name) try: - coefficients = pd.read_csv(file_path, comment='#', index_col='coefficient_name') + coefficients = pd.read_csv(file_path, comment="#", index_col="coefficient_name") except ValueError: logger.exception("Coefficient File Invalid: %s" % str(file_path)) raise if coefficients.index.duplicated().any(): - logger.warning(f"duplicate coefficients in {file_path}\n" - f"{coefficients[coefficients.index.duplicated(keep=False)]}") + logger.warning( + f"duplicate coefficients in {file_path}\n" + f"{coefficients[coefficients.index.duplicated(keep=False)]}" + ) raise RuntimeError(f"duplicate coefficients in {file_path}") if coefficients.value.isnull().any(): logger.warning( - f"null coefficients in {file_path}\n{coefficients[coefficients.value.isnull()]}") + f"null coefficients in {file_path}\n{coefficients[coefficients.value.isnull()]}" + ) raise RuntimeError(f"null coefficients in {file_path}") return coefficients @@ -188,15 +184,19 @@ def spec_for_segment(model_settings, spec_id, segment_name, estimator): else: # otherwise we expect a single coefficient column # doesn't really matter what it is called, but this may catch errors - assert spec.columns[0] in ['coefficient', segment_name] + assert spec.columns[0] in ["coefficient", segment_name] - if 'COEFFICIENTS' not in model_settings: - logger.warning(f"no coefficient file specified in model_settings for {spec_file_name}") + if "COEFFICIENTS" not in model_settings: + logger.warning( + f"no coefficient file specified in model_settings for {spec_file_name}" + ) try: assert (spec.astype(float) == spec).all(axis=None) except (ValueError, AssertionError): - raise RuntimeError(f"No coefficient file specified for {spec_file_name} " - f"but not all spec column values are numeric") + raise RuntimeError( + f"No coefficient file specified for {spec_file_name} " + f"but not all spec column values are numeric" + ) return spec @@ -212,14 +212,16 @@ def read_model_coefficient_template(model_settings): Read the coefficient template specified by COEFFICIENT_TEMPLATE model setting """ - assert 'COEFFICIENT_TEMPLATE' in model_settings, \ - "'COEFFICIENT_TEMPLATE' not in model_settings in %s" % model_settings.get('source_file_paths') + assert "COEFFICIENT_TEMPLATE" in model_settings, ( + "'COEFFICIENT_TEMPLATE' not in model_settings in %s" + % model_settings.get("source_file_paths") + ) - coefficients_file_name = model_settings['COEFFICIENT_TEMPLATE'] + coefficients_file_name = model_settings["COEFFICIENT_TEMPLATE"] file_path = config.config_file_path(coefficients_file_name) try: - template = pd.read_csv(file_path, comment='#', index_col='coefficient_name') + template = pd.read_csv(file_path, comment="#", index_col="coefficient_name") except ValueError: logger.exception("Coefficient Template File Invalid: %s" % str(file_path)) raise @@ -236,7 +238,9 @@ def read_model_coefficient_template(model_settings): if template.index.duplicated().any(): dupes = template[template.index.duplicated(keep=False)].sort_index() - logger.warning(f"duplicate coefficient names in {coefficients_file_name}:\n{dupes}") + logger.warning( + f"duplicate coefficient names in {coefficients_file_name}:\n{dupes}" + ) assert not template.index.duplicated().any() return template @@ -253,12 +257,12 @@ def dump_mapped_coefficients(model_settings): for c in template_df.columns: template_df[c] = template_df[c].map(coefficients_df.value) - coefficients_template_file_name = model_settings['COEFFICIENT_TEMPLATE'] + coefficients_template_file_name = model_settings["COEFFICIENT_TEMPLATE"] file_path = config.output_file_path(coefficients_template_file_name) template_df.to_csv(file_path, index=True) logger.info(f"wrote mapped coefficient template to {file_path}") - coefficients_file_name = model_settings['COEFFICIENTS'] + coefficients_file_name = model_settings["COEFFICIENTS"] file_path = config.output_file_path(coefficients_file_name) coefficients_df.to_csv(file_path, index=True) logger.info(f"wrote raw coefficients to {file_path}") @@ -296,33 +300,49 @@ def get_segment_coefficients(model_settings, segment_name): """ - if 'COEFFICIENTS' in model_settings and 'COEFFICIENT_TEMPLATE' in model_settings: + if "COEFFICIENTS" in model_settings and "COEFFICIENT_TEMPLATE" in model_settings: legacy = False - elif 'COEFFICIENTS' in model_settings: - legacy = 'COEFFICIENTS' - warnings.warn("Support for COEFFICIENTS without COEFFICIENT_TEMPLATE in model settings file will be removed." - "Use COEFFICIENT and COEFFICIENT_TEMPLATE to support estimation.", FutureWarning) - elif 'LEGACY_COEFFICIENTS' in model_settings: - legacy = 'LEGACY_COEFFICIENTS' - warnings.warn("Support for 'LEGACY_COEFFICIENTS' setting in model settings file will be removed." - "Use COEFFICIENT and COEFFICIENT_TEMPLATE to support estimation.", FutureWarning) + elif "COEFFICIENTS" in model_settings: + legacy = "COEFFICIENTS" + warnings.warn( + "Support for COEFFICIENTS without COEFFICIENT_TEMPLATE in model settings file will be removed." + "Use COEFFICIENT and COEFFICIENT_TEMPLATE to support estimation.", + FutureWarning, + ) + elif "LEGACY_COEFFICIENTS" in model_settings: + legacy = "LEGACY_COEFFICIENTS" + warnings.warn( + "Support for 'LEGACY_COEFFICIENTS' setting in model settings file will be removed." + "Use COEFFICIENT and COEFFICIENT_TEMPLATE to support estimation.", + FutureWarning, + ) else: raise RuntimeError(f"No COEFFICIENTS setting in model_settings") if legacy: constants = config.get_model_constants(model_settings) - legacy_coeffs_file_path = config.config_file_path(model_settings['LEGACY_COEFFICIENTS']) - omnibus_coefficients = pd.read_csv(legacy_coeffs_file_path, comment='#', index_col='coefficient_name') - coefficients_dict = assign.evaluate_constants(omnibus_coefficients[segment_name], constants=constants) + legacy_coeffs_file_path = config.config_file_path( + model_settings["LEGACY_COEFFICIENTS"] + ) + omnibus_coefficients = pd.read_csv( + legacy_coeffs_file_path, comment="#", index_col="coefficient_name" + ) + coefficients_dict = assign.evaluate_constants( + omnibus_coefficients[segment_name], constants=constants + ) else: coefficients_df = read_model_coefficients(model_settings) template_df = read_model_coefficient_template(model_settings) - coefficients_col = template_df[segment_name].map(coefficients_df.value).astype(float) + coefficients_col = ( + template_df[segment_name].map(coefficients_df.value).astype(float) + ) if coefficients_col.isnull().any(): # show them the offending lines from interaction_coefficients_file - logger.warning(f"bad coefficients in COEFFICIENTS {model_settings['COEFFICIENTS']}\n" - f"{coefficients_col[coefficients_col.isnull()]}") + logger.warning( + f"bad coefficients in COEFFICIENTS {model_settings['COEFFICIENTS']}\n" + f"{coefficients_col[coefficients_col.isnull()]}" + ) assert not coefficients_col.isnull().any() coefficients_dict = coefficients_col.to_dict() @@ -331,24 +351,25 @@ def get_segment_coefficients(model_settings, segment_name): def eval_nest_coefficients(nest_spec, coefficients, trace_label): - def replace_coefficients(nest): if isinstance(nest, dict): - assert 'coefficient' in nest - coefficient_name = nest['coefficient'] + assert "coefficient" in nest + coefficient_name = nest["coefficient"] if isinstance(coefficient_name, str): - assert coefficient_name in coefficients, "%s not in nest coefficients" % (coefficient_name, ) - nest['coefficient'] = coefficients[coefficient_name] + assert ( + coefficient_name in coefficients + ), "%s not in nest coefficients" % (coefficient_name,) + nest["coefficient"] = coefficients[coefficient_name] - assert 'alternatives' in nest - for alternative in nest['alternatives']: + assert "alternatives" in nest + for alternative in nest["alternatives"]: if isinstance(alternative, dict): replace_coefficients(alternative) if isinstance(coefficients, pd.DataFrame): - assert ('value' in coefficients.columns) - coefficients = coefficients['value'].to_dict() + assert "value" in coefficients.columns + coefficients = coefficients["value"].to_dict() replace_coefficients(nest_spec) @@ -362,16 +383,19 @@ def eval_coefficients(spec, coefficients, estimator): spec = spec.copy() # don't clobber input spec if isinstance(coefficients, pd.DataFrame): - assert ('value' in coefficients.columns) - coefficients = coefficients['value'].to_dict() + assert "value" in coefficients.columns + coefficients = coefficients["value"].to_dict() - assert isinstance(coefficients, dict), \ - "eval_coefficients doesn't grok type of coefficients: %s" % (type(coefficients)) + assert isinstance( + coefficients, dict + ), "eval_coefficients doesn't grok type of coefficients: %s" % (type(coefficients)) for c in spec.columns: if c == SPEC_LABEL_NAME: continue - spec[c] = spec[c].apply(lambda x: eval(str(x), {}, coefficients)).astype(np.float32) + spec[c] = ( + spec[c].apply(lambda x: eval(str(x), {}, coefficients)).astype(np.float32) + ) # drop any rows with all zeros since they won't have any effect (0 marginal utility) # (do not drop rows in estimation mode as it may confuse the estimation package (e.g. larch) @@ -380,15 +404,23 @@ def eval_coefficients(spec, coefficients, estimator): if estimator: logger.debug("keeping %s all-zero rows in SPEC" % (zero_rows.sum(),)) else: - logger.debug("dropping %s all-zero rows from SPEC" % (zero_rows.sum(), )) + logger.debug("dropping %s all-zero rows from SPEC" % (zero_rows.sum(),)) spec = spec.loc[~zero_rows] return spec -def eval_utilities(spec, choosers, locals_d=None, trace_label=None, - have_trace_targets=False, trace_all_rows=False, - estimator=None, trace_column_names=None, log_alt_losers=False): +def eval_utilities( + spec, + choosers, + locals_d=None, + trace_label=None, + have_trace_targets=False, + trace_all_rows=False, + estimator=None, + trace_column_names=None, + log_alt_losers=False, +): """ Parameters @@ -416,7 +448,7 @@ def eval_utilities(spec, choosers, locals_d=None, trace_label=None, # fixme - restore tracing and _check_for_variability - trace_label = tracing.extend_trace_label(trace_label, 'eval_utils') + trace_label = tracing.extend_trace_label(trace_label, "eval_utils") # avoid altering caller's passed-in locals_d parameter (they may be looping) locals_dict = assign.local_utilities() @@ -425,7 +457,7 @@ def eval_utilities(spec, choosers, locals_d=None, trace_label=None, locals_dict.update(locals_d) globals_dict = {} - locals_dict['df'] = choosers + locals_dict["df"] = choosers # - eval spec expressions if isinstance(spec.index, pd.MultiIndex): @@ -444,17 +476,21 @@ def eval_utilities(spec, choosers, locals_d=None, trace_label=None, with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") - if expr.startswith('@'): + if expr.startswith("@"): expression_value = eval(expr[1:], globals_dict, locals_dict) else: expression_value = choosers.eval(expr) if len(w) > 0: for wrn in w: - logger.warning(f"{trace_label} - {type(wrn).__name__} ({wrn.message}) evaluating: {str(expr)}") + logger.warning( + f"{trace_label} - {type(wrn).__name__} ({wrn.message}) evaluating: {str(expr)}" + ) except Exception as err: - logger.exception(f"{trace_label} - {type(err).__name__} ({str(err)}) evaluating: {str(expr)}") + logger.exception( + f"{trace_label} - {type(err).__name__} ({str(err)}) evaluating: {str(expr)}" + ) raise err if log_alt_losers: @@ -464,8 +500,10 @@ def eval_utilities(spec, choosers, locals_d=None, trace_label=None, losers = np.amax(utils, axis=1) < ALT_LOSER_UTIL if losers.any(): - logger.warning(f"{trace_label} - {sum(losers)} choosers of {len(losers)} " - f"with prohibitive utilities for all alternatives for expression: {expr}") + logger.warning( + f"{trace_label} - {sum(losers)} choosers of {len(losers)} " + f"with prohibitive utilities for all alternatives for expression: {expr}" + ) expression_values[i] = expression_value i += 1 @@ -476,7 +514,8 @@ def eval_utilities(spec, choosers, locals_d=None, trace_label=None, df = pd.DataFrame( data=expression_values.transpose(), index=choosers.index, - columns=spec.index.get_level_values(SPEC_LABEL_NAME)) + columns=spec.index.get_level_values(SPEC_LABEL_NAME), + ) df.index.name = choosers.index.name estimator.write_expression_values(df) @@ -508,19 +547,28 @@ def eval_utilities(spec, choosers, locals_d=None, trace_label=None, if trace_column_names is not None: if isinstance(trace_column_names, str): trace_column_names = [trace_column_names] - expression_values_df.columns = pd.MultiIndex.from_frame(choosers.loc[trace_targets, trace_column_names]) + expression_values_df.columns = pd.MultiIndex.from_frame( + choosers.loc[trace_targets, trace_column_names] + ) - tracing.trace_df(expression_values_df, tracing.extend_trace_label(trace_label, 'expression_values'), - slicer=None, transpose=False) + tracing.trace_df( + expression_values_df, + tracing.extend_trace_label(trace_label, "expression_values"), + slicer=None, + transpose=False, + ) if len(spec.columns) > 1: for c in spec.columns: - name = f'expression_value_{c}' + name = f"expression_value_{c}" - tracing.trace_df(expression_values_df.multiply(spec[c].values, axis=0), - tracing.extend_trace_label(trace_label, name), - slicer=None, transpose=False) + tracing.trace_df( + expression_values_df.multiply(spec[c].values, axis=0), + tracing.extend_trace_label(trace_label, name), + slicer=None, + transpose=False, + ) del expression_values chunk.log_df(trace_label, "expression_values", None) @@ -571,7 +619,7 @@ def eval_variables(exprs, df, locals_d=None): locals_dict.update(locals_d) globals_dict = {} - locals_dict['df'] = df + locals_dict["df"] = df def to_array(x): @@ -596,7 +644,7 @@ def to_array(x): values = OrderedDict() for expr in exprs: try: - if expr.startswith('@'): + if expr.startswith("@"): expr_values = to_array(eval(expr[1:], globals_dict, locals_dict)) else: expr_values = to_array(df.eval(expr)) @@ -605,7 +653,9 @@ def to_array(x): values[expr] = expr_values except Exception as err: - logger.exception(f"Variable evaluation failed {type(err).__name__} ({str(err)}) evaluating: {str(expr)}") + logger.exception( + f"Variable evaluation failed {type(err).__name__} ({str(err)}) evaluating: {str(expr)}" + ) raise err values = util.df_from_dict(values, index=df.index) @@ -658,9 +708,13 @@ def set_skim_wrapper_targets(df, skims): the skims object is intended to be used. """ - skims = skims if isinstance(skims, list) \ - else skims.values() if isinstance(skims, dict) \ + skims = ( + skims + if isinstance(skims, list) + else skims.values() + if isinstance(skims, dict) else [skims] + ) # assume any object in skims can be treated as a skim for skim in skims: @@ -682,7 +736,7 @@ def _check_for_variability(expression_values, trace_label): """ if trace_label is None: - trace_label = '_check_for_variability' + trace_label = "_check_for_variability" sample = random_rows(expression_values, min(1000, len(expression_values))) @@ -691,7 +745,9 @@ def _check_for_variability(expression_values, trace_label): v = sample.iloc[:, i] if v.min() == v.max(): col_name = sample.columns[i] - logger.info("%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], col_name)) + logger.info( + "%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], col_name) + ) no_variability += 1 # FIXME - how could this happen? Not sure it is really a problem? if np.count_nonzero(v.isnull().values) > 0: @@ -700,10 +756,14 @@ def _check_for_variability(expression_values, trace_label): has_missing_vals += 1 if no_variability > 0: - logger.warning("%s: %s columns have no variability" % (trace_label, no_variability)) + logger.warning( + "%s: %s columns have no variability" % (trace_label, no_variability) + ) if has_missing_vals > 0: - logger.warning("%s: %s columns have missing values" % (trace_label, has_missing_vals)) + logger.warning( + "%s: %s columns have missing values" % (trace_label, has_missing_vals) + ) def compute_nested_exp_utilities(raw_utilities, nest_spec): @@ -736,8 +796,9 @@ def compute_nested_exp_utilities(raw_utilities, nest_spec): if nest.is_leaf: # leaf_utility = raw_utility / nest.product_of_coefficients - nested_utilities[name] = \ + nested_utilities[name] = ( raw_utilities[name].astype(float) / nest.product_of_coefficients + ) else: # nest node @@ -745,9 +806,10 @@ def compute_nested_exp_utilities(raw_utilities, nest_spec): # this will RuntimeWarning: divide by zero encountered in log # if all nest alternative utilities are zero # but the resulting inf will become 0 when exp is applied below - with np.errstate(divide='ignore'): - nested_utilities[name] = \ - nest.coefficient * np.log(nested_utilities[nest.alternatives].sum(axis=1)) + with np.errstate(divide="ignore"): + nested_utilities[name] = nest.coefficient * np.log( + nested_utilities[nest.alternatives].sum(axis=1) + ) # exponentiate the utility nested_utilities[name] = np.exp(nested_utilities[name]) @@ -777,12 +839,14 @@ def compute_nested_probabilities(nested_exp_utilities, nest_spec, trace_label): nested_probabilities = pd.DataFrame(index=nested_exp_utilities.index) - for nest in logit.each_nest(nest_spec, type='node', post_order=False): + for nest in logit.each_nest(nest_spec, type="node", post_order=False): - probs = logit.utils_to_probs(nested_exp_utilities[nest.alternatives], - trace_label=trace_label, - exponentiated=True, - allow_zero_probs=True) + probs = logit.utils_to_probs( + nested_exp_utilities[nest.alternatives], + trace_label=trace_label, + exponentiated=True, + allow_zero_probs=True, + ) nested_probabilities = pd.concat([nested_probabilities, probs], axis=1) @@ -812,7 +876,7 @@ def compute_base_probabilities(nested_probabilities, nests, spec): base_probabilities = pd.DataFrame(index=nested_probabilities.index) - for nest in logit.each_nest(nests, type='leaf', post_order=False): + for nest in logit.each_nest(nests, type="leaf", post_order=False): # skip root: it has a prob of 1 but we didn't compute a nested probability column for it ancestors = nest.ancestors[1:] @@ -821,16 +885,24 @@ def compute_base_probabilities(nested_probabilities, nests, spec): # reorder alternative columns to match spec # since these are alternatives chosen by column index, order of columns matters - assert(set(base_probabilities.columns) == set(spec.columns)) + assert set(base_probabilities.columns) == set(spec.columns) base_probabilities = base_probabilities[spec.columns] return base_probabilities -def eval_mnl(choosers, spec, locals_d, custom_chooser, estimator, - log_alt_losers=False, - want_logsums=False, trace_label=None, - trace_choice_name=None, trace_column_names=None): +def eval_mnl( + choosers, + spec, + locals_d, + custom_chooser, + estimator, + log_alt_losers=False, + want_logsums=False, + trace_label=None, + trace_choice_name=None, + trace_column_names=None, +): """ Run a simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative @@ -876,55 +948,79 @@ def eval_mnl(choosers, spec, locals_d, custom_chooser, estimator, # FIXME - not implemented because not currently needed assert not want_logsums - trace_label = tracing.extend_trace_label(trace_label, 'eval_mnl') + trace_label = tracing.extend_trace_label(trace_label, "eval_mnl") have_trace_targets = tracing.has_trace_targets(choosers) if have_trace_targets: - tracing.trace_df(choosers, '%s.choosers' % trace_label) - - utilities = eval_utilities(spec, choosers, locals_d, - log_alt_losers=log_alt_losers, - trace_label=trace_label, have_trace_targets=have_trace_targets, - estimator=estimator, trace_column_names=trace_column_names) + tracing.trace_df(choosers, "%s.choosers" % trace_label) + + utilities = eval_utilities( + spec, + choosers, + locals_d, + log_alt_losers=log_alt_losers, + trace_label=trace_label, + have_trace_targets=have_trace_targets, + estimator=estimator, + trace_column_names=trace_column_names, + ) chunk.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df(utilities, '%s.utilities' % trace_label, - column_labels=['alternative', 'utility']) - - probs = logit.utils_to_probs(utilities, trace_label=trace_label, trace_choosers=choosers) + tracing.trace_df( + utilities, + "%s.utilities" % trace_label, + column_labels=["alternative", "utility"], + ) + + probs = logit.utils_to_probs( + utilities, trace_label=trace_label, trace_choosers=choosers + ) chunk.log_df(trace_label, "probs", probs) del utilities - chunk.log_df(trace_label, 'utilities', None) + chunk.log_df(trace_label, "utilities", None) if have_trace_targets: # report these now in case make_choices throws error on bad_choices - tracing.trace_df(probs, '%s.probs' % trace_label, - column_labels=['alternative', 'probability']) + tracing.trace_df( + probs, + "%s.probs" % trace_label, + column_labels=["alternative", "probability"], + ) if custom_chooser: - choices, rands = custom_chooser(probs=probs, choosers=choosers, spec=spec, - trace_label=trace_label) + choices, rands = custom_chooser( + probs=probs, choosers=choosers, spec=spec, trace_label=trace_label + ) else: choices, rands = logit.make_choices(probs, trace_label=trace_label) del probs - chunk.log_df(trace_label, 'probs', None) + chunk.log_df(trace_label, "probs", None) if have_trace_targets: - tracing.trace_df(choices, '%s.choices' % trace_label, - columns=[None, trace_choice_name]) - tracing.trace_df(rands, '%s.rands' % trace_label, - columns=[None, 'rand']) + tracing.trace_df( + choices, "%s.choices" % trace_label, columns=[None, trace_choice_name] + ) + tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) return choices -def eval_nl(choosers, spec, nest_spec, locals_d, custom_chooser, estimator, - log_alt_losers=False, - want_logsums=False, trace_label=None, - trace_choice_name=None, trace_column_names=None): +def eval_nl( + choosers, + spec, + nest_spec, + locals_d, + custom_chooser, + estimator, + log_alt_losers=False, + want_logsums=False, + trace_label=None, + trace_choice_name=None, + trace_column_names=None, +): """ Run a nested-logit simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative @@ -962,39 +1058,52 @@ def eval_nl(choosers, spec, nest_spec, locals_d, custom_chooser, estimator, of `spec`. """ - trace_label = tracing.extend_trace_label(trace_label, 'eval_nl') + trace_label = tracing.extend_trace_label(trace_label, "eval_nl") assert trace_label have_trace_targets = tracing.has_trace_targets(choosers) logit.validate_nest_spec(nest_spec, trace_label) if have_trace_targets: - tracing.trace_df(choosers, '%s.choosers' % trace_label) - - raw_utilities = eval_utilities(spec, choosers, locals_d, - log_alt_losers=log_alt_losers, - trace_label=trace_label, have_trace_targets=have_trace_targets, - estimator=estimator, trace_column_names=trace_column_names) + tracing.trace_df(choosers, "%s.choosers" % trace_label) + + raw_utilities = eval_utilities( + spec, + choosers, + locals_d, + log_alt_losers=log_alt_losers, + trace_label=trace_label, + have_trace_targets=have_trace_targets, + estimator=estimator, + trace_column_names=trace_column_names, + ) chunk.log_df(trace_label, "raw_utilities", raw_utilities) if have_trace_targets: - tracing.trace_df(raw_utilities, '%s.raw_utilities' % trace_label, - column_labels=['alternative', 'utility']) + tracing.trace_df( + raw_utilities, + "%s.raw_utilities" % trace_label, + column_labels=["alternative", "utility"], + ) # exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities(raw_utilities, nest_spec) chunk.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) del raw_utilities - chunk.log_df(trace_label, 'raw_utilities', None) + chunk.log_df(trace_label, "raw_utilities", None) if have_trace_targets: - tracing.trace_df(nested_exp_utilities, '%s.nested_exp_utilities' % trace_label, - column_labels=['alternative', 'utility']) + tracing.trace_df( + nested_exp_utilities, + "%s.nested_exp_utilities" % trace_label, + column_labels=["alternative", "utility"], + ) # probabilities of alternatives relative to siblings sharing the same nest - nested_probabilities = \ - compute_nested_probabilities(nested_exp_utilities, nest_spec, trace_label=trace_label) + nested_probabilities = compute_nested_probabilities( + nested_exp_utilities, nest_spec, trace_label=trace_label + ) chunk.log_df(trace_label, "nested_probabilities", nested_probabilities) if want_logsums: @@ -1003,22 +1112,30 @@ def eval_nl(choosers, spec, nest_spec, locals_d, custom_chooser, estimator, chunk.log_df(trace_label, "logsums", logsums) del nested_exp_utilities - chunk.log_df(trace_label, 'nested_exp_utilities', None) + chunk.log_df(trace_label, "nested_exp_utilities", None) if have_trace_targets: - tracing.trace_df(nested_probabilities, '%s.nested_probabilities' % trace_label, - column_labels=['alternative', 'probability']) + tracing.trace_df( + nested_probabilities, + "%s.nested_probabilities" % trace_label, + column_labels=["alternative", "probability"], + ) # global (flattened) leaf probabilities based on relative nest coefficients (in spec order) - base_probabilities = compute_base_probabilities(nested_probabilities, nest_spec, spec) + base_probabilities = compute_base_probabilities( + nested_probabilities, nest_spec, spec + ) chunk.log_df(trace_label, "base_probabilities", base_probabilities) del nested_probabilities - chunk.log_df(trace_label, 'nested_probabilities', None) + chunk.log_df(trace_label, "nested_probabilities", None) if have_trace_targets: - tracing.trace_df(base_probabilities, '%s.base_probabilities' % trace_label, - column_labels=['alternative', 'probability']) + tracing.trace_df( + base_probabilities, + "%s.base_probabilities" % trace_label, + column_labels=["alternative", "probability"], + ) # note base_probabilities could all be zero since we allowed all probs for nests to be zero # check here to print a clear message but make_choices will raise error if probs don't sum to 1 @@ -1028,43 +1145,57 @@ def eval_nl(choosers, spec, nest_spec, locals_d, custom_chooser, estimator, if no_choices.any(): logit.report_bad_choices( - no_choices, base_probabilities, - trace_label=tracing.extend_trace_label(trace_label, 'bad_probs'), + no_choices, + base_probabilities, + trace_label=tracing.extend_trace_label(trace_label, "bad_probs"), trace_choosers=choosers, - msg="base_probabilities do not sum to one") + msg="base_probabilities do not sum to one", + ) if custom_chooser: - choices, rands = custom_chooser(probs=base_probabilities, choosers=choosers, spec=spec, - trace_label=trace_label) + choices, rands = custom_chooser( + probs=base_probabilities, + choosers=choosers, + spec=spec, + trace_label=trace_label, + ) else: choices, rands = logit.make_choices(base_probabilities, trace_label=trace_label) del base_probabilities - chunk.log_df(trace_label, 'base_probabilities', None) + chunk.log_df(trace_label, "base_probabilities", None) if have_trace_targets: - tracing.trace_df(choices, '%s.choices' % trace_label, - columns=[None, trace_choice_name]) - tracing.trace_df(rands, '%s.rands' % trace_label, - columns=[None, 'rand']) + tracing.trace_df( + choices, "%s.choices" % trace_label, columns=[None, trace_choice_name] + ) + tracing.trace_df(rands, "%s.rands" % trace_label, columns=[None, "rand"]) if want_logsums: - tracing.trace_df(logsums, '%s.logsums' % trace_label, - columns=[None, 'logsum']) + tracing.trace_df( + logsums, "%s.logsums" % trace_label, columns=[None, "logsum"] + ) if want_logsums: - choices = choices.to_frame('choice') - choices['logsum'] = logsums + choices = choices.to_frame("choice") + choices["logsum"] = logsums return choices -def _simple_simulate(choosers, spec, nest_spec, skims=None, locals_d=None, - custom_chooser=None, - log_alt_losers=False, - want_logsums=False, - estimator=None, - trace_label=None, trace_choice_name=None, trace_column_names=None, - ): +def _simple_simulate( + choosers, + spec, + nest_spec, + skims=None, + locals_d=None, + custom_chooser=None, + log_alt_losers=False, + want_logsums=False, + estimator=None, + trace_label=None, + trace_choice_name=None, + trace_column_names=None, +): """ Run an MNL or NL simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative @@ -1114,59 +1245,90 @@ def _simple_simulate(choosers, spec, nest_spec, skims=None, locals_d=None, set_skim_wrapper_targets(choosers, skims) if nest_spec is None: - choices = eval_mnl(choosers, spec, locals_d, custom_chooser, - log_alt_losers=log_alt_losers, - want_logsums=want_logsums, - estimator=estimator, - trace_label=trace_label, - trace_choice_name=trace_choice_name, trace_column_names=trace_column_names) + choices = eval_mnl( + choosers, + spec, + locals_d, + custom_chooser, + log_alt_losers=log_alt_losers, + want_logsums=want_logsums, + estimator=estimator, + trace_label=trace_label, + trace_choice_name=trace_choice_name, + trace_column_names=trace_column_names, + ) else: - choices = eval_nl(choosers, spec, nest_spec, locals_d, custom_chooser, - log_alt_losers=log_alt_losers, - want_logsums=want_logsums, - estimator=estimator, - trace_label=trace_label, - trace_choice_name=trace_choice_name, trace_column_names=trace_column_names) + choices = eval_nl( + choosers, + spec, + nest_spec, + locals_d, + custom_chooser, + log_alt_losers=log_alt_losers, + want_logsums=want_logsums, + estimator=estimator, + trace_label=trace_label, + trace_choice_name=trace_choice_name, + trace_column_names=trace_column_names, + ) return choices def tvpb_skims(skims): - def list_of_skims(skims): - return \ - skims if isinstance(skims, list) \ - else skims.values() if isinstance(skims, dict) \ - else [skims] if skims is not None \ + return ( + skims + if isinstance(skims, list) + else skims.values() + if isinstance(skims, dict) + else [skims] + if skims is not None else [] - - return [skim for skim in list_of_skims(skims) if isinstance(skim, pathbuilder.TransitVirtualPathLogsumWrapper)] - - -def simple_simulate(choosers, spec, nest_spec, - skims=None, locals_d=None, - chunk_size=0, custom_chooser=None, - log_alt_losers=False, - want_logsums=False, - estimator=None, - trace_label=None, trace_choice_name=None, trace_column_names=None): + ) + + return [ + skim + for skim in list_of_skims(skims) + if isinstance(skim, pathbuilder.TransitVirtualPathLogsumWrapper) + ] + + +def simple_simulate( + choosers, + spec, + nest_spec, + skims=None, + locals_d=None, + chunk_size=0, + custom_chooser=None, + log_alt_losers=False, + want_logsums=False, + estimator=None, + trace_label=None, + trace_choice_name=None, + trace_column_names=None, +): """ Run an MNL or NL simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative properties and no need to sample from alternatives. """ - trace_label = tracing.extend_trace_label(trace_label, 'simple_simulate') + trace_label = tracing.extend_trace_label(trace_label, "simple_simulate") assert len(choosers) > 0 result_list = [] # segment by person type and pick the right spec for each person type - for i, chooser_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers(choosers, chunk_size, trace_label): + for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + choosers, chunk_size, trace_label + ): choices = _simple_simulate( - chooser_chunk, spec, nest_spec, + chooser_chunk, + spec, + nest_spec, skims=skims, locals_d=locals_d, custom_chooser=custom_chooser, @@ -1175,11 +1337,12 @@ def simple_simulate(choosers, spec, nest_spec, estimator=estimator, trace_label=chunk_trace_label, trace_choice_name=trace_choice_name, - trace_column_names=trace_column_names) + trace_column_names=trace_column_names, + ) result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) if len(result_list) > 1: choices = pd.concat(result_list) @@ -1189,24 +1352,35 @@ def simple_simulate(choosers, spec, nest_spec, return choices -def simple_simulate_by_chunk_id(choosers, spec, nest_spec, - skims=None, locals_d=None, - chunk_size=0, custom_chooser=None, - log_alt_losers=False, - want_logsums=False, - estimator=None, - trace_label=None, - trace_choice_name=None): +def simple_simulate_by_chunk_id( + choosers, + spec, + nest_spec, + skims=None, + locals_d=None, + chunk_size=0, + custom_chooser=None, + log_alt_losers=False, + want_logsums=False, + estimator=None, + trace_label=None, + trace_choice_name=None, +): """ chunk_by_chunk_id wrapper for simple_simulate """ result_list = [] - for i, chooser_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers_by_chunk_id(choosers, chunk_size, trace_label): + for ( + i, + chooser_chunk, + chunk_trace_label, + ) in chunk.adaptive_chunked_choosers_by_chunk_id(choosers, chunk_size, trace_label): choices = _simple_simulate( - chooser_chunk, spec, nest_spec, + chooser_chunk, + spec, + nest_spec, skims=skims, locals_d=locals_d, custom_chooser=custom_chooser, @@ -1214,11 +1388,12 @@ def simple_simulate_by_chunk_id(choosers, spec, nest_spec, want_logsums=want_logsums, estimator=estimator, trace_label=chunk_trace_label, - trace_choice_name=trace_choice_name) + trace_choice_name=trace_choice_name, + ) result_list.append(choices) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) if len(result_list) > 1: choices = pd.concat(result_list) @@ -1238,21 +1413,26 @@ def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): # FIXME - untested and not currently used by any models... - trace_label = tracing.extend_trace_label(trace_label, 'eval_mnl_logsums') + trace_label = tracing.extend_trace_label(trace_label, "eval_mnl_logsums") have_trace_targets = tracing.has_trace_targets(choosers) logger.debug("running eval_mnl_logsums") # trace choosers if have_trace_targets: - tracing.trace_df(choosers, '%s.choosers' % trace_label) + tracing.trace_df(choosers, "%s.choosers" % trace_label) - utilities = eval_utilities(spec, choosers, locals_d, trace_label, have_trace_targets) + utilities = eval_utilities( + spec, choosers, locals_d, trace_label, have_trace_targets + ) chunk.log_df(trace_label, "utilities", utilities) if have_trace_targets: - tracing.trace_df(utilities, '%s.raw_utilities' % trace_label, - column_labels=['alternative', 'utility']) + tracing.trace_df( + utilities, + "%s.raw_utilities" % trace_label, + column_labels=["alternative", "utility"], + ) # - logsums # logsum is log of exponentiated utilities summed across columns of each chooser row @@ -1262,8 +1442,9 @@ def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): # trace utilities if have_trace_targets: - tracing.trace_df(logsums, '%s.logsums' % trace_label, - column_labels=['alternative', 'logsum']) + tracing.trace_df( + logsums, "%s.logsums" % trace_label, column_labels=["alternative", "logsum"] + ) return logsums @@ -1278,29 +1459,37 @@ def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): Index will be that of `choosers`, values will be nest logsum based on spec column values """ - trace_label = tracing.extend_trace_label(trace_label, 'eval_nl_logsums') + trace_label = tracing.extend_trace_label(trace_label, "eval_nl_logsums") have_trace_targets = tracing.has_trace_targets(choosers) logit.validate_nest_spec(nest_spec, trace_label) # trace choosers if have_trace_targets: - tracing.trace_df(choosers, '%s.choosers' % trace_label) - - raw_utilities = eval_utilities(spec, choosers, locals_d, - trace_label=trace_label, have_trace_targets=have_trace_targets) + tracing.trace_df(choosers, "%s.choosers" % trace_label) + + raw_utilities = eval_utilities( + spec, + choosers, + locals_d, + trace_label=trace_label, + have_trace_targets=have_trace_targets, + ) chunk.log_df(trace_label, "raw_utilities", raw_utilities) if have_trace_targets: - tracing.trace_df(raw_utilities, '%s.raw_utilities' % trace_label, - column_labels=['alternative', 'utility']) + tracing.trace_df( + raw_utilities, + "%s.raw_utilities" % trace_label, + column_labels=["alternative", "utility"], + ) # - exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities(raw_utilities, nest_spec) chunk.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities) del raw_utilities # done with raw_utilities - chunk.log_df(trace_label, 'raw_utilities', None) + chunk.log_df(trace_label, "raw_utilities", None) # - logsums logsums = np.log(nested_exp_utilities.root) @@ -1309,20 +1498,25 @@ def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): if have_trace_targets: # add logsum to nested_exp_utilities for tracing - nested_exp_utilities['logsum'] = logsums - tracing.trace_df(nested_exp_utilities, '%s.nested_exp_utilities' % trace_label, - column_labels=['alternative', 'utility']) - tracing.trace_df(logsums, '%s.logsums' % trace_label, - column_labels=['alternative', 'logsum']) + nested_exp_utilities["logsum"] = logsums + tracing.trace_df( + nested_exp_utilities, + "%s.nested_exp_utilities" % trace_label, + column_labels=["alternative", "utility"], + ) + tracing.trace_df( + logsums, "%s.logsums" % trace_label, column_labels=["alternative", "logsum"] + ) del nested_exp_utilities # done with nested_exp_utilities - chunk.log_df(trace_label, 'nested_exp_utilities', None) + chunk.log_df(trace_label, "nested_exp_utilities", None) return logsums -def _simple_simulate_logsums(choosers, spec, nest_spec, - skims=None, locals_d=None, trace_label=None): +def _simple_simulate_logsums( + choosers, spec, nest_spec, skims=None, locals_d=None, trace_label=None +): """ like simple_simulate except return logsums instead of making choices @@ -1336,18 +1530,25 @@ def _simple_simulate_logsums(choosers, spec, nest_spec, set_skim_wrapper_targets(choosers, skims) if nest_spec is None: - logsums = eval_mnl_logsums(choosers, spec, locals_d, - trace_label=trace_label) + logsums = eval_mnl_logsums(choosers, spec, locals_d, trace_label=trace_label) else: - logsums = eval_nl_logsums(choosers, spec, nest_spec, locals_d, - trace_label=trace_label) + logsums = eval_nl_logsums( + choosers, spec, nest_spec, locals_d, trace_label=trace_label + ) return logsums -def simple_simulate_logsums(choosers, spec, nest_spec, - skims=None, locals_d=None, chunk_size=0, - trace_label=None, chunk_tag=None): +def simple_simulate_logsums( + choosers, + spec, + nest_spec, + skims=None, + locals_d=None, + chunk_size=0, + trace_label=None, + chunk_tag=None, +): """ like simple_simulate except return logsums instead of making choices @@ -1362,17 +1563,17 @@ def simple_simulate_logsums(choosers, spec, nest_spec, result_list = [] # segment by person type and pick the right spec for each person type - for i, chooser_chunk, chunk_trace_label \ - in chunk.adaptive_chunked_choosers(choosers, chunk_size, trace_label, chunk_tag): + for i, chooser_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers( + choosers, chunk_size, trace_label, chunk_tag + ): logsums = _simple_simulate_logsums( - chooser_chunk, spec, nest_spec, - skims, locals_d, - chunk_trace_label) + chooser_chunk, spec, nest_spec, skims, locals_d, chunk_trace_label + ) result_list.append(logsums) - chunk.log_df(trace_label, f'result_list', result_list) + chunk.log_df(trace_label, f"result_list", result_list) if len(result_list) > 1: logsums = pd.concat(result_list) diff --git a/activitysim/core/skim_dict_factory.py b/activitysim/core/skim_dict_factory.py index 51b1bc59ab..be3196628e 100644 --- a/activitysim/core/skim_dict_factory.py +++ b/activitysim/core/skim_dict_factory.py @@ -2,18 +2,15 @@ # See full license in LICENSE.txt. # from builtins import int -import os -import multiprocessing import logging +import multiprocessing +import os +from abc import ABC, abstractmethod + import numpy as np import openmatrix as omx -from abc import ABC, abstractmethod -from activitysim.core import util -from activitysim.core import config -from activitysim.core import inject -from activitysim.core import tracing -from activitysim.core import skim_dictionary +from activitysim.core import config, inject, skim_dictionary, tracing, util logger = logging.getLogger(__name__) @@ -27,6 +24,7 @@ class SkimData(object): For instance, to open/close memmapped files just in time, or to access backing data via an alternate api """ + def __init__(self, skim_data): """ skim_data is an np.ndarray or anything that implements the methods/properties of this class @@ -39,7 +37,7 @@ def __init__(self, skim_data): def __getitem__(self, indexes): if len(indexes) != 3: - raise ValueError(f'number of indexes ({len(indexes)}) should be 3') + raise ValueError(f"number of indexes ({len(indexes)}) should be 3") return self._skim_data[indexes] @property @@ -108,7 +106,7 @@ def load_skim_info(self, skim_tag): # ignore any 3D skims not in skim_time_periods # specifically, load all skims except those with key2 not in dim3_tags_to_load skim_time_periods = self.network_los.skim_time_periods - dim3_tags_to_load = skim_time_periods and skim_time_periods['labels'] + dim3_tags_to_load = skim_time_periods and skim_time_periods["labels"] self.omx_manifest = {} # dict mapping { omx_key: skim_name } @@ -120,13 +118,16 @@ def load_skim_info(self, skim_tag): # fixme call to omx_file.shape() failing in windows p3.5 if self.omx_shape is None: - self.omx_shape = tuple(int(i) for i in omx_file.shape()) # sometimes omx shape are floats! + self.omx_shape = tuple( + int(i) for i in omx_file.shape() + ) # sometimes omx shape are floats! else: - assert (self.omx_shape == tuple(int(i) for i in omx_file.shape())) + assert self.omx_shape == tuple(int(i) for i in omx_file.shape()) for skim_name in omx_file.listMatrices(): - assert skim_name not in self.omx_manifest, \ - f"duplicate skim '{skim_name}' found in {self.omx_manifest[skim_name]} and {omx_file}" + assert ( + skim_name not in self.omx_manifest + ), f"duplicate skim '{skim_name}' found in {self.omx_manifest[skim_name]} and {omx_file}" self.omx_manifest[skim_name] = omx_file_path for m in omx_file.listMappings(): @@ -137,14 +138,16 @@ def load_skim_info(self, skim_tag): else: # don't really expect more than one, but ok if they are all the same if not (self.offset_map == omx_file.mapentries(m)): - raise RuntimeError(f"Multiple mappings in omx file: {self.offset_map_name} != {m}") + raise RuntimeError( + f"Multiple mappings in omx file: {self.offset_map_name} != {m}" + ) # - omx_keys dict maps skim key to omx_key # DISTWALK: DISTWALK # ('DRV_COM_WLK_BOARDS', 'AM'): DRV_COM_WLK_BOARDS__AM, ... self.omx_keys = dict() for skim_name in self.omx_manifest.keys(): - key1, sep, key2 = skim_name.partition('__') + key1, sep, key2 = skim_name.partition("__") # - ignore composite tags not in dim3_tags_to_load if dim3_tags_to_load and sep and key2 not in dim3_tags_to_load: @@ -192,7 +195,11 @@ def load_skim_info(self, skim_tag): self.block_offsets[skim_key] = key1_offset + key2_relative_offset if skim_dictionary.ROW_MAJOR_LAYOUT: - self.skim_data_shape = (self.num_skims, self.omx_shape[0], self.omx_shape[1]) + self.skim_data_shape = ( + self.num_skims, + self.omx_shape[0], + self.omx_shape[1], + ) else: self.skim_data_shape = self.omx_shape + (self.num_skims,) @@ -278,8 +285,10 @@ def _read_skims_from_omx(self, skim_info, skim_data): if omx_manifest[omx_key] == omx_file_path: offset = skim_info.block_offsets[skim_key] - logger.debug(f"_read_skims_from_omx file {omx_file_path} omx_key {omx_key} " - f"skim_key {skim_key} to offset {offset}") + logger.debug( + f"_read_skims_from_omx file {omx_file_path} omx_key {omx_key} " + f"skim_key {skim_key} to offset {offset}" + ) if skim_dictionary.ROW_MAJOR_LAYOUT: a = skim_data[offset, :, :] @@ -292,7 +301,9 @@ def _read_skims_from_omx(self, skim_info, skim_data): num_skims_loaded += 1 - logger.info(f"_read_skims_from_omx loaded {num_skims_loaded} skims from {omx_file_path}") + logger.info( + f"_read_skims_from_omx loaded {num_skims_loaded} skims from {omx_file_path}" + ) def _open_existing_readonly_memmap_skim_cache(self, skim_info): """ @@ -308,13 +319,21 @@ def _open_existing_readonly_memmap_skim_cache(self, skim_info): logger.warning(f"read_skim_cache file not found: {skim_cache_path}") return None - logger.info(f"reading skim cache {skim_info.skim_tag} {skim_info.skim_data_shape} from {skim_cache_path}") + logger.info( + f"reading skim cache {skim_info.skim_tag} {skim_info.skim_data_shape} from {skim_cache_path}" + ) try: - data = np.memmap(skim_cache_path, shape=skim_info.skim_data_shape, dtype=dtype, mode='r') + data = np.memmap( + skim_cache_path, shape=skim_info.skim_data_shape, dtype=dtype, mode="r" + ) except Exception as e: - logger.warning(f"{type(e).__name__} reading {skim_info.skim_tag} skim_cache {skim_cache_path}: {str(e)}") - logger.warning(f"ignoring incompatible {skim_info.skim_tag} skim_cache {skim_cache_path}") + logger.warning( + f"{type(e).__name__} reading {skim_info.skim_tag} skim_cache {skim_cache_path}: {str(e)}" + ) + logger.warning( + f"ignoring incompatible {skim_info.skim_tag} skim_cache {skim_cache_path}" + ) return None return data @@ -328,9 +347,13 @@ def _create_empty_writable_memmap_skim_cache(self, skim_info): skim_cache_path = self._memmap_skim_data_path(skim_info.skim_tag) - logger.info(f"writing skim cache {skim_info.skim_tag} {skim_info.skim_data_shape} to {skim_cache_path}") + logger.info( + f"writing skim cache {skim_info.skim_tag} {skim_info.skim_data_shape} to {skim_cache_path}" + ) - data = np.memmap(skim_cache_path, shape=skim_info.skim_data_shape, dtype=dtype, mode='w+') + data = np.memmap( + skim_cache_path, shape=skim_info.skim_data_shape, dtype=dtype, mode="w+" + ) return data @@ -343,7 +366,6 @@ def copy_omx_to_mmap_file(self, skim_info): class NumpyArraySkimFactory(AbstractSkimFactory): - def __init__(self, network_los): super().__init__(network_los) @@ -366,8 +388,9 @@ def allocate_skim_buffer(self, skim_info, shared=False): multiprocessing.RawArray or numpy.ndarray """ - assert shared == self.network_los.multiprocess(), \ - f"NumpyArraySkimFactory.allocate_skim_buffer shared {shared} multiprocess {not shared}" + assert ( + shared == self.network_los.multiprocess() + ), f"NumpyArraySkimFactory.allocate_skim_buffer shared {shared} multiprocess {not shared}" dtype_name = skim_info.dtype_name dtype = np.dtype(dtype_name) @@ -376,16 +399,20 @@ def allocate_skim_buffer(self, skim_info, shared=False): buffer_size = util.iprod(skim_info.skim_data_shape) csz = buffer_size * dtype.itemsize - logger.info(f"allocate_skim_buffer shared {shared} {skim_info.skim_tag} shape {skim_info.skim_data_shape} " - f"total size: {util.INT(csz)} ({util.GB(csz)})") + logger.info( + f"allocate_skim_buffer shared {shared} {skim_info.skim_tag} shape {skim_info.skim_data_shape} " + f"total size: {util.INT(csz)} ({util.GB(csz)})" + ) if shared: - if dtype_name == 'float64': - typecode = 'd' - elif dtype_name == 'float32': - typecode = 'f' + if dtype_name == "float64": + typecode = "d" + elif dtype_name == "float32": + typecode = "f" else: - raise RuntimeError("allocate_skim_buffer unrecognized dtype %s" % dtype_name) + raise RuntimeError( + "allocate_skim_buffer unrecognized dtype %s" % dtype_name + ) buffer = multiprocessing.RawArray(typecode, buffer_size) else: @@ -409,7 +436,9 @@ def _skim_data_from_buffer(self, skim_info, skim_buffer): dtype = np.dtype(skim_info.dtype_name) assert len(skim_buffer) == util.iprod(skim_info.skim_data_shape) - skim_data = np.frombuffer(skim_buffer, dtype=dtype).reshape(skim_info.skim_data_shape) + skim_data = np.frombuffer(skim_buffer, dtype=dtype).reshape( + skim_info.skim_data_shape + ) return skim_data def load_skims_to_buffer(self, skim_info, skim_buffer): @@ -422,8 +451,8 @@ def load_skims_to_buffer(self, skim_info, skim_buffer): skim_buffer: 1D buffer sized to hold all skims (multiprocessing.RawArray or numpy.ndarray) """ - read_cache = self.network_los.setting('read_skim_cache', False) - write_cache = self.network_los.setting('write_skim_cache', False) + read_cache = self.network_los.setting("read_skim_cache", False) + write_cache = self.network_los.setting("write_skim_cache", False) skim_data = self._skim_data_from_buffer(skim_info, skim_buffer) assert skim_data.shape == skim_info.skim_data_shape @@ -451,7 +480,9 @@ def load_skims_to_buffer(self, skim_info, skim_buffer): # bug - do we need to close it? - logger.info(f"load_skims_to_buffer {skim_info.skim_tag} shape {skim_data.shape}") + logger.info( + f"load_skims_to_buffer {skim_info.skim_tag} shape {skim_data.shape}" + ) def get_skim_data(self, skim_tag, skim_info): """ @@ -467,10 +498,12 @@ def get_skim_data(self, skim_tag, skim_info): SkimData """ - data_buffers = inject.get_injectable('data_buffers', None) + data_buffers = inject.get_injectable("data_buffers", None) if data_buffers: # we assume any existing skim buffers will already have skim data loaded into them - logger.info(f"get_skim_data {skim_tag} using existing shared skim_buffers for skims") + logger.info( + f"get_skim_data {skim_tag} using existing shared skim_buffers for skims" + ) skim_buffer = data_buffers[skim_tag] else: skim_buffer = self.allocate_skim_buffer(skim_info, shared=False) @@ -478,7 +511,9 @@ def get_skim_data(self, skim_tag, skim_info): skim_data = SkimData(self._skim_data_from_buffer(skim_info, skim_buffer)) - logger.info(f"get_skim_data {skim_tag} {type(skim_data).__name__} shape {skim_data.shape}") + logger.info( + f"get_skim_data {skim_tag} {type(skim_data).__name__} shape {skim_data.shape}" + ) return skim_data @@ -499,9 +534,11 @@ def __init__(self, skim_cache_path, skim_info): self._shape = skim_info.skim_data_shape def __getitem__(self, indexes): - assert len(indexes) == 3, f'number of indexes ({len(indexes)}) should be 3' + assert len(indexes) == 3, f"number of indexes ({len(indexes)}) should be 3" # open memmap - data = np.memmap(self.skim_cache_path, shape=self._shape, dtype=self.dtype, mode='r') + data = np.memmap( + self.skim_cache_path, shape=self._shape, dtype=self.dtype, mode="r" + ) # dereference skim values result = data[indexes] # closing memmap's underlying mmap frees data read into (not really needed as we are exiting scope) @@ -552,7 +589,7 @@ def get_skim_data(self, skim_tag, skim_info): """ # don't expect legacy shared memory buffers - assert not inject.get_injectable('data_buffers', {}).get(skim_tag) + assert not inject.get_injectable("data_buffers", {}).get(skim_tag) skim_cache_path = self._memmap_skim_data_path(skim_tag) if not os.path.isfile(skim_cache_path): @@ -566,6 +603,8 @@ def get_skim_data(self, skim_tag, skim_info): skim_data = self._open_existing_readonly_memmap_skim_cache(skim_info) skim_data = SkimData(skim_data) - logger.info(f"get_skim_data {skim_tag} {type(skim_data).__name__} shape {skim_data.shape}") + logger.info( + f"get_skim_data {skim_tag} {type(skim_data).__name__} shape {skim_data.shape}" + ) return skim_data diff --git a/activitysim/core/skim_dictionary.py b/activitysim/core/skim_dictionary.py index ba23b85dce..db9b1d2b0e 100644 --- a/activitysim/core/skim_dictionary.py +++ b/activitysim/core/skim_dictionary.py @@ -1,10 +1,8 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range -from builtins import object - import logging +from builtins import object, range import numpy as np import pandas as pd @@ -40,7 +38,9 @@ def __init__(self, offset_int=None, offset_list=None, offset_series=None): self.offset_int = self.offset_series = None - assert (offset_int is not None) + (offset_list is not None) + (offset_series is not None) <= 1 + assert (offset_int is not None) + (offset_list is not None) + ( + offset_series is not None + ) <= 1 if offset_int is not None: self.set_offset_int(offset_int) @@ -49,7 +49,7 @@ def __init__(self, offset_int=None, offset_list=None, offset_series=None): elif offset_series is not None: self.set_offset_series(offset_series) - def print_offset(self, message=''): + def print_offset(self, message=""): assert (self.offset_int is not None) or (self.offset_series is not None) if self.offset_int is not None: @@ -86,11 +86,13 @@ def set_offset_list(self, offset_list): # - for performance, check if this is a simple range that can ber represented by an int offset first_offset = offset_list[0] - if (offset_list == list(range(first_offset, len(offset_list)+first_offset))): + if offset_list == list(range(first_offset, len(offset_list) + first_offset)): offset_int = -1 * first_offset self.set_offset_int(offset_int) else: - offset_series = pd.Series(data=list(range(len(offset_list))), index=offset_list) + offset_series = pd.Series( + data=list(range(len(offset_list))), index=offset_list + ) self.set_offset_series(offset_series) def set_offset_int(self, offset_int): @@ -121,7 +123,7 @@ def map(self, zone_ids): """ if self.offset_series is not None: - assert(self.offset_int is None) + assert self.offset_int is None assert isinstance(self.offset_series, pd.Series) # FIXME - turns out it is faster to use series.map if zone_ids is a series @@ -129,12 +131,20 @@ def map(self, zone_ids): if isinstance(zone_ids, np.ndarray): zone_ids = pd.Series(zone_ids) - offsets = zone_ids.map(self.offset_series, na_action='ignore').fillna(NOT_IN_SKIM_ZONE_ID).astype(int) + offsets = ( + zone_ids.map(self.offset_series, na_action="ignore") + .fillna(NOT_IN_SKIM_ZONE_ID) + .astype(int) + ) elif self.offset_int: - assert (self.offset_series is None) + assert self.offset_series is None # apply integer offset, but map NOT_IN_SKIM_ZONE_ID to self - offsets = np.where(zone_ids == NOT_IN_SKIM_ZONE_ID, NOT_IN_SKIM_ZONE_ID, zone_ids + self.offset_int) + offsets = np.where( + zone_ids == NOT_IN_SKIM_ZONE_ID, + NOT_IN_SKIM_ZONE_ID, + zone_ids + self.offset_int, + ) else: offsets = zone_ids @@ -157,11 +167,15 @@ def __init__(self, skim_tag, skim_info, skim_data): self.skim_info = skim_info self.usage = set() # track keys of skims looked up - self.offset_mapper = self._offset_mapper() # (in function so subclass can override) + self.offset_mapper = ( + self._offset_mapper() + ) # (in function so subclass can override) self.omx_shape = skim_info.omx_shape self.skim_data = skim_data - self.dtype = np.dtype(skim_info.dtype_name) # so we can coerce if we have missing values + self.dtype = np.dtype( + skim_info.dtype_name + ) # so we can coerce if we have missing values # - skim_dim3 dict maps key1 to dict of key2 absolute offsets into block # DRV_COM_WLK_BOARDS: {'MD': 4, 'AM': 3, 'PM': 5}, ... @@ -171,7 +185,9 @@ def __init__(self, skim_tag, skim_info, skim_data): if isinstance(skim_key, tuple): key1, key2 = skim_key self.skim_dim3.setdefault(key1, {})[key2] = offset - logger.info(f"SkimDict.build_3d_skim_block_offset_table registered {len(self.skim_dim3)} 3d keys") + logger.info( + f"SkimDict.build_3d_skim_block_offset_table registered {len(self.skim_dim3)} 3d keys" + ) def _offset_mapper(self): """ @@ -250,8 +266,12 @@ def _lookup(self, orig, dest, block_offsets): result = self.skim_data[mapped_orig, mapped_dest, block_offsets] # FIXME - should return nan if not in skim (negative indices wrap around) - in_skim = (mapped_orig >= 0) & (mapped_orig < self.omx_shape[0]) & \ - (mapped_dest >= 0) & (mapped_dest < self.omx_shape[1]) + in_skim = ( + (mapped_orig >= 0) + & (mapped_orig < self.omx_shape[0]) + & (mapped_dest >= 0) + & (mapped_dest < self.omx_shape[1]) + ) # if not ((in_skim | (orig == NOT_IN_SKIM_ZONE_ID) | (dest == NOT_IN_SKIM_ZONE_ID)).all()): # print(f"orig\n{orig}") @@ -259,8 +279,9 @@ def _lookup(self, orig, dest, block_offsets): # print(f"in_skim\n{in_skim}") # check for bad indexes (other than NOT_IN_SKIM_ZONE_ID) - assert (in_skim | (orig == NOT_IN_SKIM_ZONE_ID) | (dest == NOT_IN_SKIM_ZONE_ID)).all(), \ - f"{(~in_skim).sum()} od pairs not in skim" + assert ( + in_skim | (orig == NOT_IN_SKIM_ZONE_ID) | (dest == NOT_IN_SKIM_ZONE_ID) + ).all(), f"{(~in_skim).sum()} od pairs not in skim" if not in_skim.all(): result = np.where(in_skim, result, NOT_IN_SKIM_NAN).astype(self.dtype) @@ -331,10 +352,14 @@ def lookup_3d(self, orig, dest, dim3, key): # skim_indexes = dim3.map(skim_keys_to_indexes).astype('int') try: - block_offsets = np.vectorize(skim_keys_to_indexes.get)(dim3) # this should be faster than map + block_offsets = np.vectorize(skim_keys_to_indexes.get)( + dim3 + ) # this should be faster than map result = self._lookup(orig, dest, block_offsets) except Exception as err: - logger.error("SkimDict lookup_3d error: %s: %s", type(err).__name__, str(err)) + logger.error( + "SkimDict lookup_3d error: %s: %s", type(err).__name__, str(err) + ) logger.error(f"key {key}") logger.error(f"orig max {orig.max()} min {orig.min()}") logger.error(f"dest max {dest.max()} min {dest.min()}") @@ -411,8 +436,12 @@ def set_df(self, df): ------- self (to facilitiate chaining) """ - assert self.orig_key in df, f"orig_key '{self.orig_key}' not in df columns: {list(df.columns)}" - assert self.dest_key in df, f"dest_key '{self.dest_key}' not in df columns: {list(df.columns)}" + assert ( + self.orig_key in df + ), f"orig_key '{self.orig_key}' not in df columns: {list(df.columns)}" + assert ( + self.dest_key in df + ), f"dest_key '{self.dest_key}' not in df columns: {list(df.columns)}" self.df = df return self @@ -439,9 +468,13 @@ def lookup(self, key, reverse=False): assert self.df is not None, "Call set_df first" if reverse: - s = self.skim_dict.lookup(self.df[self.dest_key], self.df[self.orig_key], key) + s = self.skim_dict.lookup( + self.df[self.dest_key], self.df[self.orig_key], key + ) else: - s = self.skim_dict.lookup(self.df[self.orig_key], self.df[self.dest_key], key) + s = self.skim_dict.lookup( + self.df[self.orig_key], self.df[self.dest_key], key + ) return pd.Series(s, index=self.df.index) @@ -459,7 +492,7 @@ def max(self, key): s = np.maximum( self.skim_dict.lookup(self.df[self.dest_key], self.df[self.orig_key], key), - self.skim_dict.lookup(self.df[self.orig_key], self.df[self.dest_key], key) + self.skim_dict.lookup(self.df[self.orig_key], self.df[self.dest_key], key), ) return pd.Series(s, index=self.df.index) @@ -533,9 +566,15 @@ def set_df(self, df): ------- self (to facilitiate chaining) """ - assert self.orig_key in df, f"orig_key '{self.orig_key}' not in df columns: {list(df.columns)}" - assert self.dest_key in df, f"dest_key '{self.dest_key}' not in df columns: {list(df.columns)}" - assert self.dim3_key in df, f"dim3_key '{self.dim3_key}' not in df columns: {list(df.columns)}" + assert ( + self.orig_key in df + ), f"orig_key '{self.orig_key}' not in df columns: {list(df.columns)}" + assert ( + self.dest_key in df + ), f"dest_key '{self.dest_key}' not in df columns: {list(df.columns)}" + assert ( + self.dim3_key in df + ), f"dim3_key '{self.dim3_key}' not in df columns: {list(df.columns)}" self.df = df return self @@ -554,8 +593,8 @@ def __getitem__(self, key): A Series of impedances values from the set of skims with specified base key, indexed by orig/dest/dim3 """ assert self.df is not None, "Call set_df first" - orig = self.df[self.orig_key].astype('int') - dest = self.df[self.dest_key].astype('int') + orig = self.df[self.orig_key].astype("int") + dest = self.df[self.dest_key].astype("int") dim3 = self.df[self.dim3_key] skim_values = self.skim_dict.lookup_3d(orig, dest, dim3, key) @@ -596,11 +635,15 @@ def __init__(self, skim_tag, network_los, taz_skim_dict): self.network_los = network_los super().__init__(skim_tag, taz_skim_dict.skim_info, taz_skim_dict.skim_data) - assert self.offset_mapper is not None # should have been set with _init_offset_mapper + assert ( + self.offset_mapper is not None + ) # should have been set with _init_offset_mapper self.dtype = np.dtype(self.skim_info.dtype_name) self.base_keys = taz_skim_dict.skim_info.base_keys - self.sparse_keys = list(set(network_los.maz_to_maz_df.columns) - {'OMAZ', 'DMAZ'}) + self.sparse_keys = list( + set(network_los.maz_to_maz_df.columns) - {"OMAZ", "DMAZ"} + ) self.sparse_key_usage = set() def _offset_mapper(self): @@ -616,7 +659,12 @@ def _offset_mapper(self): """ # start with a series with MAZ zone_id index and TAZ zone id values - maz_to_taz = self.network_los.maz_taz_df[['MAZ', 'TAZ']].set_index('MAZ').sort_values(by='TAZ').TAZ + maz_to_taz = ( + self.network_los.maz_taz_df[["MAZ", "TAZ"]] + .set_index("MAZ") + .sort_values(by="TAZ") + .TAZ + ) # use taz offset_mapper to create series mapping directly from MAZ to TAZ skim index taz_offset_mapper = super()._offset_mapper() @@ -630,7 +678,9 @@ def _offset_mapper(self): # 8429 330 # 9859 331 - assert isinstance(maz_to_skim_offset, np.ndarray) or isinstance(maz_to_skim_offset, pd.Series) + assert isinstance(maz_to_skim_offset, np.ndarray) or isinstance( + maz_to_skim_offset, pd.Series + ) if isinstance(maz_to_skim_offset, pd.Series): offset_mapper = OffsetMapper(offset_series=maz_to_skim_offset) elif isinstance(maz_to_skim_offset, np.ndarray): @@ -682,8 +732,10 @@ def sparse_lookup(self, orig, dest, key): backstop_values = super().lookup(orig, dest, key) # get distance skim if a different key was specified by blend_distance_skim_name - if (blend_distance_skim_name != key): - distance = self.network_los.get_mazpairs(orig, dest, blend_distance_skim_name) + if blend_distance_skim_name != key: + distance = self.network_los.get_mazpairs( + orig, dest, blend_distance_skim_name + ) else: distance = values @@ -692,9 +744,12 @@ def sparse_lookup(self, orig, dest, key): # beyond max_blend_distance, just use the skim values backstop_fractions = np.minimum(distance / max_blend_distance, 1) - values = np.where(is_nan, - backstop_values, - backstop_fractions * backstop_values + (1 - backstop_fractions) * values) + values = np.where( + is_nan, + backstop_values, + backstop_fractions * backstop_values + + (1 - backstop_fractions) * values, + ) elif is_nan.any(): @@ -797,13 +852,17 @@ def get(self, row_ids, col_ids): row_indexes = self.offset_mapper.map(np.asanyarray(row_ids)) - not_in_skim = (row_indexes == NOT_IN_SKIM_ZONE_ID) + not_in_skim = row_indexes == NOT_IN_SKIM_ZONE_ID if not_in_skim.any(): - logger.warning(f"DataFrameMatrix: {not_in_skim.sum()} row_ids of {len(row_ids)} not in skim.") + logger.warning( + f"DataFrameMatrix: {not_in_skim.sum()} row_ids of {len(row_ids)} not in skim." + ) not_in_skim = not_in_skim.values logger.warning(f"row_ids: {row_ids[not_in_skim]}") logger.warning(f"col_ids: {col_ids[not_in_skim]}") - raise RuntimeError(f"DataFrameMatrix: {not_in_skim.sum()} row_ids of {len(row_ids)} not in skim.") + raise RuntimeError( + f"DataFrameMatrix: {not_in_skim.sum()} row_ids of {len(row_ids)} not in skim." + ) assert (row_indexes >= 0).all(), f"{row_indexes}" diff --git a/activitysim/core/steps/output.py b/activitysim/core/steps/output.py index bf0c439484..93397b914c 100644 --- a/activitysim/core/steps/output.py +++ b/activitysim/core/steps/output.py @@ -2,13 +2,11 @@ # See full license in LICENSE.txt. import logging import sys -import pandas as pd -import numpy as np -from activitysim.core import pipeline -from activitysim.core import inject -from activitysim.core import config +import numpy as np +import pandas as pd +from activitysim.core import config, inject, pipeline from activitysim.core.config import setting logger = logging.getLogger(__name__) @@ -30,16 +28,18 @@ def track_skim_usage(output_dir): pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 - skim_dict = inject.get_injectable('skim_dict') + skim_dict = inject.get_injectable("skim_dict") - mode = 'wb' if sys.version_info < (3,) else 'w' - with open(config.output_file_path('skim_usage.txt'), mode) as output_file: + mode = "wb" if sys.version_info < (3,) else "w" + with open(config.output_file_path("skim_usage.txt"), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.get_skim_usage(): print(key, file=output_file) - unused = set(k for k in skim_dict.skim_info.base_keys) - set(k for k in skim_dict.get_skim_usage()) + unused = set(k for k in skim_dict.skim_info.base_keys) - set( + k for k in skim_dict.get_skim_usage() + ) for key in unused: print(key, file=output_file) @@ -55,9 +55,9 @@ def previous_write_data_dictionary(output_dir): """ - model_settings = config.read_model_settings('write_data_dictionary') - txt_format = model_settings.get('txt_format', 'data_dict.txt') - csv_format = model_settings.get('csv_format', 'data_dict.csv') + model_settings = config.read_model_settings("write_data_dictionary") + txt_format = model_settings.get("txt_format", "data_dict.txt") + csv_format = model_settings.get("csv_format", "data_dict.csv") if txt_format: @@ -70,12 +70,12 @@ def previous_write_data_dictionary(output_dir): # write data dictionary for all checkpointed_tables - with open(output_file_path, 'w') as output_file: + with open(output_file_path, "w") as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) - print('index:', df.index.name, df.index.dtype, file=output_file) + print("index:", df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file) @@ -100,18 +100,20 @@ def write_data_dictionary(output_dir): """ - model_settings = config.read_model_settings('write_data_dictionary') - txt_format = model_settings.get('txt_format', 'data_dict.txt') - csv_format = model_settings.get('csv_format', 'data_dict.csv') + model_settings = config.read_model_settings("write_data_dictionary") + txt_format = model_settings.get("txt_format", "data_dict.txt") + csv_format = model_settings.get("csv_format", "data_dict.csv") if not (csv_format or txt_format): - logger.warning(f"write_data_dictionary step invoked but neither 'txt_format' nor 'csv_format' specified") + logger.warning( + f"write_data_dictionary step invoked but neither 'txt_format' nor 'csv_format' specified" + ) return table_names = pipeline.registered_tables() # use table_names list from model_settings, if provided - schema_tables = model_settings.get('tables', None) + schema_tables = model_settings.get("tables", None) if schema_tables: table_names = [c for c in schema_tables if c in table_names] @@ -125,10 +127,15 @@ def write_data_dictionary(output_dir): if df.index.name and df.index.name not in df.columns: df = df.reset_index() - info = df.dtypes.astype(str).to_frame('dtype').reset_index().rename(columns={'index': 'column_name'}) - info['checkpoint'] = '' - - info.insert(loc=0, column='table_name', value=table_name) + info = ( + df.dtypes.astype(str) + .to_frame("dtype") + .reset_index() + .rename(columns={"index": "column_name"}) + ) + info["checkpoint"] = "" + + info.insert(loc=0, column="table_name", value=table_name) schema[table_name] = info # annotate schema.info with name of checkpoint columns were first seen @@ -151,10 +158,12 @@ def write_data_dictionary(output_dir): info = schema.get(table_name, None) # tag any new columns with checkpoint name - prev_columns = info[info.checkpoint != ''].column_name.values + prev_columns = info[info.checkpoint != ""].column_name.values new_cols = [c for c in df.columns.values if c not in prev_columns] is_new_column_this_checkpoont = info.column_name.isin(new_cols) - info.checkpoint = np.where(is_new_column_this_checkpoont, checkpoint_name, info.checkpoint) + info.checkpoint = np.where( + is_new_column_this_checkpoont, checkpoint_name, info.checkpoint + ) schema[table_name] = info @@ -164,7 +173,7 @@ def write_data_dictionary(output_dir): schema_df.to_csv(config.output_file_path(csv_format), header=True, index=False) if txt_format: - with open(config.output_file_path(txt_format), 'w') as output_file: + with open(config.output_file_path(txt_format), "w") as output_file: # get max schema column widths from omnibus table col_width = {c: schema_df[c].str.len().max() + 2 for c in schema_df} @@ -172,17 +181,20 @@ def write_data_dictionary(output_dir): for table_name in table_names: info = schema.get(table_name, None) - columns_to_print = ['column_name', 'dtype', 'checkpoint'] + columns_to_print = ["column_name", "dtype", "checkpoint"] info = info[columns_to_print].copy() # normalize schema columns widths across all table schemas for unified output formatting for c in info: - info[c] = info[c].str.pad(col_width[c], side='right') + info[c] = info[c].str.pad(col_width[c], side="right") info.columns = [c.ljust(col_width[c]) for c in info.columns] info = info.to_string(index=False) - print(f"###\n### {table_name} {final_shapes[table_name]}\n###\n", file=output_file) + print( + f"###\n### {table_name} {final_shapes[table_name]}\n###\n", + file=output_file, + ) print(f"{info}\n", file=output_file) @@ -229,7 +241,7 @@ def write_tables(output_dir): """ - output_tables_settings_name = 'output_tables' + output_tables_settings_name = "output_tables" output_tables_settings = setting(output_tables_settings_name) @@ -237,25 +249,27 @@ def write_tables(output_dir): logger.info("No output_tables specified in settings file. Nothing to write.") return - action = output_tables_settings.get('action') - tables = output_tables_settings.get('tables') - prefix = output_tables_settings.get('prefix', 'final_') - h5_store = output_tables_settings.get('h5_store', False) - sort = output_tables_settings.get('sort', False) + action = output_tables_settings.get("action") + tables = output_tables_settings.get("tables") + prefix = output_tables_settings.get("prefix", "final_") + h5_store = output_tables_settings.get("h5_store", False) + sort = output_tables_settings.get("sort", False) registered_tables = pipeline.registered_tables() - if action == 'include': + if action == "include": # interpret empty or missing tables setting to mean include all registered tables output_tables_list = tables if tables is not None else registered_tables - elif action == 'skip': + elif action == "skip": output_tables_list = [t for t in registered_tables if t not in tables] else: - raise "expected %s action '%s' to be either 'include' or 'skip'" % \ - (output_tables_settings_name, action) + raise "expected %s action '%s' to be either 'include' or 'skip'" % ( + output_tables_settings_name, + action, + ) for table_name in output_tables_list: - if table_name == 'checkpoints': + if table_name == "checkpoints": df = pipeline.get_checkpoints() else: if table_name not in registered_tables: @@ -264,30 +278,42 @@ def write_tables(output_dir): df = pipeline.get_table(table_name) if sort: - traceable_table_indexes = inject.get_injectable('traceable_table_indexes', {}) + traceable_table_indexes = inject.get_injectable( + "traceable_table_indexes", {} + ) if df.index.name in traceable_table_indexes: df = df.sort_index() - logger.debug(f"write_tables sorting {table_name} on index {df.index.name}") + logger.debug( + f"write_tables sorting {table_name} on index {df.index.name}" + ) else: # find all registered columns we can use to sort this table # (they are ordered appropriately in traceable_table_indexes) - sort_columns = [c for c in traceable_table_indexes if c in df.columns] + sort_columns = [ + c for c in traceable_table_indexes if c in df.columns + ] if len(sort_columns) > 0: df = df.sort_values(by=sort_columns) - logger.debug(f"write_tables sorting {table_name} on columns {sort_columns}") + logger.debug( + f"write_tables sorting {table_name} on columns {sort_columns}" + ) else: - logger.debug(f"write_tables sorting {table_name} on unrecognized index {df.index.name}") + logger.debug( + f"write_tables sorting {table_name} on unrecognized index {df.index.name}" + ) df = df.sort_index() if h5_store: - file_path = config.output_file_path('%soutput_tables.h5' % prefix) - df.to_hdf(file_path, key=table_name, mode='a', format='fixed') + file_path = config.output_file_path("%soutput_tables.h5" % prefix) + df.to_hdf(file_path, key=table_name, mode="a", format="fixed") else: file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex - write_index = df.index.name is not None or isinstance(df.index, pd.MultiIndex) + write_index = df.index.name is not None or isinstance( + df.index, pd.MultiIndex + ) df.to_csv(file_path, index=write_index) diff --git a/activitysim/core/test/extensions/__init__.py b/activitysim/core/test/extensions/__init__.py index 690edc137d..33c638303a 100644 --- a/activitysim/core/test/extensions/__init__.py +++ b/activitysim/core/test/extensions/__init__.py @@ -1,2 +1 @@ - from . import steps diff --git a/activitysim/core/test/extensions/steps.py b/activitysim/core/test/extensions/steps.py index b9e598b93c..baa894c692 100644 --- a/activitysim/core/test/extensions/steps.py +++ b/activitysim/core/test/extensions/steps.py @@ -1,37 +1,36 @@ import pandas as pd -from activitysim.core import inject -from activitysim.core import pipeline -from activitysim.core import tracing + +from activitysim.core import inject, pipeline, tracing @inject.step() def step1(): - table1 = pd.DataFrame({'c': [1, 2, 3]}) - inject.add_table('table1', table1) + table1 = pd.DataFrame({"c": [1, 2, 3]}) + inject.add_table("table1", table1) @inject.step() def step2(): - table1 = pd.DataFrame({'c': [2, 4, 6]}) - inject.add_table('table2', table1) + table1 = pd.DataFrame({"c": [2, 4, 6]}) + inject.add_table("table2", table1) @inject.step() def step3(): - table1 = pd.DataFrame({'c': [3, 6, 9]}) - inject.add_table('table3', table1) + table1 = pd.DataFrame({"c": [3, 6, 9]}) + inject.add_table("table3", table1) @inject.step() def step_add_col(): - table_name = inject.get_step_arg('table_name') + table_name = inject.get_step_arg("table_name") assert table_name is not None - col_name = inject.get_step_arg('column_name') + col_name = inject.get_step_arg("column_name") assert col_name is not None table = pipeline.get_table(table_name) @@ -46,7 +45,7 @@ def step_add_col(): @inject.step() def step_forget_tab(): - table_name = inject.get_step_arg('table_name') + table_name = inject.get_step_arg("table_name") assert table_name is not None table = pipeline.get_table(table_name) @@ -57,9 +56,9 @@ def step_forget_tab(): @inject.step() def create_households(trace_hh_id): - df = pd.DataFrame({'household_id': [1, 2, 3], 'home_zone_id': {100, 100, 101}}) - inject.add_table('households', df) + df = pd.DataFrame({"household_id": [1, 2, 3], "home_zone_id": {100, 100, 101}}) + inject.add_table("households", df) - pipeline.get_rn_generator().add_channel('households', df) + pipeline.get_rn_generator().add_channel("households", df) - tracing.register_traceable_table('households', df) + tracing.register_traceable_table("households", df) diff --git a/activitysim/core/test/test_assign.py b/activitysim/core/test/test_assign.py index 1efd4201f5..3818711067 100644 --- a/activitysim/core/test/test_assign.py +++ b/activitysim/core/test/test_assign.py @@ -1,21 +1,18 @@ # ActivitySim # See full license in LICENSE.txt. -import os.path import logging import logging.config +import os.path import numpy as np import pandas as pd import pytest -from .. import config -from .. import assign -from .. import tracing -from .. import inject +from .. import assign, config, inject, tracing def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) @@ -34,46 +31,47 @@ def teardown_function(func): inject.reinject_decorated_tables() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data_dir(): - return os.path.join(os.path.dirname(__file__), 'data') + return os.path.join(os.path.dirname(__file__), "data") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def spec_name(data_dir): - return os.path.join(data_dir, 'assignment_spec.csv') + return os.path.join(data_dir, "assignment_spec.csv") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data_name(data_dir): - return os.path.join(data_dir, 'data.csv') + return os.path.join(data_dir, "data.csv") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(data_name): return pd.read_csv(data_name) def test_read_model_spec(): - spec = assign.read_assignment_spec(config.config_file_path('assignment_spec.csv')) + spec = assign.read_assignment_spec(config.config_file_path("assignment_spec.csv")) assert len(spec) == 8 - assert list(spec.columns) == ['description', 'target', 'expression'] + assert list(spec.columns) == ["description", "target", "expression"] def test_assign_variables(capsys, data): - spec = assign.read_assignment_spec(config.config_file_path('assignment_spec.csv')) + spec = assign.read_assignment_spec(config.config_file_path("assignment_spec.csv")) - locals_d = {'CONSTANT': 7, '_shadow': 99} + locals_d = {"CONSTANT": 7, "_shadow": 99} - results, trace_results, trace_assigned_locals \ - = assign.assign_variables(spec, data, locals_d, trace_rows=None) + results, trace_results, trace_assigned_locals = assign.assign_variables( + spec, data, locals_d, trace_rows=None + ) print(results) - assert list(results.columns) == ['target1', 'target2', 'target3'] + assert list(results.columns) == ["target1", "target2", "target3"] assert list(results.target1) == [True, False, False] assert list(results.target2) == [53, 53, 55] assert list(results.target3) == [530, 530, 550] @@ -82,8 +80,9 @@ def test_assign_variables(capsys, data): trace_rows = [False, True, False] - results, trace_results, trace_assigned_locals \ - = assign.assign_variables(spec, data, locals_d, trace_rows=trace_rows) + results, trace_results, trace_assigned_locals = assign.assign_variables( + spec, data, locals_d, trace_rows=trace_rows + ) # should get same results as before assert list(results.target3) == [530, 530, 550] @@ -92,38 +91,40 @@ def test_assign_variables(capsys, data): print(trace_results) assert trace_results is not None - assert '_scalar' in trace_results.columns - assert list(trace_results['_scalar']) == [42] + assert "_scalar" in trace_results.columns + assert list(trace_results["_scalar"]) == [42] # shadow should have been assigned - assert list(trace_results['_shadow']) == [1] - assert list(trace_results['_temp']) == [9] - assert list(trace_results['target3']) == [530] + assert list(trace_results["_shadow"]) == [1] + assert list(trace_results["_temp"]) == [9] + assert list(trace_results["target3"]) == [530] print("trace_assigned_locals", trace_assigned_locals) - assert trace_assigned_locals['_DF_COL_NAME'] == 'thing2' + assert trace_assigned_locals["_DF_COL_NAME"] == "thing2" # shouldn't have been changed even though it was a target - assert locals_d['_shadow'] == 99 + assert locals_d["_shadow"] == 99 out, err = capsys.readouterr() def test_assign_variables_aliased(capsys, data): - spec = assign.read_assignment_spec(config.config_file_path('assignment_spec_alias_df.csv')) + spec = assign.read_assignment_spec( + config.config_file_path("assignment_spec_alias_df.csv") + ) - locals_d = {'CONSTANT': 7, '_shadow': 99} + locals_d = {"CONSTANT": 7, "_shadow": 99} trace_rows = [False, True, False] - results, trace_results, trace_assigned_locals \ - = assign.assign_variables(spec, data, locals_d, - df_alias='aliased_df', trace_rows=trace_rows) + results, trace_results, trace_assigned_locals = assign.assign_variables( + spec, data, locals_d, df_alias="aliased_df", trace_rows=trace_rows + ) print(results) - assert list(results.columns) == ['target1', 'target2', 'target3'] + assert list(results.columns) == ["target1", "target2", "target3"] assert list(results.target1) == [True, False, False] assert list(results.target2) == [53, 53, 55] assert list(results.target3) == [530, 530, 550] @@ -132,15 +133,15 @@ def test_assign_variables_aliased(capsys, data): print(trace_results) assert trace_results is not None - assert '_scalar' in trace_results.columns - assert list(trace_results['_scalar']) == [42] + assert "_scalar" in trace_results.columns + assert list(trace_results["_scalar"]) == [42] # shadow should have been assigned - assert list(trace_results['_shadow']) == [1] - assert list(trace_results['_temp']) == [9] - assert list(trace_results['target3']) == [530] + assert list(trace_results["_shadow"]) == [1] + assert list(trace_results["_temp"]) == [9] + assert list(trace_results["target3"]) == [530] - assert locals_d['_shadow'] == 99 + assert locals_d["_shadow"] == 99 out, err = capsys.readouterr() @@ -149,21 +150,25 @@ def test_assign_variables_failing(capsys, data): close_handlers() - output_dir = os.path.join(os.path.dirname(__file__), 'output') + output_dir = os.path.join(os.path.dirname(__file__), "output") inject.add_injectable("output_dir", output_dir) tracing.config_logger(basic=True) - spec = assign.read_assignment_spec(config.config_file_path('assignment_spec_failing.csv')) + spec = assign.read_assignment_spec( + config.config_file_path("assignment_spec_failing.csv") + ) locals_d = { - 'CONSTANT': 7, - '_shadow': 99, - 'log': np.log, + "CONSTANT": 7, + "_shadow": 99, + "log": np.log, } with pytest.raises(NameError) as excinfo: - results, trace_results = assign.assign_variables(spec, data, locals_d, trace_rows=None) + results, trace_results = assign.assign_variables( + spec, data, locals_d, trace_rows=None + ) out, err = capsys.readouterr() # don't consume output diff --git a/activitysim/core/test/test_inject_defaults.py b/activitysim/core/test/test_inject_defaults.py index d0b869f1c9..82e106984d 100644 --- a/activitysim/core/test/test_inject_defaults.py +++ b/activitysim/core/test/test_inject_defaults.py @@ -4,10 +4,8 @@ import pytest -from .. import inject - # Note that the following import statement has the side-effect of registering injectables: -from .. import config +from .. import config, inject def teardown_function(func): @@ -32,11 +30,11 @@ def test_defaults(): print("output_dir", output_dir) assert "directory does not exist" in str(excinfo.value) - configs_dir = os.path.join(os.path.dirname(__file__), 'configs_test_defaults') + configs_dir = os.path.join(os.path.dirname(__file__), "configs_test_defaults") inject.add_injectable("configs_dir", configs_dir) settings = inject.get_injectable("settings") assert isinstance(settings, dict) - data_dir = os.path.join(os.path.dirname(__file__), 'data') + data_dir = os.path.join(os.path.dirname(__file__), "data") inject.add_injectable("data_dir", data_dir) diff --git a/activitysim/core/test/test_input.py b/activitysim/core/test/test_input.py index fa94590246..f695020b63 100644 --- a/activitysim/core/test/test_input.py +++ b/activitysim/core/test/test_input.py @@ -1,33 +1,34 @@ # ActivitySim # See full license in LICENSE.txt. import os -import yaml -import pytest + import pandas as pd +import pytest +import yaml -from activitysim.core import inject # Note that the following import statement has the side-effect of registering injectables: -from activitysim.core import config -from activitysim.core import input +from activitysim.core import config, inject, input -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def seed_households(): - return pd.DataFrame({ - 'HHID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - 'home_zone_id': [8, 8, 8, 8, 12, 12, 15, 16, 16, 18], - }) + return pd.DataFrame( + { + "HHID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "home_zone_id": [8, 8, 8, 8, 12, 12, 15, 16, 16, 18], + } + ) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data_dir(): - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), 'output') + output_dir = os.path.join(os.path.dirname(__file__), "output") inject.add_injectable("output_dir", output_dir) - data_dir = os.path.join(os.path.dirname(__file__), 'temp_data') + data_dir = os.path.join(os.path.dirname(__file__), "temp_data") if not os.path.exists(data_dir): os.mkdir(data_dir) @@ -44,12 +45,12 @@ def data_dir(): def test_missing_table_list(data_dir): - settings = inject.get_injectable('settings') + settings = inject.get_injectable("settings") assert isinstance(settings, dict) with pytest.raises(AssertionError) as excinfo: - input.read_input_table('households') - assert 'no input_table_list found' in str(excinfo.value) + input.read_input_table("households") + assert "no input_table_list found" in str(excinfo.value) def test_csv_reader(seed_households, data_dir): @@ -64,16 +65,16 @@ def test_csv_reader(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable('settings', settings) + inject.add_injectable("settings", settings) - hh_file = os.path.join(data_dir, 'households.csv') + hh_file = os.path.join(data_dir, "households.csv") seed_households.to_csv(hh_file, index=False) assert os.path.isfile(hh_file) - df = input.read_input_table('households') + df = input.read_input_table("households") - assert df.index.name == 'household_id' + assert df.index.name == "household_id" def test_hdf_reader1(seed_households, data_dir): @@ -88,16 +89,16 @@ def test_hdf_reader1(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable('settings', settings) + inject.add_injectable("settings", settings) - hh_file = os.path.join(data_dir, 'households.h5') - seed_households.to_hdf(hh_file, key='households', mode='w') + hh_file = os.path.join(data_dir, "households.h5") + seed_households.to_hdf(hh_file, key="households", mode="w") assert os.path.isfile(hh_file) - df = input.read_input_table('households') + df = input.read_input_table("households") - assert df.index.name == 'household_id' + assert df.index.name == "household_id" def test_hdf_reader2(seed_households, data_dir): @@ -113,16 +114,16 @@ def test_hdf_reader2(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable('settings', settings) + inject.add_injectable("settings", settings) - hh_file = os.path.join(data_dir, 'households.h5') - seed_households.to_hdf(hh_file, key='seed_households', mode='w') + hh_file = os.path.join(data_dir, "households.h5") + seed_households.to_hdf(hh_file, key="seed_households", mode="w") assert os.path.isfile(hh_file) - df = input.read_input_table('households') + df = input.read_input_table("households") - assert df.index.name == 'household_id' + assert df.index.name == "household_id" def test_hdf_reader3(seed_households, data_dir): @@ -137,16 +138,16 @@ def test_hdf_reader3(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable('settings', settings) + inject.add_injectable("settings", settings) - hh_file = os.path.join(data_dir, 'input_data.h5') - seed_households.to_hdf(hh_file, key='households', mode='w') + hh_file = os.path.join(data_dir, "input_data.h5") + seed_households.to_hdf(hh_file, key="households", mode="w") assert os.path.isfile(hh_file) - df = input.read_input_table('households') + df = input.read_input_table("households") - assert df.index.name == 'household_id' + assert df.index.name == "household_id" def test_missing_filename(seed_households, data_dir): @@ -160,11 +161,11 @@ def test_missing_filename(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable('settings', settings) + inject.add_injectable("settings", settings) with pytest.raises(AssertionError) as excinfo: - input.read_input_table('households') - assert 'no input file provided' in str(excinfo.value) + input.read_input_table("households") + assert "no input file provided" in str(excinfo.value) def test_create_input_store(seed_households, data_dir): @@ -181,19 +182,19 @@ def test_create_input_store(seed_households, data_dir): """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) - inject.add_injectable('settings', settings) + inject.add_injectable("settings", settings) - hh_file = os.path.join(data_dir, 'households.csv') + hh_file = os.path.join(data_dir, "households.csv") seed_households.to_csv(hh_file, index=False) assert os.path.isfile(hh_file) - df = input.read_input_table('households') + df = input.read_input_table("households") - assert df.index.name == 'household_id' + assert df.index.name == "household_id" - output_store = os.path.join(inject.get_injectable('output_dir'), 'input_data.h5') + output_store = os.path.join(inject.get_injectable("output_dir"), "input_data.h5") assert os.path.exists(output_store) - store_df = pd.read_hdf(output_store, 'seed_households') + store_df = pd.read_hdf(output_store, "seed_households") assert store_df.equals(seed_households) diff --git a/activitysim/core/test/test_logit.py b/activitysim/core/test/test_logit.py index 15db8a5ae0..225fdbb3c7 100644 --- a/activitysim/core/test/test_logit.py +++ b/activitysim/core/test/test_logit.py @@ -5,17 +5,15 @@ import numpy as np import pandas as pd - import pandas.testing as pdt import pytest +from .. import inject, logit from ..simulate import eval_variables -from .. import logit -from .. import inject def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) @@ -24,121 +22,130 @@ def teardown_function(func): inject.reinject_decorated_tables() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data_dir(): - return os.path.join(os.path.dirname(__file__), 'data') + return os.path.join(os.path.dirname(__file__), "data") def add_canonical_dirs(): - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), 'output') + output_dir = os.path.join(os.path.dirname(__file__), "output") inject.add_injectable("output_dir", output_dir) # this is lifted straight from urbansim's test_mnl.py -@pytest.fixture(scope='module', params=[ - ('fish.csv', - 'fish_choosers.csv', - pd.DataFrame( - [[-0.02047652], [0.95309824]], index=['price', 'catch'], - columns=['Alt']), - pd.DataFrame([ - [0.2849598, 0.2742482, 0.1605457, 0.2802463], - [0.1498991, 0.4542377, 0.2600969, 0.1357664]], - columns=['beach', 'boat', 'charter', 'pier']))]) +@pytest.fixture( + scope="module", + params=[ + ( + "fish.csv", + "fish_choosers.csv", + pd.DataFrame( + [[-0.02047652], [0.95309824]], index=["price", "catch"], columns=["Alt"] + ), + pd.DataFrame( + [ + [0.2849598, 0.2742482, 0.1605457, 0.2802463], + [0.1498991, 0.4542377, 0.2600969, 0.1357664], + ], + columns=["beach", "boat", "charter", "pier"], + ), + ) + ], +) def test_data(request): data, choosers, spec, probabilities = request.param return { - 'data': data, - 'choosers': choosers, - 'spec': spec, - 'probabilities': probabilities + "data": data, + "choosers": choosers, + "spec": spec, + "probabilities": probabilities, } @pytest.fixture def choosers(test_data, data_dir): - filen = os.path.join(data_dir, test_data['choosers']) + filen = os.path.join(data_dir, test_data["choosers"]) return pd.read_csv(filen) @pytest.fixture def spec(test_data): - return test_data['spec'] + return test_data["spec"] @pytest.fixture def utilities(choosers, spec, test_data): vars = eval_variables(spec.index, choosers) - utils = vars.dot(spec).astype('float') + utils = vars.dot(spec).astype("float") return pd.DataFrame( - utils.values.reshape(test_data['probabilities'].shape), - columns=test_data['probabilities'].columns) + utils.values.reshape(test_data["probabilities"].shape), + columns=test_data["probabilities"].columns, + ) def test_utils_to_probs(utilities, test_data): probs = logit.utils_to_probs(utilities, trace_label=None) - pdt.assert_frame_equal(probs, test_data['probabilities']) + pdt.assert_frame_equal(probs, test_data["probabilities"]) def test_utils_to_probs_raises(): add_canonical_dirs() - idx = pd.Index(name='household_id', data=[1]) + idx = pd.Index(name="household_id", data=[1]) with pytest.raises(RuntimeError) as excinfo: - logit.utils_to_probs(pd.DataFrame([[1, 2, np.inf, 3]], index=idx), trace_label=None) + logit.utils_to_probs( + pd.DataFrame([[1, 2, np.inf, 3]], index=idx), trace_label=None + ) assert "infinite exponentiated utilities" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: - logit.utils_to_probs(pd.DataFrame([[-999, -999, -999, -999]], index=idx), trace_label=None) + logit.utils_to_probs( + pd.DataFrame([[-999, -999, -999, -999]], index=idx), trace_label=None + ) assert "all probabilities are zero" in str(excinfo.value) def test_make_choices_only_one(): probs = pd.DataFrame( - [[1, 0, 0], [0, 1, 0]], columns=['a', 'b', 'c'], index=['x', 'y']) + [[1, 0, 0], [0, 1, 0]], columns=["a", "b", "c"], index=["x", "y"] + ) choices, rands = logit.make_choices(probs) - pdt.assert_series_equal( - choices, - pd.Series([0, 1], index=['x', 'y'])) + pdt.assert_series_equal(choices, pd.Series([0, 1], index=["x", "y"])) def test_make_choices_real_probs(utilities): probs = logit.utils_to_probs(utilities, trace_label=None) choices, rands = logit.make_choices(probs) - pdt.assert_series_equal( - choices, - pd.Series([1, 2], index=[0, 1])) + pdt.assert_series_equal(choices, pd.Series([1, 2], index=[0, 1])) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def interaction_choosers(): - return pd.DataFrame({ - 'attr': ['a', 'b', 'c', 'b']}, - index=['w', 'x', 'y', 'z']) + return pd.DataFrame({"attr": ["a", "b", "c", "b"]}, index=["w", "x", "y", "z"]) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def interaction_alts(): - return pd.DataFrame({ - 'prop': [10, 20, 30, 40]}, - index=[1, 2, 3, 4]) + return pd.DataFrame({"prop": [10, 20, 30, 40]}, index=[1, 2, 3, 4]) def test_interaction_dataset_no_sample(interaction_choosers, interaction_alts): - expected = pd.DataFrame({ - 'attr': ['a'] * 4 + ['b'] * 4 + ['c'] * 4 + ['b'] * 4, - 'prop': [10, 20, 30, 40] * 4}, - index=[1, 2, 3, 4] * 4) + expected = pd.DataFrame( + { + "attr": ["a"] * 4 + ["b"] * 4 + ["c"] * 4 + ["b"] * 4, + "prop": [10, 20, 30, 40] * 4, + }, + index=[1, 2, 3, 4] * 4, + ) - interacted = logit.interaction_dataset( - interaction_choosers, interaction_alts) + interacted = logit.interaction_dataset(interaction_choosers, interaction_alts) interacted, expected = interacted.align(expected, axis=1) @@ -147,15 +154,18 @@ def test_interaction_dataset_no_sample(interaction_choosers, interaction_alts): pdt.assert_frame_equal(interacted, expected) -def test_interaction_dataset_sampled( - interaction_choosers, interaction_alts): - expected = pd.DataFrame({ - 'attr': ['a'] * 2 + ['b'] * 2 + ['c'] * 2 + ['b'] * 2, - 'prop': [30, 40, 10, 30, 40, 10, 20, 10]}, - index=[3, 4, 1, 3, 4, 1, 2, 1]) +def test_interaction_dataset_sampled(interaction_choosers, interaction_alts): + expected = pd.DataFrame( + { + "attr": ["a"] * 2 + ["b"] * 2 + ["c"] * 2 + ["b"] * 2, + "prop": [30, 40, 10, 30, 40, 10, 20, 10], + }, + index=[3, 4, 1, 3, 4, 1, 2, 1], + ) interacted = logit.interaction_dataset( - interaction_choosers, interaction_alts, sample_size=2) + interaction_choosers, interaction_alts, sample_size=2 + ) interacted, expected = interacted.align(expected, axis=1) pdt.assert_frame_equal(interacted, expected) diff --git a/activitysim/core/test/test_los.py b/activitysim/core/test/test_los.py index 22b465d331..62afbc0f3f 100644 --- a/activitysim/core/test/test_los.py +++ b/activitysim/core/test/test_los.py @@ -4,14 +4,12 @@ import os import numpy as np -import pandas as pd import numpy.testing as npt +import pandas as pd import pandas.testing as pdt import pytest - -from .. import inject -from .. import los +from .. import inject, los def teardown_function(func): @@ -21,37 +19,37 @@ def teardown_function(func): def add_canonical_dirs(configs_dir_name): - configs_dir = os.path.join(os.path.dirname(__file__), f'los/{configs_dir_name}') + configs_dir = os.path.join(os.path.dirname(__file__), f"los/{configs_dir_name}") inject.add_injectable("configs_dir", configs_dir) - data_dir = os.path.join(os.path.dirname(__file__), f'los/data') + data_dir = os.path.join(os.path.dirname(__file__), f"los/data") inject.add_injectable("data_dir", data_dir) - output_dir = os.path.join(os.path.dirname(__file__), f'output') + output_dir = os.path.join(os.path.dirname(__file__), f"output") inject.add_injectable("output_dir", output_dir) def test_legacy_configs(): - add_canonical_dirs('configs_legacy_settings') + add_canonical_dirs("configs_legacy_settings") with pytest.warns(FutureWarning): network_los = los.Network_LOS() - assert network_los.setting('zone_system') == los.ONE_ZONE + assert network_los.setting("zone_system") == los.ONE_ZONE - assert 'z1_taz_skims.omx' in network_los.omx_file_names('taz') + assert "z1_taz_skims.omx" in network_los.omx_file_names("taz") def test_one_zone(): - add_canonical_dirs('configs_1z') + add_canonical_dirs("configs_1z") network_los = los.Network_LOS() - assert network_los.setting('zone_system') == los.ONE_ZONE + assert network_los.setting("zone_system") == los.ONE_ZONE - assert 'z1_taz_skims.omx' in network_los.omx_file_names('taz') + assert "z1_taz_skims.omx" in network_los.omx_file_names("taz") network_los.load_data() @@ -60,10 +58,7 @@ def test_one_zone(): # 23000,22000,0.89,0.89 # 23000,23000,0.19,0.19 - od_df = pd.DataFrame({ - 'orig': [5, 23, 23, 23], - 'dest': [7, 20, 21, 22] - }) + od_df = pd.DataFrame({"orig": [5, 23, 23, 23], "dest": [7, 20, 21, 22]}) skim_dict = network_los.get_default_skim_dict() @@ -73,9 +68,11 @@ def test_one_zone(): # 23000,20000,2.55,2.55 # 23000,21000,1.9,1.9 # 23000,22000,0.62,0.62 - skims = skim_dict.wrap('orig', 'dest') + skims = skim_dict.wrap("orig", "dest") skims.set_df(od_df) - pdt.assert_series_equal(skims['DIST'], pd.Series([0.4, 2.55, 1.9, 0.62]).astype(np.float32)) + pdt.assert_series_equal( + skims["DIST"], pd.Series([0.4, 2.55, 1.9, 0.62]).astype(np.float32) + ) # OMAZ, DMAZ, DIST, DISTBIKE # 2000, 1000, 0.37, 0.37 @@ -83,43 +80,51 @@ def test_one_zone(): # 21000,23000,1.89,1.89 # 22000,23000,0.89,0.89 - skims = skim_dict.wrap('dest', 'orig') + skims = skim_dict.wrap("dest", "orig") skims.set_df(od_df) - pdt.assert_series_equal(skims['DIST'], pd.Series([0.46, 2.45, 1.89, 0.89]).astype(np.float32)) + pdt.assert_series_equal( + skims["DIST"], pd.Series([0.46, 2.45, 1.89, 0.89]).astype(np.float32) + ) def test_two_zone(): - add_canonical_dirs('configs_2z') + add_canonical_dirs("configs_2z") network_los = los.Network_LOS() - assert network_los.setting('zone_system') == los.TWO_ZONE + assert network_los.setting("zone_system") == los.TWO_ZONE - assert 'z2_taz_skims.omx' in network_los.omx_file_names('taz') + assert "z2_taz_skims.omx" in network_los.omx_file_names("taz") - assert network_los.blend_distance_skim_name == 'DIST' + assert network_los.blend_distance_skim_name == "DIST" network_los.load_data() skim_dict = network_los.get_default_skim_dict() # skims should be the same as maz_to_maz distances when no blending - od_df = pd.DataFrame({ - 'orig': [1000, 2000, 23000, 23000, 23000], - 'dest': [2000, 2000, 20000, 21000, 22000] - }) + od_df = pd.DataFrame( + { + "orig": [1000, 2000, 23000, 23000, 23000], + "dest": [2000, 2000, 20000, 21000, 22000], + } + ) # compare to distances from maz_to_maz table - dist = pd.Series(network_los.get_mazpairs(od_df.orig, od_df.dest, 'DIST')).astype(np.float32) + dist = pd.Series(network_los.get_mazpairs(od_df.orig, od_df.dest, "DIST")).astype( + np.float32 + ) # make sure we got the right values - pdt.assert_series_equal(dist, pd.Series([0.24, 0.14, 2.55, 1.9, 0.62]).astype(np.float32)) + pdt.assert_series_equal( + dist, pd.Series([0.24, 0.14, 2.55, 1.9, 0.62]).astype(np.float32) + ) - skims = skim_dict.wrap('orig', 'dest') + skims = skim_dict.wrap("orig", "dest") skims.set_df(od_df) # assert no blending for DISTBIKE - assert network_los.max_blend_distance.get('DISTBIKE', 0) == 0 + assert network_los.max_blend_distance.get("DISTBIKE", 0) == 0 - skim_dist = skims['DISTBIKE'] + skim_dist = skims["DISTBIKE"] print(type(skims), type(skim_dist.iloc[0])) print(type(dist.iloc[0])) @@ -127,89 +132,106 @@ def test_two_zone(): # but should be different where maz-maz distance differs from skim backstop and blending desired # blending enabled for DIST - assert network_los.max_blend_distance.get('DIST') > 0 + assert network_los.max_blend_distance.get("DIST") > 0 with pytest.raises(AssertionError) as excinfo: - pdt.assert_series_equal(skims['DIST'], dist) + pdt.assert_series_equal(skims["DIST"], dist) def test_three_zone(): - add_canonical_dirs('configs_3z') + add_canonical_dirs("configs_3z") network_los = los.Network_LOS() - assert network_los.setting('zone_system') == los.THREE_ZONE + assert network_los.setting("zone_system") == los.THREE_ZONE - assert 'z3_taz_skims.omx' in network_los.omx_file_names('taz') + assert "z3_taz_skims.omx" in network_los.omx_file_names("taz") - assert network_los.blend_distance_skim_name == 'DIST' + assert network_los.blend_distance_skim_name == "DIST" network_los.load_data() - od_df = pd.DataFrame({ - 'orig': [1000, 2000, 23000, 23000, 23000], - 'dest': [2000, 2000, 20000, 21000, 22000] - }) + od_df = pd.DataFrame( + { + "orig": [1000, 2000, 23000, 23000, 23000], + "dest": [2000, 2000, 20000, 21000, 22000], + } + ) - dist = network_los.get_mazpairs(od_df.orig, od_df.dest, 'DIST').astype(np.float32) + dist = network_los.get_mazpairs(od_df.orig, od_df.dest, "DIST").astype(np.float32) np.testing.assert_almost_equal(dist, [0.24, 0.14, 2.55, 1.9, 0.62]) def test_30_minute_windows(): - add_canonical_dirs('configs_test_misc') - network_los = los.Network_LOS(los_settings_file_name='settings_30_min.yaml') + add_canonical_dirs("configs_test_misc") + network_los = los.Network_LOS(los_settings_file_name="settings_30_min.yaml") - assert network_los.skim_time_period_label(1) == 'EA' - assert network_los.skim_time_period_label(16) == 'AM' - assert network_los.skim_time_period_label(24) == 'MD' - assert network_los.skim_time_period_label(36) == 'PM' - assert network_los.skim_time_period_label(46) == 'EV' + assert network_los.skim_time_period_label(1) == "EA" + assert network_los.skim_time_period_label(16) == "AM" + assert network_los.skim_time_period_label(24) == "MD" + assert network_los.skim_time_period_label(36) == "PM" + assert network_los.skim_time_period_label(46) == "EV" pd.testing.assert_series_equal( network_los.skim_time_period_label(pd.Series([1, 16, 24, 36, 46])), - pd.Series(['EA', 'AM', 'MD', 'PM', 'EV'])) + pd.Series(["EA", "AM", "MD", "PM", "EV"]), + ) def test_60_minute_windows(): - add_canonical_dirs('configs_test_misc') - network_los = los.Network_LOS(los_settings_file_name='settings_60_min.yaml') + add_canonical_dirs("configs_test_misc") + network_los = los.Network_LOS(los_settings_file_name="settings_60_min.yaml") - assert network_los.skim_time_period_label(1) == 'EA' - assert network_los.skim_time_period_label(8) == 'AM' - assert network_los.skim_time_period_label(12) == 'MD' - assert network_los.skim_time_period_label(18) == 'PM' - assert network_los.skim_time_period_label(23) == 'EV' + assert network_los.skim_time_period_label(1) == "EA" + assert network_los.skim_time_period_label(8) == "AM" + assert network_los.skim_time_period_label(12) == "MD" + assert network_los.skim_time_period_label(18) == "PM" + assert network_los.skim_time_period_label(23) == "EV" pd.testing.assert_series_equal( network_los.skim_time_period_label(pd.Series([1, 8, 12, 18, 23])), - pd.Series(['EA', 'AM', 'MD', 'PM', 'EV'])) + pd.Series(["EA", "AM", "MD", "PM", "EV"]), + ) def test_1_week_time_window(): - add_canonical_dirs('configs_test_misc') - network_los = los.Network_LOS(los_settings_file_name='settings_1_week.yaml') + add_canonical_dirs("configs_test_misc") + network_los = los.Network_LOS(los_settings_file_name="settings_1_week.yaml") - assert network_los.skim_time_period_label(1) == 'Sunday' - assert network_los.skim_time_period_label(2) == 'Monday' - assert network_los.skim_time_period_label(3) == 'Tuesday' - assert network_los.skim_time_period_label(4) == 'Wednesday' - assert network_los.skim_time_period_label(5) == 'Thursday' - assert network_los.skim_time_period_label(6) == 'Friday' - assert network_los.skim_time_period_label(7) == 'Saturday' + assert network_los.skim_time_period_label(1) == "Sunday" + assert network_los.skim_time_period_label(2) == "Monday" + assert network_los.skim_time_period_label(3) == "Tuesday" + assert network_los.skim_time_period_label(4) == "Wednesday" + assert network_los.skim_time_period_label(5) == "Thursday" + assert network_los.skim_time_period_label(6) == "Friday" + assert network_los.skim_time_period_label(7) == "Saturday" weekly_series = network_los.skim_time_period_label(pd.Series([1, 2, 3, 4, 5, 6, 7])) - pd.testing.assert_series_equal(weekly_series, - pd.Series(['Sunday', 'Monday', 'Tuesday', 'Wednesday', - 'Thursday', 'Friday', 'Saturday'])) + pd.testing.assert_series_equal( + weekly_series, + pd.Series( + [ + "Sunday", + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + ] + ), + ) def test_skim_time_periods_future_warning(): - add_canonical_dirs('configs_test_misc') + add_canonical_dirs("configs_test_misc") with pytest.warns(FutureWarning) as warning_test: - network_los = los.Network_LOS(los_settings_file_name='settings_legacy_hours_key.yaml') + network_los = los.Network_LOS( + los_settings_file_name="settings_legacy_hours_key.yaml" + ) diff --git a/activitysim/core/test/test_pipeline.py b/activitysim/core/test/test_pipeline.py index 24e8ec0b68..724aa18822 100644 --- a/activitysim/core/test/test_pipeline.py +++ b/activitysim/core/test/test_pipeline.py @@ -1,14 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -import os import logging -import pytest +import os +import pytest import tables -from activitysim.core import tracing -from activitysim.core import pipeline -from activitysim.core import inject +from activitysim.core import inject, pipeline, tracing from .extensions import steps @@ -21,16 +19,16 @@ def setup_function(): inject.reinject_decorated_tables() - inject.remove_injectable('skim_dict') - inject.remove_injectable('skim_stack') + inject.remove_injectable("skim_dict") + inject.remove_injectable("skim_stack") - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), 'output') + output_dir = os.path.join(os.path.dirname(__file__), "output") inject.add_injectable("output_dir", output_dir) - data_dir = os.path.join(os.path.dirname(__file__), 'data') + data_dir = os.path.join(os.path.dirname(__file__), "data") inject.add_injectable("data_dir", data_dir) inject.clear_cache() @@ -56,17 +54,17 @@ def close_handlers(): # @pytest.mark.filterwarnings('ignore::tables.NaturalNameWarning') def test_pipeline_run(): - inject.add_step('step1', steps.step1) - inject.add_step('step2', steps.step2) - inject.add_step('step3', steps.step3) - inject.add_step('step_add_col', steps.step_add_col) + inject.add_step("step1", steps.step1) + inject.add_step("step2", steps.step2) + inject.add_step("step3", steps.step3) + inject.add_step("step_add_col", steps.step_add_col) inject.dump_state() _MODELS = [ - 'step1', - 'step2', - 'step3', - 'step_add_col.table_name=table2;column_name=c2' + "step1", + "step2", + "step3", + "step_add_col.table_name=table2;column_name=c2", ] pipeline.run(models=_MODELS, resume_after=None) @@ -101,19 +99,19 @@ def test_pipeline_run(): def test_pipeline_checkpoint_drop(): - inject.add_step('step1', steps.step1) - inject.add_step('step2', steps.step2) - inject.add_step('step3', steps.step3) - inject.add_step('step_add_col', steps.step_add_col) - inject.add_step('step_forget_tab', steps.step_forget_tab) + inject.add_step("step1", steps.step1) + inject.add_step("step2", steps.step2) + inject.add_step("step3", steps.step3) + inject.add_step("step_add_col", steps.step_add_col) + inject.add_step("step_forget_tab", steps.step_forget_tab) _MODELS = [ - 'step1', - '_step2', - '_step_add_col.table_name=table2;column_name=c2', - '_step_forget_tab.table_name=table2', - 'step3', - 'step_forget_tab.table_name=table3', + "step1", + "_step2", + "_step_add_col.table_name=table2;column_name=c2", + "_step_forget_tab.table_name=table2", + "step3", + "step_forget_tab.table_name=table3", ] pipeline.run(models=_MODELS, resume_after=None) @@ -137,6 +135,7 @@ def test_pipeline_checkpoint_drop(): pipeline.close_pipeline() close_handlers() + # if __name__ == "__main__": # # print "\n\ntest_pipeline_run" diff --git a/activitysim/core/test/test_random.py b/activitysim/core/test/test_random.py index 85228f5941..a3aa4bbe13 100644 --- a/activitysim/core/test/test_random.py +++ b/activitysim/core/test/test_random.py @@ -1,8 +1,8 @@ # ActivitySim # See full license in LICENSE.txt. import numpy as np -import pandas as pd import numpy.testing as npt +import pandas as pd import pytest from activitysim.core import random @@ -14,7 +14,7 @@ def test_basic(): rng.set_base_seed(0) - rng.begin_step('test_step') + rng.begin_step("test_step") global_rng = rng.get_global_rng() @@ -34,25 +34,21 @@ def test_basic(): def test_channel(): channels = { - 'households': 'household_id', - 'persons': 'person_id', + "households": "household_id", + "persons": "person_id", } rng = random.Random() - persons = pd.DataFrame({ - "household_id": [1, 1, 2, 2, 2], - }, index=[1, 2, 3, 4, 5]) - persons.index.name = 'person_id' + persons = pd.DataFrame({"household_id": [1, 1, 2, 2, 2],}, index=[1, 2, 3, 4, 5]) + persons.index.name = "person_id" - households = pd.DataFrame({ - "data": [1, 1, 2, 2, 2], - }, index=[1, 2, 3, 4, 5]) - households.index.name = 'household_id' + households = pd.DataFrame({"data": [1, 1, 2, 2, 2],}, index=[1, 2, 3, 4, 5]) + households.index.name = "household_id" - rng.begin_step('test_step') + rng.begin_step("test_step") - rng.add_channel('persons', persons) - rng.add_channel('households', households) + rng.add_channel("persons", persons) + rng.add_channel("households", households) rands = rng.random_for_df(persons) @@ -67,9 +63,9 @@ def test_channel(): test1_expected_rands2 = [0.9105223, 0.5718418, 0.7222742, 0.9062284, 0.3929369] npt.assert_almost_equal(np.asanyarray(rands).flatten(), test1_expected_rands2) - rng.end_step('test_step') + rng.end_step("test_step") - rng.begin_step('test_step2') + rng.begin_step("test_step2") rands = rng.random_for_df(households) expected_rands = [0.417278, 0.2994774, 0.8653719, 0.4429748, 0.5101697] @@ -84,21 +80,31 @@ def test_channel(): expected_choices = [3, 1, 4, 3, 3, 2, 2, 1, 4, 2] npt.assert_almost_equal(choices, expected_choices) - rng.end_step('test_step2') + rng.end_step("test_step2") - rng.begin_step('test_step3') + rng.begin_step("test_step3") rands = rng.random_for_df(households, n=2) - expected_rands = [0.3157928, 0.3321823, 0.5194067, 0.9340083, 0.9002048, 0.8754209, - 0.3898816, 0.4101094, 0.7351484, 0.1741092] + expected_rands = [ + 0.3157928, + 0.3321823, + 0.5194067, + 0.9340083, + 0.9002048, + 0.8754209, + 0.3898816, + 0.4101094, + 0.7351484, + 0.1741092, + ] npt.assert_almost_equal(np.asanyarray(rands).flatten(), expected_rands) - rng.end_step('test_step3') + rng.end_step("test_step3") # if we use the same step name a second time, we should get the same results as before - rng.begin_step('test_step') + rng.begin_step("test_step") rands = rng.random_for_df(persons) @@ -108,4 +114,4 @@ def test_channel(): rands = rng.random_for_df(persons) npt.assert_almost_equal(np.asanyarray(rands).flatten(), test1_expected_rands2) - rng.end_step('test_step') + rng.end_step("test_step") diff --git a/activitysim/core/test/test_simulate.py b/activitysim/core/test/test_simulate.py index 3fbee00db1..9eaa66534a 100644 --- a/activitysim/core/test/test_simulate.py +++ b/activitysim/core/test/test_simulate.py @@ -3,42 +3,40 @@ import os.path -import numpy.testing as npt import numpy as np +import numpy.testing as npt import pandas as pd import pandas.testing as pdt import pytest -from .. import inject - -from .. import simulate +from .. import inject, simulate -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data_dir(): - return os.path.join(os.path.dirname(__file__), 'data') + return os.path.join(os.path.dirname(__file__), "data") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def spec_name(data_dir): - return 'sample_spec.csv' + return "sample_spec.csv" -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def spec(data_dir, spec_name): return simulate.read_model_spec(file_name=spec_name) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(data_dir): - return pd.read_csv(os.path.join(data_dir, 'data.csv')) + return pd.read_csv(os.path.join(data_dir, "data.csv")) def setup_function(): - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), f'output') + output_dir = os.path.join(os.path.dirname(__file__), f"output") inject.add_injectable("output_dir", output_dir) @@ -47,22 +45,18 @@ def test_read_model_spec(spec_name): spec = simulate.read_model_spec(file_name=spec_name) assert len(spec) == 4 - assert spec.index.name == 'Expression' - assert list(spec.columns) == ['alt0', 'alt1'] - npt.assert_array_equal( - spec.values, - [[1.1, 11], [2.2, 22], [3.3, 33], [4.4, 44]]) + assert spec.index.name == "Expression" + assert list(spec.columns) == ["alt0", "alt1"] + npt.assert_array_equal(spec.values, [[1.1, 11], [2.2, 22], [3.3, 33], [4.4, 44]]) def test_eval_variables(spec, data): result = simulate.eval_variables(spec.index, data) - expected = pd.DataFrame([ - [1, 0, 4, 1], - [0, 1, 4, 1], - [0, 1, 5, 1]], - index=data.index, columns=spec.index) + expected = pd.DataFrame( + [[1, 0, 4, 1], [0, 1, 4, 1], [0, 1, 5, 1]], index=data.index, columns=spec.index + ) expected[expected.columns[0]] = expected[expected.columns[0]].astype(np.int8) expected[expected.columns[1]] = expected[expected.columns[1]].astype(np.int8) @@ -77,7 +71,7 @@ def test_eval_variables(spec, data): def test_simple_simulate(data, spec): - inject.add_injectable("settings", {'check_for_variability': False}) + inject.add_injectable("settings", {"check_for_variability": False}) choices = simulate.simple_simulate(choosers=data, spec=spec, nest_spec=None) expected = pd.Series([1, 1, 1], index=data.index) @@ -86,8 +80,10 @@ def test_simple_simulate(data, spec): def test_simple_simulate_chunked(data, spec): - inject.add_injectable("settings", {'check_for_variability': False}) + inject.add_injectable("settings", {"check_for_variability": False}) - choices = simulate.simple_simulate(choosers=data, spec=spec, nest_spec=None, chunk_size=2) + choices = simulate.simple_simulate( + choosers=data, spec=spec, nest_spec=None, chunk_size=2 + ) expected = pd.Series([1, 1, 1], index=data.index) pdt.assert_series_equal(choices, expected) diff --git a/activitysim/core/test/test_skim.py b/activitysim/core/test/test_skim.py index 0605250378..a8ff8396c9 100644 --- a/activitysim/core/test/test_skim.py +++ b/activitysim/core/test/test_skim.py @@ -2,8 +2,8 @@ # See full license in LICENSE.txt. import numpy as np -import pandas as pd import numpy.testing as npt +import pandas as pd import pandas.testing as pdt import pytest @@ -12,7 +12,7 @@ @pytest.fixture def data(): - return np.arange(100, dtype='int').reshape((10, 10)) + return np.arange(100, dtype="int").reshape((10, 10)) class FakeSkimInfo(object): @@ -25,41 +25,30 @@ def test_skims(data): # ROW_MAJOR_LAYOUT omx_shape = (10, 10) num_skims = 2 - skim_data_shape = (num_skims, ) + omx_shape + skim_data_shape = (num_skims,) + omx_shape skim_data = np.zeros(skim_data_shape, dtype=int) skim_data[0, :, :] = data - skim_data[1, :, :] = data*10 + skim_data[1, :, :] = data * 10 skim_info = FakeSkimInfo() - skim_info.block_offsets = {'AM': 0, 'PM': 1} + skim_info.block_offsets = {"AM": 0, "PM": 1} skim_info.omx_shape = omx_shape - skim_info.dtype_name = 'int' + skim_info.dtype_name = "int" - skim_dict = skim_dictionary.SkimDict('taz', skim_info, skim_data) + skim_dict = skim_dictionary.SkimDict("taz", skim_info, skim_data) skim_dict.offset_mapper.set_offset_int(0) # default is -1 skims = skim_dict.wrap("taz_l", "taz_r") - df = pd.DataFrame({ - "taz_l": [1, 9, 4], - "taz_r": [2, 3, 7], - }) + df = pd.DataFrame({"taz_l": [1, 9, 4], "taz_r": [2, 3, 7],}) skims.set_df(df) pdt.assert_series_equal( - skims['AM'], - pd.Series( - [12, 93, 47], - index=[0, 1, 2] - ).astype(data.dtype) + skims["AM"], pd.Series([12, 93, 47], index=[0, 1, 2]).astype(data.dtype) ) pdt.assert_series_equal( - skims['PM'], - pd.Series( - [120, 930, 470], - index=[0, 1, 2] - ).astype(data.dtype) + skims["PM"], pd.Series([120, 930, 470], index=[0, 1, 2]).astype(data.dtype) ) @@ -68,34 +57,27 @@ def test_3dskims(data): # ROW_MAJOR_LAYOUT omx_shape = (10, 10) num_skims = 2 - skim_data_shape = (num_skims, ) + omx_shape + skim_data_shape = (num_skims,) + omx_shape skim_data = np.zeros(skim_data_shape, dtype=int) skim_data[0, :, :] = data - skim_data[1, :, :] = data*10 + skim_data[1, :, :] = data * 10 skim_info = FakeSkimInfo() - skim_info.block_offsets = {('SOV', 'AM'): 0, ('SOV', 'PM'): 1} + skim_info.block_offsets = {("SOV", "AM"): 0, ("SOV", "PM"): 1} skim_info.omx_shape = omx_shape - skim_info.dtype_name = 'int' - skim_info.key1_block_offsets = {'SOV': 0} + skim_info.dtype_name = "int" + skim_info.key1_block_offsets = {"SOV": 0} - skim_dict = skim_dictionary.SkimDict('taz', skim_info, skim_data) + skim_dict = skim_dictionary.SkimDict("taz", skim_info, skim_data) skim_dict.offset_mapper.set_offset_int(0) # default is -1 skims3d = skim_dict.wrap_3d(orig_key="taz_l", dest_key="taz_r", dim3_key="period") - df = pd.DataFrame({ - "taz_l": [1, 9, 4], - "taz_r": [2, 3, 7], - "period": ['AM', 'PM', 'AM'] - }) + df = pd.DataFrame( + {"taz_l": [1, 9, 4], "taz_r": [2, 3, 7], "period": ["AM", "PM", "AM"]} + ) skims3d.set_df(df) pdt.assert_series_equal( - skims3d["SOV"], - pd.Series( - [12, 930, 47], - index=[0, 1, 2] - ), - check_dtype=False + skims3d["SOV"], pd.Series([12, 930, 47], index=[0, 1, 2]), check_dtype=False ) diff --git a/activitysim/core/test/test_timetable.py b/activitysim/core/test/test_timetable.py index 9528cbfd77..17b871dcfc 100644 --- a/activitysim/core/test/test_timetable.py +++ b/activitysim/core/test/test_timetable.py @@ -2,21 +2,20 @@ # See full license in LICENSE.txt. from builtins import range + import numpy as np import pandas as pd import pandas.testing as pdt import pytest -from .. import timetable as tt from .. import chunk +from .. import timetable as tt @pytest.fixture def persons(): - df = pd.DataFrame( - index=list(range(6)) - ) + df = pd.DataFrame(index=list(range(6))) return df @@ -25,26 +24,42 @@ def persons(): def tdd_alts(): alts = pd.DataFrame( data=[ - [5, 5], [5, 6], [5, 7], [5, 8], [5, 9], [5, 10], - [6, 6], [6, 7], [6, 8], [6, 9], [6, 10], - [7, 7], [7, 8], [7, 9], [7, 10], - [8, 8], [8, 9], [8, 10], - [9, 9], [9, 10], + [5, 5], + [5, 6], + [5, 7], + [5, 8], + [5, 9], + [5, 10], + [6, 6], + [6, 7], + [6, 8], + [6, 9], + [6, 10], + [7, 7], + [7, 8], + [7, 9], + [7, 10], + [8, 8], + [8, 9], + [8, 10], + [9, 9], + [9, 10], [10, 10], ], - columns=['start', 'end']) - alts['duration'] = alts.end - alts.start + columns=["start", "end"], + ) + alts["duration"] = alts.end - alts.start return alts def test_basic(persons, tdd_alts): - with chunk.chunk_log('test_basic', base=True): + with chunk.chunk_log("test_basic", base=True): person_windows = tt.create_timetable_windows(persons, tdd_alts) - timetable = tt.TimeTable(person_windows, tdd_alts, 'person_windows') + timetable = tt.TimeTable(person_windows, tdd_alts, "person_windows") # print "\ntdd_footprints_df\n", timetable.tdd_footprints_df # 0 1 2 3 4 5 6 7 @@ -73,7 +88,7 @@ def test_basic(persons, tdd_alts): num_alts = len(tdd_alts.index) num_persons = len(persons.index) - person_ids = pd.Series(list(range(num_persons))*num_alts) + person_ids = pd.Series(list(range(num_persons)) * num_alts) tdds = pd.Series(np.repeat(list(range(num_alts)), num_persons)) assert timetable.tour_available(person_ids, tdds).all() @@ -92,15 +107,17 @@ def test_basic(persons, tdd_alts): # 5 0 0 0 0 2 7 4 0 person_ids = pd.Series([0, 1, 1, 0, 1, 3, 4]) - tdds = pd.Series([ - 0, # tdd START_END does not collide with START_END - 0, # tdd START_END does not collide with START - 6, # tdd START_END does not collide with END - 1, # tdd START does not collide with START_END - 7, # tdd START does not collide with END - 3, # tdd END does not collide with START_END - 3, # tdd END does not collide with START - ]) + tdds = pd.Series( + [ + 0, # tdd START_END does not collide with START_END + 0, # tdd START_END does not collide with START + 6, # tdd START_END does not collide with END + 1, # tdd START does not collide with START_END + 7, # tdd START does not collide with END + 3, # tdd END does not collide with START_END + 3, # tdd END does not collide with START + ] + ) assert timetable.tour_available(person_ids, tdds).all() # print "\nupdated_person_windows\n", timetable.get_person_windows_df() @@ -113,24 +130,30 @@ def test_basic(persons, tdd_alts): # 5 0 0 0 0 2 7 4 0 person_ids = pd.Series([1, 5, 2, 2]) - tdds = pd.Series([ - 1, # tdd START + END collides with START + END - 17, # START + MIDDLE + END collides with same - 6, # tdd START_END collides with MIDDLE - 1, # tdd START + END collides with START + MIDDLE - ]) + tdds = pd.Series( + [ + 1, # tdd START + END collides with START + END + 17, # START + MIDDLE + END collides with same + 6, # tdd START_END collides with MIDDLE + 1, # tdd START + END collides with START + MIDDLE + ] + ) assert not timetable.tour_available(person_ids, tdds).any() # ensure that tour_available handles heterogeneous results person_ids = pd.Series([0, 1, 1, 5]) - tdds = pd.Series([ - 0, # tdd START_END does not collide with START_END - 0, # tdd START_END does not collide with START - 1, # tdd START + END collides with START + END - 17, # START + MIDDLE + END collides with same - ]) - pdt.assert_series_equal(timetable.tour_available(person_ids, tdds), - pd.Series([True, True, False, False], index=person_ids.index)) + tdds = pd.Series( + [ + 0, # tdd START_END does not collide with START_END + 0, # tdd START_END does not collide with START + 1, # tdd START + END collides with START + END + 17, # START + MIDDLE + END collides with same + ] + ) + pdt.assert_series_equal( + timetable.tour_available(person_ids, tdds), + pd.Series([True, True, False, False], index=person_ids.index), + ) # assigning overlapping trip END,START should convert END to START_END person_ids = pd.Series([2]) @@ -185,5 +208,9 @@ def test_basic(persons, tdd_alts): person_ids = pd.Series([0, 1, 2, 3]) starts = pd.Series([9, 6, 9, 5]) ends = pd.Series([10, 10, 10, 9]) - periods_available = timetable.remaining_periods_available(person_ids, starts, ends) - pdt.assert_series_equal(periods_available, pd.Series([6, 3, 4, 3]), check_dtype=False) + periods_available = timetable.remaining_periods_available( + person_ids, starts, ends + ) + pdt.assert_series_equal( + periods_available, pd.Series([6, 3, 4, 3]), check_dtype=False + ) diff --git a/activitysim/core/test/test_tracing.py b/activitysim/core/test/test_tracing.py index 429b429ad8..df88e0d36c 100644 --- a/activitysim/core/test/test_tracing.py +++ b/activitysim/core/test/test_tracing.py @@ -1,13 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -import os.path import logging -import pytest +import os.path import pandas as pd +import pytest -from .. import tracing -from .. import inject +from .. import inject, tracing def close_handlers(): @@ -29,10 +28,10 @@ def add_canonical_dirs(): inject.clear_cache() - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), 'output') + output_dir = os.path.join(os.path.dirname(__file__), "output") inject.add_injectable("output_dir", output_dir) @@ -42,7 +41,7 @@ def test_config_logger(capsys): tracing.config_logger() - logger = logging.getLogger('activitysim') + logger = logging.getLogger("activitysim") file_handlers = [h for h in logger.handlers if type(h) is logging.FileHandler] assert len(file_handlers) == 1 @@ -50,9 +49,9 @@ def test_config_logger(capsys): print("handlers:", logger.handlers) - logger.info('test_config_logger') - logger.info('log_info') - logger.warning('log_warn1') + logger.info("test_config_logger") + logger.info("log_info") + logger.warning("log_warn1") out, err = capsys.readouterr() @@ -60,19 +59,19 @@ def test_config_logger(capsys): print(out) assert "could not find conf file" not in out - assert 'log_warn1' in out - assert 'log_info' not in out + assert "log_warn1" in out + assert "log_info" not in out close_handlers() logger = logging.getLogger(__name__) - logger.warning('log_warn2') + logger.warning("log_warn2") - with open(asim_logger_baseFilename, 'r') as content_file: + with open(asim_logger_baseFilename, "r") as content_file: content = content_file.read() print(content) - assert 'log_warn1' in content - assert 'log_warn2' not in content + assert "log_warn1" in content + assert "log_warn2" not in content def test_print_summary(capsys): @@ -81,14 +80,16 @@ def test_print_summary(capsys): tracing.config_logger() - tracing.print_summary('label', df=pd.DataFrame(), describe=False, value_counts=False) + tracing.print_summary( + "label", df=pd.DataFrame(), describe=False, value_counts=False + ) out, err = capsys.readouterr() # don't consume output print(out) - assert 'print_summary neither value_counts nor describe' in out + assert "print_summary neither value_counts nor describe" in out close_handlers() @@ -99,24 +100,24 @@ def test_register_households(capsys): tracing.config_logger() - df = pd.DataFrame({'zort': ['a', 'b', 'c']}, index=[1, 2, 3]) + df = pd.DataFrame({"zort": ["a", "b", "c"]}, index=[1, 2, 3]) - inject.add_injectable('traceable_tables', ['households']) + inject.add_injectable("traceable_tables", ["households"]) inject.add_injectable("trace_hh_id", 5) - tracing.register_traceable_table('households', df) + tracing.register_traceable_table("households", df) out, err = capsys.readouterr() # print out # don't consume output assert "Can't register table 'households' without index name" in out - df.index.name = 'household_id' - tracing.register_traceable_table('households', df) + df.index.name = "household_id" + tracing.register_traceable_table("households", df) out, err = capsys.readouterr() # print out # don't consume output # should warn that household id not in index - assert 'trace_hh_id 5 not in dataframe' in out + assert "trace_hh_id 5 not in dataframe" in out close_handlers() @@ -127,41 +128,46 @@ def test_register_tours(capsys): tracing.config_logger() - inject.add_injectable('traceable_tables', ['households', 'tours']) + inject.add_injectable("traceable_tables", ["households", "tours"]) # in case another test injected this inject.add_injectable("trace_tours", []) - inject.add_injectable("trace_hh_id", 3) # need this or register_traceable_table is a nop + inject.add_injectable( + "trace_hh_id", 3 + ) # need this or register_traceable_table is a nop - tours_df = pd.DataFrame({'zort': ['a', 'b', 'c']}, index=[10, 11, 12]) - tours_df.index.name = 'tour_id' + tours_df = pd.DataFrame({"zort": ["a", "b", "c"]}, index=[10, 11, 12]) + tours_df.index.name = "tour_id" - tracing.register_traceable_table('tours', tours_df) + tracing.register_traceable_table("tours", tours_df) out, err = capsys.readouterr() - assert "can't find a registered table to slice table 'tours' index name 'tour_id'" in out + assert ( + "can't find a registered table to slice table 'tours' index name 'tour_id'" + in out + ) inject.add_injectable("trace_hh_id", 3) - households_df = pd.DataFrame({'dzing': ['a', 'b', 'c']}, index=[1, 2, 3]) - households_df.index.name = 'household_id' - tracing.register_traceable_table('households', households_df) + households_df = pd.DataFrame({"dzing": ["a", "b", "c"]}, index=[1, 2, 3]) + households_df.index.name = "household_id" + tracing.register_traceable_table("households", households_df) - tracing.register_traceable_table('tours', tours_df) + tracing.register_traceable_table("tours", tours_df) out, err = capsys.readouterr() # print out # don't consume output assert "can't find a registered table to slice table 'tours'" in out - tours_df['household_id'] = [1, 5, 3] + tours_df["household_id"] = [1, 5, 3] - tracing.register_traceable_table('tours', tours_df) + tracing.register_traceable_table("tours", tours_df) out, err = capsys.readouterr() print(out) # don't consume output # should be tracing tour with tour_id 3 - traceable_table_ids = inject.get_injectable('traceable_table_ids') - assert traceable_table_ids['tours'] == [12] + traceable_table_ids = inject.get_injectable("traceable_table_ids") + assert traceable_table_ids["tours"] == [12] close_handlers() @@ -173,7 +179,7 @@ def test_write_csv(capsys): tracing.config_logger() # should complain if df not a DataFrame or Series - tracing.write_csv(df='not a df or series', file_name='baddie') + tracing.write_csv(df="not a df or series", file_name="baddie") out, err = capsys.readouterr() @@ -186,10 +192,10 @@ def test_write_csv(capsys): def test_slice_ids(): - df = pd.DataFrame({'household_id': [1, 2, 3]}, index=[11, 12, 13]) + df = pd.DataFrame({"household_id": [1, 2, 3]}, index=[11, 12, 13]) # slice by named column - sliced_df = tracing.slice_ids(df, [1, 3, 6], column='household_id') + sliced_df = tracing.slice_ids(df, [1, 3, 6], column="household_id") assert len(sliced_df.index) == 2 # slice by index @@ -198,7 +204,7 @@ def test_slice_ids(): # attempt to slice by non-existent column with pytest.raises(RuntimeError) as excinfo: - sliced_df = tracing.slice_ids(df, [5, 6], column='baddie') + sliced_df = tracing.slice_ids(df, [5, 6], column="baddie") assert "slice_ids slicer column 'baddie' not in dataframe" in str(excinfo.value) @@ -206,10 +212,10 @@ def test_basic(capsys): close_handlers() - configs_dir = os.path.join(os.path.dirname(__file__), 'configs') + configs_dir = os.path.join(os.path.dirname(__file__), "configs") inject.add_injectable("configs_dir", configs_dir) - output_dir = os.path.join(os.path.dirname(__file__), 'output') + output_dir = os.path.join(os.path.dirname(__file__), "output") inject.add_injectable("output_dir", output_dir) # remove existing handlers or basicConfig is a NOP @@ -221,20 +227,20 @@ def test_basic(capsys): file_handlers = [h for h in logger.handlers if type(h) is logging.FileHandler] assert len(file_handlers) == 0 - logger = logging.getLogger('activitysim') + logger = logging.getLogger("activitysim") - logger.info('test_basic') - logger.debug('log_debug') - logger.info('log_info') - logger.warning('log_warn') + logger.info("test_basic") + logger.debug("log_debug") + logger.info("log_info") + logger.warning("log_warn") out, err = capsys.readouterr() # don't consume output print(out) - assert 'log_warn' in out - assert 'log_info' in out - assert 'log_debug' not in out + assert "log_warn" in out + assert "log_info" in out + assert "log_debug" not in out close_handlers() diff --git a/activitysim/core/test/test_util.py b/activitysim/core/test/test_util.py index 3e7017dd66..086e8a0326 100644 --- a/activitysim/core/test/test_util.py +++ b/activitysim/core/test/test_util.py @@ -6,51 +6,55 @@ import pandas.testing as pdt import pytest -from ..util import reindex -from ..util import other_than -from ..util import quick_loc_series -from ..util import quick_loc_df +from ..util import other_than, quick_loc_df, quick_loc_series, reindex -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def people(): - return pd.DataFrame({ - 'household': [1, 2, 2, 3, 3, 3, 4, 4, 4, 4], - 'ptype': [1, 2, 1, 3, 1, 2, 3, 2, 2, 1]}, - index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']) + return pd.DataFrame( + { + "household": [1, 2, 2, 3, 3, 3, 4, 4, 4, 4], + "ptype": [1, 2, 1, 3, 1, 2, 3, 2, 2, 1], + }, + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ) def test_other_than(people): expected = pd.Series( [False, False, True, True, True, False, True, True, True, True], - index=people.index, name='left') + index=people.index, + name="left", + ) - bools = people['ptype'] == 2 - others = other_than(people['household'], bools) + bools = people["ptype"] == 2 + others = other_than(people["household"], bools) pdt.assert_series_equal(others, expected) def test_reindex(): - s = pd.Series([.5, 1.0, 1.5], index=[2, 1, 3]) - s2 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - assert list(reindex(s, s2).values) == [1.0, .5, 1.5] + s = pd.Series([0.5, 1.0, 1.5], index=[2, 1, 3]) + s2 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + assert list(reindex(s, s2).values) == [1.0, 0.5, 1.5] def test_quick_loc_df(): - df = pd.DataFrame({'attrib': ['1', '2', '3', '4', '5']}, index=[1, 2, 3, 4, 5]) + df = pd.DataFrame({"attrib": ["1", "2", "3", "4", "5"]}, index=[1, 2, 3, 4, 5]) loc_list = np.asanyarray([2, 1, 3, 4, 4, 5, 1]) attrib_list = [str(i) for i in loc_list] - assert list(quick_loc_df(loc_list, df, 'attrib')) == attrib_list - assert list(quick_loc_df(loc_list, df, 'attrib')) == list(df.loc[loc_list]['attrib']) + assert list(quick_loc_df(loc_list, df, "attrib")) == attrib_list + assert list(quick_loc_df(loc_list, df, "attrib")) == list( + df.loc[loc_list]["attrib"] + ) def test_quick_loc_series(): - series = pd.Series(['1', '2', '3', '4', '5'], index=[1, 2, 3, 4, 5]) + series = pd.Series(["1", "2", "3", "4", "5"], index=[1, 2, 3, 4, 5]) loc_list = np.asanyarray([2, 1, 3, 4, 4, 5, 1]) attrib_list = [str(i) for i in loc_list] diff --git a/activitysim/core/test/utils_testing.py b/activitysim/core/test/utils_testing.py index d95a848621..a8a74fd3b4 100644 --- a/activitysim/core/test/utils_testing.py +++ b/activitysim/core/test/utils_testing.py @@ -34,18 +34,17 @@ def assert_frames_equal(actual, expected, use_close=False): else: comp = npt.assert_equal - assert (isinstance(actual, pd.DataFrame) and - isinstance(expected, pd.DataFrame)), \ - 'Inputs must both be pandas DataFrames.' + assert isinstance(actual, pd.DataFrame) and isinstance( + expected, pd.DataFrame + ), "Inputs must both be pandas DataFrames." for i, exp_row in expected.iterrows(): - assert i in actual.index, 'Expected row {!r} not found.'.format(i) + assert i in actual.index, "Expected row {!r} not found.".format(i) act_row = actual.loc[i] for j, exp_item in exp_row.items(): - assert j in act_row.index, \ - 'Expected column {!r} not found.'.format(j) + assert j in act_row.index, "Expected column {!r} not found.".format(j) act_item = act_row[j] @@ -53,7 +52,8 @@ def assert_frames_equal(actual, expected, use_close=False): comp(act_item, exp_item) except AssertionError as e: raise AssertionError( - str(e) + '\n\nColumn: {!r}\nRow: {!r}'.format(j, i)) + str(e) + "\n\nColumn: {!r}\nRow: {!r}".format(j, i) + ) def assert_index_equal(left, right): @@ -70,5 +70,8 @@ def assert_index_equal(left, right): left_diff = left.difference(right) right_diff = right.difference(left) if len(left_diff) > 0 or len(right_diff) > 0: - raise AssertionError("keys not in left [{0}], keys not in right [{1}]".format( - left_diff, right_diff)) + raise AssertionError( + "keys not in left [{0}], keys not in right [{1}]".format( + left_diff, right_diff + ) + ) diff --git a/activitysim/core/timetable.py b/activitysim/core/timetable.py index a793738344..fe2aeee3cf 100644 --- a/activitysim/core/timetable.py +++ b/activitysim/core/timetable.py @@ -1,16 +1,13 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import range -from builtins import object - import logging +from builtins import object, range import numpy as np import pandas as pd -from activitysim.core import pipeline -from activitysim.core import chunk +from activitysim.core import chunk, pipeline logger = logging.getLogger(__name__) @@ -31,9 +28,12 @@ [I_START, I_START], [I_END, I_END], [I_MIDDLE, I_MIDDLE], - [I_START, I_MIDDLE], [I_MIDDLE, I_START], - [I_END, I_MIDDLE], [I_MIDDLE, I_END], - [I_START_END, I_MIDDLE], [I_MIDDLE, I_START_END], + [I_START, I_MIDDLE], + [I_MIDDLE, I_START], + [I_END, I_MIDDLE], + [I_MIDDLE, I_END], + [I_START_END, I_MIDDLE], + [I_MIDDLE, I_START_END], ] COLLISION_LIST = [a + (b << I_BIT_SHIFT) for a, b in COLLISIONS] @@ -47,25 +47,25 @@ C_START_END = str(I_START_END) -def tour_map(persons, tours, tdd_alts, persons_id_col='person_id'): +def tour_map(persons, tours, tdd_alts, persons_id_col="person_id"): sigil = { - 'empty': ' ', - 'overlap': '+++', - 'work': 'WWW', - 'school': 'SSS', - 'escort': 'esc', - 'shopping': 'shp', - 'othmaint': 'mnt', - 'othdiscr': 'dsc', - 'eatout': 'eat', - 'social': 'soc', - 'eat': 'eat', - 'business': 'bus', - 'maint': 'mnt' + "empty": " ", + "overlap": "+++", + "work": "WWW", + "school": "SSS", + "escort": "esc", + "shopping": "shp", + "othmaint": "mnt", + "othdiscr": "dsc", + "eatout": "eat", + "social": "soc", + "eat": "eat", + "business": "bus", + "maint": "mnt", } - sigil_type = 'S3' + sigil_type = "S3" # we can only map scheduled tours tours = tours[tours.tdd.notnull()] @@ -76,7 +76,7 @@ def tour_map(persons, tours, tdd_alts, persons_id_col='person_id'): n_periods = max_period - min_period + 1 n_persons = len(persons.index) - agenda = np.array([sigil['empty']]*(n_periods*n_persons), dtype=sigil_type) + agenda = np.array([sigil["empty"]] * (n_periods * n_persons), dtype=sigil_type) agenda = agenda.reshape(n_persons, n_periods) scheduled = np.zeros_like(agenda, dtype=int) @@ -84,15 +84,16 @@ def tour_map(persons, tours, tdd_alts, persons_id_col='person_id'): # construct with strings so we can create runs of strings using char * int w_strings = [ - '0' * (row.start - min_period) + - '1' * (row.duration + 1) + - '0' * (max_period - row.end) - for idx, row in tdd_alts.iterrows()] + "0" * (row.start - min_period) + + "1" * (row.duration + 1) + + "0" * (max_period - row.end) + for idx, row in tdd_alts.iterrows() + ] window_periods = np.asanyarray([list(r) for r in w_strings]).astype(int) window_periods_df = pd.DataFrame(data=window_periods, index=tdd_alts.index) - for keys, nth_tours in tours.groupby(['tour_type', 'tour_type_num'], sort=True): + for keys, nth_tours in tours.groupby(["tour_type", "tour_type_num"], sort=True): tour_type = keys[0] tour_sigil = sigil[tour_type] @@ -110,10 +111,12 @@ def tour_map(persons, tours, tdd_alts, persons_id_col='person_id'): agenda[row_ixs] = np.where(tour_windows, tour_sigil, agenda[row_ixs]) # show tour overlaps - agenda = np.where(scheduled > 1, sigil['overlap'], agenda) + agenda = np.where(scheduled > 1, sigil["overlap"], agenda) # a = pd.Series([' '.join(a) for a in agenda], index=persons.index) - a = pd.DataFrame(data=agenda, columns=[str(w) for w in range(min_period, max_period+1)]) + a = pd.DataFrame( + data=agenda, columns=[str(w) for w in range(min_period, max_period + 1)] + ) a.index = persons.index a.index.name = persons_id_col @@ -157,10 +160,9 @@ def create_timetable_windows(rows, tdd_alts): UNSCHEDULED = 0 - df = pd.DataFrame(data=UNSCHEDULED, - index=rows.index, - columns=window_cols, - dtype=np.int8) + df = pd.DataFrame( + data=UNSCHEDULED, index=rows.index, columns=window_cols, dtype=np.int8 + ) return df @@ -194,21 +196,26 @@ def __init__(self, windows_df, tdd_alts_df, table_name=None): self.checkpoint_df = None # series to map window row index value to window row's ordinal index - self.window_row_ix = pd.Series(list(range(len(windows_df.index))), index=windows_df.index) + self.window_row_ix = pd.Series( + list(range(len(windows_df.index))), index=windows_df.index + ) int_time_periods = [int(c) for c in windows_df.columns.values] - self.time_ix = pd.Series(list(range(len(windows_df.columns))), index=int_time_periods) + self.time_ix = pd.Series( + list(range(len(windows_df.columns))), index=int_time_periods + ) # - pre-compute window state footprints for every tdd_alt min_period = min(int_time_periods) max_period = max(int_time_periods) # construct with strings so we can create runs of strings using char * int w_strings = [ - C_EMPTY * (row.start - min_period) + - (C_START + C_MIDDLE * (row.duration - 1) if row.duration > 0 else '') + - (C_END if row.duration > 0 else C_START_END) + - (C_EMPTY * (max_period - row.end)) - for idx, row in tdd_alts_df.iterrows()] + C_EMPTY * (row.start - min_period) + + (C_START + C_MIDDLE * (row.duration - 1) if row.duration > 0 else "") + + (C_END if row.duration > 0 else C_START_END) + + (C_EMPTY * (max_period - row.end)) + for idx, row in tdd_alts_df.iterrows() + ] # we want range index so we can use raw numpy assert (tdd_alts_df.index == list(range(tdd_alts_df.shape[0]))).all() @@ -223,7 +230,9 @@ def begin_transaction(self, transaction_loggers): if not isinstance(transaction_loggers, list): transaction_loggers = [transaction_loggers] for transaction_logger in transaction_loggers: - transaction_logger.log("timetable.begin_transaction %s" % self.windows_table_name) + transaction_logger.log( + "timetable.begin_transaction %s" % self.windows_table_name + ) self.checkpoint_df = self.windows_df.copy() self.transaction_loggers = transaction_loggers pass @@ -281,8 +290,11 @@ def replace_table(self): assert self.windows_table_name is not None if self.checkpoint_df is not None: for logger in self.transaction_loggers.values(): - logger.log("Attempt to replace_table while in transaction: %s" % - self.windows_table_name, level=logging.ERROR) + logger.log( + "Attempt to replace_table while in transaction: %s" + % self.windows_table_name, + level=logging.ERROR, + ) raise RuntimeError("Attempt to replace_table while in transaction") # get windows_df from bottleneck function in case updates to self.person_window @@ -421,7 +433,7 @@ def pairwise_available(self, window1_row_ids, window2_row_ids): available1 = (self.slice_windows_by_row_id(window1_row_ids) != I_MIDDLE) * 1 available2 = (self.slice_windows_by_row_id(window2_row_ids) != I_MIDDLE) * 1 - return (available1 * available2) + return available1 * available2 def individually_available(self, window_row_ids): @@ -445,15 +457,15 @@ def adjacent_window_run_length(self, window_row_ids, periods, before): """ assert len(window_row_ids) == len(periods) - trace_label = 'tt.adjacent_window_run_length' + trace_label = "tt.adjacent_window_run_length" with chunk.chunk_log(trace_label): time_col_ixs = periods.map(self.time_ix).values - chunk.log_df(trace_label, 'time_col_ixs', time_col_ixs) + chunk.log_df(trace_label, "time_col_ixs", time_col_ixs) # sliced windows with 1s where windows state is I_MIDDLE and 0s elsewhere available = (self.slice_windows_by_row_id(window_row_ids) != I_MIDDLE) * 1 - chunk.log_df(trace_label, 'available', available) + chunk.log_df(trace_label, "available", available) # padding periods not available available[:, 0] = 0 @@ -461,28 +473,34 @@ def adjacent_window_run_length(self, window_row_ids, periods, before): # column idxs of windows num_rows, num_cols = available.shape - time_col_ix_map = np.tile(np.arange(0, num_cols), num_rows).reshape(num_rows, num_cols) + time_col_ix_map = np.tile(np.arange(0, num_cols), num_rows).reshape( + num_rows, num_cols + ) # 0 1 2 3 4 5... # 0 1 2 3 4 5... # 0 1 2 3 4 5... - chunk.log_df(trace_label, 'time_col_ix_map', time_col_ix_map) + chunk.log_df(trace_label, "time_col_ix_map", time_col_ix_map) if before: # ones after specified time, zeroes before mask = (time_col_ix_map < time_col_ixs.reshape(num_rows, 1)) * 1 # index of first unavailable window after time - first_unavailable = np.where((1-available)*mask, time_col_ix_map, 0).max(axis=1) + first_unavailable = np.where( + (1 - available) * mask, time_col_ix_map, 0 + ).max(axis=1) available_run_length = time_col_ixs - first_unavailable - 1 else: # ones after specified time, zeroes before mask = (time_col_ix_map > time_col_ixs.reshape(num_rows, 1)) * 1 # index of first unavailable window after time - first_unavailable = np.where((1 - available) * mask, time_col_ix_map, num_cols).min(axis=1) + first_unavailable = np.where( + (1 - available) * mask, time_col_ix_map, num_cols + ).min(axis=1) available_run_length = first_unavailable - time_col_ixs - 1 - chunk.log_df(trace_label, 'mask', mask) - chunk.log_df(trace_label, 'first_unavailable', first_unavailable) - chunk.log_df(trace_label, 'available_run_length', available_run_length) + chunk.log_df(trace_label, "mask", mask) + chunk.log_df(trace_label, "first_unavailable", first_unavailable) + chunk.log_df(trace_label, "available_run_length", available_run_length) return pd.Series(available_run_length, index=window_row_ids.index) @@ -575,7 +593,9 @@ def previous_tour_ends(self, window_row_ids, periods): pandas Series boolean indexed by window_row_ids.index """ - return self.window_periods_in_states(window_row_ids, periods, [I_END, I_START_END]) + return self.window_periods_in_states( + window_row_ids, periods, [I_END, I_START_END] + ) def previous_tour_begins(self, window_row_ids, periods): """ @@ -596,7 +616,9 @@ def previous_tour_begins(self, window_row_ids, periods): indexed by window_row_ids.index """ - return self.window_periods_in_states(window_row_ids, periods, [I_START, I_START_END]) + return self.window_periods_in_states( + window_row_ids, periods, [I_START, I_START_END] + ) def remaining_periods_available(self, window_row_ids, starts, ends): """ @@ -627,7 +649,9 @@ def remaining_periods_available(self, window_row_ids, starts, ends): assert len(window_row_ids) == len(starts) assert len(window_row_ids) == len(ends) - available = (self.slice_windows_by_row_id(window_row_ids) != I_MIDDLE).sum(axis=1) + available = (self.slice_windows_by_row_id(window_row_ids) != I_MIDDLE).sum( + axis=1 + ) # don't count time window padding at both ends of day available -= 2 @@ -663,15 +687,21 @@ def max_time_block_available(self, window_row_ids): available[:, 0] = 0 available[:, -1] = 0 - diffs = np.diff(available) # 1 at start of run of availables, -1 at end, 0 everywhere else - start_row_index, starts = np.asarray(diffs > 0).nonzero() # indices of run starts + diffs = np.diff( + available + ) # 1 at start of run of availables, -1 at end, 0 everywhere else + start_row_index, starts = np.asarray( + diffs > 0 + ).nonzero() # indices of run starts end_row_index, ends = np.asarray(diffs < 0).nonzero() # indices of run ends - assert (start_row_index == end_row_index).all() # because bounded, expect same number of starts and ends + assert ( + start_row_index == end_row_index + ).all() # because bounded, expect same number of starts and ends # run_lengths like availability but with run length at start of every run and zeros elsewhere # (row_indices of starts and ends are aligned, so end - start is run_length) run_lengths = np.zeros_like(available) - run_lengths[start_row_index, starts] = (ends - starts) + run_lengths[start_row_index, starts] = ends - starts # we just want to know the the longest one for each window_row_id max_run_lengths = run_lengths.max(axis=1) diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py index 4f6a5da36f..56c1e0812a 100644 --- a/activitysim/core/tracing.py +++ b/activitysim/core/tracing.py @@ -1,29 +1,26 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import next -from builtins import range - -import multiprocessing # for process name -import os import logging import logging.config +import multiprocessing # for process name +import os import sys import time -import yaml +from builtins import next, range import numpy as np import pandas as pd +import yaml from activitysim.core import inject from . import config - # Configurations -ASIM_LOGGER = 'activitysim' -CSV_FILE_TYPE = 'csv' -LOGGING_CONF_FILE_NAME = 'logging.yaml' +ASIM_LOGGER = "activitysim" +CSV_FILE_TYPE = "csv" +LOGGING_CONF_FILE_NAME = "logging.yaml" logger = logging.getLogger(__name__) @@ -63,11 +60,13 @@ def log_runtime(model_name, start_time=None, timing=None): process_name = multiprocessing.current_process().name # only log runtime for locutor - if config.setting('multiprocess', False) and not inject.get_injectable('locutor', False): + if config.setting("multiprocess", False) and not inject.get_injectable( + "locutor", False + ): return header = "process_name,model_name,seconds,minutes" - with config.open_log_file('timing_log.csv', 'a', header) as log_file: + with config.open_log_file("timing_log.csv", "a", header) as log_file: print(f"{process_name},{model_name},{seconds},{minutes}", file=log_file) @@ -85,10 +84,10 @@ def delete_output_files(file_type, ignore=None, subdir=None): Nothing """ - output_dir = inject.get_injectable('output_dir') + output_dir = inject.get_injectable("output_dir") subdir = [subdir] if subdir else None - directories = subdir or ['', 'log', 'trace'] + directories = subdir or ["", "log", "trace"] for subdir in directories: @@ -125,12 +124,16 @@ def delete_trace_files(): ------- Nothing """ - delete_output_files(CSV_FILE_TYPE, subdir='trace') - delete_output_files(CSV_FILE_TYPE, subdir='log') + delete_output_files(CSV_FILE_TYPE, subdir="trace") + delete_output_files(CSV_FILE_TYPE, subdir="log") - active_log_files = [h.baseFilename for h in logger.root.handlers if isinstance(h, logging.FileHandler)] + active_log_files = [ + h.baseFilename + for h in logger.root.handlers + if isinstance(h, logging.FileHandler) + ] - delete_output_files('log', ignore=active_log_files) + delete_output_files("log", ignore=active_log_files) def config_logger(basic=False): @@ -148,7 +151,9 @@ def config_logger(basic=False): if basic: log_config_file = None else: - log_config_file = config.config_file_path(LOGGING_CONF_FILE_NAME, mandatory=False) + log_config_file = config.config_file_path( + LOGGING_CONF_FILE_NAME, mandatory=False + ) if log_config_file: try: @@ -159,8 +164,8 @@ def config_logger(basic=False): raise e try: - config_dict = config_dict['logging'] - config_dict.setdefault('version', 1) + config_dict = config_dict["logging"] + config_dict.setdefault("version", 1) logging.config.dictConfig(config_dict) except Exception as e: print(f"Unable to config logging as specified in {log_config_file}") @@ -203,7 +208,9 @@ def print_summary(label, df, describe=False, value_counts=False): if value_counts: n = 10 - logger.info("%s top %s value counts:\n%s" % (label, n, df.value_counts().nlargest(n))) + logger.info( + "%s top %s value counts:\n%s" % (label, n, df.value_counts().nlargest(n)) + ) if describe: logger.info("%s summary:\n%s" % (label, df.describe())) @@ -211,10 +218,12 @@ def print_summary(label, df, describe=False, value_counts=False): def initialize_traceable_tables(): - traceable_table_ids = inject.get_injectable('traceable_table_ids', {}) + traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) if len(traceable_table_ids) > 0: - logger.debug(f"initialize_traceable_tables resetting table_ids for {list(traceable_table_ids.keys())}") - inject.add_injectable('traceable_table_ids', {}) + logger.debug( + f"initialize_traceable_tables resetting table_ids for {list(traceable_table_ids.keys())}" + ) + inject.add_injectable("traceable_table_ids", {}) def register_traceable_table(table_name, df): @@ -235,7 +244,7 @@ def register_traceable_table(table_name, df): logger.debug(f"register_traceable_table {table_name}") - traceable_tables = inject.get_injectable('traceable_tables', []) + traceable_tables = inject.get_injectable("traceable_tables", []) if table_name not in traceable_tables: logger.error("table '%s' not in traceable_tables" % table_name) return @@ -245,19 +254,26 @@ def register_traceable_table(table_name, df): logger.error("Can't register table '%s' without index name" % table_name) return - traceable_table_ids = inject.get_injectable('traceable_table_ids', {}) - traceable_table_indexes = inject.get_injectable('traceable_table_indexes', {}) - - if idx_name in traceable_table_indexes and traceable_table_indexes[idx_name] != table_name: - logger.error("table '%s' index name '%s' already registered for table '%s'" % - (table_name, idx_name, traceable_table_indexes[idx_name])) + traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) + traceable_table_indexes = inject.get_injectable("traceable_table_indexes", {}) + + if ( + idx_name in traceable_table_indexes + and traceable_table_indexes[idx_name] != table_name + ): + logger.error( + "table '%s' index name '%s' already registered for table '%s'" + % (table_name, idx_name, traceable_table_indexes[idx_name]) + ) return # update traceable_table_indexes with this traceable_table's idx_name if idx_name not in traceable_table_indexes: traceable_table_indexes[idx_name] = table_name - logger.debug("adding table %s.%s to traceable_table_indexes" % (table_name, idx_name)) - inject.add_injectable('traceable_table_indexes', traceable_table_indexes) + logger.debug( + "adding table %s.%s to traceable_table_indexes" % (table_name, idx_name) + ) + inject.add_injectable("traceable_table_indexes", traceable_table_indexes) # add any new indexes associated with trace_hh_id to traceable_table_ids @@ -266,12 +282,15 @@ def register_traceable_table(table_name, df): return new_traced_ids = [] - if table_name == 'households': + if table_name == "households": if trace_hh_id not in df.index: logger.warning("trace_hh_id %s not in dataframe" % trace_hh_id) new_traced_ids = [] else: - logger.info("tracing household id %s in %s households" % (trace_hh_id, len(df.index))) + logger.info( + "tracing household id %s in %s households" + % (trace_hh_id, len(df.index)) + ) new_traced_ids = [trace_hh_id] else: @@ -279,9 +298,11 @@ def register_traceable_table(table_name, df): ref_col = next((c for c in traceable_table_indexes if c in df.columns), None) if ref_col is None: - logger.error("can't find a registered table to slice table '%s' index name '%s'" - " in traceable_table_indexes: %s" % - (table_name, idx_name, traceable_table_indexes)) + logger.error( + "can't find a registered table to slice table '%s' index name '%s'" + " in traceable_table_indexes: %s" + % (table_name, idx_name, traceable_table_indexes) + ) return # get traceable_ids for ref_col table @@ -293,8 +314,10 @@ def register_traceable_table(table_name, df): traced_df = df[df[ref_col].isin(ref_col_traced_ids)] new_traced_ids = traced_df.index.tolist() if len(new_traced_ids) == 0: - logger.warning("register %s: no rows with %s in %s." % - (table_name, ref_col, ref_col_traced_ids)) + logger.warning( + "register %s: no rows with %s in %s." + % (table_name, ref_col, ref_col_traced_ids) + ) # update the list of trace_ids for this table prior_traced_ids = traceable_table_ids.get(table_name, []) @@ -302,15 +325,21 @@ def register_traceable_table(table_name, df): if new_traced_ids: assert not set(prior_traced_ids) & set(new_traced_ids) traceable_table_ids[table_name] = prior_traced_ids + new_traced_ids - inject.add_injectable('traceable_table_ids', traceable_table_ids) + inject.add_injectable("traceable_table_ids", traceable_table_ids) - logger.debug("register %s: added %s new ids to %s existing trace ids" % - (table_name, len(new_traced_ids), len(prior_traced_ids))) - logger.debug("register %s: tracing new ids %s in %s" % - (table_name, new_traced_ids, table_name)) + logger.debug( + "register %s: added %s new ids to %s existing trace ids" + % (table_name, len(new_traced_ids), len(prior_traced_ids)) + ) + logger.debug( + "register %s: tracing new ids %s in %s" + % (table_name, new_traced_ids, table_name) + ) -def write_df_csv(df, file_path, index_label=None, columns=None, column_labels=None, transpose=True): +def write_df_csv( + df, file_path, index_label=None, columns=None, column_labels=None, transpose=True +): need_header = not os.path.isfile(file_path) @@ -319,7 +348,7 @@ def write_df_csv(df, file_path, index_label=None, columns=None, column_labels=No if not transpose: want_index = isinstance(df.index, pd.MultiIndex) or df.index.name is not None - df.to_csv(file_path, mode='a', index=want_index, header=need_header) + df.to_csv(file_path, mode="a", index=want_index, header=need_header) return df_t = df.transpose() if df.index.name in df else df.reset_index().transpose() @@ -332,24 +361,33 @@ def write_df_csv(df, file_path, index_label=None, columns=None, column_labels=No if column_labels is None: column_labels = [None, None] if column_labels[0] is None: - column_labels[0] = 'label' + column_labels[0] = "label" if column_labels[1] is None: - column_labels[1] = 'value' + column_labels[1] = "value" if len(df_t.columns) == len(column_labels) - 1: - column_label_row = ','.join(column_labels) + column_label_row = ",".join(column_labels) else: - column_label_row = \ - column_labels[0] + ',' \ - + ','.join([column_labels[1] + '_' + str(i+1) for i in range(len(df_t.columns))]) + column_label_row = ( + column_labels[0] + + "," + + ",".join( + [ + column_labels[1] + "_" + str(i + 1) + for i in range(len(df_t.columns)) + ] + ) + ) - with open(file_path, mode='a') as f: - f.write(column_label_row + '\n') + with open(file_path, mode="a") as f: + f.write(column_label_row + "\n") - df_t.to_csv(file_path, mode='a', index=True, header=False) + df_t.to_csv(file_path, mode="a", index=True, header=False) -def write_series_csv(series, file_path, index_label=None, columns=None, column_labels=None): +def write_series_csv( + series, file_path, index_label=None, columns=None, column_labels=None +): if isinstance(columns, str): series = series.rename(columns) @@ -361,10 +399,12 @@ def write_series_csv(series, file_path, index_label=None, columns=None, column_l series.index.name = index_label need_header = not os.path.isfile(file_path) - series.to_csv(file_path, mode='a', index=True, header=need_header) + series.to_csv(file_path, mode="a", index=True, header=need_header) -def write_csv(df, file_name, index_label=None, columns=None, column_labels=None, transpose=True): +def write_csv( + df, file_name, index_label=None, columns=None, column_labels=None, transpose=True +): """ Print write_csv @@ -387,12 +427,12 @@ def write_csv(df, file_name, index_label=None, columns=None, column_labels=None, assert len(file_name) > 0 - if not file_name.endswith('.%s' % CSV_FILE_TYPE): - file_name = '%s.%s' % (file_name, CSV_FILE_TYPE) + if not file_name.endswith(".%s" % CSV_FILE_TYPE): + file_name = "%s.%s" % (file_name, CSV_FILE_TYPE) file_path = config.trace_file_path(file_name) - if os.name == 'nt': + if os.name == "nt": abs_path = os.path.abspath(file_path) if len(abs_path) > 255: msg = f"path length ({len(abs_path)}) may exceed Windows maximum length unless LongPathsEnabled: {abs_path}" @@ -403,7 +443,9 @@ def write_csv(df, file_name, index_label=None, columns=None, column_labels=None, if isinstance(df, pd.DataFrame): # logger.debug("dumping %s dataframe to %s" % (df.shape, file_name)) - write_df_csv(df, file_path, index_label, columns, column_labels, transpose=transpose) + write_df_csv( + df, file_path, index_label, columns, column_labels, transpose=transpose + ) elif isinstance(df, pd.Series): # logger.debug("dumping %s element series to %s" % (df.shape[0], file_name)) write_series_csv(df, file_path, index_label, columns, column_labels) @@ -412,8 +454,10 @@ def write_csv(df, file_name, index_label=None, columns=None, column_labels=None, # logger.debug("dumping %s element dict to %s" % (df.shape[0], file_name)) write_series_csv(df, file_path, index_label, columns, column_labels) else: - logger.error("write_csv object for file_name '%s' of unexpected type: %s" % - (file_name, type(df))) + logger.error( + "write_csv object for file_name '%s' of unexpected type: %s" + % (file_name, type(df)) + ) def slice_ids(df, ids, column=None): @@ -475,7 +519,7 @@ def get_trace_target(df, slicer, column=None): target_ids = None # id or ids to slice by (e.g. hh_id or person_ids or tour_ids) # special do-not-slice code for dumping entire df - if slicer == 'NONE': + if slicer == "NONE": return target_ids, column if slicer is None: @@ -483,16 +527,18 @@ def get_trace_target(df, slicer, column=None): if isinstance(df, pd.DataFrame): # always slice by household id if we can - if 'household_id' in df.columns: - slicer = 'household_id' + if "household_id" in df.columns: + slicer = "household_id" if slicer in df.columns: column = slicer if column is None and df.index.name != slicer: - raise RuntimeError("bad slicer '%s' for df with index '%s'" % (slicer, df.index.name)) + raise RuntimeError( + "bad slicer '%s' for df with index '%s'" % (slicer, df.index.name) + ) - traceable_table_indexes = inject.get_injectable('traceable_table_indexes', {}) - traceable_table_ids = inject.get_injectable('traceable_table_ids', {}) + traceable_table_indexes = inject.get_injectable("traceable_table_indexes", {}) + traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) if df.empty: target_ids = None @@ -500,8 +546,8 @@ def get_trace_target(df, slicer, column=None): # maps 'person_id' to 'persons', etc table_name = traceable_table_indexes[slicer] target_ids = traceable_table_ids.get(table_name, []) - elif slicer == 'zone_id': - target_ids = inject.get_injectable('trace_od', []) + elif slicer == "zone_id": + target_ids = inject.get_injectable("trace_od", []) return target_ids, column @@ -552,10 +598,10 @@ def hh_id_for_chooser(id, choosers): scalar household_id or series of household_ids """ - if choosers.index.name == 'household_id': + if choosers.index.name == "household_id": hh_id = id - elif 'household_id' in choosers.columns: - hh_id = choosers.loc[id]['household_id'] + elif "household_id" in choosers.columns: + hh_id = choosers.loc[id]["household_id"] else: print(": hh_id_for_chooser: nada:\n%s" % choosers.columns) hh_id = None @@ -577,7 +623,7 @@ def trace_id_for_chooser(id, choosers): """ hh_id = None - for column_name in ['household_id', 'person_id']: + for column_name in ["household_id", "person_id"]: if choosers.index.name == column_name: hh_id = id break @@ -593,12 +639,22 @@ def trace_id_for_chooser(id, choosers): def dump_df(dump_switch, df, trace_label, fname): if dump_switch: - trace_label = extend_trace_label(trace_label, 'DUMP.%s' % fname) - trace_df(df, trace_label, index_label=df.index.name, slicer='NONE', transpose=False) - - -def trace_df(df, label, slicer=None, columns=None, - index_label=None, column_labels=None, transpose=True, warn_if_empty=False): + trace_label = extend_trace_label(trace_label, "DUMP.%s" % fname) + trace_df( + df, trace_label, index_label=df.index.name, slicer="NONE", transpose=False + ) + + +def trace_df( + df, + label, + slicer=None, + columns=None, + index_label=None, + column_labels=None, + transpose=True, + warn_if_empty=False, +): """ Slice dataframe by traced household or person id dataframe and write to CSV @@ -633,12 +689,20 @@ def trace_df(df, label, slicer=None, columns=None, if warn_if_empty and df.shape[0] == 0 and target_ids != []: column_name = column or slicer - logger.warning("slice_canonically: no rows in %s with %s == %s" - % (label, column_name, target_ids)) + logger.warning( + "slice_canonically: no rows in %s with %s == %s" + % (label, column_name, target_ids) + ) if df.shape[0] > 0: - write_csv(df, file_name=label, index_label=(index_label or slicer), columns=columns, - column_labels=column_labels, transpose=transpose) + write_csv( + df, + file_name=label, + index_label=(index_label or slicer), + columns=columns, + column_labels=column_labels, + transpose=transpose, + ) def interaction_trace_rows(interaction_df, choosers, sample_size=None): @@ -668,21 +732,23 @@ def interaction_trace_rows(interaction_df, choosers, sample_size=None): # slicer column name and id targets to use for chooser id added to model_design dataframe # currently we only ever slice by person_id, but that could change, so we check here... - traceable_table_ids = inject.get_injectable('traceable_table_ids', {}) + traceable_table_ids = inject.get_injectable("traceable_table_ids", {}) - if choosers.index.name == 'person_id' and 'persons' in traceable_table_ids: + if choosers.index.name == "person_id" and "persons" in traceable_table_ids: slicer_column_name = choosers.index.name - targets = traceable_table_ids['persons'] - elif 'household_id' in choosers.columns and 'households' in traceable_table_ids: - slicer_column_name = 'household_id' - targets = traceable_table_ids['households'] - elif 'person_id' in choosers.columns and 'persons' in traceable_table_ids: - slicer_column_name = 'person_id' - targets = traceable_table_ids['persons'] + targets = traceable_table_ids["persons"] + elif "household_id" in choosers.columns and "households" in traceable_table_ids: + slicer_column_name = "household_id" + targets = traceable_table_ids["households"] + elif "person_id" in choosers.columns and "persons" in traceable_table_ids: + slicer_column_name = "person_id" + targets = traceable_table_ids["persons"] else: print(choosers.columns) - raise RuntimeError("interaction_trace_rows don't know how to slice index '%s'" - % choosers.index.name) + raise RuntimeError( + "interaction_trace_rows don't know how to slice index '%s'" + % choosers.index.name + ) if sample_size is None: # if sample size not constant, we count on either @@ -701,11 +767,11 @@ def interaction_trace_rows(interaction_df, choosers, sample_size=None): if slicer_column_name == choosers.index.name: trace_rows = np.in1d(choosers.index, targets) trace_ids = np.asanyarray(choosers[trace_rows].index) - elif slicer_column_name == 'person_id': - trace_rows = np.in1d(choosers['person_id'], targets) + elif slicer_column_name == "person_id": + trace_rows = np.in1d(choosers["person_id"], targets) trace_ids = np.asanyarray(choosers[trace_rows].person_id) - elif slicer_column_name == 'household_id': - trace_rows = np.in1d(choosers['household_id'], targets) + elif slicer_column_name == "household_id": + trace_rows = np.in1d(choosers["household_id"], targets) trace_ids = np.asanyarray(choosers[trace_rows].household_id) else: assert False @@ -754,7 +820,7 @@ def trace_interaction_eval_results(trace_results, trace_ids, label): return # write out the raw dataframe - file_path = config.trace_file_path('%s.raw.csv' % label) + file_path = config.trace_file_path("%s.raw.csv" % label) trace_results.to_csv(file_path, mode="a", index=True, header=True) # if there are multiple targets, we want them in separate tables for readability @@ -768,14 +834,16 @@ def trace_interaction_eval_results(trace_results, trace_ids, label): # # remove the slicer (person_id or hh_id) column? # del df_target[slicer_column_name] - target_label = '%s.%s.%s' % (label, slicer_column_name, target) + target_label = "%s.%s.%s" % (label, slicer_column_name, target) - trace_df(df_target, - label=target_label, - slicer="NONE", - transpose=True, - column_labels=['expression', None], - warn_if_empty=False) + trace_df( + df_target, + label=target_label, + slicer="NONE", + transpose=True, + column_labels=["expression", None], + warn_if_empty=False, + ) def no_results(trace_label): diff --git a/activitysim/core/util.py b/activitysim/core/util.py index 1351023f37..9a2f5c18d5 100644 --- a/activitysim/core/util.py +++ b/activitysim/core/util.py @@ -1,28 +1,26 @@ # ActivitySim # See full license in LICENSE.txt. -from builtins import zip import logging import os - +from builtins import zip from operator import itemgetter -import numpy as np -import pandas as pd - import cytoolz as tz import cytoolz.curried +import numpy as np +import pandas as pd logger = logging.getLogger(__name__) -def si_units(x, kind='B', digits=3, shift=1000): +def si_units(x, kind="B", digits=3, shift=1000): # nano micro milli kilo mega giga tera peta exa zeta yotta - tiers = ['n', 'µ', 'm', '', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] + tiers = ["n", "µ", "m", "", "K", "M", "G", "T", "P", "E", "Z", "Y"] tier = 3 - sign = '-' if x < 0 else '' + sign = "-" if x < 0 else "" x = abs(x) if x > 0: while x > shift and tier < len(tiers): @@ -35,18 +33,18 @@ def si_units(x, kind='B', digits=3, shift=1000): def GB(bytes): - return si_units(bytes, kind='B', digits=1) + return si_units(bytes, kind="B", digits=1) def SEC(seconds): - return si_units(seconds, kind='s', digits=2) + return si_units(seconds, kind="s", digits=2) def INT(x): # format int as camel case (e.g. 1000000 vecomes '1_000_000') negative = x < 0 x = abs(int(x)) - result = '' + result = "" while x >= 1000: x, r = divmod(x, 1000) result = "_%03d%s" % (r, result) @@ -123,12 +121,12 @@ def left_merge_on_index_and_col(left_df, right_df, join_col, target_col): idx_col = right_df.index.name # SELECT target_col FROM full_sample LEFT JOIN unique_sample on idx_col, join_col - merged = \ - pd.merge( - left_df[[join_col]].reset_index(), - right_df[[join_col, target_col]].reset_index(), - on=[idx_col, join_col], - how="left") + merged = pd.merge( + left_df[[join_col]].reset_index(), + right_df[[join_col, target_col]].reset_index(), + on=[idx_col, join_col], + how="left", + ) merged.set_index(idx_col, inplace=True) @@ -171,11 +169,13 @@ def reindex(series1, series2): """ # turns out the merge is much faster than the .loc below - df = pd.merge(series2.to_frame(name='left'), - series1.to_frame(name='right'), - left_on="left", - right_index=True, - how="left") + df = pd.merge( + series2.to_frame(name="left"), + series1.to_frame(name="right"), + left_on="left", + right_index=True, + how="left", + ) return df.right # return pd.Series(series1.loc[series2.values].values, index=series2.index) @@ -213,14 +213,19 @@ def other_than(groups, bools): """ counts = groups[bools].value_counts() - merge_col = groups.to_frame(name='right') + merge_col = groups.to_frame(name="right") pipeline = tz.compose( tz.curry(pd.Series.fillna, value=False), - itemgetter('left'), + itemgetter("left"), tz.curry( - pd.DataFrame.merge, right=merge_col, how='right', left_index=True, - right_on='right'), - tz.curry(pd.Series.to_frame, name='left')) + pd.DataFrame.merge, + right=merge_col, + how="right", + left_index=True, + right_on="right", + ), + tz.curry(pd.Series.to_frame, name="left"), + ) gt0 = pipeline(counts > 0) gt1 = pipeline(counts > 1) @@ -283,13 +288,17 @@ def quick_loc_series(loc_list, target_series): elif isinstance(loc_list, np.ndarray) or isinstance(loc_list, list): left_df = pd.DataFrame({left_on: loc_list}) else: - raise RuntimeError("quick_loc_series loc_list of unexpected type %s" % type(loc_list)) - - df = pd.merge(left_df, - target_series.to_frame(name='right'), - left_on=left_on, - right_index=True, - how="left") + raise RuntimeError( + "quick_loc_series loc_list of unexpected type %s" % type(loc_list) + ) + + df = pd.merge( + left_df, + target_series.to_frame(name="right"), + left_on=left_on, + right_index=True, + how="left", + ) # regression test # assert list(df.right) == list(target_series.loc[loc_list]) @@ -313,7 +322,7 @@ def assign_in_place(df, df2): """ # expect no rows in df2 that are not in df - assert (len(df2.index.difference(df.index)) == 0) + assert len(df2.index.difference(df.index)) == 0 # update common columns in place common_columns = df2.columns.intersection(df.columns) @@ -331,18 +340,24 @@ def assign_in_place(df, df2): try: df[c] = df[c].astype(old_dtype) except ValueError: - logger.warning("assign_in_place changed dtype %s of column %s to %s" % - (old_dtype, c, df[c].dtype)) + logger.warning( + "assign_in_place changed dtype %s of column %s to %s" + % (old_dtype, c, df[c].dtype) + ) # if both df and df2 column were ints, but result is not - if np.issubdtype(old_dtype, np.integer) \ - and np.issubdtype(df2[c].dtype, np.integer) \ - and not np.issubdtype(df[c].dtype, np.integer): + if ( + np.issubdtype(old_dtype, np.integer) + and np.issubdtype(df2[c].dtype, np.integer) + and not np.issubdtype(df[c].dtype, np.integer) + ): try: df[c] = df[c].astype(old_dtype) except ValueError: - logger.warning("assign_in_place changed dtype %s of column %s to %s" % - (old_dtype, c, df[c].dtype)) + logger.warning( + "assign_in_place changed dtype %s of column %s to %s" + % (old_dtype, c, df[c].dtype) + ) # add new columns (in order they appear in df2) new_columns = [c for c in df2.columns if c not in df.columns] diff --git a/activitysim/estimation/larch/auto_ownership.py b/activitysim/estimation/larch/auto_ownership.py index cec449db4b..06cd7f34a2 100644 --- a/activitysim/estimation/larch/auto_ownership.py +++ b/activitysim/estimation/larch/auto_ownership.py @@ -1,18 +1,18 @@ import os +from typing import Collection + import numpy as np import pandas as pd import yaml -from typing import Collection +from larch import DataFrames, Model, P, X from larch.util import Dict -from .simple_simulate import simple_simulate_data - from .general import ( - remove_apostrophes, apply_coefficients, dict_of_linear_utility_from_spec, + remove_apostrophes, ) -from larch import Model, DataFrames, P, X +from .simple_simulate import simple_simulate_data def auto_ownership_model( diff --git a/activitysim/estimation/larch/cdap.py b/activitysim/estimation/larch/cdap.py index c3b4a22186..4c82e87b6e 100644 --- a/activitysim/estimation/larch/cdap.py +++ b/activitysim/estimation/larch/cdap.py @@ -1,22 +1,22 @@ -import numpy as np -import pandas as pd -import re +import importlib import itertools -from larch import P, X, DataFrames, Model -from larch.model.model_group import ModelGroup -from larch.util import Dict -import larch +import logging import os +import re +from pathlib import Path + +import larch +import numpy as np +import pandas as pd import yaml -import importlib -import logging +from larch import DataFrames, Model, P, X from larch.log import logger_name -from pathlib import Path +from larch.model.model_group import ModelGroup +from larch.util import Dict from ...abm.models.util import cdap from .general import apply_coefficients, explicit_value_parameters - _logger = logging.getLogger(logger_name) @@ -322,7 +322,7 @@ def read_yaml(filename, **kwargs): except FileNotFoundError: persons = pd.read_csv(persons_file) - person_type_map = settings.get('PERSON_TYPE_MAP') + person_type_map = settings.get("PERSON_TYPE_MAP") if person_type_map is None: raise KeyError("PERSON_TYPE_MAP missing from cdap_settings.yaml") diff --git a/activitysim/estimation/larch/general.py b/activitysim/estimation/larch/general.py index 5730e96faa..7c0f498624 100644 --- a/activitysim/estimation/larch/general.py +++ b/activitysim/estimation/larch/general.py @@ -1,18 +1,18 @@ +import itertools +import logging +import os +import re +from pathlib import Path +from typing import Mapping + import numpy as np import pandas as pd -import re -import os import yaml -import itertools -from typing import Mapping -from larch import P, X, DataFrames, Model +from larch import DataFrames, Model, P, X +from larch.log import logger_name from larch.model.abstract_model import AbstractChoiceModel from larch.model.tree import NestingTree from larch.util import Dict -from pathlib import Path - -import logging -from larch.log import logger_name _logger = logging.getLogger(logger_name) @@ -318,7 +318,10 @@ def apply_coefficients(coefficients, model, minimum=None, maximum=None): assert "value" in coefficients.columns if "constrain" not in coefficients.columns: import warnings - warnings.warn("coefficient dataframe missing 'constrain' column, setting all to 'F'") + + warnings.warn( + "coefficient dataframe missing 'constrain' column, setting all to 'F'" + ) coefficients["constrain"] = "F" assert coefficients.index.name == "coefficient_name" assert isinstance(model, AbstractChoiceModel) @@ -475,7 +478,7 @@ def clean_values( return values -def update_coefficients(model, data, result_dir=Path('.'), output_file=None): +def update_coefficients(model, data, result_dir=Path("."), output_file=None): if isinstance(data, pd.DataFrame): coefficients = data.copy() else: @@ -485,7 +488,6 @@ def update_coefficients(model, data, result_dir=Path('.'), output_file=None): if output_file is not None: os.makedirs(result_dir, exist_ok=True) coefficients.reset_index().to_csv( - result_dir/output_file, - index=False, + result_dir / output_file, index=False, ) return coefficients diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py index 6ee8c32053..0a5009b3b0 100644 --- a/activitysim/estimation/larch/location_choice.py +++ b/activitysim/estimation/larch/location_choice.py @@ -1,21 +1,22 @@ import os +from pathlib import Path +from typing import Collection + import numpy as np import pandas as pd import yaml -from typing import Collection +from larch import DataFrames, Model, P, X from larch.util import Dict -from pathlib import Path from .general import ( - remove_apostrophes, - construct_nesting_tree, - linear_utility_from_spec, - explicit_value_parameters, apply_coefficients, + construct_nesting_tree, cv_to_ca, + explicit_value_parameters, + linear_utility_from_spec, + remove_apostrophes, str_repr, ) -from larch import Model, DataFrames, P, X def size_coefficients_from_spec(size_spec): @@ -48,8 +49,8 @@ def location_choice_model( model_selector = model_selector.replace("_destination", "") model_selector = model_selector.replace("_subtour", "") model_selector = model_selector.replace("_tour", "") - if model_selector == 'joint': - model_selector = 'non_mandatory' + if model_selector == "joint": + model_selector = "non_mandatory" edb_directory = edb_directory.format(name=name) def _read_csv(filename, **kwargs): @@ -77,7 +78,7 @@ def _read_csv(filename, **kwargs): include_settings = os.path.join(edb_directory, include_settings) if include_settings and os.path.exists(include_settings): with open(include_settings, "r") as yf: - more_settings = yaml.load(yf, Loader=yaml.SafeLoader, ) + more_settings = yaml.load(yf, Loader=yaml.SafeLoader,) settings.update(more_settings) CHOOSER_SEGMENT_COLUMN_NAME = settings.get("CHOOSER_SEGMENT_COLUMN_NAME") @@ -87,7 +88,7 @@ def _read_csv(filename, **kwargs): if SEGMENTS is not None: SEGMENT_IDS = {i: i for i in SEGMENTS} - SIZE_TERM_SELECTOR = settings.get('SIZE_TERM_SELECTOR', model_selector) + SIZE_TERM_SELECTOR = settings.get("SIZE_TERM_SELECTOR", model_selector) # filter size spec for this location choice only size_spec = ( @@ -100,41 +101,39 @@ def _read_csv(filename, **kwargs): size_coef = size_coefficients_from_spec(size_spec) indexes_to_drop = [ - "util_size_variable", # pre-computed size (will be re-estimated) - "util_size_variable_atwork", # pre-computed size (will be re-estimated) - "util_utility_adjustment", # shadow pricing (ignored in estimation) - "@df['size_term'].apply(np.log1p)", # pre-computed size (will be re-estimated) + "util_size_variable", # pre-computed size (will be re-estimated) + "util_size_variable_atwork", # pre-computed size (will be re-estimated) + "util_utility_adjustment", # shadow pricing (ignored in estimation) + "@df['size_term'].apply(np.log1p)", # pre-computed size (will be re-estimated) ] - if 'Label' in spec.columns: + if "Label" in spec.columns: indexes_to_drop = [i for i in indexes_to_drop if i in spec.Label.to_numpy()] - label_column_name = 'Label' - elif 'Expression' in spec.columns: - indexes_to_drop = [i for i in indexes_to_drop if i in spec.Expression.to_numpy()] - label_column_name = 'Expression' + label_column_name = "Label" + elif "Expression" in spec.columns: + indexes_to_drop = [ + i for i in indexes_to_drop if i in spec.Expression.to_numpy() + ] + label_column_name = "Expression" else: raise ValueError("cannot find Label or Expression in spec file") expression_labels = None - if label_column_name == 'Expression': + if label_column_name == "Expression": expression_labels = { expr: f"variable_label{n:04d}" for n, expr in enumerate(spec.Expression.to_numpy()) } # Remove shadow pricing and pre-existing size expression for re-estimation - spec = ( - spec.set_index(label_column_name) - .drop(index=indexes_to_drop) - .reset_index() - ) + spec = spec.set_index(label_column_name).drop(index=indexes_to_drop).reset_index() - if label_column_name == 'Expression': - spec.insert(0, "Label", spec['Expression'].map(expression_labels)) - alt_values['variable'] = alt_values['variable'].map(expression_labels) + if label_column_name == "Expression": + spec.insert(0, "Label", spec["Expression"].map(expression_labels)) + alt_values["variable"] = alt_values["variable"].map(expression_labels) label_column_name = "Label" - if name == 'trip_destination': - CHOOSER_SEGMENT_COLUMN_NAME = 'primary_purpose' + if name == "trip_destination": + CHOOSER_SEGMENT_COLUMN_NAME = "primary_purpose" primary_purposes = spec.columns[3:] SEGMENT_IDS = {pp: pp for pp in primary_purposes} @@ -181,28 +180,46 @@ def _read_csv(filename, **kwargs): except KeyError: # Missing the zone_id variable? # Use the alternative id's instead, which assumes no sampling of alternatives - x_ca_1 = pd.merge(x_ca, landuse, left_on=x_ca.index.get_level_values(1), right_index=True, how="left") + x_ca_1 = pd.merge( + x_ca, + landuse, + left_on=x_ca.index.get_level_values(1), + right_index=True, + how="left", + ) x_ca_1.index = x_ca.index # Availability of choice zones if "util_no_attractions" in x_ca_1: - av = x_ca_1["util_no_attractions"].apply(lambda x: False if x == 1 else True).astype(np.int8) + av = ( + x_ca_1["util_no_attractions"] + .apply(lambda x: False if x == 1 else True) + .astype(np.int8) + ) elif "@df['size_term']==0" in x_ca_1: - av = x_ca_1["@df['size_term']==0"].apply(lambda x: False if x == 1 else True).astype(np.int8) + av = ( + x_ca_1["@df['size_term']==0"] + .apply(lambda x: False if x == 1 else True) + .astype(np.int8) + ) else: av = 1 d = DataFrames(co=x_co, ca=x_ca_1, av=av) m = Model(dataservice=d) - if len(spec.columns) == 4 and all(spec.columns == ['Label', 'Description', 'Expression', 'coefficient']): + if len(spec.columns) == 4 and all( + spec.columns == ["Label", "Description", "Expression", "coefficient"] + ): m.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist",), ) - elif len(spec.columns) == 4 \ - and all(spec.columns[:3] == ['Label', 'Description', 'Expression']) \ - and len(SEGMENT_IDS) == 1 \ - and spec.columns[3] == list(SEGMENT_IDS.values())[0]: + elif ( + len(spec.columns) == 4 + and all(spec.columns[:3] == ["Label", "Description", "Expression"]) + and len(SEGMENT_IDS) == 1 + and spec.columns[3] == list(SEGMENT_IDS.values())[0] + ): m.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist",), ) @@ -225,7 +242,9 @@ def _read_csv(filename, **kwargs): ) else: m.quantity_ca = sum( - P(f"{i}_{q}") * X(q) * X(f"{CHOOSER_SEGMENT_COLUMN_NAME}=={str_repr(SEGMENT_IDS[i])}") + P(f"{i}_{q}") + * X(q) + * X(f"{CHOOSER_SEGMENT_COLUMN_NAME}=={str_repr(SEGMENT_IDS[i])}") for i in size_spec.index for q in size_spec.columns if size_spec.loc[i, q] != 0 @@ -256,7 +275,7 @@ def _read_csv(filename, **kwargs): return m -def update_size_spec(model, data, result_dir=Path('.'), output_file=None): +def update_size_spec(model, data, result_dir=Path("."), output_file=None): master_size_spec = data.master_size_spec size_spec = data.size_spec model_selector = data.model_selector @@ -265,7 +284,9 @@ def update_size_spec(model, data, result_dir=Path('.'), output_file=None): for c in size_spec.columns: for i in size_spec.index: param_name = f"{i}_{c}" - j = (master_size_spec['segment'] == i) & (master_size_spec['model_selector'] == model_selector) + j = (master_size_spec["segment"] == i) & ( + master_size_spec["model_selector"] == model_selector + ) try: master_size_spec.loc[j, c] = np.exp(model.get_value(param_name)) except KeyError: @@ -273,38 +294,30 @@ def update_size_spec(model, data, result_dir=Path('.'), output_file=None): # Rescale each row to total 1, not mathematically needed # but to maintain a consistent approach from existing ASim - master_size_spec.iloc[:, 2:] = ( - master_size_spec.iloc[:, 2:].div(master_size_spec.iloc[:, 2:].sum(1), axis=0) + master_size_spec.iloc[:, 2:] = master_size_spec.iloc[:, 2:].div( + master_size_spec.iloc[:, 2:].sum(1), axis=0 ) if output_file is not None: os.makedirs(result_dir, exist_ok=True) master_size_spec.reset_index().to_csv( - result_dir/output_file, - index=False, + result_dir / output_file, index=False, ) return master_size_spec def workplace_location_model(return_data=False): - return location_choice_model( - name="workplace_location", - return_data=return_data, - ) + return location_choice_model(name="workplace_location", return_data=return_data,) def school_location_model(return_data=False): - return location_choice_model( - name="school_location", - return_data=return_data, - ) + return location_choice_model(name="school_location", return_data=return_data,) def atwork_subtour_destination_model(return_data=False): return location_choice_model( - name="atwork_subtour_destination", - return_data=return_data, + name="atwork_subtour_destination", return_data=return_data, ) @@ -320,13 +333,9 @@ def joint_tour_destination_model(return_data=False): def non_mandatory_tour_destination_model(return_data=False): # goes with joint_tour_destination return location_choice_model( - name="non_mandatory_tour_destination", - return_data=return_data, + name="non_mandatory_tour_destination", return_data=return_data, ) def trip_destination_model(return_data=False): - return location_choice_model( - name="trip_destination", - return_data=return_data, - ) + return location_choice_model(name="trip_destination", return_data=return_data,) diff --git a/activitysim/estimation/larch/mode_choice.py b/activitysim/estimation/larch/mode_choice.py index 80761308c1..d2b319fc19 100644 --- a/activitysim/estimation/larch/mode_choice.py +++ b/activitysim/estimation/larch/mode_choice.py @@ -1,36 +1,35 @@ import os +from pathlib import Path +from typing import Collection + import numpy as np import pandas as pd import yaml -from typing import Collection +from larch import DataFrames, Model, P, X from larch.util import Dict -from pathlib import Path from .general import ( - remove_apostrophes, - construct_nesting_tree, - linear_utility_from_spec, - explicit_value_parameters, apply_coefficients, clean_values, + construct_nesting_tree, + explicit_value_parameters, + linear_utility_from_spec, + remove_apostrophes, ) -from .simple_simulate import simple_simulate_data, construct_availability -from larch import Model, DataFrames, P, X +from .simple_simulate import construct_availability, simple_simulate_data def mode_choice_model( - name, - edb_directory="output/estimation_data_bundle/{name}/", - return_data=False, - override_filenames=None, + name, + edb_directory="output/estimation_data_bundle/{name}/", + return_data=False, + override_filenames=None, ): if override_filenames is None: override_filenames = {} edb_directory = edb_directory.format(name=name) data = simple_simulate_data( - name=name, - edb_directory=edb_directory, - **override_filenames, + name=name, edb_directory=edb_directory, **override_filenames, ) coefficients = data.coefficients coef_template = data.coef_template @@ -48,9 +47,9 @@ def mode_choice_model( purposes = list(coef_template.columns) if "atwork" in name: - purposes = ['atwork'] - elif 'atwork' in purposes: - purposes.remove('atwork') + purposes = ["atwork"] + elif "atwork" in purposes: + purposes.remove("atwork") # Setup purpose specific models m = {purpose: Model(graph=tree, title=purpose) for purpose in purposes} @@ -71,13 +70,15 @@ def mode_choice_model( explicit_value_parameters(model) apply_coefficients(coefficients, m) - avail = construct_availability(m[purposes[0]], chooser_data, data.alt_codes_to_names) + avail = construct_availability( + m[purposes[0]], chooser_data, data.alt_codes_to_names + ) d = DataFrames( co=chooser_data, av=avail, alt_codes=data.alt_codes, alt_names=data.alt_names, ) - if 'atwork' not in name: + if "atwork" not in name: for purpose, model in m.items(): model.dataservice = d.selector_co(f"tour_type=='{purpose}'") model.choice_co_code = "override_choice_code" @@ -113,9 +114,7 @@ def tour_mode_choice_model( return_data=False, ): return mode_choice_model( - name=name, - edb_directory=edb_directory, - return_data=return_data, + name=name, edb_directory=edb_directory, return_data=return_data, ) @@ -125,9 +124,7 @@ def trip_mode_choice_model( return_data=False, ): return mode_choice_model( - name=name, - edb_directory=edb_directory, - return_data=return_data, + name=name, edb_directory=edb_directory, return_data=return_data, ) @@ -140,7 +137,5 @@ def atwork_subtour_mode_choice_model( name=name, edb_directory=edb_directory, return_data=return_data, - override_filenames=dict( - coefficients_file="tour_mode_choice_coefficients.csv", - ) + override_filenames=dict(coefficients_file="tour_mode_choice_coefficients.csv",), ) diff --git a/activitysim/estimation/larch/nonmand_tour_freq.py b/activitysim/estimation/larch/nonmand_tour_freq.py index f042be9463..ecbb1408c6 100644 --- a/activitysim/estimation/larch/nonmand_tour_freq.py +++ b/activitysim/estimation/larch/nonmand_tour_freq.py @@ -1,21 +1,22 @@ +import itertools +import logging +import os +import re +from pathlib import Path +from typing import Mapping + import numpy as np import pandas as pd -import re -import os import yaml -import itertools -from typing import Mapping -from larch import P, X, DataFrames, Model +from larch import DataFrames, Model, P, X +from larch.log import logger_name from larch.util import Dict -from pathlib import Path -import logging -from larch.log import logger_name from .general import ( - remove_apostrophes, - linear_utility_from_spec, apply_coefficients, cv_to_ca, + linear_utility_from_spec, + remove_apostrophes, ) _logger = logging.getLogger(logger_name) @@ -125,7 +126,9 @@ def nonmand_tour_freq_model( settings = data.settings segment_names = [s["NAME"] for s in settings["SPEC_SEGMENTS"]] - data.relabel_coef = link_same_value_coefficients(segment_names, data.coefficients, data.spec) + data.relabel_coef = link_same_value_coefficients( + segment_names, data.coefficients, data.spec + ) spec = data.spec coefficients = data.coefficients chooser_data = data.chooser_data diff --git a/activitysim/estimation/larch/scheduling.py b/activitysim/estimation/larch/scheduling.py index 649df79115..d9e595a7e4 100644 --- a/activitysim/estimation/larch/scheduling.py +++ b/activitysim/estimation/larch/scheduling.py @@ -1,21 +1,22 @@ import os +from pathlib import Path +from typing import Collection + import numpy as np import pandas as pd import yaml -from typing import Collection +from larch import DataFrames, Model, P, X from larch.util import Dict -from pathlib import Path from .general import ( - remove_apostrophes, - construct_nesting_tree, - linear_utility_from_spec, - explicit_value_parameters, apply_coefficients, + construct_nesting_tree, cv_to_ca, + explicit_value_parameters, + linear_utility_from_spec, + remove_apostrophes, str_repr, ) -from larch import Model, DataFrames, P, X def schedule_choice_model( @@ -52,10 +53,10 @@ def _read_csv(filename, optional=False, **kwargs): coefficients = _read_csv(coefficients_file, index_col="coefficient_name",) except FileNotFoundError: # possibly mis-named file is shown in settings - coefficients_file = settings.get('COEFFICIENTS', coefficients_file) + coefficients_file = settings.get("COEFFICIENTS", coefficients_file) coefficients = _read_csv(coefficients_file, index_col="coefficient_name",) - spec = _read_csv(spec_file, comment='#') + spec = _read_csv(spec_file, comment="#") alt_values = _read_csv(alt_values_file) chooser_data = _read_csv(chooser_file) @@ -66,7 +67,7 @@ def _read_csv(filename, optional=False, **kwargs): include_settings = settings.get("include_settings") if include_settings: with open(os.path.join(edb_directory, include_settings), "r") as yf: - more_settings = yaml.load(yf, Loader=yaml.SafeLoader, ) + more_settings = yaml.load(yf, Loader=yaml.SafeLoader,) settings.update(more_settings) CHOOSER_SEGMENT_COLUMN_NAME = settings.get("CHOOSER_SEGMENT_COLUMN_NAME") @@ -76,26 +77,27 @@ def _read_csv(filename, optional=False, **kwargs): if SEGMENTS is not None: SEGMENT_IDS = {i: i for i in SEGMENTS} - if 'Label' in spec.columns: - label_column_name = 'Label' - elif 'Expression' in spec.columns: - label_column_name = 'Expression' + if "Label" in spec.columns: + label_column_name = "Label" + elif "Expression" in spec.columns: + label_column_name = "Expression" else: raise ValueError("cannot find Label or Expression in spec file") m = Model() if len(spec.columns) == 4 and ( - [c.lower() for c in spec.columns] == [ - 'label', 'description', 'expression', 'coefficient' - ] + [c.lower() for c in spec.columns] + == ["label", "description", "expression", "coefficient"] ): m.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist",), ) - elif len(spec.columns) == 4 \ - and all(spec.columns[:3] == ['Label', 'Description', 'Expression']) \ - and len(SEGMENT_IDS) == 1 \ - and spec.columns[3] == list(SEGMENT_IDS.values())[0]: + elif ( + len(spec.columns) == 4 + and all(spec.columns[:3] == ["Label", "Description", "Expression"]) + and len(SEGMENT_IDS) == 1 + and spec.columns[3] == list(SEGMENT_IDS.values())[0] + ): m.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist",), ) @@ -127,10 +129,10 @@ def _read_csv(filename, optional=False, **kwargs): # else: # x_co["_segment_label"] = size_spec.index[0] - alt_codes = np.arange(len(x_ca.index.levels[1]))+1 + alt_codes = np.arange(len(x_ca.index.levels[1])) + 1 x_ca.index = x_ca.index.set_levels(alt_codes, 1) - x_co["override_choice_plus1"] = x_co["override_choice"]+1 - x_co["model_choice_plus1"] = x_co["model_choice"]+1 + x_co["override_choice_plus1"] = x_co["override_choice"] + 1 + x_co["model_choice_plus1"] = x_co["model_choice"] + 1 unavail_coefs = coefficients.query("(constrain == 'T') & (value < -900)").index unavail_data = [i.data for i in m.utility_ca if i.param in unavail_coefs] @@ -214,20 +216,15 @@ def mandatory_tour_scheduling_school_model(return_data=False): def non_mandatory_tour_scheduling_model(return_data=False): return schedule_choice_model( - name="non_mandatory_tour_scheduling", - return_data=return_data, + name="non_mandatory_tour_scheduling", return_data=return_data, ) def joint_tour_scheduling_model(return_data=False): - return schedule_choice_model( - name="joint_tour_scheduling", - return_data=return_data, - ) + return schedule_choice_model(name="joint_tour_scheduling", return_data=return_data,) def atwork_subtour_scheduling_model(return_data=False): return schedule_choice_model( - name="atwork_subtour_scheduling", - return_data=return_data, + name="atwork_subtour_scheduling", return_data=return_data, ) diff --git a/activitysim/estimation/larch/simple_simulate.py b/activitysim/estimation/larch/simple_simulate.py index 7fc85d7dee..31f1015fd9 100644 --- a/activitysim/estimation/larch/simple_simulate.py +++ b/activitysim/estimation/larch/simple_simulate.py @@ -1,16 +1,17 @@ import os from pathlib import Path + import numpy as np import pandas as pd import yaml +from larch import DataFrames, Model from larch.util import Dict -from larch import Model, DataFrames from .general import ( - remove_apostrophes, - dict_of_linear_utility_from_spec, apply_coefficients, construct_nesting_tree, + dict_of_linear_utility_from_spec, + remove_apostrophes, ) @@ -72,7 +73,9 @@ def _read_csv(filename, **kwargs): coefficients = _read_csv(coefficients_file, index_col="coefficient_name",) try: - coef_template = _read_csv(coefficients_template, index_col="coefficient_name",) + coef_template = _read_csv( + coefficients_template, index_col="coefficient_name", + ) except FileNotFoundError: coef_template = None @@ -94,6 +97,7 @@ def _read_csv(filename, **kwargs): except Exception: # when an error happens in reading anything other than settings, print settings from pprint import pprint + pprint(settings) raise @@ -112,12 +116,12 @@ def _read_csv(filename, **kwargs): def simple_simulate_model( - name, - edb_directory="output/estimation_data_bundle/{name}/", - return_data=False, - choices=None, - construct_avail=False, - values_index_col="household_id", + name, + edb_directory="output/estimation_data_bundle/{name}/", + return_data=False, + choices=None, + construct_avail=False, + values_index_col="household_id", ): data = simple_simulate_data( name=name, edb_directory=edb_directory, values_index_col=values_index_col, @@ -132,13 +136,14 @@ def simple_simulate_model( alt_codes = data.alt_codes from .general import clean_values + chooser_data = clean_values( chooser_data, alt_names_to_codes=choices or data.alt_names_to_codes, choice_code="override_choice_code", ) - if settings.get('LOGIT_TYPE') == 'NL': + if settings.get("LOGIT_TYPE") == "NL": tree = construct_nesting_tree(data.alt_names, settings["NESTS"]) m = Model(graph=tree) else: @@ -155,7 +160,7 @@ def simple_simulate_model( else: avail = True - d = DataFrames(co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names, ) + d = DataFrames(co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names,) m.dataservice = d m.choice_co_code = "override_choice_code" @@ -186,8 +191,8 @@ def auto_ownership_model( name=name, edb_directory=edb_directory, return_data=return_data, - choices={i: i+1 for i in range(5)}, # choices are coded in data as integers, - # not 'cars0' etc as appears in the spec + choices={i: i + 1 for i in range(5)}, # choices are coded in data as integers, + # not 'cars0' etc as appears in the spec ) @@ -200,7 +205,10 @@ def free_parking_model( name=name, edb_directory=edb_directory, return_data=return_data, - choices={True: 1, False: 2}, # True is free parking, False is paid parking, names match spec positions + choices={ + True: 1, + False: 2, + }, # True is free parking, False is paid parking, names match spec positions ) @@ -210,9 +218,7 @@ def mandatory_tour_frequency_model( return_data=False, ): return simple_simulate_model( - name=name, - edb_directory=edb_directory, - return_data=return_data, + name=name, edb_directory=edb_directory, return_data=return_data, ) @@ -222,9 +228,7 @@ def joint_tour_frequency_model( return_data=False, ): return simple_simulate_model( - name=name, - edb_directory=edb_directory, - return_data=return_data, + name=name, edb_directory=edb_directory, return_data=return_data, ) @@ -247,9 +251,7 @@ def joint_tour_composition_model( return_data=False, ): return simple_simulate_model( - name=name, - edb_directory=edb_directory, - return_data=return_data, + name=name, edb_directory=edb_directory, return_data=return_data, ) diff --git a/activitysim/estimation/larch/stop_frequency.py b/activitysim/estimation/larch/stop_frequency.py index c13cd0d5ec..8fdd02b967 100644 --- a/activitysim/estimation/larch/stop_frequency.py +++ b/activitysim/estimation/larch/stop_frequency.py @@ -1,33 +1,34 @@ import os from pathlib import Path + import numpy as np import pandas as pd import yaml +from larch import DataFrames, Model from larch.util import Dict -from larch import Model, DataFrames from .general import ( - remove_apostrophes, - dict_of_linear_utility_from_spec, apply_coefficients, construct_nesting_tree, + dict_of_linear_utility_from_spec, + remove_apostrophes, ) def stop_frequency_data( - edb_directory="output/estimation_data_bundle/{name}/", - settings_file="{name}_model_settings.yaml", - chooser_data_file="{name}_values_combined.csv", - values_index_col="tour_id", + edb_directory="output/estimation_data_bundle/{name}/", + settings_file="{name}_model_settings.yaml", + chooser_data_file="{name}_values_combined.csv", + values_index_col="tour_id", ): - name = 'stop_frequency' + name = "stop_frequency" edb_directory = edb_directory.format(name=name) settings_file = settings_file.format(name=name) with open(os.path.join(edb_directory, settings_file), "r") as yf: settings = yaml.load(yf, Loader=yaml.SafeLoader,) - segments = [i['primary_purpose'] for i in settings['SPEC_SEGMENTS']] + segments = [i["primary_purpose"] for i in settings["SPEC_SEGMENTS"]] master_coef = {} prior_segs = [] @@ -35,11 +36,10 @@ def stop_frequency_data( segment_coef = {} for seg_ in settings["SPEC_SEGMENTS"]: - seg_purpose = seg_['primary_purpose'] + seg_purpose = seg_["primary_purpose"] seg_subdir = Path(os.path.join(edb_directory, seg_purpose)) - segment_coef[seg_['primary_purpose']] = pd.read_csv( - seg_subdir/seg_['COEFFICIENTS'], - index_col="coefficient_name", + segment_coef[seg_["primary_purpose"]] = pd.read_csv( + seg_subdir / seg_["COEFFICIENTS"], index_col="coefficient_name", ) for seg in segments: @@ -63,14 +63,14 @@ def stop_frequency_data( # rewrite revised spec files with common segment_coef names for seg in segments: seg_subdir = Path(os.path.join(edb_directory, seg)) - with open(seg_subdir/f"stop_frequency_SPEC.csv", 'rt') as f: + with open(seg_subdir / f"stop_frequency_SPEC.csv", "rt") as f: spec = f.read() for kcoef, v in coef_map[seg].items(): spec = spec.replace(kcoef, v) - with open(seg_subdir/f"stop_frequency_SPEC_.csv", 'wt') as f: + with open(seg_subdir / f"stop_frequency_SPEC_.csv", "wt") as f: f.write(spec) - master_coef_df = pd.DataFrame(data=master_coef, index=['value']).T + master_coef_df = pd.DataFrame(data=master_coef, index=["value"]).T master_coef_df.index.name = "coefficient_name" seg_coefficients = [] @@ -82,12 +82,16 @@ def stop_frequency_data( seg_chooser_data = [] for seg in settings["SPEC_SEGMENTS"]: - seg_purpose = seg['primary_purpose'] + seg_purpose = seg["primary_purpose"] seg_subdir = Path(os.path.join(edb_directory, seg_purpose)) - coeffs_ = pd.read_csv(seg_subdir/seg['COEFFICIENTS'], index_col="coefficient_name") - coeffs_.index = pd.Index([f"{i}_{seg_purpose}" for i in coeffs_.index], name="coefficient_name") + coeffs_ = pd.read_csv( + seg_subdir / seg["COEFFICIENTS"], index_col="coefficient_name" + ) + coeffs_.index = pd.Index( + [f"{i}_{seg_purpose}" for i in coeffs_.index], name="coefficient_name" + ) seg_coefficients.append(coeffs_) - spec = pd.read_csv(seg_subdir/"stop_frequency_SPEC_.csv") + spec = pd.read_csv(seg_subdir / "stop_frequency_SPEC_.csv") spec = remove_apostrophes(spec, ["Label"]) # spec.iloc[:, 3:] = spec.iloc[:, 3:].applymap(lambda x: f"{x}_{seg_purpose}" if not pd.isna(x) else x) seg_spec.append(spec) @@ -103,7 +107,7 @@ def stop_frequency_data( seg_alt_codes_to_names.append(alt_codes_to_names) chooser_data = pd.read_csv( - seg_subdir/chooser_data_file.format(name=name), + seg_subdir / chooser_data_file.format(name=name), index_col=values_index_col, ) seg_chooser_data.append(chooser_data) @@ -125,12 +129,9 @@ def stop_frequency_data( def stop_frequency_model( - edb_directory="output/estimation_data_bundle/{name}/", - return_data=False, + edb_directory="output/estimation_data_bundle/{name}/", return_data=False, ): - data = stop_frequency_data( - edb_directory=edb_directory, values_index_col="tour_id", - ) + data = stop_frequency_data(edb_directory=edb_directory, values_index_col="tour_id",) models = [] @@ -146,13 +147,14 @@ def stop_frequency_model( alt_codes = data.alt_codes[n] from .general import clean_values + chooser_data = clean_values( chooser_data, alt_names_to_codes=data.alt_names_to_codes[n], choice_code="override_choice_code", ) - if settings.get('LOGIT_TYPE') == 'NL': + if settings.get("LOGIT_TYPE") == "NL": tree = construct_nesting_tree(data.alt_names[n], settings["NESTS"]) m = Model(graph=tree) else: @@ -166,13 +168,16 @@ def stop_frequency_model( avail = True - d = DataFrames(co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names, ) + d = DataFrames( + co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names, + ) m.dataservice = d m.choice_co_code = "override_choice_code" models.append(m) from larch.model.model_group import ModelGroup + models = ModelGroup(models) if return_data: @@ -184,7 +189,7 @@ def stop_frequency_model( return models -def update_segment_coefficients(model, data, result_dir=Path('.'), output_file=None): +def update_segment_coefficients(model, data, result_dir=Path("."), output_file=None): for m, segment_name in zip(model, data.segments): coefficient_map = data.coefficient_map[segment_name] segment_c = [] @@ -194,10 +199,11 @@ def update_segment_coefficients(model, data, result_dir=Path('.'), output_file=N segment_c.append(c_local) master_c.append(c_master) coefficients = data.segment_coefficients[segment_name].copy() - coefficients.loc[segment_c, "value"] = model.pf.loc[master_c, "value"].to_numpy() + coefficients.loc[segment_c, "value"] = model.pf.loc[ + master_c, "value" + ].to_numpy() if output_file is not None: os.makedirs(result_dir, exist_ok=True) coefficients.reset_index().to_csv( - result_dir/output_file.format(segment_name=segment_name), - index=False, + result_dir / output_file.format(segment_name=segment_name), index=False, ) diff --git a/activitysim/estimation/test/test_larch_estimation.py b/activitysim/estimation/test/test_larch_estimation.py index af124a9fdc..bad64770dd 100644 --- a/activitysim/estimation/test/test_larch_estimation.py +++ b/activitysim/estimation/test/test_larch_estimation.py @@ -1,7 +1,8 @@ import os +import subprocess + import pandas as pd import pytest -import subprocess from activitysim.cli.create import get_example @@ -50,7 +51,9 @@ def est_data(): def _regression_check(dataframe_regression, df, basename=None): dataframe_regression.check( - df.select_dtypes("number").drop(columns=["holdfast"], errors='ignore').clip(-9e9, 9e9), + df.select_dtypes("number") + .drop(columns=["holdfast"], errors="ignore") + .clip(-9e9, 9e9), # pandas 1.3 handles int8 dtypes as actual numbers, so holdfast needs to be dropped manually # we're dropping it not adding to the regression check so older pandas will also work. basename=basename, @@ -60,23 +63,26 @@ def _regression_check(dataframe_regression, df, basename=None): ) -@pytest.mark.parametrize("name,method", [ - ("free_parking", "BHHH"), - ("mandatory_tour_frequency", "SLSQP"), - ("joint_tour_frequency", "SLSQP"), - ("joint_tour_composition", "SLSQP"), - ("joint_tour_participation", "SLSQP"), - ("mandatory_tour_frequency", "BHHH"), - ("atwork_subtour_frequency", "SLSQP"), - ("auto_ownership", "BHHH"), - ("trip_mode_choice", "SLSQP"), -]) +@pytest.mark.parametrize( + "name,method", + [ + ("free_parking", "BHHH"), + ("mandatory_tour_frequency", "SLSQP"), + ("joint_tour_frequency", "SLSQP"), + ("joint_tour_composition", "SLSQP"), + ("joint_tour_participation", "SLSQP"), + ("mandatory_tour_frequency", "BHHH"), + ("atwork_subtour_frequency", "SLSQP"), + ("auto_ownership", "BHHH"), + ("trip_mode_choice", "SLSQP"), + ], +) def test_simple_simulate(est_data, num_regression, dataframe_regression, name, method): from activitysim.estimation.larch import component_model m = component_model(name) m.load_data() - m.doctor(repair_ch_av='-') + m.doctor(repair_ch_av="-") loglike_prior = m.loglike() r = m.maximize_loglike(method=method, options={"maxiter": 1000}) num_regression.check( @@ -87,13 +93,16 @@ def test_simple_simulate(est_data, num_regression, dataframe_regression, name, m _regression_check(dataframe_regression, m.pf) -@pytest.mark.parametrize("name,method", [ - ("workplace_location", "SLSQP"), - ("school_location", "SLSQP"), - ("non_mandatory_tour_destination", "SLSQP"), - ("atwork_subtour_destination", "BHHH"), - ("trip_destination", "SLSQP"), -]) +@pytest.mark.parametrize( + "name,method", + [ + ("workplace_location", "SLSQP"), + ("school_location", "SLSQP"), + ("non_mandatory_tour_destination", "SLSQP"), + ("atwork_subtour_destination", "BHHH"), + ("trip_destination", "SLSQP"), + ], +) def test_location_model(est_data, num_regression, dataframe_regression, name, method): from activitysim.estimation.larch import component_model, update_size_spec @@ -106,9 +115,7 @@ def test_location_model(est_data, num_regression, dataframe_regression, name, me basename=f"test_loc_{name}_loglike", ) _regression_check(dataframe_regression, m.pf) - size_spec = update_size_spec( - m, data, result_dir=None, output_file=None, - ) + size_spec = update_size_spec(m, data, result_dir=None, output_file=None,) dataframe_regression.check( size_spec, basename=f"test_loc_{name}_size_spec", @@ -118,19 +125,22 @@ def test_location_model(est_data, num_regression, dataframe_regression, name, me ) -@pytest.mark.parametrize("name,method", [ - ("non_mandatory_tour_scheduling", "SLSQP"), - ("joint_tour_scheduling", "SLSQP"), - ("atwork_subtour_scheduling", "SLSQP"), - ("mandatory_tour_scheduling_work", "SLSQP"), - ("mandatory_tour_scheduling_school", "SLSQP"), -]) +@pytest.mark.parametrize( + "name,method", + [ + ("non_mandatory_tour_scheduling", "SLSQP"), + ("joint_tour_scheduling", "SLSQP"), + ("atwork_subtour_scheduling", "SLSQP"), + ("mandatory_tour_scheduling_work", "SLSQP"), + ("mandatory_tour_scheduling_school", "SLSQP"), + ], +) def test_scheduling_model(est_data, num_regression, dataframe_regression, name, method): from activitysim.estimation.larch import component_model, update_size_spec m, data = component_model(name, return_data=True) m.load_data() - m.doctor(repair_ch_av='-') + m.doctor(repair_ch_av="-") loglike_prior = m.loglike() r = m.maximize_loglike(method=method) num_regression.check( @@ -167,9 +177,7 @@ def test_workplace_location(est_data, num_regression, dataframe_regression): basename="test_workplace_location_loglike", ) _regression_check(dataframe_regression, m.pf) - size_spec = update_size_spec( - m, data, result_dir=None, output_file=None, - ) + size_spec = update_size_spec(m, data, result_dir=None, output_file=None,) dataframe_regression.check( size_spec, basename="test_workplace_location_size_spec", @@ -189,9 +197,7 @@ def test_school_location(est_data, num_regression, dataframe_regression): basename="test_school_location_loglike", ) _regression_check(dataframe_regression, m.pf) - size_spec = update_size_spec( - m, data, result_dir=None, output_file=None, - ) + size_spec = update_size_spec(m, data, result_dir=None, output_file=None,) dataframe_regression.check( size_spec, basename="test_school_location_size_spec", @@ -213,7 +219,9 @@ def test_cdap_model(est_data, num_regression, dataframe_regression): _regression_check(dataframe_regression, m.pf) -def test_nonmand_and_joint_tour_dest_choice(est_data, num_regression, dataframe_regression): +def test_nonmand_and_joint_tour_dest_choice( + est_data, num_regression, dataframe_regression +): from activitysim.estimation.larch import component_model modelname = ("non_mandatory_tour_destination", "joint_tour_destination") @@ -230,8 +238,10 @@ def test_nonmand_and_joint_tour_dest_choice(est_data, num_regression, dataframe_ def test_tour_and_subtour_mode_choice(est_data, num_regression, dataframe_regression): - from activitysim.estimation.larch.mode_choice import tour_mode_choice_model, \ - atwork_subtour_mode_choice_model + from activitysim.estimation.larch.mode_choice import ( + atwork_subtour_mode_choice_model, + tour_mode_choice_model, + ) m = tour_mode_choice_model() s = atwork_subtour_mode_choice_model() diff --git a/activitysim/examples/create_run_all_examples.py b/activitysim/examples/create_run_all_examples.py index 40cec55ca9..aecc2e744b 100644 --- a/activitysim/examples/create_run_all_examples.py +++ b/activitysim/examples/create_run_all_examples.py @@ -6,7 +6,7 @@ runnable_line_signature = " # " # yes, hacky for now examples_file_name = "example_manifest.yaml" -example_file = open(examples_file_name, 'r') +example_file = open(examples_file_name, "r") lines = example_file.readlines() for line in lines: if runnable_line_signature in line: diff --git a/activitysim/examples/example_arc/scripts/arc_crop.py b/activitysim/examples/example_arc/scripts/arc_crop.py index 25bda2edcc..32cfc092da 100644 --- a/activitysim/examples/example_arc/scripts/arc_crop.py +++ b/activitysim/examples/example_arc/scripts/arc_crop.py @@ -1,26 +1,34 @@ +import argparse import os -import pandas as pd -import openmatrix as omx -import numpy as np -import argparse +import numpy as np +import openmatrix as omx +import pandas as pd MAZ_OFFSET = 0 segments = { - 'test': (100, 135), # arbitrary but has univ - 'fulton': (0, 1296), - 'full': (0, 5922), + "test": (100, 135), # arbitrary but has univ + "fulton": (0, 1296), + "full": (0, 5922), } -parser = argparse.ArgumentParser(description='crop PSRC raw_data') -parser.add_argument('segment_name', metavar='segment_name', type=str, nargs=1, - help=f"geography segmentation (e.g. full)") - -parser.add_argument('-c', '--check_geography', - default=False, - action='store_true', - help='check consistency of MAZ, TAZ zone_ids and foreign keys & write orphan_households file') +parser = argparse.ArgumentParser(description="crop PSRC raw_data") +parser.add_argument( + "segment_name", + metavar="segment_name", + type=str, + nargs=1, + help=f"geography segmentation (e.g. full)", +) + +parser.add_argument( + "-c", + "--check_geography", + default=False, + action="store_true", + help="check consistency of MAZ, TAZ zone_ids and foreign keys & write orphan_households file", +) args = parser.parse_args() @@ -31,8 +39,8 @@ assert segment_name in segments.keys(), f"Unknown seg: {segment_name}" zone_min, zone_max = segments[segment_name] -input_dir = './data_raw' -output_dir = f'./data_{segment_name}' +input_dir = "./data_raw" +output_dir = f"./data_{segment_name}" print(f"check_geography {check_geography}") @@ -51,7 +59,17 @@ def output_path(file_name): def integerize_id_columns(df, table_name): - columns = ['MAZ', 'OMAZ', 'DMAZ', 'TAZ', 'zone_id', 'household_id', 'HHID', 'maz', 'taz'] + columns = [ + "MAZ", + "OMAZ", + "DMAZ", + "TAZ", + "zone_id", + "household_id", + "HHID", + "maz", + "taz", + ] for c in df.columns: if c in columns: print(f"converting {table_name}.{c} to int") @@ -92,9 +110,11 @@ def to_csv(df, file_name): # land_use # land_use = read_csv("land_use.csv") -land_use = land_use[(land_use["zone_id"] >= zone_min) & (land_use["zone_id"] <= zone_max)] -integerize_id_columns(land_use, 'land_use') -land_use = land_use.sort_values('zone_id') +land_use = land_use[ + (land_use["zone_id"] >= zone_min) & (land_use["zone_id"] <= zone_max) +] +integerize_id_columns(land_use, "land_use") +land_use = land_use.sort_values("zone_id") # move index col to front land_use.insert(0, "zone_id", land_use.pop("zone_id")) @@ -106,7 +126,7 @@ def to_csv(df, file_name): # households = read_csv("households.csv") households = households[households["maz"].isin(land_use.zone_id)] -integerize_id_columns(households, 'households') +integerize_id_columns(households, "households") to_csv(households, "households.csv") @@ -115,14 +135,14 @@ def to_csv(df, file_name): # persons = read_csv("persons.csv") persons = persons[persons["household_id"].isin(households.household_id)] -integerize_id_columns(persons, 'persons') +integerize_id_columns(persons, "persons") to_csv(persons, "persons.csv") # # skims # -omx_infile_name = 'skims.omx' +omx_infile_name = "skims.omx" skim_data_type = np.float32 omx_in = omx.open_file(input_path(omx_infile_name)) @@ -130,21 +150,23 @@ def to_csv(df, file_name): assert not omx_in.listMappings() -zone = land_use.sort_values('zone_id')[['zone_id']] +zone = land_use.sort_values("zone_id")[["zone_id"]] zone.index = zone.zone_id - 1 zone_indexes = zone.index.tolist() # index of TAZ in skim (zero-based, no mapping) zone_labels = zone.zone_id.tolist() # TAZ zone_ids in omx index order # create -num_outfiles = 4 if segment_name == 'full' else 1 +num_outfiles = 4 if segment_name == "full" else 1 if num_outfiles == 1: - omx_out = [omx.open_file(output_path(f"skims.omx"), 'w')] + omx_out = [omx.open_file(output_path(f"skims.omx"), "w")] else: - omx_out = [omx.open_file(output_path(f"skims{i+1}.omx"), 'w') for i in range(num_outfiles)] + omx_out = [ + omx.open_file(output_path(f"skims{i+1}.omx"), "w") for i in range(num_outfiles) + ] for omx_file in omx_out: - omx_file.create_mapping('ZONE', zone_labels) + omx_file.create_mapping("ZONE", zone_labels) iskim = 0 for mat_name in omx_in.list_matrices(): diff --git a/activitysim/examples/example_arc/simulation.py b/activitysim/examples/example_arc/simulation.py index 93b430001a..e328406328 100644 --- a/activitysim/examples/example_arc/simulation.py +++ b/activitysim/examples/example_arc/simulation.py @@ -4,12 +4,12 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse +import sys from activitysim.cli.run import add_run_args, run -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_arc/test/simulation.py b/activitysim/examples/example_arc/test/simulation.py index ec6a1181b1..0fc64d5390 100755 --- a/activitysim/examples/example_arc/test/simulation.py +++ b/activitysim/examples/example_arc/test/simulation.py @@ -1,12 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse +import sys from activitysim.cli.run import add_run_args, run -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_arc/test/test_arc.py b/activitysim/examples/example_arc/test/test_arc.py index e747f632a1..1cd86e58bf 100644 --- a/activitysim/examples/example_arc/test/test_arc.py +++ b/activitysim/examples/example_arc/test/test_arc.py @@ -2,10 +2,10 @@ # See full license in LICENSE.txt. import os import subprocess -import pkg_resources import pandas as pd import pandas.testing as pdt +import pkg_resources from activitysim.core import inject @@ -16,33 +16,45 @@ def teardown_function(func): def test_arc(): - def example_path(dirname): - resource = os.path.join('examples', 'example_arc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_arc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) def regress(): - regress_trips_df = pd.read_csv(test_path('regress/final_trips.csv')) - final_trips_df = pd.read_csv(test_path('output/final_trips.csv')) + regress_trips_df = pd.read_csv(test_path("regress/final_trips.csv")) + final_trips_df = pd.read_csv(test_path("output/final_trips.csv")) # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] pdt.assert_frame_equal(final_trips_df, regress_trips_df) - file_path = os.path.join(os.path.dirname(__file__), 'simulation.py') - - subprocess.run(['coverage', 'run', '-a', file_path, - '-c', test_path('configs'), '-c', example_path('configs'), - '-d', example_path('data'), - '-o', test_path('output')], check=True) + file_path = os.path.join(os.path.dirname(__file__), "simulation.py") + + subprocess.run( + [ + "coverage", + "run", + "-a", + file_path, + "-c", + test_path("configs"), + "-c", + example_path("configs"), + "-d", + example_path("data"), + "-o", + test_path("output"), + ], + check=True, + ) regress() -if __name__ == '__main__': +if __name__ == "__main__": test_arc() diff --git a/activitysim/examples/example_estimation/build_example_data/build_stop_coeffs.py b/activitysim/examples/example_estimation/build_example_data/build_stop_coeffs.py index 9594958648..69a3f38fbc 100644 --- a/activitysim/examples/example_estimation/build_example_data/build_stop_coeffs.py +++ b/activitysim/examples/example_estimation/build_example_data/build_stop_coeffs.py @@ -1,58 +1,72 @@ - # python ~/work/activitysim/activitysim/examples/example_estimation/build_example_data/build_stop_coeffs.py -import pandas as pd import numpy as np - +import pandas as pd FIRST_RUN = True # work, school, univ, social, shopping, eatout, escort,atwork,othmaint,othdiscr -for what in ['work', 'school', 'univ', 'social', 'shopping', 'eatout', 'escort', 'atwork', 'othmaint', 'othdiscr']: +for what in [ + "work", + "school", + "univ", + "social", + "shopping", + "eatout", + "escort", + "atwork", + "othmaint", + "othdiscr", +]: if FIRST_RUN: - df = pd.read_csv(f'stop_frequency_{what}.csv', comment='#') - df.to_csv(f'stop_frequency_backup_{what}.csv', index=False) + df = pd.read_csv(f"stop_frequency_{what}.csv", comment="#") + df.to_csv(f"stop_frequency_backup_{what}.csv", index=False) else: - df = pd.read_csv(f'stop_frequency_backup_{what}.csv', comment='#') + df = pd.read_csv(f"stop_frequency_backup_{what}.csv", comment="#") - del df['Expression'] + del df["Expression"] - df = df.set_index('Description').unstack() + df = df.set_index("Description").unstack() # drop empty coefficients df = df[~df.isnull()] # want index as columns - df = df.reset_index().rename(columns={'level_0': 'alt', 0: 'value'}) + df = df.reset_index().rename(columns={"level_0": "alt", 0: "value"}) # drop duplicate coefficients on same spec row - df = df[~df[['Description', 'value']].duplicated(keep='first')] + df = df[~df[["Description", "value"]].duplicated(keep="first")] - dupes = df[['Description']].duplicated(keep=False) - df['coefficient_name'] = \ - np.where(dupes, 'coef_' + df.Description + '_' + df['alt'], 'coef_' + df.Description) - df['coefficient_name'] = df['coefficient_name'].str.lower() - df['coefficient_name'] = df['coefficient_name'].str.replace('[^a-zZ-Z0-9]+', '_', regex=True) - del df['alt'] + dupes = df[["Description"]].duplicated(keep=False) + df["coefficient_name"] = np.where( + dupes, "coef_" + df.Description + "_" + df["alt"], "coef_" + df.Description + ) + df["coefficient_name"] = df["coefficient_name"].str.lower() + df["coefficient_name"] = df["coefficient_name"].str.replace( + "[^a-zZ-Z0-9]+", "_", regex=True + ) + del df["alt"] - df.to_csv(f'stop_frequency_coefficients_{what}.csv', index=False) + df.to_csv(f"stop_frequency_coefficients_{what}.csv", index=False) - spec = pd.read_csv(f'stop_frequency_backup_{what}.csv', comment='#') + spec = pd.read_csv(f"stop_frequency_backup_{what}.csv", comment="#") alt_cols = spec.columns[2:].values for index, row in df.iterrows(): - m = {row['value']: row['coefficient_name']} - alts = spec.loc[spec.Description == row['Description'], alt_cols].values[0] + m = {row["value"]: row["coefficient_name"]} + alts = spec.loc[spec.Description == row["Description"], alt_cols].values[0] alts = [m.get(a, a) for a in alts] - spec.loc[spec.Description == row['Description'], alt_cols] = [m.get(a, a) for a in alts] + spec.loc[spec.Description == row["Description"], alt_cols] = [ + m.get(a, a) for a in alts + ] - spec.insert(loc=0, column='Label', value='util_' + spec.Description) + spec.insert(loc=0, column="Label", value="util_" + spec.Description) - spec['Label'] = spec['Label'].str.lower() - spec['Label'] = spec['Label'].str.replace('[^a-zZ-Z0-9]+', '_', regex=True) + spec["Label"] = spec["Label"].str.lower() + spec["Label"] = spec["Label"].str.replace("[^a-zZ-Z0-9]+", "_", regex=True) - df.to_csv(f'stop_frequency_coefficients_{what}.csv', index=False) - spec.to_csv(f'stop_frequency_{what}.csv', index=False) + df.to_csv(f"stop_frequency_coefficients_{what}.csv", index=False) + spec.to_csv(f"stop_frequency_{what}.csv", index=False) diff --git a/activitysim/examples/example_estimation/build_example_data/mode_choice_wrangle.py b/activitysim/examples/example_estimation/build_example_data/mode_choice_wrangle.py index 58e33b6d93..8cfe399940 100644 --- a/activitysim/examples/example_estimation/build_example_data/mode_choice_wrangle.py +++ b/activitysim/examples/example_estimation/build_example_data/mode_choice_wrangle.py @@ -1,34 +1,38 @@ -import pandas as pd import numpy as np +import pandas as pd -df = pd.read_csv(f'trip_mode_coefficients_p.csv', comment='#') +df = pd.read_csv(f"trip_mode_coefficients_p.csv", comment="#") -alts = list(df.drop(columns='Expression').columns.values.astype(str)) -alts_str = '_'.join(alts) +alts = list(df.drop(columns="Expression").columns.values.astype(str)) +alts_str = "_".join(alts) -df = df.set_index('Expression').unstack() +df = df.set_index("Expression").unstack() -df = df.reset_index().rename(columns={'level_0': 'alts', 0: 'value'}) +df = df.reset_index().rename(columns={"level_0": "alts", 0: "value"}) -df = df.groupby(['Expression', 'value']).agg(lambda col: '_'.join(col)).reset_index() +df = df.groupby(["Expression", "value"]).agg(lambda col: "_".join(col)).reset_index() -df['coefficient_name'] = 'coef_' + np.where(df.alts == alts_str, df['Expression'], df['Expression'] + '_' + df.alts) +df["coefficient_name"] = "coef_" + np.where( + df.alts == alts_str, df["Expression"], df["Expression"] + "_" + df.alts +) coefficients_df = df -df = pd.read_csv(f'trip_mode_coefficients_p.csv', comment='#') +df = pd.read_csv(f"trip_mode_coefficients_p.csv", comment="#") for alt in alts: - alt_df = pd.merge(df[['Expression', alt]].rename(columns={alt: 'value'}), - coefficients_df[['Expression', 'value', 'coefficient_name']], - left_on=['Expression', 'value'], - right_on=['Expression', 'value'], - how='left') - df[alt] = alt_df['coefficient_name'] + alt_df = pd.merge( + df[["Expression", alt]].rename(columns={alt: "value"}), + coefficients_df[["Expression", "value", "coefficient_name"]], + left_on=["Expression", "value"], + right_on=["Expression", "value"], + how="left", + ) + df[alt] = alt_df["coefficient_name"] -coefficients_df = coefficients_df[['coefficient_name', 'value']] -coefficients_df.to_csv(f'trip_mode_choice_coefficients.csv', index=False) +coefficients_df = coefficients_df[["coefficient_name", "value"]] +coefficients_df.to_csv(f"trip_mode_choice_coefficients.csv", index=False) -df.to_csv(f'trip_mode_choice_coefficients_template.csv', index=False) +df.to_csv(f"trip_mode_choice_coefficients_template.csv", index=False) diff --git a/activitysim/examples/example_estimation/scripts/extract_survey_data.py b/activitysim/examples/example_estimation/scripts/extract_survey_data.py index e82505e052..c4bc4a770f 100644 --- a/activitysim/examples/example_estimation/scripts/extract_survey_data.py +++ b/activitysim/examples/example_estimation/scripts/extract_survey_data.py @@ -1,9 +1,9 @@ # ActivitySim # See full license in LICENSE.txt. -import sys -import os import logging +import os +import sys import numpy as np import pandas as pd @@ -13,24 +13,24 @@ # create console handler with a higher log level ch = logging.StreamHandler() -ch.setFormatter(logging.Formatter('%(levelname)s - %(message)s')) +ch.setFormatter(logging.Formatter("%(levelname)s - %(message)s")) logger.addHandler(ch) inputs = { - 'households': 'final_households.csv', - 'persons': 'final_persons.csv', - 'tours': 'final_tours.csv', - 'joint_tour_participants': 'final_joint_tour_participants.csv', - 'trips': 'final_trips.csv', + "households": "final_households.csv", + "persons": "final_persons.csv", + "tours": "final_tours.csv", + "joint_tour_participants": "final_joint_tour_participants.csv", + "trips": "final_trips.csv", } surveys = { - 'households': 'survey_households.csv', - 'persons': 'survey_persons.csv', - 'tours': 'survey_tours.csv', - 'joint_tour_participants': 'survey_joint_tour_participants.csv', - 'trips': 'survey_trips.csv', + "households": "survey_households.csv", + "persons": "survey_persons.csv", + "tours": "survey_tours.csv", + "joint_tour_participants": "survey_joint_tour_participants.csv", + "trips": "survey_trips.csv", } @@ -39,30 +39,62 @@ data_dir = args[0] -input_dir = os.path.join(data_dir, 'survey_data/') -output_dir = os.path.join(data_dir, 'survey_data/') +input_dir = os.path.join(data_dir, "survey_data/") +output_dir = os.path.join(data_dir, "survey_data/") -configs_dir = os.path.dirname('../example/configs/') +configs_dir = os.path.dirname("../example/configs/") -households = pd.read_csv(os.path.join(input_dir, inputs['households'])) -persons = pd.read_csv(os.path.join(input_dir, inputs['persons'])) -tours = pd.read_csv(os.path.join(input_dir, inputs['tours'])) -joint_tour_participants = pd.read_csv(os.path.join(input_dir, inputs['joint_tour_participants'])) -trips = pd.read_csv(os.path.join(input_dir, inputs['trips'])) +households = pd.read_csv(os.path.join(input_dir, inputs["households"])) +persons = pd.read_csv(os.path.join(input_dir, inputs["persons"])) +tours = pd.read_csv(os.path.join(input_dir, inputs["tours"])) +joint_tour_participants = pd.read_csv( + os.path.join(input_dir, inputs["joint_tour_participants"]) +) +trips = pd.read_csv(os.path.join(input_dir, inputs["trips"])) households = households[ - ['household_id', 'home_zone_id', 'income', 'hhsize', 'HHT', 'auto_ownership', 'num_workers'] + [ + "household_id", + "home_zone_id", + "income", + "hhsize", + "HHT", + "auto_ownership", + "num_workers", + ] ] persons = persons[ - ['person_id', 'household_id', 'age', 'PNUM', 'sex', - 'pemploy', 'pstudent', 'ptype', 'school_zone_id', 'workplace_zone_id', 'free_parking_at_work'] + [ + "person_id", + "household_id", + "age", + "PNUM", + "sex", + "pemploy", + "pstudent", + "ptype", + "school_zone_id", + "workplace_zone_id", + "free_parking_at_work", + ] ] tours = tours[ - ['tour_id', 'person_id', 'household_id', 'tour_type', 'tour_category', - 'destination', 'origin', 'start', 'end', 'tour_mode', 'parent_tour_id'] + [ + "tour_id", + "person_id", + "household_id", + "tour_type", + "tour_category", + "destination", + "origin", + "start", + "end", + "tour_mode", + "parent_tour_id", + ] ] joint_tour_participants = joint_tour_participants[ - ['participant_id', 'tour_id', 'household_id', 'person_id', 'participant_num'] + ["participant_id", "tour_id", "household_id", "person_id", "participant_num"] ] OPTIONAL_TRIP_COLUMNS = [] @@ -70,12 +102,25 @@ # OPTIONAL_TRIP_COLUMNS = ['trip_num'] trips = trips[ - ['trip_id', 'person_id', 'household_id', 'tour_id', 'outbound', 'purpose', - 'destination', 'origin', 'depart', 'trip_mode'] + OPTIONAL_TRIP_COLUMNS + [ + "trip_id", + "person_id", + "household_id", + "tour_id", + "outbound", + "purpose", + "destination", + "origin", + "depart", + "trip_mode", + ] + + OPTIONAL_TRIP_COLUMNS ] -households.to_csv(os.path.join(output_dir, surveys['households']), index=False) -persons.to_csv(os.path.join(output_dir, surveys['persons']), index=False) -tours.to_csv(os.path.join(output_dir, surveys['tours']), index=False) -joint_tour_participants.to_csv(os.path.join(output_dir, surveys['joint_tour_participants']), index=False) -trips.to_csv(os.path.join(output_dir, surveys['trips']), index=False) +households.to_csv(os.path.join(output_dir, surveys["households"]), index=False) +persons.to_csv(os.path.join(output_dir, surveys["persons"]), index=False) +tours.to_csv(os.path.join(output_dir, surveys["tours"]), index=False) +joint_tour_participants.to_csv( + os.path.join(output_dir, surveys["joint_tour_participants"]), index=False +) +trips.to_csv(os.path.join(output_dir, surveys["trips"]), index=False) diff --git a/activitysim/examples/example_estimation/scripts/infer.py b/activitysim/examples/example_estimation/scripts/infer.py index 9bbf4ab275..f60b94900f 100644 --- a/activitysim/examples/example_estimation/scripts/infer.py +++ b/activitysim/examples/example_estimation/scripts/infer.py @@ -1,85 +1,60 @@ # ActivitySim # See full license in LICENSE.txt. -import sys -import os import logging -import yaml +import os +import sys import numpy as np import pandas as pd +import yaml +from activitysim.abm.models.util import canonical_ids as cid from activitysim.abm.models.util import tour_frequency as tf from activitysim.core.util import reindex -from activitysim.abm.models.util import canonical_ids as cid - logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # create console handler with a higher log level ch = logging.StreamHandler() -ch.setFormatter(logging.Formatter('%(levelname)s - %(message)s')) +ch.setFormatter(logging.Formatter("%(levelname)s - %(message)s")) logger.addHandler(ch) CONSTANTS = {} -SURVEY_TOUR_ID = 'survey_tour_id' -SURVEY_PARENT_TOUR_ID = 'survey_parent_tour_id' -SURVEY_PARTICIPANT_ID = 'survey_participant_id' -SURVEY_TRIP_ID = 'survey_trip_id' -ASIM_TOUR_ID = 'tour_id' -ASIM_PARENT_TOUR_ID = 'parent_tour_id' -ASIM_TRIP_ID = 'trip_id' +SURVEY_TOUR_ID = "survey_tour_id" +SURVEY_PARENT_TOUR_ID = "survey_parent_tour_id" +SURVEY_PARTICIPANT_ID = "survey_participant_id" +SURVEY_TRIP_ID = "survey_trip_id" +ASIM_TOUR_ID = "tour_id" +ASIM_PARENT_TOUR_ID = "parent_tour_id" +ASIM_TRIP_ID = "trip_id" -ASIM_PARTICIPANT_ID = 'participant_id' +ASIM_PARTICIPANT_ID = "participant_id" survey_tables = { - 'households': { - 'file_name': 'survey_households.csv', - 'index': 'household_id' - }, - 'persons': { - 'file_name': 'survey_persons.csv', - 'index': 'person_id' - }, - 'tours': { - 'file_name': 'survey_tours.csv' - }, - 'joint_tour_participants': { - 'file_name': 'survey_joint_tour_participants.csv' - }, - 'trips': { - 'file_name': 'survey_trips.csv' - }, + "households": {"file_name": "survey_households.csv", "index": "household_id"}, + "persons": {"file_name": "survey_persons.csv", "index": "person_id"}, + "tours": {"file_name": "survey_tours.csv"}, + "joint_tour_participants": {"file_name": "survey_joint_tour_participants.csv"}, + "trips": {"file_name": "survey_trips.csv"}, } outputs = { - 'households': 'override_households.csv', - 'persons': 'override_persons.csv', - 'tours': 'override_tours.csv', - 'joint_tour_participants': 'override_joint_tour_participants.csv', - 'trips': 'override_trips.csv', + "households": "override_households.csv", + "persons": "override_persons.csv", + "tours": "override_tours.csv", + "joint_tour_participants": "override_joint_tour_participants.csv", + "trips": "override_trips.csv", } control_tables = { - 'households': { - 'file_name': 'final_households.csv', - 'index': 'household_id' - }, - 'persons': { - 'file_name': 'final_persons.csv', - 'index': 'person_id' - }, - 'tours': { - 'file_name': 'final_tours.csv' - }, - 'joint_tour_participants': { - 'file_name': 'final_joint_tour_participants.csv' - }, - 'trips': { - 'file_name': 'final_trips.csv' - }, + "households": {"file_name": "final_households.csv", "index": "household_id"}, + "persons": {"file_name": "final_persons.csv", "index": "person_id"}, + "tours": {"file_name": "final_tours.csv"}, + "joint_tour_participants": {"file_name": "final_joint_tour_participants.csv"}, + "trips": {"file_name": "final_trips.csv"}, } apply_controls = True skip_controls = not apply_controls @@ -95,83 +70,118 @@ def unmangle_ids(ids): def infer_cdap_activity(persons, tours, joint_tour_participants): - mandatory_tour_types = ['work', 'school'] - non_mandatory_tour_types = ['escort', 'shopping', 'othmaint', 'othdiscr', 'eatout', 'social'] - - num_mandatory_tours = \ - tours[tours.tour_type.isin(mandatory_tour_types)].\ - groupby('person_id').size().\ - reindex(persons.index).fillna(0).astype(np.int8) - - num_non_mandatory_tours = \ - tours[tours.tour_type.isin(non_mandatory_tour_types)].\ - groupby('person_id').size().\ - reindex(persons.index).fillna(0).astype(np.int8) - - num_joint_tours = \ - joint_tour_participants.\ - groupby('person_id').size().\ - reindex(persons.index).fillna(0).astype(np.int8) + mandatory_tour_types = ["work", "school"] + non_mandatory_tour_types = [ + "escort", + "shopping", + "othmaint", + "othdiscr", + "eatout", + "social", + ] + + num_mandatory_tours = ( + tours[tours.tour_type.isin(mandatory_tour_types)] + .groupby("person_id") + .size() + .reindex(persons.index) + .fillna(0) + .astype(np.int8) + ) + + num_non_mandatory_tours = ( + tours[tours.tour_type.isin(non_mandatory_tour_types)] + .groupby("person_id") + .size() + .reindex(persons.index) + .fillna(0) + .astype(np.int8) + ) + + num_joint_tours = ( + joint_tour_participants.groupby("person_id") + .size() + .reindex(persons.index) + .fillna(0) + .astype(np.int8) + ) num_non_mandatory_tours += num_joint_tours - cdap_activity = pd.Series('H', index=persons.index) - cdap_activity = cdap_activity.where(num_mandatory_tours == 0, 'M') - cdap_activity = cdap_activity.where((cdap_activity == 'M') | (num_non_mandatory_tours == 0), 'N') + cdap_activity = pd.Series("H", index=persons.index) + cdap_activity = cdap_activity.where(num_mandatory_tours == 0, "M") + cdap_activity = cdap_activity.where( + (cdap_activity == "M") | (num_non_mandatory_tours == 0), "N" + ) return cdap_activity def infer_mandatory_tour_frequency(persons, tours): - num_work_tours = \ - tours[tours.tour_type == 'work'].\ - groupby('person_id').size().reindex(persons.index).fillna(0).astype(np.int8) - - num_school_tours = \ - tours[tours.tour_type == 'school'].\ - groupby('person_id').size().reindex(persons.index).fillna(0).astype(np.int8) + num_work_tours = ( + tours[tours.tour_type == "work"] + .groupby("person_id") + .size() + .reindex(persons.index) + .fillna(0) + .astype(np.int8) + ) + + num_school_tours = ( + tours[tours.tour_type == "school"] + .groupby("person_id") + .size() + .reindex(persons.index) + .fillna(0) + .astype(np.int8) + ) mtf = { - 0: '', - 1: 'work1', - 2: 'work2', - 10: 'school1', - 20: 'school2', - 11: 'work_and_school' + 0: "", + 1: "work1", + 2: "work2", + 10: "school1", + 20: "school2", + 11: "work_and_school", } - mandatory_tour_frequency = (num_work_tours + num_school_tours*10).map(mtf) + mandatory_tour_frequency = (num_work_tours + num_school_tours * 10).map(mtf) return mandatory_tour_frequency def infer_non_mandatory_tour_frequency(configs_dir, persons, tours): - def read_alts(): # escort,shopping,othmaint,othdiscr,eatout,social # 0,0,0,0,0,0 # 0,0,0,1,0,0, ... - alts = \ - pd.read_csv(os.path.join(configs_dir, 'non_mandatory_tour_frequency_alternatives.csv'), - comment='#') + alts = pd.read_csv( + os.path.join(configs_dir, "non_mandatory_tour_frequency_alternatives.csv"), + comment="#", + ) alts = alts.astype(np.int8) # - NARROW return alts - tours = tours[tours.tour_category == 'non_mandatory'] + tours = tours[tours.tour_category == "non_mandatory"] alts = read_alts() tour_types = list(alts.columns.values) # tour_frequency is index in alts table - alts['alt_id'] = alts.index + alts["alt_id"] = alts.index # actual tour counts (may exceed counts envisioned by alts) unconstrained_tour_counts = pd.DataFrame(index=persons.index) for tour_type in tour_types: - unconstrained_tour_counts[tour_type] = \ - tours[tours.tour_type == tour_type].\ - groupby('person_id').size().reindex(persons.index).fillna(0).astype(np.int8) + unconstrained_tour_counts[tour_type] = ( + tours[tours.tour_type == tour_type] + .groupby("person_id") + .size() + .reindex(persons.index) + .fillna(0) + .astype(np.int8) + ) # unextend tour counts # activitysim extend tours counts based on a probability table @@ -180,10 +190,14 @@ def read_alts(): max_tour_counts = alts[tour_types].max(axis=0) constrained_tour_counts = pd.DataFrame(index=persons.index) for tour_type in tour_types: - constrained_tour_counts[tour_type] = unconstrained_tour_counts[tour_type].clip(upper=max_tour_counts[tour_type]) + constrained_tour_counts[tour_type] = unconstrained_tour_counts[tour_type].clip( + upper=max_tour_counts[tour_type] + ) # persons whose tours were constrained who aren't eligible for extension becuase they have > 4 constrained tours - has_constrained_tours = (unconstrained_tour_counts != constrained_tour_counts).any(axis=1) + has_constrained_tours = (unconstrained_tour_counts != constrained_tour_counts).any( + axis=1 + ) print("%s persons with constrained tours" % (has_constrained_tours.sum())) too_many_tours = has_constrained_tours & constrained_tour_counts.sum(axis=1) > 4 if too_many_tours.any(): @@ -195,72 +209,107 @@ def read_alts(): # determine alt id corresponding to constrained_tour_counts # need to do index waltz because pd.merge doesn't preserve index in this case - alt_id = \ - pd.merge(constrained_tour_counts.reset_index(), alts, - left_on=tour_types, right_on=tour_types, how='left').set_index(persons.index.name).alt_id + alt_id = ( + pd.merge( + constrained_tour_counts.reset_index(), + alts, + left_on=tour_types, + right_on=tour_types, + how="left", + ) + .set_index(persons.index.name) + .alt_id + ) # did we end up with any tour frequencies not in alts? if alt_id.isna().any(): bad_tour_frequencies = alt_id.isna() logger.warning("WARNING Bad joint tour frequencies\n\n") - logger.warning("\nWARNING Bad non_mandatory tour frequencies: num_tours\n%s" % - constrained_tour_counts[bad_tour_frequencies]) - logger.warning("\nWARNING Bad non_mandatory tour frequencies: num_tours\n%s" % - tours[tours.person_id.isin(persons.index[bad_tour_frequencies])].sort_values('person_id')) + logger.warning( + "\nWARNING Bad non_mandatory tour frequencies: num_tours\n%s" + % constrained_tour_counts[bad_tour_frequencies] + ) + logger.warning( + "\nWARNING Bad non_mandatory tour frequencies: num_tours\n%s" + % tours[ + tours.person_id.isin(persons.index[bad_tour_frequencies]) + ].sort_values("person_id") + ) bug - tf = unconstrained_tour_counts.rename(columns={tour_type: '_%s' % tour_type for tour_type in tour_types}) - tf['non_mandatory_tour_frequency'] = alt_id + tf = unconstrained_tour_counts.rename( + columns={tour_type: "_%s" % tour_type for tour_type in tour_types} + ) + tf["non_mandatory_tour_frequency"] = alt_id return tf def infer_joint_tour_frequency(configs_dir, households, tours): - def read_alts(): # right now this file just contains the start and end hour - alts = \ - pd.read_csv(os.path.join(configs_dir, 'joint_tour_frequency_alternatives.csv'), - comment='#', index_col='alt') + alts = pd.read_csv( + os.path.join(configs_dir, "joint_tour_frequency_alternatives.csv"), + comment="#", + index_col="alt", + ) alts = alts.astype(np.int8) # - NARROW return alts alts = read_alts() tour_types = list(alts.columns.values) - assert(len(alts.index[(alts == 0).all(axis=1)]) == 1) # should be one zero_tours alt + assert len(alts.index[(alts == 0).all(axis=1)]) == 1 # should be one zero_tours alt zero_tours_alt = alts.index[(alts == 0).all(axis=1)].values[0] - alts['joint_tour_frequency'] = alts.index - joint_tours = tours[tours.tour_category == 'joint'] + alts["joint_tour_frequency"] = alts.index + joint_tours = tours[tours.tour_category == "joint"] num_tours = pd.DataFrame(index=households.index) for tour_type in tour_types: - joint_tour_is_tour_type = (joint_tours.tour_type == tour_type) + joint_tour_is_tour_type = joint_tours.tour_type == tour_type if joint_tour_is_tour_type.any(): - num_tours[tour_type] = \ - joint_tours[joint_tour_is_tour_type].\ - groupby('household_id').size().\ - reindex(households.index).fillna(0) + num_tours[tour_type] = ( + joint_tours[joint_tour_is_tour_type] + .groupby("household_id") + .size() + .reindex(households.index) + .fillna(0) + ) else: - logger.warning("WARNING infer_joint_tour_frequency - no tours of type '%s'" % tour_type) + logger.warning( + "WARNING infer_joint_tour_frequency - no tours of type '%s'" % tour_type + ) num_tours[tour_type] = 0 num_tours = num_tours.fillna(0).astype(np.int64) # need to do index waltz because pd.merge doesn't preserve index in this case - jtf = pd.merge(num_tours.reset_index(), alts, left_on=tour_types, right_on=tour_types, how='left').\ - set_index(households.index.name) + jtf = pd.merge( + num_tours.reset_index(), + alts, + left_on=tour_types, + right_on=tour_types, + how="left", + ).set_index(households.index.name) if jtf.joint_tour_frequency.isna().any(): bad_tour_frequencies = jtf.joint_tour_frequency.isna() logger.warning("WARNING Bad joint tour frequencies\n\n") - logger.warning("\nWARNING Bad joint tour frequencies: num_tours\n%s" % - num_tours[bad_tour_frequencies]) - logger.warning("\nWARNING Bad joint tour frequencies: num_tours\n%s" % - joint_tours[joint_tours.household_id.isin(households.index[bad_tour_frequencies])]) + logger.warning( + "\nWARNING Bad joint tour frequencies: num_tours\n%s" + % num_tours[bad_tour_frequencies] + ) + logger.warning( + "\nWARNING Bad joint tour frequencies: num_tours\n%s" + % joint_tours[ + joint_tours.household_id.isin(households.index[bad_tour_frequencies]) + ] + ) bug - logger.info("infer_joint_tour_frequency: %s households with joint tours", - (jtf.joint_tour_frequency != zero_tours_alt).sum()) + logger.info( + "infer_joint_tour_frequency: %s households with joint tours", + (jtf.joint_tour_frequency != zero_tours_alt).sum(), + ) return jtf.joint_tour_frequency @@ -270,31 +319,45 @@ def infer_joint_tour_composition(persons, tours, joint_tour_participants): assign joint_tours a 'composition' column ('adults', 'children', or 'mixed') depending on the composition of the joint_tour_participants """ - joint_tours = tours[tours.tour_category == 'joint'].copy() + joint_tours = tours[tours.tour_category == "joint"].copy() - joint_tour_participants = \ - pd.merge(joint_tour_participants, persons, - left_on='person_id', right_index=True, how='left') + joint_tour_participants = pd.merge( + joint_tour_participants, + persons, + left_on="person_id", + right_index=True, + how="left", + ) # FIXME - computed by asim annotate persons - not needed if embeded in asim and called just-in-time - if 'adult' not in joint_tour_participants: - joint_tour_participants['adult'] = (joint_tour_participants.age >= 18) - - tour_has_adults = \ - joint_tour_participants[joint_tour_participants.adult]\ - .groupby(SURVEY_TOUR_ID).size()\ - .reindex(joint_tours[SURVEY_TOUR_ID]).fillna(0) > 0 - - tour_has_children = \ - joint_tour_participants[~joint_tour_participants.adult]\ - .groupby([SURVEY_TOUR_ID]).size()\ - .reindex(joint_tours[SURVEY_TOUR_ID]).fillna(0) > 0 + if "adult" not in joint_tour_participants: + joint_tour_participants["adult"] = joint_tour_participants.age >= 18 + + tour_has_adults = ( + joint_tour_participants[joint_tour_participants.adult] + .groupby(SURVEY_TOUR_ID) + .size() + .reindex(joint_tours[SURVEY_TOUR_ID]) + .fillna(0) + > 0 + ) + + tour_has_children = ( + joint_tour_participants[~joint_tour_participants.adult] + .groupby([SURVEY_TOUR_ID]) + .size() + .reindex(joint_tours[SURVEY_TOUR_ID]) + .fillna(0) + > 0 + ) assert (tour_has_adults | tour_has_children).all() - joint_tours['composition'] = np.where(tour_has_adults, np.where(tour_has_children, 'mixed', 'adults'), 'children') + joint_tours["composition"] = np.where( + tour_has_adults, np.where(tour_has_children, "mixed", "adults"), "children" + ) - return joint_tours.composition.reindex(tours.index).fillna('').astype(str) + return joint_tours.composition.reindex(tours.index).fillna("").astype(str) def infer_tour_scheduling(configs_dir, tours): @@ -302,11 +365,13 @@ def infer_tour_scheduling(configs_dir, tours): def read_tdd_alts(): # right now this file just contains the start and end hour - tdd_alts = pd.read_csv(os.path.join(configs_dir, 'tour_departure_and_duration_alternatives.csv')) - tdd_alts['duration'] = tdd_alts.end - tdd_alts.start + tdd_alts = pd.read_csv( + os.path.join(configs_dir, "tour_departure_and_duration_alternatives.csv") + ) + tdd_alts["duration"] = tdd_alts.end - tdd_alts.start tdd_alts = tdd_alts.astype(np.int8) # - NARROW - tdd_alts['tdd'] = tdd_alts.index + tdd_alts["tdd"] = tdd_alts.index return tdd_alts tdd_alts = read_tdd_alts() @@ -317,7 +382,13 @@ def read_tdd_alts(): assert tours.end.isin(tdd_alts.end).all(), "not all tour starts in tdd_alts" - tdds = pd.merge(tours[['start', 'end']], tdd_alts, left_on=['start', 'end'], right_on=['start', 'end'], how='left') + tdds = pd.merge( + tours[["start", "end"]], + tdd_alts, + left_on=["start", "end"], + right_on=["start", "end"], + how="left", + ) if tdds.tdd.isna().any(): bad_tdds = tours[tdds.tdd.isna()] @@ -332,30 +403,35 @@ def read_tdd_alts(): def patch_tour_ids(persons, tours, joint_tour_participants): - def set_tour_index(tours, parent_tour_num_col, is_joint): - group_cols = ['person_id', 'tour_category', 'tour_type'] + group_cols = ["person_id", "tour_category", "tour_type"] - if 'parent_tour_num' in tours: - group_cols += ['parent_tour_num'] + if "parent_tour_num" in tours: + group_cols += ["parent_tour_num"] - tours['tour_type_num'] = \ + tours["tour_type_num"] = ( tours.sort_values(by=group_cols).groupby(group_cols).cumcount() + 1 + ) - return cid.set_tour_index(tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint) + return cid.set_tour_index( + tours, parent_tour_num_col=parent_tour_num_col, is_joint=is_joint + ) - assert 'mandatory_tour_frequency' in persons + assert "mandatory_tour_frequency" in persons # replace survey_tour ids with asim standard tour_ids (which are based on person_id and tour_type) ##################### # mandatory tours ##################### - mandatory_tours = \ - set_tour_index(tours[tours.tour_category == 'mandatory'], parent_tour_num_col=None, is_joint=False) + mandatory_tours = set_tour_index( + tours[tours.tour_category == "mandatory"], + parent_tour_num_col=None, + is_joint=False, + ) - assert mandatory_tours.index.name == 'tour_id' + assert mandatory_tours.index.name == "tour_id" ##################### # joint tours @@ -363,43 +439,52 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): # joint tours tour_id was assigned based on person_id of the first person in household (PNUM == 1) # because the actual point person forthe tour is only identified later in joint_tour_participants) - temp_point_persons = persons.loc[persons.PNUM == 1, ['household_id']] - temp_point_persons['person_id'] = temp_point_persons.index - temp_point_persons.set_index('household_id', inplace=True) + temp_point_persons = persons.loc[persons.PNUM == 1, ["household_id"]] + temp_point_persons["person_id"] = temp_point_persons.index + temp_point_persons.set_index("household_id", inplace=True) # patch person_id with value of temp_point_person_id and use it to set_tour_index - joint_tours = tours[tours.tour_category == 'joint'] - joint_tours['cache_point_person_id'] = joint_tours['person_id'] - joint_tours['person_id'] = reindex(temp_point_persons.person_id, joint_tours.household_id) + joint_tours = tours[tours.tour_category == "joint"] + joint_tours["cache_point_person_id"] = joint_tours["person_id"] + joint_tours["person_id"] = reindex( + temp_point_persons.person_id, joint_tours.household_id + ) joint_tours = set_tour_index(joint_tours, parent_tour_num_col=None, is_joint=True) - joint_tours['person_id'] = joint_tours['cache_point_person_id'] - del joint_tours['cache_point_person_id'] + joint_tours["person_id"] = joint_tours["cache_point_person_id"] + del joint_tours["cache_point_person_id"] # patch tour_id column in patched_joint_tour_participants patched_joint_tour_participants = joint_tour_participants.copy() asim_tour_id = pd.Series(joint_tours.index, index=joint_tours[SURVEY_TOUR_ID]) - patched_joint_tour_participants[ASIM_TOUR_ID] = \ - reindex(asim_tour_id, patched_joint_tour_participants[SURVEY_TOUR_ID]) + patched_joint_tour_participants[ASIM_TOUR_ID] = reindex( + asim_tour_id, patched_joint_tour_participants[SURVEY_TOUR_ID] + ) # participant_id is formed by combining tour_id and participant pern.PNUM # pathological knowledge, but awkward to conflate with joint_tour_participation.py logic - participant_pnum = reindex(persons.PNUM, patched_joint_tour_participants['person_id']) - patched_joint_tour_participants[ASIM_PARTICIPANT_ID] = \ - (patched_joint_tour_participants[ASIM_TOUR_ID] * cid.MAX_PARTICIPANT_PNUM) + participant_pnum + participant_pnum = reindex( + persons.PNUM, patched_joint_tour_participants["person_id"] + ) + patched_joint_tour_participants[ASIM_PARTICIPANT_ID] = ( + patched_joint_tour_participants[ASIM_TOUR_ID] * cid.MAX_PARTICIPANT_PNUM + ) + participant_pnum ##################### # non_mandatory tours ##################### - non_mandatory_tours = \ - set_tour_index(tours[tours.tour_category == 'non_mandatory'], parent_tour_num_col=None, is_joint=False) + non_mandatory_tours = set_tour_index( + tours[tours.tour_category == "non_mandatory"], + parent_tour_num_col=None, + is_joint=False, + ) ##################### # atwork tours ##################### - atwork_tours = tours[tours.tour_category == 'atwork'] + atwork_tours = tours[tours.tour_category == "atwork"] # patch atwork tours parent_tour_id before assigning their tour_id @@ -407,50 +492,71 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): # tours for students with both work and school trips should have lower tour_num for school # tours are already sorted, but schools comes before work (which is alphabetical, not the alternative id order), # so work_and_school tour_nums are correct for students (school=1, work=2) but workers need to be flipped - mandatory_tour_frequency = \ - reindex(persons.mandatory_tour_frequency, mandatory_tours.person_id) - is_worker = \ - reindex(persons.pemploy, mandatory_tours.person_id).\ - isin([CONSTANTS['PEMPLOY_FULL'], CONSTANTS['PEMPLOY_PART']]) - work_and_school_and_worker = (mandatory_tour_frequency == 'work_and_school') & is_worker + mandatory_tour_frequency = reindex( + persons.mandatory_tour_frequency, mandatory_tours.person_id + ) + is_worker = reindex(persons.pemploy, mandatory_tours.person_id).isin( + [CONSTANTS["PEMPLOY_FULL"], CONSTANTS["PEMPLOY_PART"]] + ) + work_and_school_and_worker = ( + mandatory_tour_frequency == "work_and_school" + ) & is_worker # calculate tour_num for work tours (required to set_tour_index for atwork subtours) parent_tours = mandatory_tours[[SURVEY_TOUR_ID]] - parent_tours['tour_num'] = \ - mandatory_tours.\ - sort_values(by=['person_id', 'tour_category', 'tour_type']).\ - groupby(['person_id', 'tour_category']).cumcount() + 1 - - parent_tours.tour_num = parent_tours.tour_num.where(~work_and_school_and_worker, 3 - parent_tours.tour_num) + parent_tours["tour_num"] = ( + mandatory_tours.sort_values(by=["person_id", "tour_category", "tour_type"]) + .groupby(["person_id", "tour_category"]) + .cumcount() + + 1 + ) + + parent_tours.tour_num = parent_tours.tour_num.where( + ~work_and_school_and_worker, 3 - parent_tours.tour_num + ) parent_tours = parent_tours.set_index(SURVEY_TOUR_ID, drop=True) # temporarily add parent_tour_num column to atwork tours, call set_tour_index, and then delete it - atwork_tours['parent_tour_num'] = reindex(parent_tours.tour_num, atwork_tours[SURVEY_PARENT_TOUR_ID]) + atwork_tours["parent_tour_num"] = reindex( + parent_tours.tour_num, atwork_tours[SURVEY_PARENT_TOUR_ID] + ) - atwork_tours = set_tour_index(atwork_tours, parent_tour_num_col='parent_tour_num', is_joint=False) + atwork_tours = set_tour_index( + atwork_tours, parent_tour_num_col="parent_tour_num", is_joint=False + ) - del atwork_tours['parent_tour_num'] + del atwork_tours["parent_tour_num"] # tours['household_id'] = reindex(persons.household_id, tours.person_id) - asim_tour_id = pd.Series(mandatory_tours.index, index=mandatory_tours[SURVEY_TOUR_ID]) - atwork_tours[ASIM_PARENT_TOUR_ID] = reindex(asim_tour_id, atwork_tours[SURVEY_PARENT_TOUR_ID]) + asim_tour_id = pd.Series( + mandatory_tours.index, index=mandatory_tours[SURVEY_TOUR_ID] + ) + atwork_tours[ASIM_PARENT_TOUR_ID] = reindex( + asim_tour_id, atwork_tours[SURVEY_PARENT_TOUR_ID] + ) ##################### # concat tours ##################### # only true for fake data - assert (mandatory_tours.index == unmangle_ids(mandatory_tours[SURVEY_TOUR_ID])).all() + assert ( + mandatory_tours.index == unmangle_ids(mandatory_tours[SURVEY_TOUR_ID]) + ).all() assert (joint_tours.index == unmangle_ids(joint_tours[SURVEY_TOUR_ID])).all() - assert (non_mandatory_tours.index == unmangle_ids(non_mandatory_tours[SURVEY_TOUR_ID])).all() + assert ( + non_mandatory_tours.index == unmangle_ids(non_mandatory_tours[SURVEY_TOUR_ID]) + ).all() - patched_tours = pd.concat([mandatory_tours, joint_tours, non_mandatory_tours, atwork_tours]) + patched_tours = pd.concat( + [mandatory_tours, joint_tours, non_mandatory_tours, atwork_tours] + ) assert patched_tours.index.name == ASIM_TOUR_ID patched_tours = patched_tours.reset_index() - del patched_tours['tour_type_num'] + del patched_tours["tour_type_num"] assert ASIM_TOUR_ID in patched_tours assert ASIM_PARENT_TOUR_ID in patched_tours @@ -461,9 +567,14 @@ def set_tour_index(tours, parent_tour_num_col, is_joint): def infer_atwork_subtour_frequency(configs_dir, tours): # first column is 'atwork_subtour_frequency' nickname, remaining columns are trip type counts - alts = pd.read_csv(os.path.join(configs_dir, 'atwork_subtour_frequency_alternatives.csv'), comment='#') - tour_types = list(alts.drop(columns=alts.columns[0]).columns) # get trip_types, ignoring first column - alts['alt_id'] = alts.index + alts = pd.read_csv( + os.path.join(configs_dir, "atwork_subtour_frequency_alternatives.csv"), + comment="#", + ) + tour_types = list( + alts.drop(columns=alts.columns[0]).columns + ) # get trip_types, ignoring first column + alts["alt_id"] = alts.index # alt eat business maint alt_id # 0 no_subtours 0 0 0 0 @@ -473,40 +584,58 @@ def infer_atwork_subtour_frequency(configs_dir, tours): # 4 business2 0 2 0 4 # 5 eat_business 1 1 0 5 - work_tours = tours[tours.tour_type == 'work'] + work_tours = tours[tours.tour_type == "work"] work_tours = work_tours[[ASIM_TOUR_ID]] - subtours = tours[tours.tour_category == 'atwork'] - subtours = subtours[['tour_id', 'tour_type', 'parent_tour_id']] + subtours = tours[tours.tour_category == "atwork"] + subtours = subtours[["tour_id", "tour_type", "parent_tour_id"]] # actual tour counts (may exceed counts envisioned by alts) tour_counts = pd.DataFrame(index=work_tours[ASIM_TOUR_ID]) for tour_type in tour_types: # count subtours of this type by parent_tour_id - tour_type_count = subtours[subtours.tour_type == tour_type].groupby('parent_tour_id').size() + tour_type_count = ( + subtours[subtours.tour_type == tour_type].groupby("parent_tour_id").size() + ) # backfill with 0 count - tour_counts[tour_type] = tour_type_count.reindex(tour_counts.index).fillna(0).astype(np.int8) + tour_counts[tour_type] = ( + tour_type_count.reindex(tour_counts.index).fillna(0).astype(np.int8) + ) # determine alt id corresponding to constrained_tour_counts # need to do index waltz because pd.merge doesn't preserve index in this case - tour_counts = \ - pd.merge(tour_counts.reset_index(), alts, - left_on=tour_types, right_on=tour_types, how='left').set_index(tour_counts.index.name) + tour_counts = pd.merge( + tour_counts.reset_index(), + alts, + left_on=tour_types, + right_on=tour_types, + how="left", + ).set_index(tour_counts.index.name) atwork_subtour_frequency = tour_counts.alt # did we end up with any tour frequencies not in alts? if atwork_subtour_frequency.isna().any(): bad_tour_frequencies = atwork_subtour_frequency.isna() - logger.warning("WARNING Bad atwork subtour frequencies for %s work tours" % bad_tour_frequencies.sum()) - logger.warning("WARNING Bad atwork subtour frequencies: num_tours\n%s" % - tour_counts[bad_tour_frequencies]) - logger.warning("WARNING Bad atwork subtour frequencies: num_tours\n%s" % - subtours[subtours.parent_tour_id.isin(tour_counts[bad_tour_frequencies].index)]. - sort_values('parent_tour_id')) + logger.warning( + "WARNING Bad atwork subtour frequencies for %s work tours" + % bad_tour_frequencies.sum() + ) + logger.warning( + "WARNING Bad atwork subtour frequencies: num_tours\n%s" + % tour_counts[bad_tour_frequencies] + ) + logger.warning( + "WARNING Bad atwork subtour frequencies: num_tours\n%s" + % subtours[ + subtours.parent_tour_id.isin(tour_counts[bad_tour_frequencies].index) + ].sort_values("parent_tour_id") + ) bug - atwork_subtour_frequency = reindex(atwork_subtour_frequency, tours[ASIM_TOUR_ID]).fillna('') + atwork_subtour_frequency = reindex( + atwork_subtour_frequency, tours[ASIM_TOUR_ID] + ).fillna("") return atwork_subtour_frequency @@ -522,7 +651,9 @@ def patch_trip_ids(tours, trips): # patch tour_id foreign key # tours['household_id'] = reindex(persons.household_id, tours.person_id) - asim_tour_id = pd.Series(tours[ASIM_TOUR_ID].values, index=tours[SURVEY_TOUR_ID].values) + asim_tour_id = pd.Series( + tours[ASIM_TOUR_ID].values, index=tours[SURVEY_TOUR_ID].values + ) trips[ASIM_TOUR_ID] = reindex(asim_tour_id, trips[SURVEY_TOUR_ID]) # person_is_university = persons.pstudent == constants.PSTUDENT_UNIVERSITY @@ -533,16 +664,18 @@ def patch_trip_ids(tours, trips): # trips['primary_purpose'] = reindex(tour_primary_purpose, trips.tour_id) # if order is ambiguous if trips depart in same time slot - order by SURVEY_TRIP_ID hoping that increases with time - if 'trip_num' not in trips: - trips['trip_num'] = \ - trips.sort_values(by=['tour_id', 'outbound', 'depart', SURVEY_TRIP_ID]).\ - groupby(['tour_id', 'outbound']).\ - cumcount() + 1 + if "trip_num" not in trips: + trips["trip_num"] = ( + trips.sort_values(by=["tour_id", "outbound", "depart", SURVEY_TRIP_ID]) + .groupby(["tour_id", "outbound"]) + .cumcount() + + 1 + ) cid.set_trip_index(trips) assert trips.index.name == ASIM_TRIP_ID - trips = trips.reset_index().rename(columns={'trip_id': ASIM_TRIP_ID}) + trips = trips.reset_index().rename(columns={"trip_id": ASIM_TRIP_ID}) return trips @@ -553,18 +686,20 @@ def infer_stop_frequency(configs_dir, tours, trips): # 0out_0in,0,0 # 0out_1in,0,1 # ... - alts = pd.read_csv(os.path.join(configs_dir, 'stop_frequency_alternatives.csv'), comment='#') - assert 'alt' in alts - assert 'in' in alts - assert 'out' in alts + alts = pd.read_csv( + os.path.join(configs_dir, "stop_frequency_alternatives.csv"), comment="#" + ) + assert "alt" in alts + assert "in" in alts + assert "out" in alts freq = pd.DataFrame(index=tours[SURVEY_TOUR_ID]) # number of trips is one less than number of stops - freq['out'] = trips[trips.outbound].groupby(SURVEY_TOUR_ID).trip_num.max() - 1 - freq['in'] = trips[~trips.outbound].groupby(SURVEY_TOUR_ID).trip_num.max() - 1 + freq["out"] = trips[trips.outbound].groupby(SURVEY_TOUR_ID).trip_num.max() - 1 + freq["in"] = trips[~trips.outbound].groupby(SURVEY_TOUR_ID).trip_num.max() - 1 - freq = pd.merge(freq.reset_index(), alts, on=['out', 'in'], how='left') + freq = pd.merge(freq.reset_index(), alts, on=["out", "in"], how="left") assert (freq[SURVEY_TOUR_ID] == tours[SURVEY_TOUR_ID]).all() @@ -574,37 +709,41 @@ def infer_stop_frequency(configs_dir, tours, trips): def read_tables(input_dir, tables): for table, info in tables.items(): - table = pd.read_csv(os.path.join(input_dir, info['file_name']), index_col=info.get('index')) + table = pd.read_csv( + os.path.join(input_dir, info["file_name"]), index_col=info.get("index") + ) # coerce missing data in string columns to empty strings, not NaNs for c in table.columns: # read_csv converts empty string to NaN, even if all non-empty values are strings - if table[c].dtype == 'object': + if table[c].dtype == "object": print("##### converting", c, table[c].dtype) - table[c] = table[c].fillna('').astype(str) - info['table'] = table + table[c] = table[c].fillna("").astype(str) + info["table"] = table - households = tables['households'].get('table') - persons = tables['persons'].get('table') - tours = tables['tours'].get('table') - joint_tour_participants = tables['joint_tour_participants'].get('table') - trips = tables['trips'].get('table') + households = tables["households"].get("table") + persons = tables["persons"].get("table") + tours = tables["tours"].get("table") + joint_tour_participants = tables["joint_tour_participants"].get("table") + trips = tables["trips"].get("table") return households, persons, tours, joint_tour_participants, trips def check_controls(table_name, column_name): - table = survey_tables[table_name].get('table') - c_table = control_tables[table_name].get('table') + table = survey_tables[table_name].get("table") + c_table = control_tables[table_name].get("table") - if column_name == 'index': - dont_match = (table.index != c_table.index) + if column_name == "index": + dont_match = table.index != c_table.index else: - dont_match = (table[column_name] != c_table[column_name]) + dont_match = table[column_name] != c_table[column_name] if dont_match.any(): - print("check_controls %s.%s: %s out of %s do not match" % - (table_name, column_name, dont_match.sum(), len(table))) + print( + "check_controls %s.%s: %s out of %s do not match" + % (table_name, column_name, dont_match.sum(), len(table)) + ) print("control\n%s" % c_table[dont_match][[column_name]]) print("survey\n%s" % table[dont_match][[column_name]]) @@ -617,75 +756,96 @@ def check_controls(table_name, column_name): def infer(configs_dir, input_dir, output_dir): - households, persons, tours, joint_tour_participants, trips = read_tables(input_dir, survey_tables) + households, persons, tours, joint_tour_participants, trips = read_tables( + input_dir, survey_tables + ) # be explicit about all tour_ids to avoid confusion between asim and survey ids - tours = tours.rename(columns={'tour_id': SURVEY_TOUR_ID, 'parent_tour_id': SURVEY_PARENT_TOUR_ID}) - joint_tour_participants = \ - joint_tour_participants.rename(columns={'tour_id': SURVEY_TOUR_ID, 'participant_id': SURVEY_PARTICIPANT_ID}) - trips = trips.rename(columns={'trip_id': SURVEY_TRIP_ID, 'tour_id': SURVEY_TOUR_ID}) + tours = tours.rename( + columns={"tour_id": SURVEY_TOUR_ID, "parent_tour_id": SURVEY_PARENT_TOUR_ID} + ) + joint_tour_participants = joint_tour_participants.rename( + columns={"tour_id": SURVEY_TOUR_ID, "participant_id": SURVEY_PARTICIPANT_ID} + ) + trips = trips.rename(columns={"trip_id": SURVEY_TRIP_ID, "tour_id": SURVEY_TOUR_ID}) # mangle survey tour ids to keep us honest tours[SURVEY_TOUR_ID] = mangle_ids(tours[SURVEY_TOUR_ID]) tours[SURVEY_PARENT_TOUR_ID] = mangle_ids(tours[SURVEY_PARENT_TOUR_ID]) - joint_tour_participants[SURVEY_TOUR_ID] = mangle_ids(joint_tour_participants[SURVEY_TOUR_ID]) - joint_tour_participants[SURVEY_PARTICIPANT_ID] = mangle_ids(joint_tour_participants[SURVEY_PARTICIPANT_ID]) + joint_tour_participants[SURVEY_TOUR_ID] = mangle_ids( + joint_tour_participants[SURVEY_TOUR_ID] + ) + joint_tour_participants[SURVEY_PARTICIPANT_ID] = mangle_ids( + joint_tour_participants[SURVEY_PARTICIPANT_ID] + ) trips[SURVEY_TRIP_ID] = mangle_ids(trips[SURVEY_TRIP_ID]) trips[SURVEY_TOUR_ID] = mangle_ids(trips[SURVEY_TOUR_ID]) # persons.cdap_activity - persons['cdap_activity'] = infer_cdap_activity(persons, tours, joint_tour_participants) + persons["cdap_activity"] = infer_cdap_activity( + persons, tours, joint_tour_participants + ) # check but don't assert as this is not deterministic - skip_controls or check_controls('persons', 'cdap_activity') + skip_controls or check_controls("persons", "cdap_activity") # persons.mandatory_tour_frequency - persons['mandatory_tour_frequency'] = infer_mandatory_tour_frequency(persons, tours) - assert skip_controls or check_controls('persons', 'mandatory_tour_frequency') + persons["mandatory_tour_frequency"] = infer_mandatory_tour_frequency(persons, tours) + assert skip_controls or check_controls("persons", "mandatory_tour_frequency") # persons.non_mandatory_tour_frequency tour_frequency = infer_non_mandatory_tour_frequency(configs_dir, persons, tours) for c in tour_frequency.columns: print("assigning persons", c) persons[c] = tour_frequency[c] - assert skip_controls or check_controls('persons', 'non_mandatory_tour_frequency') + assert skip_controls or check_controls("persons", "non_mandatory_tour_frequency") # patch_tour_ids - tours, joint_tour_participants = patch_tour_ids(persons, tours, joint_tour_participants) - survey_tables['tours']['table'] = tours - survey_tables['joint_tour_participants']['table'] = joint_tour_participants + tours, joint_tour_participants = patch_tour_ids( + persons, tours, joint_tour_participants + ) + survey_tables["tours"]["table"] = tours + survey_tables["joint_tour_participants"]["table"] = joint_tour_participants - assert skip_controls or check_controls('tours', 'index') - assert skip_controls or check_controls('joint_tour_participants', 'index') + assert skip_controls or check_controls("tours", "index") + assert skip_controls or check_controls("joint_tour_participants", "index") # patch_tour_ids trips = patch_trip_ids(tours, trips) - survey_tables['trips']['table'] = trips # so we can check_controls - assert skip_controls or check_controls('trips', 'index') + survey_tables["trips"]["table"] = trips # so we can check_controls + assert skip_controls or check_controls("trips", "index") # households.joint_tour_frequency - households['joint_tour_frequency'] = infer_joint_tour_frequency(configs_dir, households, tours) - assert skip_controls or check_controls('households', 'joint_tour_frequency') + households["joint_tour_frequency"] = infer_joint_tour_frequency( + configs_dir, households, tours + ) + assert skip_controls or check_controls("households", "joint_tour_frequency") # tours.composition - tours['composition'] = infer_joint_tour_composition(persons, tours, joint_tour_participants) - assert skip_controls or check_controls('tours', 'composition') + tours["composition"] = infer_joint_tour_composition( + persons, tours, joint_tour_participants + ) + assert skip_controls or check_controls("tours", "composition") # tours.tdd - tours['tdd'] = infer_tour_scheduling(configs_dir, tours) - assert skip_controls or check_controls('tours', 'tdd') + tours["tdd"] = infer_tour_scheduling(configs_dir, tours) + assert skip_controls or check_controls("tours", "tdd") - tours['atwork_subtour_frequency'] = infer_atwork_subtour_frequency(configs_dir, tours) - assert skip_controls or check_controls('tours', 'atwork_subtour_frequency') + tours["atwork_subtour_frequency"] = infer_atwork_subtour_frequency( + configs_dir, tours + ) + assert skip_controls or check_controls("tours", "atwork_subtour_frequency") - tours['stop_frequency'] = infer_stop_frequency(configs_dir, tours, trips) - assert skip_controls or check_controls('tours', 'stop_frequency') + tours["stop_frequency"] = infer_stop_frequency(configs_dir, tours, trips) + assert skip_controls or check_controls("tours", "stop_frequency") # write output files - households.to_csv(os.path.join(output_dir, outputs['households']), index=True) - persons.to_csv(os.path.join(output_dir, outputs['persons']), index=True) - tours.to_csv(os.path.join(output_dir, outputs['tours']), index=False) - joint_tour_participants.to_csv(os.path.join(output_dir, outputs['joint_tour_participants']), index=False) - trips.to_csv(os.path.join(output_dir, outputs['trips']), index=False) + households.to_csv(os.path.join(output_dir, outputs["households"]), index=True) + persons.to_csv(os.path.join(output_dir, outputs["persons"]), index=True) + tours.to_csv(os.path.join(output_dir, outputs["tours"]), index=False) + joint_tour_participants.to_csv( + os.path.join(output_dir, outputs["joint_tour_participants"]), index=False + ) + trips.to_csv(os.path.join(output_dir, outputs["trips"]), index=False) # python infer.py data @@ -695,10 +855,10 @@ def infer(configs_dir, input_dir, output_dir): data_dir = args[0] configs_dir = args[1] -with open(os.path.join(configs_dir, 'constants.yaml')) as stream: +with open(os.path.join(configs_dir, "constants.yaml")) as stream: CONSTANTS = yaml.load(stream, Loader=yaml.SafeLoader) -input_dir = os.path.join(data_dir, 'survey_data/') +input_dir = os.path.join(data_dir, "survey_data/") output_dir = input_dir if apply_controls: diff --git a/activitysim/examples/example_marin/scripts/marin_crop.py b/activitysim/examples/example_marin/scripts/marin_crop.py index f3a92cc0eb..68dcbc8d5f 100644 --- a/activitysim/examples/example_marin/scripts/marin_crop.py +++ b/activitysim/examples/example_marin/scripts/marin_crop.py @@ -1,28 +1,37 @@ # crop marin tvpb example data processing to one county # Ben Stabler, ben.stabler@rsginc.com, 09/17/20 -import os -import pandas as pd -import openmatrix as omx import argparse +import os + import numpy as np +import openmatrix as omx +import pandas as pd MAZ_OFFSET = 100000 segments = { - 'test': {'DistName': ["Downtown SF"]}, - 'marin_sf': {'CountyName': ["Marin", "San Francisco"]}, - 'full': {}, + "test": {"DistName": ["Downtown SF"]}, + "marin_sf": {"CountyName": ["Marin", "San Francisco"]}, + "full": {}, } -parser = argparse.ArgumentParser(description='crop Marin raw_data') -parser.add_argument('segment_name', metavar='segment_name', type=str, nargs=1, - help=f"geography segmentation (e.g. full)") - -parser.add_argument('-c', '--check_geography', - default=False, - action='store_true', - help='check consistency of MAZ, TAZ, TAP zone_ids and foreign keys & write orphan_households file') +parser = argparse.ArgumentParser(description="crop Marin raw_data") +parser.add_argument( + "segment_name", + metavar="segment_name", + type=str, + nargs=1, + help=f"geography segmentation (e.g. full)", +) + +parser.add_argument( + "-c", + "--check_geography", + default=False, + action="store_true", + help="check consistency of MAZ, TAZ, TAP zone_ids and foreign keys & write orphan_households file", +) args = parser.parse_args() @@ -32,8 +41,8 @@ assert segment_name in segments.keys(), f"Unknown seg: {segment_name}" -input_dir = './data_raw' -output_dir = f'./data_{segment_name}' +input_dir = "./data_raw" +output_dir = f"./data_{segment_name}" print(f"segment_name {segment_name}") @@ -58,7 +67,7 @@ def output_path(file_name): def patch_maz(df, maz_offset): for c in df.columns: - if c in ['MAZ', 'OMAZ', 'DMAZ', 'mgra', 'orig_mgra', 'dest_mgra']: + if c in ["MAZ", "OMAZ", "DMAZ", "mgra", "orig_mgra", "dest_mgra"]: df[c] += maz_offset return df @@ -89,8 +98,8 @@ def to_csv(df, file_name): # ######## check for orphan_households not in any maz in land_use land_use = read_csv(LAND_USE) - land_use = land_use[['MAZ', 'TAZ']] - land_use = land_use.sort_values(['TAZ', 'MAZ']) + land_use = land_use[["MAZ", "TAZ"]] + land_use = land_use.sort_values(["TAZ", "MAZ"]) households = read_csv(HOUSEHOLDS) orphan_households = households[~households.MAZ.isin(land_use.MAZ)] @@ -106,12 +115,14 @@ def to_csv(df, file_name): # could just build maz and taz files, but want to make sure PSRC data is right land_use = read_csv(LAND_USE) - land_use = land_use.sort_values('MAZ') - maz = read_csv(MAZ_TAZ).sort_values('MAZ') + land_use = land_use.sort_values("MAZ") + maz = read_csv(MAZ_TAZ).sort_values("MAZ") # ### FATAL ### if not land_use.MAZ.isin(maz.MAZ).all(): - print(f"land_use.MAZ not in maz.MAZ\n{land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)]}") + print( + f"land_use.MAZ not in maz.MAZ\n{land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)]}" + ) raise RuntimeError(f"land_use.MAZ not in maz.MAZ") if not maz.MAZ.isin(land_use.MAZ).all(): @@ -119,7 +130,9 @@ def to_csv(df, file_name): # ### FATAL ### if not land_use.TAZ.isin(maz.TAZ).all(): - print(f"land_use.TAZ not in maz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)]}") + print( + f"land_use.TAZ not in maz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)]}" + ) raise RuntimeError(f"land_use.TAZ not in maz.TAZ") if not maz.TAZ.isin(land_use.TAZ).all(): @@ -137,18 +150,18 @@ def to_csv(df, file_name): land_use = land_use[land_use[slice_col].isin(slice_values)] print(f"land_use shape after slicing {land_use.shape}") -to_csv(land_use, 'land_use.csv') +to_csv(land_use, "land_use.csv") # maz_taz, tazs, taps -maz_taz = land_use[['MAZ', 'TAZ']] +maz_taz = land_use[["MAZ", "TAZ"]] to_csv(maz_taz, "maz_taz.csv") tazs = land_use["TAZ"].unique() tazs.sort() taps = read_csv(TAP_MAZ) -taps = taps[['TAP', 'TAZ']].sort_values(by='TAP') +taps = taps[["TAP", "TAZ"]].sort_values(by="TAP") taps = taps[taps["TAZ"].isin(tazs)] to_csv(taps, "tap.csv") @@ -158,22 +171,32 @@ def to_csv(df, file_name): maz_maz_walk = read_csv("maz_maz_walk.csv") maz_maz_bike = read_csv("maz_maz_bike.csv") -maz_tap_walk = maz_tap_walk[maz_tap_walk["MAZ"].isin(land_use["MAZ"]) & maz_tap_walk["TAP"].isin(taps["TAP"])] -maz_maz_walk = maz_maz_walk[maz_maz_walk["OMAZ"].isin(land_use["MAZ"]) & maz_maz_walk["DMAZ"].isin(land_use["MAZ"])] -maz_maz_bike = maz_maz_bike[maz_maz_bike["OMAZ"].isin(land_use["MAZ"]) & maz_maz_bike["DMAZ"].isin(land_use["MAZ"])] +maz_tap_walk = maz_tap_walk[ + maz_tap_walk["MAZ"].isin(land_use["MAZ"]) & maz_tap_walk["TAP"].isin(taps["TAP"]) +] +maz_maz_walk = maz_maz_walk[ + maz_maz_walk["OMAZ"].isin(land_use["MAZ"]) + & maz_maz_walk["DMAZ"].isin(land_use["MAZ"]) +] +maz_maz_bike = maz_maz_bike[ + maz_maz_bike["OMAZ"].isin(land_use["MAZ"]) + & maz_maz_bike["DMAZ"].isin(land_use["MAZ"]) +] to_csv(maz_tap_walk, "maz_tap_walk.csv") to_csv(maz_maz_walk, "maz_maz_walk.csv") to_csv(maz_maz_bike, "maz_maz_bike.csv") tap_lines = read_csv("tap_lines.csv") -tap_lines = tap_lines[tap_lines['TAP'].isin(taps["TAP"])] +tap_lines = tap_lines[tap_lines["TAP"].isin(taps["TAP"])] to_csv(tap_lines, "tap_lines.csv") # taz to tap drive data taz_tap_drive = read_csv("maz_taz_tap_drive.csv") -taz_tap_drive = taz_tap_drive[taz_tap_drive["MAZ"].isin(land_use["MAZ"]) & taz_tap_drive["TAP"].isin(taps["TAP"])] +taz_tap_drive = taz_tap_drive[ + taz_tap_drive["MAZ"].isin(land_use["MAZ"]) & taz_tap_drive["TAP"].isin(taps["TAP"]) +] to_csv(taz_tap_drive, "maz_taz_tap_drive.csv") @@ -200,7 +223,10 @@ def to_csv(df, file_name): work_tours = read_csv(WORK_TOURS) work_tours = work_tours[work_tours["hh_id"].isin(households["HHID"])] -work_tours = work_tours[work_tours["orig_mgra"].isin(land_use["MAZ"]) & work_tours["dest_mgra"].isin(land_use["MAZ"])] +work_tours = work_tours[ + work_tours["orig_mgra"].isin(land_use["MAZ"]) + & work_tours["dest_mgra"].isin(land_use["MAZ"]) +] to_csv(work_tours, "work_tours.csv") # skims @@ -213,11 +239,11 @@ def to_csv(df, file_name): # taz skims with skim_data_type np.float32 are under 2GB - otherwise we would need to further segment them for tp in time_periods: - in_file_name = f'HWYSKM{tp}_taz_rename.omx' + in_file_name = f"HWYSKM{tp}_taz_rename.omx" taz_file_in = omx.open_file(input_path(in_file_name)) - out_file_name = f'highway_skims_{tp}.omx' - taz_file_out = omx.open_file(output_path(out_file_name), 'w') - taz_file_out.create_mapping('ZONE', tazs.tolist()) + out_file_name = f"highway_skims_{tp}.omx" + taz_file_out = omx.open_file(output_path(out_file_name), "w") + taz_file_out.create_mapping("ZONE", tazs.tolist()) for mat_name in taz_file_in.list_matrices(): # make sure we have a vanilla numpy array, not a CArray m = np.asanyarray(taz_file_in[mat_name]).astype(skim_data_type) @@ -228,11 +254,11 @@ def to_csv(df, file_name): taz_file_out.close() for skim_set in ["SET1", "SET2", "SET3"]: - out_file_name = f'transit_skims_{skim_set}.omx' - tap_file_out = omx.open_file(output_path(out_file_name), 'w') - tap_file_out.create_mapping('TAP', taps["TAP"].tolist()) + out_file_name = f"transit_skims_{skim_set}.omx" + tap_file_out = omx.open_file(output_path(out_file_name), "w") + tap_file_out.create_mapping("TAP", taps["TAP"].tolist()) for tp in time_periods: - in_file_name = f'transit_skims_{tp}_{skim_set}_rename.omx' + in_file_name = f"transit_skims_{tp}_{skim_set}_rename.omx" tap_file_in = omx.open_file(input_path(in_file_name)) for mat_name in tap_file_in.list_matrices(): # make sure we have a vanilla numpy array, not a CArray diff --git a/activitysim/examples/example_marin/scripts/marin_fix.py b/activitysim/examples/example_marin/scripts/marin_fix.py index 6cc1b08d9e..55ad498a72 100644 --- a/activitysim/examples/example_marin/scripts/marin_fix.py +++ b/activitysim/examples/example_marin/scripts/marin_fix.py @@ -2,11 +2,14 @@ # so data input files look 'realistic' - and that work is done instaed by 'import_tours' annotation expression files import os -import pandas as pd + import openmatrix as omx +import pandas as pd -input_dir = './data_3_marin' -output_dir = './data_3_marin/fix' # don't overwrite - but these files shold replace 'oritinals' +input_dir = "./data_3_marin" +output_dir = ( + "./data_3_marin/fix" # don't overwrite - but these files shold replace 'oritinals' +) def input_path(filenane): @@ -20,8 +23,8 @@ def output_path(filenane): # 0 - get county zones mazs = pd.read_csv(input_path("maz_data_asim.csv")) -del mazs['zone_id'] -del mazs['county_id'] +del mazs["zone_id"] +del mazs["county_id"] mazs.to_csv(output_path("maz_data_asim.csv"), index=False) tazs = mazs["TAZ"].unique() @@ -44,9 +47,9 @@ def output_path(filenane): maz_maz_walk = pd.read_csv(input_path("maz_maz_walk.csv")) maz_maz_bike = pd.read_csv(input_path("maz_maz_bike.csv")) -del maz_tap_walk['TAP.1'] -del maz_maz_walk['DMAZ.1'] -del maz_maz_bike['DMAZ.1'] +del maz_tap_walk["TAP.1"] +del maz_maz_walk["DMAZ.1"] +del maz_maz_bike["DMAZ.1"] maz_tap_walk.to_csv(output_path("maz_tap_walk.csv"), index=False) maz_maz_walk.to_csv(output_path("maz_maz_walk.csv"), index=False) @@ -55,7 +58,7 @@ def output_path(filenane): # 3 - accessibility data access = pd.read_csv(input_path("access.csv")) -del access['zone_id'] +del access["zone_id"] access.to_csv(output_path("access.csv"), index=False) # 4 - maz to tap drive data @@ -67,17 +70,17 @@ def output_path(filenane): # 5 - households households = pd.read_csv(input_path("households_asim.csv")) -del households['home_zone_id'] -del households['household_id'] +del households["home_zone_id"] +del households["household_id"] households.to_csv(output_path("households_asim.csv"), index=False) # 6 - persons persons = pd.read_csv(input_path("persons_asim.csv")) -del persons['person_id'] -del persons['household_id'] -del persons['is_university'] +del persons["person_id"] +del persons["household_id"] +del persons["is_university"] persons.to_csv(output_path("persons_asim.csv"), index=False) # 7 - tours file diff --git a/activitysim/examples/example_marin/scripts/marin_work_tour_mode_choice_data.py b/activitysim/examples/example_marin/scripts/marin_work_tour_mode_choice_data.py index bfbaefddd3..e31212549c 100644 --- a/activitysim/examples/example_marin/scripts/marin_work_tour_mode_choice_data.py +++ b/activitysim/examples/example_marin/scripts/marin_work_tour_mode_choice_data.py @@ -1,9 +1,8 @@ - # marin tvpb example data processing # Ben Stabler, ben.stabler@rsginc.com, 09/17/20 -import pandas as pd import openmatrix as omx +import pandas as pd # command to run the underdevelopment example # python simulation.py -c configs_3_zone_marin -d data_3_marin -o output_3_marin @@ -12,10 +11,10 @@ # 1 - fix skim names, put time periods at end and make all names unique -time_periods = ["AM", "EA", "EV", "MD", "PM"] +time_periods = ["AM", "EA", "EV", "MD", "PM"] for tp in time_periods: - taz_file = omx.open_file('HWYSKM' + tp + '_taz.omx') - taz_file_rename = omx.open_file('HWYSKM' + tp + '_taz_rename.omx', 'w') + taz_file = omx.open_file("HWYSKM" + tp + "_taz.omx") + taz_file_rename = omx.open_file("HWYSKM" + tp + "_taz_rename.omx", "w") for mat_name in taz_file.list_matrices(): taz_file_rename[mat_name + "__" + tp] = taz_file[mat_name][:] print(mat_name + "__" + tp) @@ -23,39 +22,57 @@ taz_file_rename.close() for tp in time_periods: - for skim_set in ["SET1", "SET2", "SET3"]: - tap_file = omx.open_file('transit_skims_' + tp + '_' + skim_set + '.omx') - tap_file_rename = omx.open_file('transit_skims_' + tp + '_' + skim_set + '_rename.omx', 'w') + for skim_set in ["SET1", "SET2", "SET3"]: + tap_file = omx.open_file("transit_skims_" + tp + "_" + skim_set + ".omx") + tap_file_rename = omx.open_file( + "transit_skims_" + tp + "_" + skim_set + "_rename.omx", "w" + ) for mat_name in tap_file.list_matrices(): - tap_file_rename[mat_name + "_" + skim_set + "__" + tp] = tap_file[mat_name][:] - print(mat_name + '_' + skim_set + "__" + tp) + tap_file_rename[mat_name + "_" + skim_set + "__" + tp] = tap_file[mat_name][ + : + ] + print(mat_name + "_" + skim_set + "__" + tp) tap_file.close() tap_file_rename.close() # 2 - nearby skims need headers -maz_tap_walk = pd.read_csv("2015_test_2019_02_13_Part3/skims/ped_distance_maz_tap.txt", header=None) -maz_maz_walk = pd.read_csv("2015_test_2019_02_13_Part3/skims/ped_distance_maz_maz.txt", header=None) -maz_maz_bike = pd.read_csv("2015_test_2019_02_13_Part3/skims/bike_distance_maz_maz.txt", header=None) - -maz_tap_walk.columns = ["MAZ", "TAP", "TAP", "WALK_TRANSIT_GEN_COST", "WALK_TRANSIT_DIST"] -maz_maz_walk.columns = ["OMAZ", "DMAZ", "DMAZ", "WALK_GEN_COST", "WALK_DIST"] -maz_maz_bike.columns = ["OMAZ", "DMAZ", "DMAZ", "BIKE_GEN_COST", "BIKE_DIST"] +maz_tap_walk = pd.read_csv( + "2015_test_2019_02_13_Part3/skims/ped_distance_maz_tap.txt", header=None +) +maz_maz_walk = pd.read_csv( + "2015_test_2019_02_13_Part3/skims/ped_distance_maz_maz.txt", header=None +) +maz_maz_bike = pd.read_csv( + "2015_test_2019_02_13_Part3/skims/bike_distance_maz_maz.txt", header=None +) + +maz_tap_walk.columns = [ + "MAZ", + "TAP", + "TAP", + "WALK_TRANSIT_GEN_COST", + "WALK_TRANSIT_DIST", +] +maz_maz_walk.columns = ["OMAZ", "DMAZ", "DMAZ", "WALK_GEN_COST", "WALK_DIST"] +maz_maz_bike.columns = ["OMAZ", "DMAZ", "DMAZ", "BIKE_GEN_COST", "BIKE_DIST"] maz_tap_walk["WALK_TRANSIT_DIST"] = maz_tap_walk["WALK_TRANSIT_DIST"] / 5280 # miles maz_maz_walk["WALK_DIST"] = maz_maz_walk["WALK_DIST"] / 5280 # miles maz_maz_bike["BIKE_DIST"] = maz_maz_bike["BIKE_DIST"] / 5280 # miles -maz_tap_walk[["MAZ", "TAP", "WALK_TRANSIT_DIST"]].to_csv("maz_tap_walk.csv", index=False) -maz_maz_walk[["OMAZ", "DMAZ", "WALK_DIST"]].to_csv("maz_maz_walk.csv", index=False) -maz_maz_bike[["OMAZ", "DMAZ", "BIKE_DIST"]].to_csv("maz_maz_bike.csv", index=False) +maz_tap_walk[["MAZ", "TAP", "WALK_TRANSIT_DIST"]].to_csv( + "maz_tap_walk.csv", index=False +) +maz_maz_walk[["OMAZ", "DMAZ", "WALK_DIST"]].to_csv("maz_maz_walk.csv", index=False) +maz_maz_bike[["OMAZ", "DMAZ", "BIKE_DIST"]].to_csv("maz_maz_bike.csv", index=False) # 3 - maz data mazs = pd.read_csv("2015_test_2019_02_13_Part2/landuse/maz_data_withDensity.csv") pcost = pd.read_csv("2015_test_2019_02_13/ctramp_output/mgraParkingCost.csv") -mazs = pd.concat([mazs, pcost], axis=1) +mazs = pd.concat([mazs, pcost], axis=1) mazs = mazs.fillna(0) tazs = pd.read_csv("2015_test_2019_02_13_Part2/landuse/taz_data.csv") @@ -81,22 +98,28 @@ taz_tap_drive = pd.read_csv("2015_test_2019_02_13_Part3/skims/drive_maz_taz_tap.csv") -taz_tap_drive = taz_tap_drive.pivot_table(index=["FTAZ", "TTAP"], values=['DTIME', 'DDIST', "WDIST"], fill_value=0) +taz_tap_drive = taz_tap_drive.pivot_table( + index=["FTAZ", "TTAP"], values=["DTIME", "DDIST", "WDIST"], fill_value=0 +) taz_tap_drive.columns = list(map("".join, taz_tap_drive.columns)) taz_tap_drive = taz_tap_drive.reset_index() taz_tap_drive = taz_tap_drive.set_index("FTAZ") taz_tap_drive["TAP"] = taz_tap_drive["TTAP"] -taz_tap_drive = pd.merge(mazs[["MAZ", "TAZ"]], taz_tap_drive, left_on=['TAZ'], right_on=['FTAZ']) -taz_tap_drive[["MAZ", "TAP", "DDIST", "DTIME", "WDIST"]].to_csv("maz_taz_tap_drive.csv", index=False) +taz_tap_drive = pd.merge( + mazs[["MAZ", "TAZ"]], taz_tap_drive, left_on=["TAZ"], right_on=["FTAZ"] +) +taz_tap_drive[["MAZ", "TAP", "DDIST", "DTIME", "WDIST"]].to_csv( + "maz_taz_tap_drive.csv", index=False +) # 6 - tours file, we just need work tours itour = pd.read_csv("2015_test_2019_02_13/ctramp_output/indivTourData_3.csv") work_tours = itour[itour["tour_purpose"] == "Work"] -work_tours["tour_id"] = range(1, len(work_tours)+1) +work_tours["tour_id"] = range(1, len(work_tours) + 1) work_tours["household_id"] = work_tours["hh_id"] work_tours = work_tours.set_index("tour_id", drop=False) @@ -136,13 +159,13 @@ # 9 - replace existing pipeline tables for restart for now # run simple three zone example and get output pipeline and then replace tables before tour mode choice -pipeline = pd.io.pytables.HDFStore('pipeline.h5') +pipeline = pd.io.pytables.HDFStore("pipeline.h5") pipeline.keys() -pipeline['/accessibility/compute_accessibility'] = access # index zone_id -pipeline['/households/joint_tour_frequency'] = households # index household_id -pipeline['/persons/non_mandatory_tour_frequency'] = persons # index person_id -pipeline['/land_use/initialize_landuse'] = mazs # index zone_id -pipeline['/tours/non_mandatory_tour_scheduling'] = work_tours # index tour_id +pipeline["/accessibility/compute_accessibility"] = access # index zone_id +pipeline["/households/joint_tour_frequency"] = households # index household_id +pipeline["/persons/non_mandatory_tour_frequency"] = persons # index person_id +pipeline["/land_use/initialize_landuse"] = mazs # index zone_id +pipeline["/tours/non_mandatory_tour_scheduling"] = work_tours # index tour_id pipeline.close() diff --git a/activitysim/examples/example_marin/test/simulation.py b/activitysim/examples/example_marin/test/simulation.py index ec6a1181b1..0fc64d5390 100755 --- a/activitysim/examples/example_marin/test/simulation.py +++ b/activitysim/examples/example_marin/test/simulation.py @@ -1,12 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse +import sys from activitysim.cli.run import add_run_args, run -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_marin/test/test_marin.py b/activitysim/examples/example_marin/test/test_marin.py index 6217e8b690..c40cc0adb9 100644 --- a/activitysim/examples/example_marin/test/test_marin.py +++ b/activitysim/examples/example_marin/test/test_marin.py @@ -2,10 +2,10 @@ # See full license in LICENSE.txt. import os import subprocess -import pkg_resources import pandas as pd import pandas.testing as pdt +import pkg_resources from activitysim.core import inject @@ -16,33 +16,45 @@ def teardown_function(func): def test_marin(): - def example_path(dirname): - resource = os.path.join('examples', 'example_marin', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_marin", dirname) + return pkg_resources.resource_filename("activitysim", resource) def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) def regress(): - regress_trips_df = pd.read_csv(test_path('regress/final_tours.csv')) - final_trips_df = pd.read_csv(test_path('output/final_tours.csv')) + regress_trips_df = pd.read_csv(test_path("regress/final_tours.csv")) + final_trips_df = pd.read_csv(test_path("output/final_tours.csv")) # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] pdt.assert_frame_equal(final_trips_df, regress_trips_df) - file_path = os.path.join(os.path.dirname(__file__), 'simulation.py') - - subprocess.run(['coverage', 'run', '-a', file_path, - '-c', test_path('configs'), '-c', example_path('configs'), - '-d', example_path('data'), - '-o', test_path('output')], check=True) + file_path = os.path.join(os.path.dirname(__file__), "simulation.py") + + subprocess.run( + [ + "coverage", + "run", + "-a", + file_path, + "-c", + test_path("configs"), + "-c", + example_path("configs"), + "-d", + example_path("data"), + "-o", + test_path("output"), + ], + check=True, + ) regress() -if __name__ == '__main__': +if __name__ == "__main__": test_marin() diff --git a/activitysim/examples/example_mtc/simulation.py b/activitysim/examples/example_mtc/simulation.py index f68bee0dbd..e89ab18e33 100644 --- a/activitysim/examples/example_mtc/simulation.py +++ b/activitysim/examples/example_mtc/simulation.py @@ -1,14 +1,13 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse +import sys from activitysim import abm # register injectables - from activitysim.cli.run import add_run_args, run -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_mtc/test/simulation.py b/activitysim/examples/example_mtc/test/simulation.py index ec6a1181b1..0fc64d5390 100755 --- a/activitysim/examples/example_mtc/test/simulation.py +++ b/activitysim/examples/example_mtc/test/simulation.py @@ -1,12 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse +import sys from activitysim.cli.run import add_run_args, run -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_mtc/test/test_mtc.py b/activitysim/examples/example_mtc/test/test_mtc.py index a81c439cbd..61bf81cec2 100644 --- a/activitysim/examples/example_mtc/test/test_mtc.py +++ b/activitysim/examples/example_mtc/test/test_mtc.py @@ -2,10 +2,10 @@ # See full license in LICENSE.txt. import os import subprocess -import pkg_resources import pandas as pd import pandas.testing as pdt +import pkg_resources from activitysim.core import inject @@ -16,36 +16,61 @@ def teardown_function(func): def run_test_mtc(multiprocess=False, chunkless=False): - def example_path(dirname): - resource = os.path.join('examples', 'example_mtc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_mtc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) def regress(): - regress_trips_df = pd.read_csv(test_path('regress/final_trips.csv')) - final_trips_df = pd.read_csv(test_path('output/final_trips.csv')) + regress_trips_df = pd.read_csv(test_path("regress/final_trips.csv")) + final_trips_df = pd.read_csv(test_path("output/final_trips.csv")) # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] pdt.assert_frame_equal(final_trips_df, regress_trips_df) - file_path = os.path.join(os.path.dirname(__file__), 'simulation.py') + file_path = os.path.join(os.path.dirname(__file__), "simulation.py") if multiprocess: - run_args = ['-c', test_path('configs_mp'), '-c', example_path('configs_mp'), '-c', example_path('configs'), - '-d', example_path('data'), '-o', test_path('output')] + run_args = [ + "-c", + test_path("configs_mp"), + "-c", + example_path("configs_mp"), + "-c", + example_path("configs"), + "-d", + example_path("data"), + "-o", + test_path("output"), + ] elif chunkless: - run_args = ['-c', test_path('configs_chunkless'), '-c', example_path('configs'), - '-d', example_path('data'), '-o', test_path('output')] + run_args = [ + "-c", + test_path("configs_chunkless"), + "-c", + example_path("configs"), + "-d", + example_path("data"), + "-o", + test_path("output"), + ] else: - run_args = ['-c', test_path('configs'), '-c', example_path('configs'), - '-d', example_path('data'), '-o', test_path('output')] - - subprocess.run(['coverage', 'run', '-a', file_path] + run_args, check=True) + run_args = [ + "-c", + test_path("configs"), + "-c", + example_path("configs"), + "-d", + example_path("data"), + "-o", + test_path("output"), + ] + + subprocess.run(["coverage", "run", "-a", file_path] + run_args, check=True) regress() @@ -62,7 +87,7 @@ def test_mtc_mp(): run_test_mtc(multiprocess=True) -if __name__ == '__main__': +if __name__ == "__main__": run_test_mtc(multiprocess=False) run_test_mtc(multiprocess=True) diff --git a/activitysim/examples/example_multiple_zone/scripts/three_zone_example_data.py b/activitysim/examples/example_multiple_zone/scripts/three_zone_example_data.py index bc49a2a0ef..248f9e2f45 100644 --- a/activitysim/examples/example_multiple_zone/scripts/three_zone_example_data.py +++ b/activitysim/examples/example_multiple_zone/scripts/three_zone_example_data.py @@ -12,16 +12,15 @@ import os import sys -import pandas as pd import numpy as np import openmatrix as omx - +import pandas as pd # Create example directory -input_data = os.path.join(os.path.dirname(__file__), '../data_1') -output_data = os.path.join(os.path.dirname(__file__), '../data_3') +input_data = os.path.join(os.path.dirname(__file__), "../data_1") +output_data = os.path.join(os.path.dirname(__file__), "../data_3") MAZ_MULTIPLIER = 1000 TAP_OFFSET = 90000 @@ -31,7 +30,7 @@ if os.path.exists(output_data): # shutil.rmtree(output_data) # os.makedirs(output_data) - file_type = ('csv', 'omx') + file_type = ("csv", "omx") for file_name in os.listdir(output_data): if file_name.endswith(file_type): os.unlink(os.path.join(output_data, file_name)) @@ -40,14 +39,14 @@ # ### Convert tazs to mazs and add transit access distance by mode -land_use = pd.read_csv(os.path.join(input_data, 'land_use.csv')) +land_use = pd.read_csv(os.path.join(input_data, "land_use.csv")) -if 'ZONE' in land_use.columns: - land_use.insert(loc=0, column='MAZ', value=land_use.ZONE) - land_use.insert(loc=1, column='TAZ', value=land_use.ZONE) - land_use.drop(columns=['ZONE'], inplace=True) +if "ZONE" in land_use.columns: + land_use.insert(loc=0, column="MAZ", value=land_use.ZONE) + land_use.insert(loc=1, column="TAZ", value=land_use.ZONE) + land_use.drop(columns=["ZONE"], inplace=True) else: - land_use.insert(loc=0, column='MAZ', value=land_use.TAZ) + land_use.insert(loc=0, column="MAZ", value=land_use.TAZ) land_use.TAZ = land_use.TAZ.replace([1, 2, 3, 4], 2) land_use.TAZ = land_use.TAZ.replace([13, 14, 15], 14) @@ -57,32 +56,32 @@ shortWalk = 0.333 # the tm1 example assumes this distance for transit access longWalk = 0.667 -land_use['access_dist_transit'] = shortWalk +land_use["access_dist_transit"] = shortWalk # FIXME - could assign longWalk where maz != taz, but then results wodl differe from one-zone # land_use['access_dist_transit'] =\ # np.where(land_use.TAZ*MAZ_MULTIPLIER==land_use.MAZ, shortWalk, longWalk) -land_use.to_csv(os.path.join(output_data, 'land_use.csv'), index=False) +land_use.to_csv(os.path.join(output_data, "land_use.csv"), index=False) # ### Put households in mazs instead of tazs -households = pd.read_csv(os.path.join(input_data, 'households.csv')) -households.rename(columns={'TAZ': 'MAZ'}, inplace=True) +households = pd.read_csv(os.path.join(input_data, "households.csv")) +households.rename(columns={"TAZ": "MAZ"}, inplace=True) households.MAZ *= MAZ_MULTIPLIER -households.to_csv(os.path.join(output_data, 'households.csv'), index=False) +households.to_csv(os.path.join(output_data, "households.csv"), index=False) -persons = pd.read_csv(os.path.join(input_data, 'persons.csv')) -persons.to_csv(os.path.join(output_data, 'persons.csv'), index=False) +persons = pd.read_csv(os.path.join(input_data, "persons.csv")) +persons.to_csv(os.path.join(output_data, "persons.csv"), index=False) # ### Create maz file # one row per maz, currentlyt he only attribute it its containing TAZ # FIXME - not clear we need this -maz_df = land_use[['MAZ', 'TAZ']] -maz_df.to_csv(os.path.join(output_data, 'maz.csv'), index=False) -print("maz.csv\n%s" % (maz_df.head(6), )) +maz_df = land_use[["MAZ", "TAZ"]] +maz_df.to_csv(os.path.join(output_data, "maz.csv"), index=False) +print("maz.csv\n%s" % (maz_df.head(6),)) # ### Create taz file @@ -93,10 +92,10 @@ # 7 taz_zone_ids = np.unique(land_use.TAZ) -taz_zone_indexes = (taz_zone_ids-1) -taz_df = pd.DataFrame({'TAZ': taz_zone_ids}, index=taz_zone_indexes) -taz_df.to_csv(os.path.join(output_data, 'taz.csv'), index=False) -print("taz.csv\n%s" % (taz_df.head(6), )) +taz_zone_indexes = taz_zone_ids - 1 +taz_df = pd.DataFrame({"TAZ": taz_zone_ids}, index=taz_zone_indexes) +taz_df.to_csv(os.path.join(output_data, "taz.csv"), index=False) +print("taz.csv\n%s" % (taz_df.head(6),)) # currently this has only the one TAZ column, but the legacy table had: # index TAZ @@ -111,23 +110,26 @@ max_distance_for_bike = 5.0 -with omx.open_file(os.path.join(input_data, 'skims.omx')) as ur_skims: +with omx.open_file(os.path.join(input_data, "skims.omx")) as ur_skims: # create df with DIST column - maz_to_maz = pd.DataFrame(ur_skims['DIST']).unstack().reset_index() - maz_to_maz.columns = ['OMAZ', 'DMAZ', 'DIST'] - maz_to_maz['OMAZ'] = (maz_to_maz['OMAZ'] + 1) * MAZ_MULTIPLIER - maz_to_maz['DMAZ'] = (maz_to_maz['DMAZ'] + 1) * MAZ_MULTIPLIER + maz_to_maz = pd.DataFrame(ur_skims["DIST"]).unstack().reset_index() + maz_to_maz.columns = ["OMAZ", "DMAZ", "DIST"] + maz_to_maz["OMAZ"] = (maz_to_maz["OMAZ"] + 1) * MAZ_MULTIPLIER + maz_to_maz["DMAZ"] = (maz_to_maz["DMAZ"] + 1) * MAZ_MULTIPLIER # additional columns - for c in ['DISTBIKE', 'DISTWALK']: + for c in ["DISTBIKE", "DISTWALK"]: maz_to_maz[c] = pd.DataFrame(ur_skims[c]).unstack().values - maz_to_maz.loc[maz_to_maz['DIST'] <= max_distance_for_walk, ['OMAZ', 'DMAZ', 'DISTWALK']].\ - to_csv(os.path.join(output_data, 'maz_to_maz_walk.csv'), index=False) + maz_to_maz.loc[ + maz_to_maz["DIST"] <= max_distance_for_walk, ["OMAZ", "DMAZ", "DISTWALK"] + ].to_csv(os.path.join(output_data, "maz_to_maz_walk.csv"), index=False) - maz_to_maz.loc[maz_to_maz['DIST'] <= max_distance_for_bike, ['OMAZ', 'DMAZ', 'DIST', 'DISTBIKE']].\ - to_csv(os.path.join(output_data, 'maz_to_maz_bike.csv'), index=False) + maz_to_maz.loc[ + maz_to_maz["DIST"] <= max_distance_for_bike, + ["OMAZ", "DMAZ", "DIST", "DISTBIKE"], + ].to_csv(os.path.join(output_data, "maz_to_maz_bike.csv"), index=False) ######## @@ -143,13 +145,16 @@ tap_zone_labels = taz_zone_labels + TAP_OFFSET maz_zone_labels = taz_zone_labels * MAZ_MULTIPLIER tap_df = pd.DataFrame({"TAP": tap_zone_labels, "MAZ": maz_zone_labels}) -tap_df.to_csv(os.path.join(output_data, 'tap.csv'), index=False) +tap_df.to_csv(os.path.join(output_data, "tap.csv"), index=False) # create taz_z3 and tap skims -with \ - omx.open_file(os.path.join(input_data, 'skims.omx'), "r") as ur_skims, \ - omx.open_file(os.path.join(output_data, 'taz_skims.omx'), "w") as output_taz_skims_file, \ - omx.open_file(os.path.join(output_data, 'tap_skims.omx'), "w") as output_tap_skims_file: +with omx.open_file( + os.path.join(input_data, "skims.omx"), "r" +) as ur_skims, omx.open_file( + os.path.join(output_data, "taz_skims.omx"), "w" +) as output_taz_skims_file, omx.open_file( + os.path.join(output_data, "tap_skims.omx"), "w" +) as output_tap_skims_file: for skim_name in ur_skims.list_matrices(): @@ -158,7 +163,7 @@ # print("skim:", skim_name, ": shape", str(new_skim.shape)) mode_code = skim_name[0:3] - is_tap_mode = (mode_code == "DRV" or mode_code == "WLK") + is_tap_mode = mode_code == "DRV" or mode_code == "WLK" is_taz_mode = not is_tap_mode if is_tap_mode: @@ -170,16 +175,16 @@ egress_mode = skim_name[8:11] datum_name = skim_name[12:-4] tod = skim_name[-2:] - if access_mode == 'WLK' and egress_mode == 'WLK': - for suffix in ['FAST', 'SHORT', 'CHEAP']: - if (suffix == 'FAST') and (datum_name == 'TOTIVT'): - random_variation = np.random.rand(*new_skim.shape)*-0.1 + 1.0 - elif (suffix == 'CHEAP') and (datum_name == 'FAR'): + if access_mode == "WLK" and egress_mode == "WLK": + for suffix in ["FAST", "SHORT", "CHEAP"]: + if (suffix == "FAST") and (datum_name == "TOTIVT"): + random_variation = np.random.rand(*new_skim.shape) * -0.1 + 1.0 + elif (suffix == "CHEAP") and (datum_name == "FAR"): random_variation = np.random.rand(*new_skim.shape) * -0.5 + 1.0 else: random_variation = np.ones_like(new_skim) - tap_skim_name = f'{transit_mode}_{datum_name}_{suffix}__{tod}' + tap_skim_name = f"{transit_mode}_{datum_name}_{suffix}__{tod}" output_tap_skims_file[tap_skim_name] = new_skim * random_variation # print(f"tap skim: {skim_name} tap_skim_name: {tap_skim_name}, " # f"shape: {str(output_tap_skims_file.shape())}") @@ -191,19 +196,21 @@ output_taz_skims_file.create_mapping("taz", taz_zone_labels) output_tap_skims_file.create_mapping("tap", tap_zone_labels) -print("taz skims created: " + os.path.join(output_data, 'taz_skims.omx')) -print("tap skims created: " + os.path.join(output_data, 'tap_skims.omx')) +print("taz skims created: " + os.path.join(output_data, "taz_skims.omx")) +print("tap skims created: " + os.path.join(output_data, "tap_skims.omx")) # Create maz to tap distance file by mode -with omx.open_file(os.path.join(input_data, 'skims.omx')) as ur_skims: - distance_table = pd.DataFrame(np.transpose(ur_skims['DIST'])).unstack() +with omx.open_file(os.path.join(input_data, "skims.omx")) as ur_skims: + distance_table = pd.DataFrame(np.transpose(ur_skims["DIST"])).unstack() distance_table = distance_table.reset_index() distance_table.columns = ["MAZ", "TAP", "DIST"] - distance_table['drive_time'] = pd.DataFrame(np.transpose(ur_skims['SOV_TIME__MD'])).unstack().values + distance_table["drive_time"] = ( + pd.DataFrame(np.transpose(ur_skims["SOV_TIME__MD"])).unstack().values + ) - for c in ['DISTBIKE', 'DISTWALK']: + for c in ["DISTBIKE", "DISTWALK"]: distance_table[c] = pd.DataFrame(np.transpose(ur_skims[c])).unstack().values walk_speed = 3 @@ -225,16 +232,19 @@ distance_table = distance_table[distance_table["TAP"].isin(tap_zone_labels)] -distance_table.loc[distance_table['DIST'] <= max_distance_for_nearby_taps_walk, - ['MAZ', 'TAP', 'DISTWALK', 'walk_time']]. \ - to_csv(os.path.join(output_data, 'maz_to_tap_walk.csv'), index=False) +distance_table.loc[ + distance_table["DIST"] <= max_distance_for_nearby_taps_walk, + ["MAZ", "TAP", "DISTWALK", "walk_time"], +].to_csv(os.path.join(output_data, "maz_to_tap_walk.csv"), index=False) -distance_table.loc[distance_table['DIST'] <= max_distance_for_nearby_taps_bike, - ['MAZ', 'TAP', 'DISTBIKE', 'bike_time']]. \ - to_csv(os.path.join(output_data, 'maz_to_tap_bike.csv'), index=False) +distance_table.loc[ + distance_table["DIST"] <= max_distance_for_nearby_taps_bike, + ["MAZ", "TAP", "DISTBIKE", "bike_time"], +].to_csv(os.path.join(output_data, "maz_to_tap_bike.csv"), index=False) -distance_table.loc[distance_table['DIST'] <= max_distance_for_nearby_taps_drive, - ['MAZ', 'TAP', 'DIST', 'drive_time']]. \ - to_csv(os.path.join(output_data, 'maz_to_tap_drive.csv'), index=False) +distance_table.loc[ + distance_table["DIST"] <= max_distance_for_nearby_taps_drive, + ["MAZ", "TAP", "DIST", "drive_time"], +].to_csv(os.path.join(output_data, "maz_to_tap_drive.csv"), index=False) sys.exit(0) diff --git a/activitysim/examples/example_multiple_zone/scripts/two_zone_example_data.py b/activitysim/examples/example_multiple_zone/scripts/two_zone_example_data.py index 3cd7b51a40..0b247905f5 100644 --- a/activitysim/examples/example_multiple_zone/scripts/two_zone_example_data.py +++ b/activitysim/examples/example_multiple_zone/scripts/two_zone_example_data.py @@ -12,15 +12,14 @@ import os import sys -import pandas as pd import numpy as np import openmatrix as omx - +import pandas as pd # Create example directory -input_data = os.path.join(os.path.dirname(__file__), '../data_1') -output_data = os.path.join(os.path.dirname(__file__), '../data_2') +input_data = os.path.join(os.path.dirname(__file__), "../data_1") +output_data = os.path.join(os.path.dirname(__file__), "../data_2") MAZ_MULTIPLIER = 1000 # ### initialize output data directory @@ -29,7 +28,7 @@ if os.path.exists(output_data): # shutil.rmtree(output_data) # os.makedirs(output_data) - file_type = ('csv', 'omx') + file_type = ("csv", "omx") for file_name in os.listdir(output_data): if file_name.endswith(file_type): os.unlink(os.path.join(output_data, file_name)) @@ -38,14 +37,14 @@ # ### Convert tazs to mazs and add transit access distance by mode -land_use = pd.read_csv(os.path.join(input_data, 'land_use.csv')) +land_use = pd.read_csv(os.path.join(input_data, "land_use.csv")) -if 'ZONE' in land_use.columns: - land_use.insert(loc=0, column='MAZ', value=land_use.ZONE) - land_use.insert(loc=1, column='TAZ', value=land_use.ZONE) - land_use.drop(columns=['ZONE'], inplace=True) +if "ZONE" in land_use.columns: + land_use.insert(loc=0, column="MAZ", value=land_use.ZONE) + land_use.insert(loc=1, column="TAZ", value=land_use.ZONE) + land_use.drop(columns=["ZONE"], inplace=True) else: - land_use.insert(loc=0, column='MAZ', value=land_use.TAZ) + land_use.insert(loc=0, column="MAZ", value=land_use.TAZ) land_use.TAZ = land_use.TAZ.replace([1, 2, 3, 4], 2) land_use.TAZ = land_use.TAZ.replace([13, 14, 15], 14) @@ -55,31 +54,31 @@ shortWalk = 0.333 # the tm1 example assumes this distance for transit access longWalk = 0.667 -land_use['access_dist_transit'] = shortWalk +land_use["access_dist_transit"] = shortWalk # FIXME - could assign longWalk where maz != taz, but then results wodl differe from one-zone # land_use['access_dist_transit'] =\ # np.where(land_use.TAZ*MAZ_MULTIPLIER==land_use.MAZ, shortWalk, longWalk) -land_use.to_csv(os.path.join(output_data, 'land_use.csv'), index=False) +land_use.to_csv(os.path.join(output_data, "land_use.csv"), index=False) # ### Put households in mazs instead of tazs -households = pd.read_csv(os.path.join(input_data, 'households.csv')) -households.rename(columns={'TAZ': 'MAZ'}, inplace=True) +households = pd.read_csv(os.path.join(input_data, "households.csv")) +households.rename(columns={"TAZ": "MAZ"}, inplace=True) households.MAZ *= MAZ_MULTIPLIER -households.to_csv(os.path.join(output_data, 'households.csv'), index=False) +households.to_csv(os.path.join(output_data, "households.csv"), index=False) -persons = pd.read_csv(os.path.join(input_data, 'persons.csv')) -persons.to_csv(os.path.join(output_data, 'persons.csv'), index=False) +persons = pd.read_csv(os.path.join(input_data, "persons.csv")) +persons.to_csv(os.path.join(output_data, "persons.csv"), index=False) # ### Create maz correspondence file # FIXME - not clear we need this -maz_df = land_use[['MAZ', 'TAZ']] -maz_df.to_csv(os.path.join(output_data, 'maz.csv'), index=False) -print("maz.csv\n%s" % (maz_df.head(6), )) +maz_df = land_use[["MAZ", "TAZ"]] +maz_df.to_csv(os.path.join(output_data, "maz.csv"), index=False) +print("maz.csv\n%s" % (maz_df.head(6),)) # ### Create taz file @@ -90,10 +89,10 @@ # 7 new_zone_labels = np.unique(land_use.TAZ) -new_zone_indexes = (new_zone_labels-1) -taz_df = pd.DataFrame({'TAZ': new_zone_labels}, index=new_zone_indexes) -taz_df.to_csv(os.path.join(output_data, 'taz.csv'), index=False) -print("taz.csv\n%s" % (taz_df.head(6), )) +new_zone_indexes = new_zone_labels - 1 +taz_df = pd.DataFrame({"TAZ": new_zone_labels}, index=new_zone_indexes) +taz_df.to_csv(os.path.join(output_data, "taz.csv"), index=False) +print("taz.csv\n%s" % (taz_df.head(6),)) # currently this has only the one TAZ column, but the legacy table had: # index TAZ @@ -103,8 +102,11 @@ # ### Create taz skims -with omx.open_file(os.path.join(input_data, 'skims.omx'), 'r') as skims_file, \ - omx.open_file(os.path.join(output_data, 'taz_skims.omx'), "w") as output_skims_file: +with omx.open_file( + os.path.join(input_data, "skims.omx"), "r" +) as skims_file, omx.open_file( + os.path.join(output_data, "taz_skims.omx"), "w" +) as output_skims_file: skims = skims_file.list_matrices() num_zones = skims_file.shape()[0] @@ -122,7 +124,7 @@ output_skims_file.create_mapping("taz", new_zone_labels) -print("taz skims created: " + os.path.join(output_data, 'taz_skims.omx')) +print("taz skims created: " + os.path.join(output_data, "taz_skims.omx")) # ### Create maz to maz time/distance @@ -130,22 +132,25 @@ max_distance_for_bike = 5.0 -with omx.open_file(os.path.join(input_data, 'skims.omx')) as skims_file: +with omx.open_file(os.path.join(input_data, "skims.omx")) as skims_file: # create df with DIST column - maz_to_maz = pd.DataFrame(np.transpose(skims_file['DIST'])).unstack().reset_index() - maz_to_maz.columns = ['OMAZ', 'DMAZ', 'DIST'] - maz_to_maz['OMAZ'] = (maz_to_maz['OMAZ'] + 1) * MAZ_MULTIPLIER - maz_to_maz['DMAZ'] = (maz_to_maz['DMAZ'] + 1) * MAZ_MULTIPLIER + maz_to_maz = pd.DataFrame(np.transpose(skims_file["DIST"])).unstack().reset_index() + maz_to_maz.columns = ["OMAZ", "DMAZ", "DIST"] + maz_to_maz["OMAZ"] = (maz_to_maz["OMAZ"] + 1) * MAZ_MULTIPLIER + maz_to_maz["DMAZ"] = (maz_to_maz["DMAZ"] + 1) * MAZ_MULTIPLIER # additional columns - for c in ['DISTBIKE', 'DISTWALK']: + for c in ["DISTBIKE", "DISTWALK"]: maz_to_maz[c] = pd.DataFrame(np.transpose(skims_file[c])).unstack().values - maz_to_maz.loc[maz_to_maz['DIST'] <= max_distance_for_walk, ['OMAZ', 'DMAZ', 'DISTWALK']].\ - to_csv(os.path.join(output_data, 'maz_to_maz_walk.csv'), index=False) + maz_to_maz.loc[ + maz_to_maz["DIST"] <= max_distance_for_walk, ["OMAZ", "DMAZ", "DISTWALK"] + ].to_csv(os.path.join(output_data, "maz_to_maz_walk.csv"), index=False) - maz_to_maz.loc[maz_to_maz['DIST'] <= max_distance_for_bike, ['OMAZ', 'DMAZ', 'DIST', 'DISTBIKE']].\ - to_csv(os.path.join(output_data, 'maz_to_maz_bike.csv'), index=False) + maz_to_maz.loc[ + maz_to_maz["DIST"] <= max_distance_for_bike, + ["OMAZ", "DMAZ", "DIST", "DISTBIKE"], + ].to_csv(os.path.join(output_data, "maz_to_maz_bike.csv"), index=False) sys.exit(0) diff --git a/activitysim/examples/example_multiple_zone/test/simulation.py b/activitysim/examples/example_multiple_zone/test/simulation.py index ec6a1181b1..0fc64d5390 100755 --- a/activitysim/examples/example_multiple_zone/test/simulation.py +++ b/activitysim/examples/example_multiple_zone/test/simulation.py @@ -1,12 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse +import sys from activitysim.cli.run import add_run_args, run -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_multiple_zone/test/test_multiple_zone.py b/activitysim/examples/example_multiple_zone/test/test_multiple_zone.py index 9788526f07..1c56229296 100644 --- a/activitysim/examples/example_multiple_zone/test/test_multiple_zone.py +++ b/activitysim/examples/example_multiple_zone/test/test_multiple_zone.py @@ -2,11 +2,11 @@ # See full license in LICENSE.txt. import os import subprocess -import pkg_resources -import pytest import pandas as pd import pandas.testing as pdt +import pkg_resources +import pytest from activitysim.core import inject @@ -17,13 +17,13 @@ def teardown_function(func): def example_path(dirname): - resource = os.path.join('examples', 'example_multiple_zone', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_multiple_zone", dirname) + return pkg_resources.resource_filename("activitysim", resource) def mtc_example_path(dirname): - resource = os.path.join('examples', 'example_mtc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_mtc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def build_data(): @@ -33,71 +33,81 @@ def build_data(): pass -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def data(): build_data() def run_test(zone, multiprocess=False): - def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) def regress(zone): # ## regress tours - regress_tours_df = pd.read_csv(test_path(f'regress/final_tours_{zone}_zone.csv')) - tours_df = pd.read_csv(test_path('output/final_tours.csv')) + regress_tours_df = pd.read_csv( + test_path(f"regress/final_tours_{zone}_zone.csv") + ) + tours_df = pd.read_csv(test_path("output/final_tours.csv")) print(f"regress tours") pdt.assert_frame_equal(tours_df, regress_tours_df) # ## regress trips - regress_trips_df = pd.read_csv(test_path(f'regress/final_trips_{zone}_zone.csv')) - trips_df = pd.read_csv(test_path('output/final_trips.csv')) + regress_trips_df = pd.read_csv( + test_path(f"regress/final_trips_{zone}_zone.csv") + ) + trips_df = pd.read_csv(test_path("output/final_trips.csv")) print(f"regress trips") pdt.assert_frame_equal(trips_df, regress_trips_df), "regress trips" - file_path = os.path.join(os.path.dirname(__file__), 'simulation.py') - - run_args = ['-c', test_path(f'configs_{zone}_zone'), - '-c', example_path(f'configs_{zone}_zone'), - '-c', mtc_example_path(f'configs'), - '-d', example_path(f'data_{zone}'), - '-o', test_path('output')] + file_path = os.path.join(os.path.dirname(__file__), "simulation.py") + + run_args = [ + "-c", + test_path(f"configs_{zone}_zone"), + "-c", + example_path(f"configs_{zone}_zone"), + "-c", + mtc_example_path(f"configs"), + "-d", + example_path(f"data_{zone}"), + "-o", + test_path("output"), + ] if multiprocess: - run_args = run_args + ['-s', 'settings_mp'] - elif zone == '3': - run_args = run_args + ['-s', 'settings_static'] + run_args = run_args + ["-s", "settings_mp"] + elif zone == "3": + run_args = run_args + ["-s", "settings_static"] - subprocess.run(['coverage', 'run', '-a', file_path] + run_args, check=True) + subprocess.run(["coverage", "run", "-a", file_path] + run_args, check=True) regress(zone) def test_2_zone(data): - run_test(zone='2', multiprocess=False) + run_test(zone="2", multiprocess=False) def test_2_zone_mp(data): - run_test(zone='2', multiprocess=True) + run_test(zone="2", multiprocess=True) def test_3_zone(data): # python simulation.py -c configs_3_zone -c ../configs_3_zone -c \ # ../../example_mtc/configs -d ../data_3 -o output -s settings_mp - run_test(zone='3', multiprocess=False) + run_test(zone="3", multiprocess=False) def test_3_zone_mp(data): - run_test(zone='3', multiprocess=True) + run_test(zone="3", multiprocess=True) -if __name__ == '__main__': +if __name__ == "__main__": build_data() - run_test(zone='2', multiprocess=False) - run_test(zone='2', multiprocess=True) + run_test(zone="2", multiprocess=False) + run_test(zone="2", multiprocess=True) - run_test(zone='3', multiprocess=False) - run_test(zone='3', multiprocess=True) + run_test(zone="3", multiprocess=False) + run_test(zone="3", multiprocess=True) diff --git a/activitysim/examples/example_psrc/scripts/integrity.py b/activitysim/examples/example_psrc/scripts/integrity.py index ae2c5fbb0f..b2e81656d8 100644 --- a/activitysim/examples/example_psrc/scripts/integrity.py +++ b/activitysim/examples/example_psrc/scripts/integrity.py @@ -1,19 +1,22 @@ -import os -import pandas as pd -import openmatrix as omx -import numpy as np - import argparse +import os +import numpy as np +import openmatrix as omx +import pandas as pd -parser = argparse.ArgumentParser(description='check activitysim raw_data') -parser.add_argument('raw_data_directory', metavar='raw_data_directory', type=str, nargs=1, - help=f"path to raw data directory") +parser = argparse.ArgumentParser(description="check activitysim raw_data") +parser.add_argument( + "raw_data_directory", + metavar="raw_data_directory", + type=str, + nargs=1, + help=f"path to raw data directory", +) -parser.add_argument('-o', '--output', - type=str, - metavar='PATH', - help='path to output dir') +parser.add_argument( + "-o", "--output", type=str, metavar="PATH", help="path to output dir" +) args = parser.parse_args() @@ -34,7 +37,7 @@ def output_path(file_name): def integerize_id_columns(df, table_name): - columns = ['MAZ', 'OMAZ', 'DMAZ', 'TAZ', 'zone_id', 'household_id', 'HHID'] + columns = ["MAZ", "OMAZ", "DMAZ", "TAZ", "zone_id", "household_id", "HHID"] for c in df.columns: if c in columns: bad = ~(df[c] == df[c].astype(int)) @@ -85,59 +88,59 @@ def report_baddies(df, tag, fatal=False): # ### check maz.csv against land_use -land_use = land_use.sort_values('MAZ') -maz = read_csv("maz.csv").sort_values('MAZ') +land_use = land_use.sort_values("MAZ") +maz = read_csv("maz.csv").sort_values("MAZ") # fatal missing = land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)] -report_baddies(missing, 'land_use_MAZ_not_in_maz_MAZ', fatal=True) +report_baddies(missing, "land_use_MAZ_not_in_maz_MAZ", fatal=True) missing = maz.MAZ[~maz.MAZ.isin(land_use.MAZ)] -report_baddies(missing, 'maz_MAZ_not_in_land_use_MAZ') +report_baddies(missing, "maz_MAZ_not_in_land_use_MAZ") # fatal missing = land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)] -report_baddies(missing, 'land_use_TAZ_not_in_maz_TAZ', fatal=True) +report_baddies(missing, "land_use_TAZ_not_in_maz_TAZ", fatal=True) missing = maz.TAZ[~maz.TAZ.isin(land_use.TAZ)] -report_baddies(missing, 'maz_TAZ_not_in_land_use_TAZ') +report_baddies(missing, "maz_TAZ_not_in_land_use_TAZ") # ### check taz.csv against land_use -land_use = land_use.sort_values('TAZ') -taz = read_csv("taz.csv").sort_values('TAZ') +land_use = land_use.sort_values("TAZ") +taz = read_csv("taz.csv").sort_values("TAZ") if output_dir: - taz.to_csv(output_path('taz.csv'), index=False) + taz.to_csv(output_path("taz.csv"), index=False) # fatal missing = land_use.TAZ[~land_use.TAZ.isin(taz.TAZ)] -report_baddies(missing, 'land_use_TAZ_not_in_taz_TAZ', fatal=True) +report_baddies(missing, "land_use_TAZ_not_in_taz_TAZ", fatal=True) missing = taz.TAZ[~taz.TAZ.isin(land_use.TAZ)] -report_baddies(missing, 'taz_TAZ_not_in_land_use_TAZ') +report_baddies(missing, "taz_TAZ_not_in_land_use_TAZ") # #########s # # maz # -maz = read_csv("maz.csv").sort_values(['MAZ', 'TAZ']) +maz = read_csv("maz.csv").sort_values(["MAZ", "TAZ"]) maz = maz[maz["MAZ"].isin(land_use.MAZ)] -integerize_id_columns(maz, 'maz') +integerize_id_columns(maz, "maz") -assert (land_use.MAZ.isin(maz.MAZ).all()) -assert (land_use.TAZ.isin(maz.TAZ).all()) -assert (maz.TAZ.isin(land_use.TAZ).all()) +assert land_use.MAZ.isin(maz.MAZ).all() +assert land_use.TAZ.isin(maz.TAZ).all() +assert maz.TAZ.isin(land_use.TAZ).all() # # taz # -taz = read_csv("taz.csv").sort_values(['TAZ']) +taz = read_csv("taz.csv").sort_values(["TAZ"]) taz = taz[taz["TAZ"].isin(land_use.TAZ)] -integerize_id_columns(taz, 'taz') +integerize_id_columns(taz, "taz") -assert (land_use.TAZ.isin(taz.TAZ).all()) +assert land_use.TAZ.isin(taz.TAZ).all() # print(maz.shape) # print(f"MAZ {len(maz.MAZ.unique())}") @@ -148,22 +151,22 @@ def report_baddies(df, tag, fatal=False): # households = read_csv("households.csv") missing = households[~households["MAZ"].isin(maz.MAZ)] -report_baddies(missing, 'household_MAZ_not_in_maz_MAZ') +report_baddies(missing, "household_MAZ_not_in_maz_MAZ") -integerize_id_columns(households, 'households') +integerize_id_columns(households, "households") # # persons # persons = read_csv("persons.csv") orphans = persons[~persons["household_id"].isin(households.HHID)] -report_baddies(orphans, 'persons_not_in_households') +report_baddies(orphans, "persons_not_in_households") households = households[households["MAZ"].isin(maz.MAZ)] orphans = persons[~persons["household_id"].isin(households.HHID)] -report_baddies(orphans, 'persons_not_in_households_in_maz_MAZ') +report_baddies(orphans, "persons_not_in_households_in_maz_MAZ") -integerize_id_columns(persons, 'persons') +integerize_id_columns(persons, "persons") # # maz_to_maz_walk and maz_to_maz_bike @@ -171,22 +174,22 @@ def report_baddies(df, tag, fatal=False): m2m = read_csv("maz_to_maz_walk.csv") missing = m2m[~(m2m.OMAZ.isin(maz.MAZ) & m2m.DMAZ.isin(maz.MAZ))] -report_baddies(missing, 'maz_to_maz_walk_OMAZ_or_DMAZ_not_in_maz_MAZ') +report_baddies(missing, "maz_to_maz_walk_OMAZ_or_DMAZ_not_in_maz_MAZ") integerize_id_columns(m2m, "maz_to_maz_walk") m2m = read_csv("maz_to_maz_bike.csv") missing = m2m[~(m2m.OMAZ.isin(maz.MAZ) & m2m.DMAZ.isin(maz.MAZ))] -report_baddies(missing, 'maz_to_maz_bike_OMAZ_or_DMAZ_not_in_maz_MAZ') +report_baddies(missing, "maz_to_maz_bike_OMAZ_or_DMAZ_not_in_maz_MAZ") integerize_id_columns(m2m, "maz_to_maz_bike") # # skims # -omx_infile_name = 'skims.omx' +omx_infile_name = "skims.omx" skim_data_type = np.float32 -omx_in = omx.open_file(input_path(omx_infile_name), 'r') +omx_in = omx.open_file(input_path(omx_infile_name), "r") print(f"omx_in shape {omx_in.shape()}") print(f"{len(omx_in.listMappings())} mappings in skims") diff --git a/activitysim/examples/example_psrc/scripts/psrc_crop.py b/activitysim/examples/example_psrc/scripts/psrc_crop.py index 50850fa3da..304963b580 100644 --- a/activitysim/examples/example_psrc/scripts/psrc_crop.py +++ b/activitysim/examples/example_psrc/scripts/psrc_crop.py @@ -1,27 +1,41 @@ +import argparse import os -import pandas as pd -import openmatrix as omx -import numpy as np -import argparse +import numpy as np +import openmatrix as omx +import pandas as pd MAZ_OFFSET = 0 segments = { - 'test': (331, 358), # north part of peninsul a including university (no HSENROLL but nice MAZ-TAZ distrib) - 'downtown': (339, 630), # downtown seattle tazs (339 instead of 400 because need university) - 'seattle': (0, 857), # seattle tazs - 'full': (0, 100000), + "test": ( + 331, + 358, + ), # north part of peninsul a including university (no HSENROLL but nice MAZ-TAZ distrib) + "downtown": ( + 339, + 630, + ), # downtown seattle tazs (339 instead of 400 because need university) + "seattle": (0, 857), # seattle tazs + "full": (0, 100000), } -parser = argparse.ArgumentParser(description='crop PSRC raw_data') -parser.add_argument('segment_name', metavar='segment_name', type=str, nargs=1, - help=f"geography segmentation (e.g. full)") - -parser.add_argument('-c', '--check_geography', - default=False, - action='store_true', - help='check consistency of MAZ, TAZ zone_ids and foreign keys & write orphan_households file') +parser = argparse.ArgumentParser(description="crop PSRC raw_data") +parser.add_argument( + "segment_name", + metavar="segment_name", + type=str, + nargs=1, + help=f"geography segmentation (e.g. full)", +) + +parser.add_argument( + "-c", + "--check_geography", + default=False, + action="store_true", + help="check consistency of MAZ, TAZ zone_ids and foreign keys & write orphan_households file", +) args = parser.parse_args() @@ -32,8 +46,8 @@ assert segment_name in segments.keys(), f"Unknown seg: {segment_name}" taz_min, taz_max = segments[segment_name] -input_dir = './data_raw' -output_dir = f'./data_{segment_name}' +input_dir = "./data_raw" +output_dir = f"./data_{segment_name}" print(f"segment_name {segment_name}") @@ -59,7 +73,7 @@ def output_path(file_name): def integerize_id_columns(df, table_name): - columns = ['MAZ', 'OMAZ', 'DMAZ', 'TAZ', 'zone_id', 'household_id', 'HHID'] + columns = ["MAZ", "OMAZ", "DMAZ", "TAZ", "zone_id", "household_id", "HHID"] for c in df.columns: if c in columns: print(f"converting {table_name}.{c} to int") @@ -88,8 +102,8 @@ def to_csv(df, file_name): # ######## check for orphan_households not in any maz in land_use land_use = read_csv("land_use.csv") - land_use = land_use[['MAZ', 'TAZ']] # King County - land_use = land_use.sort_values(['TAZ', 'MAZ']) + land_use = land_use[["MAZ", "TAZ"]] # King County + land_use = land_use.sort_values(["TAZ", "MAZ"]) households = read_csv("households.csv") orphan_households = households[~households.MAZ.isin(land_use.MAZ)] @@ -105,12 +119,14 @@ def to_csv(df, file_name): # could just build maz and taz files, but want to make sure PSRC data is right land_use = read_csv("land_use.csv") - land_use = land_use.sort_values('MAZ') - maz = read_csv("maz.csv").sort_values('MAZ') + land_use = land_use.sort_values("MAZ") + maz = read_csv("maz.csv").sort_values("MAZ") # ### FATAL ### if not land_use.MAZ.isin(maz.MAZ).all(): - print(f"land_use.MAZ not in maz.MAZ\n{land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)]}") + print( + f"land_use.MAZ not in maz.MAZ\n{land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)]}" + ) raise RuntimeError(f"land_use.MAZ not in maz.MAZ") if not maz.MAZ.isin(land_use.MAZ).all(): @@ -118,18 +134,22 @@ def to_csv(df, file_name): # ### FATAL ### if not land_use.TAZ.isin(maz.TAZ).all(): - print(f"land_use.TAZ not in maz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)]}") + print( + f"land_use.TAZ not in maz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)]}" + ) raise RuntimeError(f"land_use.TAZ not in maz.TAZ") if not maz.TAZ.isin(land_use.TAZ).all(): print(f"maz.TAZ not in land_use.TAZ\n{maz.TAZ[~maz.TAZ.isin(land_use.TAZ)]}") - land_use = land_use.sort_values('TAZ') - taz = read_csv("taz.csv").sort_values('TAZ') + land_use = land_use.sort_values("TAZ") + taz = read_csv("taz.csv").sort_values("TAZ") # ### FATAL ### if not land_use.TAZ.isin(taz.TAZ).all(): - print(f"land_use.TAZ not in taz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(taz.MAZ)]}") + print( + f"land_use.TAZ not in taz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(taz.MAZ)]}" + ) raise RuntimeError(f"land_use.TAZ not in taz.TAZ") if not taz.TAZ.isin(land_use.TAZ).all(): @@ -142,44 +162,46 @@ def to_csv(df, file_name): # land_use = read_csv("land_use.csv") land_use = land_use[(land_use["TAZ"] >= taz_min) & (land_use["TAZ"] <= taz_max)] -integerize_id_columns(land_use, 'land_use') -land_use = land_use.sort_values('MAZ') +integerize_id_columns(land_use, "land_use") +land_use = land_use.sort_values("MAZ") # make sure we have some HSENROLL and COLLFTE, even for very for small samples -if land_use['HSENROLL'].sum() == 0: - assert segment_name != 'full', f"land_use['HSENROLL'] is 0 for full sample!" - land_use['HSENROLL'] = land_use['AGE0519'] +if land_use["HSENROLL"].sum() == 0: + assert segment_name != "full", f"land_use['HSENROLL'] is 0 for full sample!" + land_use["HSENROLL"] = land_use["AGE0519"] print(f"\nWARNING: land_use.HSENROLL is 0, so backfilled with AGE0519\n") -if land_use['COLLFTE'].sum() == 0: - assert segment_name != 'full', f"land_use['COLLFTE'] is 0 for full sample!" - land_use['COLLFTE'] = land_use['HSENROLL'] +if land_use["COLLFTE"].sum() == 0: + assert segment_name != "full", f"land_use['COLLFTE'] is 0 for full sample!" + land_use["COLLFTE"] = land_use["HSENROLL"] print(f"\nWARNING: land_use.COLLFTE is 0, so backfilled with HSENROLL\n") # move MAZ and TAZ columns to front -land_use = land_use[['MAZ', 'TAZ'] + [c for c in land_use.columns if c not in ['MAZ', 'TAZ']]] +land_use = land_use[ + ["MAZ", "TAZ"] + [c for c in land_use.columns if c not in ["MAZ", "TAZ"]] +] to_csv(land_use, "land_use.csv") # # maz # -maz = read_csv("maz.csv").sort_values(['MAZ', 'TAZ']) +maz = read_csv("maz.csv").sort_values(["MAZ", "TAZ"]) maz = maz[maz["MAZ"].isin(land_use.MAZ)] -integerize_id_columns(maz, 'maz') +integerize_id_columns(maz, "maz") -assert (land_use.MAZ.isin(maz.MAZ).all()) -assert (land_use.TAZ.isin(maz.TAZ).all()) -assert (maz.TAZ.isin(land_use.TAZ).all()) +assert land_use.MAZ.isin(maz.MAZ).all() +assert land_use.TAZ.isin(maz.TAZ).all() +assert maz.TAZ.isin(land_use.TAZ).all() to_csv(maz, "maz.csv") # # taz # -taz = read_csv("taz.csv").sort_values(['TAZ']) +taz = read_csv("taz.csv").sort_values(["TAZ"]) taz = taz[taz["TAZ"].isin(land_use.TAZ)] -integerize_id_columns(taz, 'taz') +integerize_id_columns(taz, "taz") -assert (land_use.TAZ.isin(taz.TAZ).all()) +assert land_use.TAZ.isin(taz.TAZ).all() to_csv(taz, "taz.csv") # print(maz.shape) @@ -191,7 +213,7 @@ def to_csv(df, file_name): # households = read_csv("households.csv") households = households[households["MAZ"].isin(maz.MAZ)] -integerize_id_columns(households, 'households') +integerize_id_columns(households, "households") to_csv(households, "households.csv") @@ -200,7 +222,7 @@ def to_csv(df, file_name): # persons = read_csv("persons.csv") persons = persons[persons["household_id"].isin(households.HHID)] -integerize_id_columns(persons, 'persons') +integerize_id_columns(persons, "persons") to_csv(persons, "persons.csv") @@ -216,27 +238,29 @@ def to_csv(df, file_name): # # skims # -omx_infile_name = 'skims.omx' +omx_infile_name = "skims.omx" skim_data_type = np.float32 omx_in = omx.open_file(input_path(omx_infile_name)) print(f"omx_in shape {omx_in.shape()}") assert not omx_in.listMappings() -taz = taz.sort_values('TAZ') +taz = taz.sort_values("TAZ") taz.index = taz.TAZ - 1 tazs_indexes = taz.index.tolist() # index of TAZ in skim (zero-based, no mapping) taz_labels = taz.TAZ.tolist() # TAZ zone_ids in omx index order # create -num_outfiles = 4 if segment_name == 'full' else 1 +num_outfiles = 4 if segment_name == "full" else 1 if num_outfiles == 1: - omx_out = [omx.open_file(output_path(f"skims.omx"), 'w')] + omx_out = [omx.open_file(output_path(f"skims.omx"), "w")] else: - omx_out = [omx.open_file(output_path(f"skims{i+1}.omx"), 'w') for i in range(num_outfiles)] + omx_out = [ + omx.open_file(output_path(f"skims{i+1}.omx"), "w") for i in range(num_outfiles) + ] for omx_file in omx_out: - omx_file.create_mapping('ZONE', taz_labels) + omx_file.create_mapping("ZONE", taz_labels) iskim = 0 for mat_name in omx_in.list_matrices(): diff --git a/activitysim/examples/example_psrc/test/simulation.py b/activitysim/examples/example_psrc/test/simulation.py index ec6a1181b1..0fc64d5390 100755 --- a/activitysim/examples/example_psrc/test/simulation.py +++ b/activitysim/examples/example_psrc/test/simulation.py @@ -1,12 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse +import sys from activitysim.cli.run import add_run_args, run -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_psrc/test/test_psrc.py b/activitysim/examples/example_psrc/test/test_psrc.py index 1aa9a2c7dd..2f9c0f7c86 100644 --- a/activitysim/examples/example_psrc/test/test_psrc.py +++ b/activitysim/examples/example_psrc/test/test_psrc.py @@ -2,10 +2,10 @@ # See full license in LICENSE.txt. import os import subprocess -import pkg_resources import pandas as pd import pandas.testing as pdt +import pkg_resources from activitysim.core import inject @@ -16,33 +16,45 @@ def teardown_function(func): def test_psrc(): - def example_path(dirname): - resource = os.path.join('examples', 'example_psrc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_psrc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) def regress(): - regress_trips_df = pd.read_csv(test_path('regress/final_trips.csv')) - final_trips_df = pd.read_csv(test_path('output/final_trips.csv')) + regress_trips_df = pd.read_csv(test_path("regress/final_trips.csv")) + final_trips_df = pd.read_csv(test_path("output/final_trips.csv")) # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] pdt.assert_frame_equal(final_trips_df, regress_trips_df) - file_path = os.path.join(os.path.dirname(__file__), 'simulation.py') - - subprocess.run(['coverage', 'run', '-a', file_path, - '-c', test_path('configs'), '-c', example_path('configs'), - '-d', example_path('data'), - '-o', test_path('output')], check=True) + file_path = os.path.join(os.path.dirname(__file__), "simulation.py") + + subprocess.run( + [ + "coverage", + "run", + "-a", + file_path, + "-c", + test_path("configs"), + "-c", + example_path("configs"), + "-d", + example_path("data"), + "-o", + test_path("output"), + ], + check=True, + ) regress() -if __name__ == '__main__': +if __name__ == "__main__": test_psrc() diff --git a/activitysim/examples/example_sandag/scripts/sandag_crop_1_zone.py b/activitysim/examples/example_sandag/scripts/sandag_crop_1_zone.py index a9ca656bf3..02f10936be 100644 --- a/activitysim/examples/example_sandag/scripts/sandag_crop_1_zone.py +++ b/activitysim/examples/example_sandag/scripts/sandag_crop_1_zone.py @@ -1,23 +1,31 @@ +import argparse import os -import pandas as pd -import openmatrix as omx -import numpy as np -import argparse +import numpy as np +import openmatrix as omx +import pandas as pd segments = { - 'test': (2185, 2240), # arbitrary - 'full': (0, 100000), + "test": (2185, 2240), # arbitrary + "full": (0, 100000), } -parser = argparse.ArgumentParser(description='crop SANDAG 1 zone raw_data') -parser.add_argument('segment_name', metavar='segment_name', type=str, nargs=1, - help=f"geography segmentation (e.g. full)") - -parser.add_argument('-c', '--check_geography', - default=False, - action='store_true', - help='check consistency of TAZ zone_ids and foreign keys & write orphan_households file') +parser = argparse.ArgumentParser(description="crop SANDAG 1 zone raw_data") +parser.add_argument( + "segment_name", + metavar="segment_name", + type=str, + nargs=1, + help=f"geography segmentation (e.g. full)", +) + +parser.add_argument( + "-c", + "--check_geography", + default=False, + action="store_true", + help="check consistency of TAZ zone_ids and foreign keys & write orphan_households file", +) args = parser.parse_args() @@ -28,8 +36,8 @@ assert segment_name in segments.keys(), f"Unknown seg: {segment_name}" zone_min, zone_max = segments[segment_name] -input_dir = './data_raw' -output_dir = f'./data_{segment_name}_1' +input_dir = "./data_raw" +output_dir = f"./data_{segment_name}_1" print(f"check_geography {check_geography}") @@ -48,7 +56,7 @@ def output_path(file_name): def integerize_id_columns(df, table_name): - columns = ['TAZ', 'household_id', 'HHID', 'taz'] + columns = ["TAZ", "household_id", "HHID", "taz"] for c in df.columns: if c in columns: print(f"converting {table_name}.{c} to int") @@ -90,8 +98,8 @@ def to_csv(df, file_name): # land_use = read_csv("land_use.csv") land_use = land_use[(land_use["TAZ"] >= zone_min) & (land_use["TAZ"] <= zone_max)] -integerize_id_columns(land_use, 'land_use') -land_use = land_use.sort_values('TAZ') +integerize_id_columns(land_use, "land_use") +land_use = land_use.sort_values("TAZ") # move index col to front land_use.insert(0, "TAZ", land_use.pop("TAZ")) @@ -103,7 +111,7 @@ def to_csv(df, file_name): # households = read_csv("households.csv") households = households[households["TAZ"].isin(land_use.TAZ)] -integerize_id_columns(households, 'households') +integerize_id_columns(households, "households") to_csv(households, "households.csv") @@ -112,35 +120,37 @@ def to_csv(df, file_name): # persons = read_csv("persons.csv") persons = persons[persons["household_id"].isin(households.HHID)] -integerize_id_columns(persons, 'persons') +integerize_id_columns(persons, "persons") to_csv(persons, "persons.csv") # # skims # -omx_infile_name = 'skims.omx' +omx_infile_name = "skims.omx" skim_data_type = np.float32 omx_in = omx.open_file(input_path(omx_infile_name)) print(f"omx_in shape {omx_in.shape()}") -zone = land_use.sort_values('TAZ')[['TAZ']] +zone = land_use.sort_values("TAZ")[["TAZ"]] zone.index = zone.TAZ - 1 zone_indexes = zone.index.tolist() # index of TAZ in skim (zero-based, no mapping) zone_labels = zone.TAZ.tolist() # TAZ in omx index order # create -num_outfiles = 6 if segment_name == 'full' else 1 +num_outfiles = 6 if segment_name == "full" else 1 if num_outfiles == 1: - omx_out = [omx.open_file(output_path(f"skims.omx"), 'w')] + omx_out = [omx.open_file(output_path(f"skims.omx"), "w")] else: - omx_out = [omx.open_file(output_path(f"skims{i+1}.omx"), 'w') for i in range(num_outfiles)] + omx_out = [ + omx.open_file(output_path(f"skims{i+1}.omx"), "w") for i in range(num_outfiles) + ] for omx_file in omx_out: - omx_file.create_mapping('ZONE', zone_labels) + omx_file.create_mapping("ZONE", zone_labels) iskim = 0 for mat_name in omx_in.list_matrices(): diff --git a/activitysim/examples/example_sandag/scripts/sandag_crop_2_zone.py b/activitysim/examples/example_sandag/scripts/sandag_crop_2_zone.py index 9626e7a373..5a80c1fe8f 100644 --- a/activitysim/examples/example_sandag/scripts/sandag_crop_2_zone.py +++ b/activitysim/examples/example_sandag/scripts/sandag_crop_2_zone.py @@ -1,25 +1,33 @@ +import argparse import os -import pandas as pd -import openmatrix as omx -import numpy as np -import argparse +import numpy as np +import openmatrix as omx +import pandas as pd MAZ_OFFSET = 0 segments = { - 'test': (331, 358), # arbitrary - 'full': (0, 100000), + "test": (331, 358), # arbitrary + "full": (0, 100000), } -parser = argparse.ArgumentParser(description='crop SANDAG 2 zone raw_data') -parser.add_argument('segment_name', metavar='segment_name', type=str, nargs=1, - help=f"geography segmentation (e.g. full)") - -parser.add_argument('-c', '--check_geography', - default=False, - action='store_true', - help='check consistency of MAZ, TAZ zone_ids and foreign keys & write orphan_households file') +parser = argparse.ArgumentParser(description="crop SANDAG 2 zone raw_data") +parser.add_argument( + "segment_name", + metavar="segment_name", + type=str, + nargs=1, + help=f"geography segmentation (e.g. full)", +) + +parser.add_argument( + "-c", + "--check_geography", + default=False, + action="store_true", + help="check consistency of MAZ, TAZ zone_ids and foreign keys & write orphan_households file", +) args = parser.parse_args() @@ -30,8 +38,8 @@ assert segment_name in segments.keys(), f"Unknown seg: {segment_name}" taz_min, taz_max = segments[segment_name] -input_dir = './data_raw' -output_dir = f'./data_{segment_name}_2' +input_dir = "./data_raw" +output_dir = f"./data_{segment_name}_2" print(f"segment_name {segment_name}") @@ -57,7 +65,7 @@ def output_path(file_name): def integerize_id_columns(df, table_name): - columns = ['MAZ', 'OMAZ', 'DMAZ', 'TAZ', 'zone_id', 'household_id', 'HHID'] + columns = ["MAZ", "OMAZ", "DMAZ", "TAZ", "zone_id", "household_id", "HHID"] for c in df.columns: if c in columns: print(f"converting {table_name}.{c} to int") @@ -86,8 +94,8 @@ def to_csv(df, file_name): # ######## check for orphan_households not in any maz in land_use land_use = read_csv("land_use.csv") - land_use = land_use[['MAZ', 'TAZ']] # King County - land_use = land_use.sort_values(['TAZ', 'MAZ']) + land_use = land_use[["MAZ", "TAZ"]] # King County + land_use = land_use.sort_values(["TAZ", "MAZ"]) households = read_csv("households.csv") orphan_households = households[~households.MAZ.isin(land_use.MAZ)] @@ -103,12 +111,14 @@ def to_csv(df, file_name): # could just build maz and taz files, but want to make sure PSRC data is right land_use = read_csv("land_use.csv") - land_use = land_use.sort_values('MAZ') - maz = read_csv("maz.csv").sort_values('MAZ') + land_use = land_use.sort_values("MAZ") + maz = read_csv("maz.csv").sort_values("MAZ") # ### FATAL ### if not land_use.MAZ.isin(maz.MAZ).all(): - print(f"land_use.MAZ not in maz.MAZ\n{land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)]}") + print( + f"land_use.MAZ not in maz.MAZ\n{land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)]}" + ) raise RuntimeError(f"land_use.MAZ not in maz.MAZ") if not maz.MAZ.isin(land_use.MAZ).all(): @@ -116,18 +126,22 @@ def to_csv(df, file_name): # ### FATAL ### if not land_use.TAZ.isin(maz.TAZ).all(): - print(f"land_use.TAZ not in maz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)]}") + print( + f"land_use.TAZ not in maz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)]}" + ) raise RuntimeError(f"land_use.TAZ not in maz.TAZ") if not maz.TAZ.isin(land_use.TAZ).all(): print(f"maz.TAZ not in land_use.TAZ\n{maz.TAZ[~maz.TAZ.isin(land_use.TAZ)]}") - land_use = land_use.sort_values('TAZ') - taz = read_csv("taz.csv").sort_values('TAZ') + land_use = land_use.sort_values("TAZ") + taz = read_csv("taz.csv").sort_values("TAZ") # ### FATAL ### if not land_use.TAZ.isin(taz.TAZ).all(): - print(f"land_use.TAZ not in taz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(taz.MAZ)]}") + print( + f"land_use.TAZ not in taz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(taz.MAZ)]}" + ) raise RuntimeError(f"land_use.TAZ not in taz.TAZ") if not taz.TAZ.isin(land_use.TAZ).all(): @@ -140,44 +154,46 @@ def to_csv(df, file_name): # land_use = read_csv("land_use.csv") land_use = land_use[(land_use["TAZ"] >= taz_min) & (land_use["TAZ"] <= taz_max)] -integerize_id_columns(land_use, 'land_use') -land_use = land_use.sort_values('MAZ') +integerize_id_columns(land_use, "land_use") +land_use = land_use.sort_values("MAZ") # make sure we have some HSENROLL and COLLFTE, even for very for small samples -if land_use['HSENROLL'].sum() == 0: - assert segment_name != 'full', f"land_use['HSENROLL'] is 0 for full sample!" - land_use['HSENROLL'] = land_use['AGE0519'] +if land_use["HSENROLL"].sum() == 0: + assert segment_name != "full", f"land_use['HSENROLL'] is 0 for full sample!" + land_use["HSENROLL"] = land_use["AGE0519"] print(f"\nWARNING: land_use.HSENROLL is 0, so backfilled with AGE0519\n") -if land_use['COLLFTE'].sum() == 0: - assert segment_name != 'full', f"land_use['COLLFTE'] is 0 for full sample!" - land_use['COLLFTE'] = land_use['HSENROLL'] +if land_use["COLLFTE"].sum() == 0: + assert segment_name != "full", f"land_use['COLLFTE'] is 0 for full sample!" + land_use["COLLFTE"] = land_use["HSENROLL"] print(f"\nWARNING: land_use.COLLFTE is 0, so backfilled with HSENROLL\n") # move MAZ and TAZ columns to front -land_use = land_use[['MAZ', 'TAZ'] + [c for c in land_use.columns if c not in ['MAZ', 'TAZ']]] +land_use = land_use[ + ["MAZ", "TAZ"] + [c for c in land_use.columns if c not in ["MAZ", "TAZ"]] +] to_csv(land_use, "land_use.csv") # # maz # -maz = read_csv("maz.csv").sort_values(['MAZ', 'TAZ']) +maz = read_csv("maz.csv").sort_values(["MAZ", "TAZ"]) maz = maz[maz["MAZ"].isin(land_use.MAZ)] -integerize_id_columns(maz, 'maz') +integerize_id_columns(maz, "maz") -assert (land_use.MAZ.isin(maz.MAZ).all()) -assert (land_use.TAZ.isin(maz.TAZ).all()) -assert (maz.TAZ.isin(land_use.TAZ).all()) +assert land_use.MAZ.isin(maz.MAZ).all() +assert land_use.TAZ.isin(maz.TAZ).all() +assert maz.TAZ.isin(land_use.TAZ).all() to_csv(maz, "maz.csv") # # taz # -taz = read_csv("taz.csv").sort_values(['TAZ']) +taz = read_csv("taz.csv").sort_values(["TAZ"]) taz = taz[taz["TAZ"].isin(land_use.TAZ)] -integerize_id_columns(taz, 'taz') +integerize_id_columns(taz, "taz") -assert (land_use.TAZ.isin(taz.TAZ).all()) +assert land_use.TAZ.isin(taz.TAZ).all() to_csv(taz, "taz.csv") # print(maz.shape) @@ -189,7 +205,7 @@ def to_csv(df, file_name): # households = read_csv("households.csv") households = households[households["MAZ"].isin(maz.MAZ)] -integerize_id_columns(households, 'households') +integerize_id_columns(households, "households") to_csv(households, "households.csv") @@ -198,7 +214,7 @@ def to_csv(df, file_name): # persons = read_csv("persons.csv") persons = persons[persons["household_id"].isin(households.HHID)] -integerize_id_columns(persons, 'persons') +integerize_id_columns(persons, "persons") to_csv(persons, "persons.csv") @@ -214,26 +230,28 @@ def to_csv(df, file_name): # # skims # -omx_infile_name = 'skims.omx' +omx_infile_name = "skims.omx" skim_data_type = np.float32 omx_in = omx.open_file(input_path(omx_infile_name)) print(f"omx_in shape {omx_in.shape()}") -taz = taz.sort_values('TAZ') +taz = taz.sort_values("TAZ") taz.index = taz.TAZ - 1 tazs_indexes = taz.index.tolist() # index of TAZ in skim (zero-based, no mapping) taz_labels = taz.TAZ.tolist() # TAZ zone_ids in omx index order # create -num_outfiles = 6 if segment_name == 'full' else 1 +num_outfiles = 6 if segment_name == "full" else 1 if num_outfiles == 1: - omx_out = [omx.open_file(output_path(f"skims.omx"), 'w')] + omx_out = [omx.open_file(output_path(f"skims.omx"), "w")] else: - omx_out = [omx.open_file(output_path(f"skims{i+1}.omx"), 'w') for i in range(num_outfiles)] + omx_out = [ + omx.open_file(output_path(f"skims{i+1}.omx"), "w") for i in range(num_outfiles) + ] for omx_file in omx_out: - omx_file.create_mapping('ZONE', taz_labels) + omx_file.create_mapping("ZONE", taz_labels) iskim = 0 for mat_name in omx_in.list_matrices(): diff --git a/activitysim/examples/example_sandag/scripts/sandag_crop_3_zone.py b/activitysim/examples/example_sandag/scripts/sandag_crop_3_zone.py index 0a54a0a321..ec2ee949fc 100644 --- a/activitysim/examples/example_sandag/scripts/sandag_crop_3_zone.py +++ b/activitysim/examples/example_sandag/scripts/sandag_crop_3_zone.py @@ -1,28 +1,37 @@ # crop marin tvpb example data processing to one county # Ben Stabler, ben.stabler@rsginc.com, 09/17/20 -import os -import pandas as pd -import openmatrix as omx import argparse +import os + import numpy as np +import openmatrix as omx +import pandas as pd MAZ_OFFSET = 100000 segments = { - 'test': {'MAZ': np.arange(MAZ_OFFSET + 500, MAZ_OFFSET + 1080)}, # includes univ - 'univ_east': {'MAZ': np.arange(MAZ_OFFSET, MAZ_OFFSET + 1080)}, - 'full': {}, + "test": {"MAZ": np.arange(MAZ_OFFSET + 500, MAZ_OFFSET + 1080)}, # includes univ + "univ_east": {"MAZ": np.arange(MAZ_OFFSET, MAZ_OFFSET + 1080)}, + "full": {}, } -parser = argparse.ArgumentParser(description='crop SANDAG 3 zone raw_data') -parser.add_argument('segment_name', metavar='segment_name', type=str, nargs=1, - help=f"geography segmentation (e.g. full)") - -parser.add_argument('-c', '--check_geography', - default=False, - action='store_true', - help='check consistency of MAZ, TAZ, TAP zone_ids and foreign keys & write orphan_households file') +parser = argparse.ArgumentParser(description="crop SANDAG 3 zone raw_data") +parser.add_argument( + "segment_name", + metavar="segment_name", + type=str, + nargs=1, + help=f"geography segmentation (e.g. full)", +) + +parser.add_argument( + "-c", + "--check_geography", + default=False, + action="store_true", + help="check consistency of MAZ, TAZ, TAP zone_ids and foreign keys & write orphan_households file", +) args = parser.parse_args() @@ -32,8 +41,8 @@ assert segment_name in segments.keys(), f"Unknown seg: {segment_name}" -input_dir = './data_raw' -output_dir = f'./data_{segment_name}_3' +input_dir = "./data_raw" +output_dir = f"./data_{segment_name}_3" print(f"segment_name {segment_name}") @@ -58,7 +67,7 @@ def output_path(file_name): def patch_maz(df, maz_offset): for c in df.columns: - if c in ['MAZ', 'OMAZ', 'DMAZ', 'mgra', 'orig_mgra', 'dest_mgra']: + if c in ["MAZ", "OMAZ", "DMAZ", "mgra", "orig_mgra", "dest_mgra"]: df[c] += maz_offset return df @@ -96,12 +105,15 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): # create if num_outfiles == 1: - omx_out = [omx.open_file(output_path(f"{omx_file_name}.omx"), 'w')] + omx_out = [omx.open_file(output_path(f"{omx_file_name}.omx"), "w")] else: - omx_out = [omx.open_file(output_path(f"{omx_file_name}{i + 1}.omx"), 'w') for i in range(num_outfiles)] + omx_out = [ + omx.open_file(output_path(f"{omx_file_name}{i + 1}.omx"), "w") + for i in range(num_outfiles) + ] for omx_file in omx_out: - omx_file.create_mapping('ZONE', labels) + omx_file.create_mapping("ZONE", labels) iskim = 0 for mat_name in omx_in.list_matrices(): @@ -133,8 +145,8 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): # ######## check for orphan_households not in any maz in land_use land_use = read_csv(LAND_USE) - land_use = land_use[['MAZ', 'TAZ']] - land_use = land_use.sort_values(['TAZ', 'MAZ']) + land_use = land_use[["MAZ", "TAZ"]] + land_use = land_use.sort_values(["TAZ", "MAZ"]) households = read_csv(HOUSEHOLDS) orphan_households = households[~households.MAZ.isin(land_use.MAZ)] @@ -143,7 +155,9 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): # write orphan_households to INPUT directory (since it doesn't belong in output) if len(orphan_households) > 0: file_name = "orphan_households.csv" - print(f"writing {file_name} {orphan_households.shape} to {input_path(file_name)}") + print( + f"writing {file_name} {orphan_households.shape} to {input_path(file_name)}" + ) orphan_households.to_csv(input_path(file_name), index=False) # ######## check that land_use and maz and taz tables have same MAZs and TAZs @@ -153,12 +167,14 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): land_use = read_csv(LAND_USE) # assert land_use.set_index('MAZ').index.is_monotonic_increasing - land_use = land_use.sort_values('MAZ') - maz = read_csv(MAZ_TAZ).sort_values('MAZ') + land_use = land_use.sort_values("MAZ") + maz = read_csv(MAZ_TAZ).sort_values("MAZ") # ### FATAL ### if not land_use.MAZ.isin(maz.MAZ).all(): - print(f"land_use.MAZ not in maz.MAZ\n{land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)]}") + print( + f"land_use.MAZ not in maz.MAZ\n{land_use.MAZ[~land_use.MAZ.isin(maz.MAZ)]}" + ) raise RuntimeError(f"land_use.MAZ not in maz.MAZ") if not maz.MAZ.isin(land_use.MAZ).all(): @@ -166,7 +182,9 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): # ### FATAL ### if not land_use.TAZ.isin(maz.TAZ).all(): - print(f"land_use.TAZ not in maz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)]}") + print( + f"land_use.TAZ not in maz.TAZ\n{land_use.TAZ[~land_use.TAZ.isin(maz.TAZ)]}" + ) raise RuntimeError(f"land_use.TAZ not in maz.TAZ") if not maz.TAZ.isin(land_use.TAZ).all(): @@ -187,12 +205,12 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): land_use = land_use[land_use[slice_col].isin(slice_values)] print(f"land_use shape after slicing {land_use.shape}") -to_csv(land_use, 'land_use.csv') +to_csv(land_use, "land_use.csv") # TAZ -taz = pd.DataFrame({'TAZ': sorted(ur_land_use.TAZ.unique())}) +taz = pd.DataFrame({"TAZ": sorted(ur_land_use.TAZ.unique())}) taz = taz[taz.TAZ.isin(land_use["TAZ"])] to_csv(taz, TAZ) @@ -200,35 +218,45 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): # maz_taz -maz_taz = read_csv(MAZ_TAZ).sort_values('MAZ') +maz_taz = read_csv(MAZ_TAZ).sort_values("MAZ") maz_taz = maz_taz[maz_taz.MAZ.isin(land_use.MAZ)] to_csv(maz_taz, MAZ_TAZ) # tap taps = read_csv(TAP_MAZ) -taps = taps[['TAP', 'MAZ']].sort_values(by='TAP').reset_index(drop=True) +taps = taps[["TAP", "MAZ"]].sort_values(by="TAP").reset_index(drop=True) taps = taps[taps["MAZ"].isin(land_use["MAZ"])] to_csv(taps, "tap.csv") # maz to tap -maz_tap_walk = read_csv("maz_to_tap_walk.csv").sort_values(['MAZ', 'TAP']) -taz_tap_drive = read_csv("maz_to_tap_drive.csv").sort_values(['MAZ', 'TAP']) +maz_tap_walk = read_csv("maz_to_tap_walk.csv").sort_values(["MAZ", "TAP"]) +taz_tap_drive = read_csv("maz_to_tap_drive.csv").sort_values(["MAZ", "TAP"]) -maz_tap_walk = maz_tap_walk[maz_tap_walk["MAZ"].isin(land_use["MAZ"]) & maz_tap_walk["TAP"].isin(taps["TAP"])] -taz_tap_drive = taz_tap_drive[taz_tap_drive["MAZ"].isin(land_use["MAZ"]) & taz_tap_drive["TAP"].isin(taps["TAP"])] +maz_tap_walk = maz_tap_walk[ + maz_tap_walk["MAZ"].isin(land_use["MAZ"]) & maz_tap_walk["TAP"].isin(taps["TAP"]) +] +taz_tap_drive = taz_tap_drive[ + taz_tap_drive["MAZ"].isin(land_use["MAZ"]) & taz_tap_drive["TAP"].isin(taps["TAP"]) +] to_csv(maz_tap_walk, "maz_to_tap_walk.csv") to_csv(taz_tap_drive, "maz_to_tap_drive.csv") # maz to mz -maz_maz_walk = read_csv("maz_to_maz_walk.csv").sort_values(['OMAZ', 'DMAZ']) -maz_maz_bike = read_csv("maz_to_maz_bike.csv").sort_values(['OMAZ', 'DMAZ']) +maz_maz_walk = read_csv("maz_to_maz_walk.csv").sort_values(["OMAZ", "DMAZ"]) +maz_maz_bike = read_csv("maz_to_maz_bike.csv").sort_values(["OMAZ", "DMAZ"]) -maz_maz_walk = maz_maz_walk[maz_maz_walk["OMAZ"].isin(land_use["MAZ"]) & maz_maz_walk["DMAZ"].isin(land_use["MAZ"])] -maz_maz_bike = maz_maz_bike[maz_maz_bike["OMAZ"].isin(land_use["MAZ"]) & maz_maz_bike["DMAZ"].isin(land_use["MAZ"])] +maz_maz_walk = maz_maz_walk[ + maz_maz_walk["OMAZ"].isin(land_use["MAZ"]) + & maz_maz_walk["DMAZ"].isin(land_use["MAZ"]) +] +maz_maz_bike = maz_maz_bike[ + maz_maz_bike["OMAZ"].isin(land_use["MAZ"]) + & maz_maz_bike["DMAZ"].isin(land_use["MAZ"]) +] to_csv(maz_maz_walk, "maz_to_maz_walk.csv") to_csv(maz_maz_bike, "maz_to_maz_bike.csv") @@ -236,7 +264,7 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): # tap_lines tap_lines = read_csv("tap_lines.csv") -tap_lines = tap_lines[tap_lines['TAP'].isin(taps["TAP"])] +tap_lines = tap_lines[tap_lines["TAP"].isin(taps["TAP"])] to_csv(tap_lines, "tap_lines.csv") # households @@ -253,5 +281,5 @@ def crop_omx(omx_file_name, zones, num_outfiles=1): # skims -crop_omx('taz_skims', taz.TAZ, num_outfiles=(4 if segment_name == 'full' else 1)) -crop_omx('tap_skims', taps.TAP, num_outfiles=(4 if segment_name == 'full' else 1)) +crop_omx("taz_skims", taz.TAZ, num_outfiles=(4 if segment_name == "full" else 1)) +crop_omx("tap_skims", taps.TAP, num_outfiles=(4 if segment_name == "full" else 1)) diff --git a/activitysim/examples/example_sandag/test/simulation.py b/activitysim/examples/example_sandag/test/simulation.py index ec6a1181b1..0fc64d5390 100755 --- a/activitysim/examples/example_sandag/test/simulation.py +++ b/activitysim/examples/example_sandag/test/simulation.py @@ -1,12 +1,12 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse +import sys from activitysim.cli.run import add_run_args, run -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_sandag/test/test_sandag.py b/activitysim/examples/example_sandag/test/test_sandag.py index 71313bec88..1f607c6a32 100644 --- a/activitysim/examples/example_sandag/test/test_sandag.py +++ b/activitysim/examples/example_sandag/test/test_sandag.py @@ -2,10 +2,10 @@ # See full license in LICENSE.txt. import os import subprocess -import pkg_resources import pandas as pd import pandas.testing as pdt +import pkg_resources from activitysim.core import inject @@ -16,65 +16,75 @@ def teardown_function(func): def example_path(dirname): - resource = os.path.join('examples', 'example_sandag', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_sandag", dirname) + return pkg_resources.resource_filename("activitysim", resource) def mtc_example_path(dirname): - resource = os.path.join('examples', 'example_mtc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_mtc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def psrc_example_path(dirname): - resource = os.path.join('examples', 'example_psrc', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_psrc", dirname) + return pkg_resources.resource_filename("activitysim", resource) def test_sandag(): - def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) def regress(zone): # ## regress tours - regress_tours_df = pd.read_csv(test_path(f'regress/final_{zone}_zone_tours.csv')) - tours_df = pd.read_csv(test_path(f'output/final_{zone}_zone_tours.csv')) + regress_tours_df = pd.read_csv( + test_path(f"regress/final_{zone}_zone_tours.csv") + ) + tours_df = pd.read_csv(test_path(f"output/final_{zone}_zone_tours.csv")) print(f"regress tours") pdt.assert_frame_equal(tours_df, regress_tours_df), "regress tours" # ## regress trips - regress_trips_df = pd.read_csv(test_path(f'regress/final_{zone}_zone_trips.csv')) - trips_df = pd.read_csv(test_path(f'output/final_{zone}_zone_trips.csv')) + regress_trips_df = pd.read_csv( + test_path(f"regress/final_{zone}_zone_trips.csv") + ) + trips_df = pd.read_csv(test_path(f"output/final_{zone}_zone_trips.csv")) print(f"regress trips") pdt.assert_frame_equal(trips_df, regress_trips_df), "regress trips" # run tests with and without multi processing - zones = ['1', '2', '3'] + zones = ["1", "2", "3"] test_combos = [(z, mp) for z in zones for mp in [False, True]] for test_combo in test_combos: zone, multiprocess = test_combo - file_path = os.path.join(os.path.dirname(__file__), 'simulation.py') + file_path = os.path.join(os.path.dirname(__file__), "simulation.py") - if zone == '2': - base_configs = psrc_example_path(f'configs') + if zone == "2": + base_configs = psrc_example_path(f"configs") else: - base_configs = mtc_example_path(f'configs') - - run_args = ['-c', test_path(f'configs_{zone}_zone'), - '-c', example_path(f'configs_{zone}_zone'), - '-c', base_configs, - '-d', example_path(f'data_{zone}'), - '-o', test_path('output')] + base_configs = mtc_example_path(f"configs") + + run_args = [ + "-c", + test_path(f"configs_{zone}_zone"), + "-c", + example_path(f"configs_{zone}_zone"), + "-c", + base_configs, + "-d", + example_path(f"data_{zone}"), + "-o", + test_path("output"), + ] if multiprocess: - run_args = run_args + ['-s', 'settings_mp.yaml'] + run_args = run_args + ["-s", "settings_mp.yaml"] - subprocess.run(['coverage', 'run', '-a', file_path] + run_args, check=True) + subprocess.run(["coverage", "run", "-a", file_path] + run_args, check=True) regress(zone) -if __name__ == '__main__': +if __name__ == "__main__": test_sandag() diff --git a/activitysim/examples/example_semcog/extensions/__init__.py b/activitysim/examples/example_semcog/extensions/__init__.py index 9c468e1ebe..0b9ada0334 100644 --- a/activitysim/examples/example_semcog/extensions/__init__.py +++ b/activitysim/examples/example_semcog/extensions/__init__.py @@ -1,4 +1,6 @@ -from . import work_from_home -from . import telecommute_frequency -from . import transit_pass_subsidy -from . import transit_pass_ownership +from . import ( + telecommute_frequency, + transit_pass_ownership, + transit_pass_subsidy, + work_from_home, +) diff --git a/activitysim/examples/example_semcog/extensions/telecommute_frequency.py b/activitysim/examples/example_semcog/extensions/telecommute_frequency.py index bbe72e11f1..8dacf223f1 100755 --- a/activitysim/examples/example_semcog/extensions/telecommute_frequency.py +++ b/activitysim/examples/example_semcog/extensions/telecommute_frequency.py @@ -4,22 +4,14 @@ import pandas as pd -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import expressions - from activitysim.abm.models.util import estimation +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing logger = logging.getLogger("activitysim") @inject.step() -def telecommute_frequency( - persons_merged, persons, - chunk_size, trace_hh_id): +def telecommute_frequency(persons_merged, persons, chunk_size, trace_hh_id): """ This model predicts the frequency of telecommute for a person (worker) who does not works from home. The alternatives of this model are 'No Telecommute', @@ -28,8 +20,8 @@ def telecommute_frequency( office during a week. """ - trace_label = 'telecommute_frequency' - model_settings_file_name = 'telecommute_frequency.yaml' + trace_label = "telecommute_frequency" + model_settings_file_name = "telecommute_frequency.yaml" choosers = persons_merged.to_frame() choosers = choosers[choosers.workplace_zone_id > -1] @@ -37,12 +29,12 @@ def telecommute_frequency( logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation('telecommute_frequency') + estimator = estimation.manager.begin_estimation("telecommute_frequency") constants = config.get_model_constants(model_settings) # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_d = {} @@ -53,9 +45,10 @@ def telecommute_frequency( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -74,25 +67,30 @@ def telecommute_frequency( locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='telecommute_frequency', - estimator=estimator) + trace_choice_name="telecommute_frequency", + estimator=estimator, + ) choices = pd.Series(model_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'persons', 'telecommute_frequency') + choices = estimator.get_survey_values( + choices, "persons", "telecommute_frequency" + ) estimator.write_override_choices(choices) estimator.end_estimation() persons = persons.to_frame() - persons['telecommute_frequency'] = choices.reindex(persons.index).fillna('').astype(str) + persons["telecommute_frequency"] = ( + choices.reindex(persons.index).fillna("").astype(str) + ) pipeline.replace_table("persons", persons) - tracing.print_summary('telecommute_frequency', persons.telecommute_frequency, value_counts=True) + tracing.print_summary( + "telecommute_frequency", persons.telecommute_frequency, value_counts=True + ) if trace_hh_id: - tracing.trace_df(persons, - label=trace_label, - warn_if_empty=True) + tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/examples/example_semcog/extensions/transit_pass_ownership.py b/activitysim/examples/example_semcog/extensions/transit_pass_ownership.py index 1b9d3c1fae..6507ab8256 100644 --- a/activitysim/examples/example_semcog/extensions/transit_pass_ownership.py +++ b/activitysim/examples/example_semcog/extensions/transit_pass_ownership.py @@ -4,39 +4,31 @@ import numpy as np -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import expressions - from activitysim.abm.models.util import estimation +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing logger = logging.getLogger("activitysim") @inject.step() -def transit_pass_ownership( - persons_merged, persons, - chunk_size, trace_hh_id): +def transit_pass_ownership(persons_merged, persons, chunk_size, trace_hh_id): """ Transit pass ownership model. """ - trace_label = 'transit_pass_ownership' - model_settings_file_name = 'transit_pass_ownership.yaml' + trace_label = "transit_pass_ownership" + model_settings_file_name = "transit_pass_ownership.yaml" choosers = persons_merged.to_frame() logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation('transit_pass_ownership') + estimator = estimation.manager.begin_estimation("transit_pass_ownership") constants = config.get_model_constants(model_settings) # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_d = {} @@ -47,9 +39,10 @@ def transit_pass_ownership( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -68,23 +61,26 @@ def transit_pass_ownership( locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='transit_pass_ownership', - estimator=estimator) + trace_choice_name="transit_pass_ownership", + estimator=estimator, + ) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'persons', 'transit_pass_ownership') + choices = estimator.get_survey_values( + choices, "persons", "transit_pass_ownership" + ) estimator.write_override_choices(choices) estimator.end_estimation() persons = persons.to_frame() - persons['transit_pass_ownership'] = choices.reindex(persons.index) + persons["transit_pass_ownership"] = choices.reindex(persons.index) pipeline.replace_table("persons", persons) - tracing.print_summary('transit_pass_ownership', persons.transit_pass_ownership, value_counts=True) + tracing.print_summary( + "transit_pass_ownership", persons.transit_pass_ownership, value_counts=True + ) if trace_hh_id: - tracing.trace_df(persons, - label=trace_label, - warn_if_empty=True) + tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/examples/example_semcog/extensions/transit_pass_subsidy.py b/activitysim/examples/example_semcog/extensions/transit_pass_subsidy.py index ddbcfbef32..4e513a6611 100644 --- a/activitysim/examples/example_semcog/extensions/transit_pass_subsidy.py +++ b/activitysim/examples/example_semcog/extensions/transit_pass_subsidy.py @@ -4,39 +4,31 @@ import numpy as np -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import expressions - from activitysim.abm.models.util import estimation +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing logger = logging.getLogger("activitysim") @inject.step() -def transit_pass_subsidy( - persons_merged, persons, - chunk_size, trace_hh_id): +def transit_pass_subsidy(persons_merged, persons, chunk_size, trace_hh_id): """ Transit pass subsidy model. """ - trace_label = 'transit_pass_subsidy' - model_settings_file_name = 'transit_pass_subsidy.yaml' + trace_label = "transit_pass_subsidy" + model_settings_file_name = "transit_pass_subsidy.yaml" choosers = persons_merged.to_frame() logger.info("Running %s with %d persons", trace_label, len(choosers)) model_settings = config.read_model_settings(model_settings_file_name) - estimator = estimation.manager.begin_estimation('transit_pass_subsidy') + estimator = estimation.manager.begin_estimation("transit_pass_subsidy") constants = config.get_model_constants(model_settings) # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_d = {} @@ -47,9 +39,10 @@ def transit_pass_subsidy( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) @@ -68,23 +61,26 @@ def transit_pass_subsidy( locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='transit_pass_subsidy', - estimator=estimator) + trace_choice_name="transit_pass_subsidy", + estimator=estimator, + ) if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'persons', 'transit_pass_subsidy') + choices = estimator.get_survey_values( + choices, "persons", "transit_pass_subsidy" + ) estimator.write_override_choices(choices) estimator.end_estimation() persons = persons.to_frame() - persons['transit_pass_subsidy'] = choices.reindex(persons.index) + persons["transit_pass_subsidy"] = choices.reindex(persons.index) pipeline.replace_table("persons", persons) - tracing.print_summary('transit_pass_subsidy', persons.transit_pass_subsidy, value_counts=True) + tracing.print_summary( + "transit_pass_subsidy", persons.transit_pass_subsidy, value_counts=True + ) if trace_hh_id: - tracing.trace_df(persons, - label=trace_label, - warn_if_empty=True) + tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/examples/example_semcog/extensions/work_from_home.py b/activitysim/examples/example_semcog/extensions/work_from_home.py index f6e2728b23..37a8cc887d 100755 --- a/activitysim/examples/example_semcog/extensions/work_from_home.py +++ b/activitysim/examples/example_semcog/extensions/work_from_home.py @@ -4,43 +4,35 @@ import numpy as np -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import simulate -from activitysim.core import inject -from activitysim.core import expressions - from activitysim.abm.models.util import estimation +from activitysim.core import config, expressions, inject, pipeline, simulate, tracing logger = logging.getLogger("activitysim") @inject.step() -def work_from_home( - persons_merged, persons, - chunk_size, trace_hh_id): +def work_from_home(persons_merged, persons, chunk_size, trace_hh_id): """ This model predicts whether a person (worker) works from home. The output from this model is TRUE (if works from home) or FALSE (works away from home). """ - trace_label = 'work_from_home' - model_settings_file_name = 'work_from_home.yaml' + trace_label = "work_from_home" + model_settings_file_name = "work_from_home.yaml" choosers = persons_merged.to_frame() model_settings = config.read_model_settings(model_settings_file_name) - chooser_filter_column_name = model_settings.get('CHOOSER_FILTER_COLUMN_NAME') + chooser_filter_column_name = model_settings.get("CHOOSER_FILTER_COLUMN_NAME") choosers = choosers[choosers[chooser_filter_column_name]] logger.info("Running %s with %d persons", trace_label, len(choosers)) - estimator = estimation.manager.begin_estimation('work_from_home') + estimator = estimation.manager.begin_estimation("work_from_home") constants = config.get_model_constants(model_settings) - work_from_home_alt = model_settings['WORK_FROM_HOME_ALT'] + work_from_home_alt = model_settings["WORK_FROM_HOME_ALT"] # - preprocessor - preprocessor_settings = model_settings.get('preprocessor', None) + preprocessor_settings = model_settings.get("preprocessor", None) if preprocessor_settings: locals_d = {} @@ -51,9 +43,10 @@ def work_from_home( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, - trace_label=trace_label) + trace_label=trace_label, + ) - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) coefficients_df = simulate.read_model_coefficients(model_settings) nest_spec = config.get_logit_model_settings(model_settings) @@ -65,18 +58,31 @@ def work_from_home( estimator.write_choosers(choosers) # - iterative single process what-if adjustment if specified - iterations = model_settings.get('WORK_FROM_HOME_ITERATIONS', 1) - iterations_chooser_filter = model_settings.get('WORK_FROM_HOME_CHOOSER_FILTER', None) - iterations_coefficient_constant = model_settings.get('WORK_FROM_HOME_COEFFICIENT_CONSTANT', None) - iterations_target_percent = model_settings.get('WORK_FROM_HOME_TARGET_PERCENT', None) - iterations_target_percent_tolerance = model_settings.get('WORK_FROM_HOME_TARGET_PERCENT_TOLERANCE', None) + iterations = model_settings.get("WORK_FROM_HOME_ITERATIONS", 1) + iterations_chooser_filter = model_settings.get( + "WORK_FROM_HOME_CHOOSER_FILTER", None + ) + iterations_coefficient_constant = model_settings.get( + "WORK_FROM_HOME_COEFFICIENT_CONSTANT", None + ) + iterations_target_percent = model_settings.get( + "WORK_FROM_HOME_TARGET_PERCENT", None + ) + iterations_target_percent_tolerance = model_settings.get( + "WORK_FROM_HOME_TARGET_PERCENT_TOLERANCE", None + ) for iteration in range(iterations): - logger.info("Running %s with %d persons iteration %d", trace_label, len(choosers), iteration) + logger.info( + "Running %s with %d persons iteration %d", + trace_label, + len(choosers), + iteration, + ) # re-read spec to reset substitution - model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) + model_spec = simulate.read_model_spec(file_name=model_settings["SPEC"]) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) choices = simulate.simple_simulate( @@ -86,50 +92,71 @@ def work_from_home( locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, - trace_choice_name='work_from_home', - estimator=estimator) + trace_choice_name="work_from_home", + estimator=estimator, + ) if iterations_target_percent is not None: choices_for_filter = choices[choosers[iterations_chooser_filter]] - current_percent = ((choices_for_filter == work_from_home_alt).sum() / len(choices_for_filter)) - logger.info("Running %s iteration %i choosers %i current percent %f target percent %f", - trace_label, iteration, len(choices_for_filter), current_percent, iterations_target_percent) - - if current_percent <= (iterations_target_percent + - iterations_target_percent_tolerance - ) and current_percent >= (iterations_target_percent - - iterations_target_percent_tolerance): - logger.info("Running %s iteration %i converged with coefficient %f", trace_label, iteration, - coefficients_df.value[iterations_coefficient_constant]) + current_percent = (choices_for_filter == work_from_home_alt).sum() / len( + choices_for_filter + ) + logger.info( + "Running %s iteration %i choosers %i current percent %f target percent %f", + trace_label, + iteration, + len(choices_for_filter), + current_percent, + iterations_target_percent, + ) + + if current_percent <= ( + iterations_target_percent + iterations_target_percent_tolerance + ) and current_percent >= ( + iterations_target_percent - iterations_target_percent_tolerance + ): + logger.info( + "Running %s iteration %i converged with coefficient %f", + trace_label, + iteration, + coefficients_df.value[iterations_coefficient_constant], + ) break else: - new_value = np.log(iterations_target_percent / - np.maximum(current_percent, 0.0001) - ) + coefficients_df.value[iterations_coefficient_constant] + new_value = ( + np.log( + iterations_target_percent / np.maximum(current_percent, 0.0001) + ) + + coefficients_df.value[iterations_coefficient_constant] + ) coefficients_df.value[iterations_coefficient_constant] = new_value - logger.info("Running %s iteration %i new coefficient for next iteration %f", - trace_label, iteration, new_value) + logger.info( + "Running %s iteration %i new coefficient for next iteration %f", + trace_label, + iteration, + new_value, + ) iteration = iteration + 1 - choices = (choices == work_from_home_alt) + choices = choices == work_from_home_alt if estimator: estimator.write_choices(choices) - choices = estimator.get_survey_values(choices, 'persons', 'work_from_home') + choices = estimator.get_survey_values(choices, "persons", "work_from_home") estimator.write_override_choices(choices) estimator.end_estimation() persons = persons.to_frame() - persons['work_from_home'] = choices.reindex(persons.index).fillna(0).astype(bool) - persons['is_out_of_home_worker'] = persons[chooser_filter_column_name] & ~persons['work_from_home'] + persons["work_from_home"] = choices.reindex(persons.index).fillna(0).astype(bool) + persons["is_out_of_home_worker"] = ( + persons[chooser_filter_column_name] & ~persons["work_from_home"] + ) pipeline.replace_table("persons", persons) - tracing.print_summary('work_from_home', persons.work_from_home, value_counts=True) + tracing.print_summary("work_from_home", persons.work_from_home, value_counts=True) if trace_hh_id: - tracing.trace_df(persons, - label=trace_label, - warn_if_empty=True) + tracing.trace_df(persons, label=trace_label, warn_if_empty=True) diff --git a/activitysim/examples/example_semcog/scripts/reindex_household_ids.py b/activitysim/examples/example_semcog/scripts/reindex_household_ids.py index 65731e1e3a..538f9d3f9d 100644 --- a/activitysim/examples/example_semcog/scripts/reindex_household_ids.py +++ b/activitysim/examples/example_semcog/scripts/reindex_household_ids.py @@ -9,31 +9,32 @@ """ import os +import sys import numpy as np import pandas as pd -import sys - if sys.version_info[0] < 3: raise Exception("Must be using Python 3") file_names = { - 'households': 'households.csv', - 'persons': 'persons.csv', - 'land_use': 'land_use.csv', + "households": "households.csv", + "persons": "persons.csv", + "land_use": "land_use.csv", } -land_use_zone_col = 'ZONE' -hh_zone_col = 'zone_id' +land_use_zone_col = "ZONE" +hh_zone_col = "zone_id" def drop_and_dump(df, drop, msg, tag, output_dir): print("Checking for %s" % msg) if drop.any(): - print("WARNING: dropping %s out of %s %s (%s)" % (drop.sum(), len(df), msg, tag)) - df[drop].to_csv(os.path.join(output_dir, '%s.csv' % tag), index=False) + print( + "WARNING: dropping %s out of %s %s (%s)" % (drop.sum(), len(df), msg, tag) + ) + df[drop].to_csv(os.path.join(output_dir, "%s.csv" % tag), index=False) df = df[~drop] return df @@ -44,53 +45,65 @@ def create_subset(input_dir, output_dir, drop_dir): ### # land_use ### - land_use_df = pd.read_csv(os.path.join(input_dir, file_names['land_use'])) + land_use_df = pd.read_csv(os.path.join(input_dir, file_names["land_use"])) land_use_df = land_use_df.sort_values(by=land_use_zone_col) - land_use_df.to_csv(os.path.join(output_dir, file_names['land_use']), index=False) + land_use_df.to_csv(os.path.join(output_dir, file_names["land_use"]), index=False) - print('zones: %s' % len(land_use_df)) + print("zones: %s" % len(land_use_df)) ### # households ### - households = \ - pd.read_csv(os.path.join(input_dir, file_names['households']), - dtype={'household_id': np.int64}) - households = households.sort_values(by='household_id') - households.rename(columns={'household_id': 'legacy_household_id'}, inplace=True) + households = pd.read_csv( + os.path.join(input_dir, file_names["households"]), + dtype={"household_id": np.int64}, + ) + households = households.sort_values(by="household_id") + households.rename(columns={"household_id": "legacy_household_id"}, inplace=True) raw_household_count = len(households) # all households must have a zone_id null_zones = households[hh_zone_col].isnull() - households = \ - drop_and_dump(households, null_zones, - msg="households with null zones", - tag='households_with_null_zones', - output_dir=drop_dir) + households = drop_and_dump( + households, + null_zones, + msg="households with null zones", + tag="households_with_null_zones", + output_dir=drop_dir, + ) households[hh_zone_col] = households[hh_zone_col].astype(np.int64) # all households zone_ids must be in land_use orphan_zones = ~households[hh_zone_col].isin(land_use_df[land_use_zone_col]) - households = \ - drop_and_dump(households, orphan_zones, - msg="households with unknown zones", - tag='households_with_unknown_zones', - output_dir=drop_dir) + households = drop_and_dump( + households, + orphan_zones, + msg="households with unknown zones", + tag="households_with_unknown_zones", + output_dir=drop_dir, + ) # reindexed household_id as both index and column households.index = np.arange(1, len(households) + 1) - households['household_id'] = households.index + households["household_id"] = households.index ### # persons ### - persons = \ - pd.read_csv(os.path.join(input_dir, file_names['persons']), - dtype={'household_id': np.int64, 'person_id': np.int64}) - persons = persons.sort_values(by=['household_id', 'member_id']) - persons.rename(columns={'person_id': 'legacy_person_id', 'household_id': 'legacy_household_id'}, inplace=True) + persons = pd.read_csv( + os.path.join(input_dir, file_names["persons"]), + dtype={"household_id": np.int64, "person_id": np.int64}, + ) + persons = persons.sort_values(by=["household_id", "member_id"]) + persons.rename( + columns={ + "person_id": "legacy_person_id", + "household_id": "legacy_household_id", + }, + inplace=True, + ) persons.legacy_household_id = persons.legacy_household_id.astype(np.int64) raw_person_count = len(persons) @@ -98,59 +111,82 @@ def create_subset(input_dir, output_dir, drop_dir): assert not persons.legacy_household_id.isnull().any() orphan_persons = ~persons.legacy_household_id.isin(households.legacy_household_id) - persons = \ - drop_and_dump(persons, orphan_persons, - msg="persons without households", - tag='persons_without_households', - output_dir=drop_dir) - - persons = \ - pd.merge(persons, - households[['legacy_household_id', 'household_id']], - left_on="legacy_household_id", - right_on="legacy_household_id", - how="left") + persons = drop_and_dump( + persons, + orphan_persons, + msg="persons without households", + tag="persons_without_households", + output_dir=drop_dir, + ) + + persons = pd.merge( + persons, + households[["legacy_household_id", "household_id"]], + left_on="legacy_household_id", + right_on="legacy_household_id", + how="left", + ) assert not persons.household_id.isnull().any() persons.household_id = persons.household_id.astype(np.int64) # reindexed person_id as both index and column persons.index = np.arange(1, len(persons) + 1) - persons['person_id'] = persons.index + persons["person_id"] = persons.index # check that we have the right number of persons in every household" - assert (persons.groupby('household_id').size() == households.persons).all() + assert (persons.groupby("household_id").size() == households.persons).all() # check that all persons in household have different member_id" - persons_with_dupe_member_id = persons.duplicated(['household_id', 'member_id'], keep='first') - household_ids_with_dupe_member_id = persons.household_id[persons_with_dupe_member_id].unique() - households_with_dupe_members = households.household_id.isin(household_ids_with_dupe_member_id) - persons_in_households_with_dupe_members = persons.household_id.isin(household_ids_with_dupe_member_id) - - print("%s of %s persons_with_dupe_member_id" % (persons_with_dupe_member_id.sum(), len(persons))) - persons = \ - drop_and_dump(persons, persons_in_households_with_dupe_members, - msg="persons in households with duplicate (household_id, member_id)", - tag='persons_in_households_with_dupe_member_id', - output_dir=drop_dir) - - households = \ - drop_and_dump(households, households_with_dupe_members, - msg="households with duplicate persons.member_id", - tag='households_with_dupe_member_id', - output_dir=drop_dir) - - missing_member1 = ~households.household_id.isin(persons.household_id[persons.member_id == 1]) + persons_with_dupe_member_id = persons.duplicated( + ["household_id", "member_id"], keep="first" + ) + household_ids_with_dupe_member_id = persons.household_id[ + persons_with_dupe_member_id + ].unique() + households_with_dupe_members = households.household_id.isin( + household_ids_with_dupe_member_id + ) + persons_in_households_with_dupe_members = persons.household_id.isin( + household_ids_with_dupe_member_id + ) + + print( + "%s of %s persons_with_dupe_member_id" + % (persons_with_dupe_member_id.sum(), len(persons)) + ) + persons = drop_and_dump( + persons, + persons_in_households_with_dupe_members, + msg="persons in households with duplicate (household_id, member_id)", + tag="persons_in_households_with_dupe_member_id", + output_dir=drop_dir, + ) + + households = drop_and_dump( + households, + households_with_dupe_members, + msg="households with duplicate persons.member_id", + tag="households_with_dupe_member_id", + output_dir=drop_dir, + ) + + missing_member1 = ~households.household_id.isin( + persons.household_id[persons.member_id == 1] + ) # print("%s of %s households missing member_id 1" % (missing_member1.sum(), len(households))) assert not missing_member1.any() - print('Writing %s households. Dropped %s' % (len(households), raw_household_count-len(households))) - households.to_csv(os.path.join(output_dir, file_names['households']), index=False) + print( + "Writing %s households. Dropped %s" + % (len(households), raw_household_count - len(households)) + ) + households.to_csv(os.path.join(output_dir, file_names["households"]), index=False) - print('Writing %s persons. Dropped %s' % (len(persons), raw_person_count-len(persons))) - persons.to_csv(os.path.join(output_dir, file_names['persons']), index=False) + print( + "Writing %s persons. Dropped %s" + % (len(persons), raw_person_count - len(persons)) + ) + persons.to_csv(os.path.join(output_dir, file_names["persons"]), index=False) -create_subset(input_dir="data_raw/", - output_dir="data/", - drop_dir="data_raw/dropped" - ) +create_subset(input_dir="data_raw/", output_dir="data/", drop_dir="data_raw/dropped") diff --git a/activitysim/examples/example_semcog/scripts/semcog_crop.py b/activitysim/examples/example_semcog/scripts/semcog_crop.py index c41d0ae7ec..f20ed4e7ea 100644 --- a/activitysim/examples/example_semcog/scripts/semcog_crop.py +++ b/activitysim/examples/example_semcog/scripts/semcog_crop.py @@ -1,30 +1,38 @@ +import argparse import os -import pandas as pd -import openmatrix as omx -import numpy as np -import argparse +import numpy as np +import openmatrix as omx +import pandas as pd MAZ_OFFSET = 0 segments = { - 'test': (149, 215), # SUPER_DIST_25==1, has univ - 'z500': (0, 500), - 'full': (0, 10000), + "test": (149, 215), # SUPER_DIST_25==1, has univ + "z500": (0, 500), + "full": (0, 10000), } -land_use_zone_col = 'ZONE' -hh_zone_col = 'zone_id' +land_use_zone_col = "ZONE" +hh_zone_col = "zone_id" num_full_skim_files = 2 -parser = argparse.ArgumentParser(description='crop PSRC raw_data') -parser.add_argument('segment_name', metavar='segment_name', type=str, nargs=1, - help=f"geography segmentation (e.g. full)") - -parser.add_argument('-c', '--check_geography', - default=False, - action='store_true', - help='check consistency of MAZ, TAZ zone_ids and foreign keys & write orphan_households file') +parser = argparse.ArgumentParser(description="crop PSRC raw_data") +parser.add_argument( + "segment_name", + metavar="segment_name", + type=str, + nargs=1, + help=f"geography segmentation (e.g. full)", +) + +parser.add_argument( + "-c", + "--check_geography", + default=False, + action="store_true", + help="check consistency of MAZ, TAZ zone_ids and foreign keys & write orphan_households file", +) args = parser.parse_args() @@ -35,8 +43,8 @@ assert segment_name in segments.keys(), f"Unknown seg: {segment_name}" zone_min, zone_max = segments[segment_name] -input_dir = './data_raw' -output_dir = f'./data_{segment_name}' +input_dir = "./data_raw" +output_dir = f"./data_{segment_name}" print(f"check_geography {check_geography}") @@ -55,7 +63,15 @@ def output_path(file_name): def integerize_id_columns(df, table_name): - columns = ['ZONE', 'SUPER_DIST_25', 'zone_id', 'household_id', 'person_id', 'MAZ', 'TAZ'] + columns = [ + "ZONE", + "SUPER_DIST_25", + "zone_id", + "household_id", + "person_id", + "MAZ", + "TAZ", + ] for c in df.columns: if c in columns: print(f"converting {table_name}.{c} to int") @@ -83,13 +99,17 @@ def to_csv(df, file_name): land_use = read_csv("land_use.csv") households = read_csv("households.csv") - orphan_households = households[~households[hh_zone_col].isin(land_use[land_use_zone_col])] + orphan_households = households[ + ~households[hh_zone_col].isin(land_use[land_use_zone_col]) + ] print(f"{len(orphan_households)} orphan_households") if len(orphan_households) > 0: # write orphan_households to INPUT directory (since it doesn't belong in output) file_name = "orphan_households.csv" - print(f"writing {file_name} {orphan_households.shape} to {input_path(file_name)}") + print( + f"writing {file_name} {orphan_households.shape} to {input_path(file_name)}" + ) orphan_households.to_csv(input_path(file_name), index=False) @@ -97,8 +117,11 @@ def to_csv(df, file_name): # land_use # land_use = read_csv("land_use.csv") -land_use = land_use[(land_use[land_use_zone_col] >= zone_min) & (land_use[land_use_zone_col] <= zone_max)] -integerize_id_columns(land_use, 'land_use') +land_use = land_use[ + (land_use[land_use_zone_col] >= zone_min) + & (land_use[land_use_zone_col] <= zone_max) +] +integerize_id_columns(land_use, "land_use") land_use = land_use.sort_values(land_use_zone_col) # move index col to front @@ -122,10 +145,10 @@ def to_csv(df, file_name): # households = read_csv("households.csv") households = households[households[hh_zone_col].isin(land_use[land_use_zone_col])] -integerize_id_columns(households, 'households') +integerize_id_columns(households, "households") # move index col to front -households.insert(0, 'household_id', households.pop('household_id')) +households.insert(0, "household_id", households.pop("household_id")) to_csv(households, "households.csv") @@ -134,10 +157,10 @@ def to_csv(df, file_name): # persons = read_csv("persons.csv") persons = persons[persons["household_id"].isin(households.household_id)] -integerize_id_columns(persons, 'persons') +integerize_id_columns(persons, "persons") # move index col to front -persons.insert(0, 'person_id', persons.pop('person_id')) +persons.insert(0, "person_id", persons.pop("person_id")) to_csv(persons, "persons.csv") @@ -145,7 +168,7 @@ def to_csv(df, file_name): # # skims # -omx_infile_name = 'skims.omx' +omx_infile_name = "skims.omx" skim_data_type = np.float32 omx_in = omx.open_file(input_path(omx_infile_name)) @@ -163,11 +186,13 @@ def to_csv(df, file_name): zone_labels = zone[land_use_zone_col].tolist() # TAZ zone_ids in omx index order # create -num_outfiles = num_full_skim_files if segment_name == 'full' else 1 +num_outfiles = num_full_skim_files if segment_name == "full" else 1 if num_outfiles == 1: - omx_out = [omx.open_file(output_path(f"skims.omx"), 'w')] + omx_out = [omx.open_file(output_path(f"skims.omx"), "w")] else: - omx_out = [omx.open_file(output_path(f"skims{i+1}.omx"), 'w') for i in range(num_outfiles)] + omx_out = [ + omx.open_file(output_path(f"skims{i+1}.omx"), "w") for i in range(num_outfiles) + ] for omx_file in omx_out: omx_file.create_mapping(land_use_zone_col, zone_labels) diff --git a/activitysim/examples/example_semcog/simulation.py b/activitysim/examples/example_semcog/simulation.py index 480b298cec..e191e3ded7 100755 --- a/activitysim/examples/example_semcog/simulation.py +++ b/activitysim/examples/example_semcog/simulation.py @@ -1,14 +1,14 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import argparse - -from activitysim.cli.run import add_run_args, run +import sys import extensions -if __name__ == '__main__': +from activitysim.cli.run import add_run_args, run + +if __name__ == "__main__": parser = argparse.ArgumentParser() add_run_args(parser) diff --git a/activitysim/examples/example_semcog/test/test_semcog.py b/activitysim/examples/example_semcog/test/test_semcog.py index df63c872be..032c2d6607 100644 --- a/activitysim/examples/example_semcog/test/test_semcog.py +++ b/activitysim/examples/example_semcog/test/test_semcog.py @@ -2,10 +2,10 @@ # See full license in LICENSE.txt. import os import subprocess -import pkg_resources import pandas as pd import pandas.testing as pdt +import pkg_resources from activitysim.core import inject @@ -16,33 +16,45 @@ def teardown_function(func): def test_semcog(): - def example_path(dirname): - resource = os.path.join('examples', 'example_semcog', dirname) - return pkg_resources.resource_filename('activitysim', resource) + resource = os.path.join("examples", "example_semcog", dirname) + return pkg_resources.resource_filename("activitysim", resource) def test_path(dirname): return os.path.join(os.path.dirname(__file__), dirname) def regress(): - regress_trips_df = pd.read_csv(test_path('regress/final_trips.csv')) - final_trips_df = pd.read_csv(test_path('output/final_trips.csv')) + regress_trips_df = pd.read_csv(test_path("regress/final_trips.csv")) + final_trips_df = pd.read_csv(test_path("output/final_trips.csv")) # person_id,household_id,tour_id,primary_purpose,trip_num,outbound,trip_count,purpose, # destination,origin,destination_logsum,depart,trip_mode,mode_choice_logsum # compare_cols = [] pdt.assert_frame_equal(final_trips_df, regress_trips_df) - file_path = os.path.join(os.path.dirname(__file__), '../simulation.py') - - subprocess.run(['coverage', 'run', '-a', file_path, - '-c', test_path('configs'), '-c', example_path('configs'), - '-d', example_path('data'), - '-o', test_path('output')], check=True) + file_path = os.path.join(os.path.dirname(__file__), "../simulation.py") + + subprocess.run( + [ + "coverage", + "run", + "-a", + file_path, + "-c", + test_path("configs"), + "-c", + example_path("configs"), + "-d", + example_path("data"), + "-o", + test_path("output"), + ], + check=True, + ) regress() -if __name__ == '__main__': +if __name__ == "__main__": test_semcog() diff --git a/activitysim/examples/scan_examples_for_errors.py b/activitysim/examples/scan_examples_for_errors.py index 2405a00ead..570ef861aa 100644 --- a/activitysim/examples/scan_examples_for_errors.py +++ b/activitysim/examples/scan_examples_for_errors.py @@ -5,10 +5,7 @@ parser = argparse.ArgumentParser() parser.add_argument( - 'working_dir', - type=str, - metavar='PATH', - help='path to examples working directory', + "working_dir", type=str, metavar="PATH", help="path to examples working directory", ) args = parser.parse_args() @@ -16,7 +13,7 @@ files_with_errors = [] for logfile in glob.glob(f"{args.working_dir}/*/output/log/activitysim.log"): - with open(logfile, 'rt') as f: + with open(logfile, "rt") as f: printing_traceback = False found_traceback = False for n, line in enumerate(f.readlines(), start=1): diff --git a/conda-environments/activitysim-dev.yml b/conda-environments/activitysim-dev.yml index 8cd394996a..31e4c43035 100644 --- a/conda-environments/activitysim-dev.yml +++ b/conda-environments/activitysim-dev.yml @@ -23,7 +23,8 @@ dependencies: - pytest - pytest-cov - coveralls -- pycodestyle +- black +- isort - pytest-regressions - git - gh diff --git a/conda-environments/activitysim-test.yml b/conda-environments/activitysim-test.yml index 6ca5270bd0..25e9cf53d8 100644 --- a/conda-environments/activitysim-test.yml +++ b/conda-environments/activitysim-test.yml @@ -20,5 +20,6 @@ dependencies: - pytest - pytest-cov - coveralls -- pycodestyle +- black +- isort - pytest-regressions \ No newline at end of file diff --git a/docs/add_image_map.py b/docs/add_image_map.py index 834f4cdb1c..c34d8d4fdb 100644 --- a/docs/add_image_map.py +++ b/docs/add_image_map.py @@ -1,4 +1,3 @@ - # rst doesn't support image maps, so we'll add one after the html has been built fileName = "_build/html/examples.html" @@ -35,10 +34,10 @@ print("add image map to " + fileName) -with open(fileName, encoding='utf-8') as file: +with open(fileName, encoding="utf-8") as file: lines = file.readlines() -with open(fileName, 'w') as file: +with open(fileName, "w") as file: for aLine in lines: if line in aLine: print("updated " + fileName) diff --git a/docs/conf.py b/docs/conf.py index bcbaac3607..da6570fb92 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,13 +12,14 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os +import sys import sphinx_rtd_theme # -- Get Package Version -------------------------------------------------- import activitysim + print("package version: " + activitysim.__version__) # If extensions (or modules to document with autodoc) are in another directory, @@ -35,29 +36,29 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.mathjax', - 'numpydoc', - 'sphinx.ext.autosummary' + "sphinx.ext.autodoc", + "sphinx.ext.mathjax", + "numpydoc", + "sphinx.ext.autosummary", ] numpydoc_show_class_members = False # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'ActivitySim' -copyright = u'contributing authors' +project = u"ActivitySim" +copyright = u"contributing authors" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -80,7 +81,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -98,7 +99,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -189,28 +190,31 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'ActivitySimdoc' +htmlhelp_basename = "ActivitySimdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # 'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'ActivitySim.tex', u'ActivitySim Documentation', - u'contributing authors', 'manual'), + ( + "index", + "ActivitySim.tex", + u"ActivitySim Documentation", + u"contributing authors", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of @@ -239,8 +243,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'activitysim', u'ActivitySim Documentation', - [u'contributing authors'], 1) + ("index", "activitysim", u"ActivitySim Documentation", [u"contributing authors"], 1) ] # If true, show URL addresses after external links. @@ -253,9 +256,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'ActivitySim', u'ActivitySim Documentation', - u'contributing authors', 'ActivitySim', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "ActivitySim", + u"ActivitySim Documentation", + u"contributing authors", + "ActivitySim", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. @@ -271,10 +280,8 @@ # texinfo_no_detailmenu = False # -- Table width fix for Read the Docs Sphinx theme ----------------------- -html_static_path = ['_static'] +html_static_path = ["_static"] html_context = { - 'css_files': [ - '_static/theme_overrides.css', # override wide tables in RTD theme - ], - } + "css_files": ["_static/theme_overrides.css",], # override wide tables in RTD theme +} diff --git a/ez_setup.py b/ez_setup.py index 435c30fe5c..1de2837266 100644 --- a/ez_setup.py +++ b/ez_setup.py @@ -13,17 +13,16 @@ This file can also be run as a script to install or upgrade setuptools. """ +import contextlib +import optparse import os +import platform import shutil +import subprocess import sys import tempfile -import zipfile -import optparse -import subprocess -import platform import textwrap -import contextlib - +import zipfile from distutils import log try: @@ -51,10 +50,10 @@ def _python_cmd(*args): def _install(archive_filename, install_args=()): with archive_context(archive_filename): # installing - log.warn('Installing Setuptools') - if not _python_cmd('setup.py', 'install', *install_args): - log.warn('Something went wrong during the installation.') - log.warn('See the error message above.') + log.warn("Installing Setuptools") + if not _python_cmd("setup.py", "install", *install_args): + log.warn("Something went wrong during the installation.") + log.warn("See the error message above.") # exitcode will be 2 return 2 @@ -62,12 +61,12 @@ def _install(archive_filename, install_args=()): def _build_egg(egg, archive_filename, to_dir): with archive_context(archive_filename): # building an egg - log.warn('Building a Setuptools egg in %s', to_dir) - _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) + log.warn("Building a Setuptools egg in %s", to_dir) + _python_cmd("setup.py", "-q", "bdist_egg", "--dist-dir", to_dir) # returning the result log.warn(egg) if not os.path.exists(egg): - raise IOError('Could not build the egg.') + raise IOError("Could not build the egg.") class ContextualZipFile(zipfile.ZipFile): @@ -85,7 +84,7 @@ def __new__(cls, *args, **kwargs): """ Construct a ZipFile or ContextualZipFile as appropriate """ - if hasattr(zipfile.ZipFile, '__exit__'): + if hasattr(zipfile.ZipFile, "__exit__"): return zipfile.ZipFile(*args, **kwargs) return super(ContextualZipFile, cls).__new__(cls) @@ -94,7 +93,7 @@ def __new__(cls, *args, **kwargs): def archive_context(filename): # extracting the archive tmpdir = tempfile.mkdtemp() - log.warn('Extracting in %s', tmpdir) + log.warn("Extracting in %s", tmpdir) old_wd = os.getcwd() try: os.chdir(tmpdir) @@ -104,7 +103,7 @@ def archive_context(filename): # going in the directory subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) os.chdir(subdir) - log.warn('Now working in %s', subdir) + log.warn("Now working in %s", subdir) yield finally: @@ -113,27 +112,34 @@ def archive_context(filename): def _do_download(version, download_base, to_dir, download_delay): - egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg' - % (version, sys.version_info[0], sys.version_info[1])) + egg = os.path.join( + to_dir, + "setuptools-%s-py%d.%d.egg" + % (version, sys.version_info[0], sys.version_info[1]), + ) if not os.path.exists(egg): - archive = download_setuptools(version, download_base, - to_dir, download_delay) + archive = download_setuptools(version, download_base, to_dir, download_delay) _build_egg(egg, archive, to_dir) sys.path.insert(0, egg) # Remove previously-imported pkg_resources if present (see # https://bitbucket.org/pypa/setuptools/pull-request/7/ for details). - if 'pkg_resources' in sys.modules: - del sys.modules['pkg_resources'] + if "pkg_resources" in sys.modules: + del sys.modules["pkg_resources"] import setuptools + setuptools.bootstrap_install_from = egg -def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, - to_dir=os.curdir, download_delay=15): +def use_setuptools( + version=DEFAULT_VERSION, + download_base=DEFAULT_URL, + to_dir=os.curdir, + download_delay=15, +): to_dir = os.path.abspath(to_dir) - rep_modules = 'pkg_resources', 'setuptools' + rep_modules = "pkg_resources", "setuptools" imported = set(sys.modules).intersection(rep_modules) try: import pkg_resources @@ -146,19 +152,21 @@ def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, return _do_download(version, download_base, to_dir, download_delay) except pkg_resources.VersionConflict as VC_err: if imported: - msg = textwrap.dedent(""" + msg = textwrap.dedent( + """ The required version of setuptools (>={version}) is not available, and can't be installed while this script is running. Please install a more recent version first, using 'easy_install -U setuptools'. (Currently using {VC_err.args[0]!r}) - """).format(VC_err=VC_err, version=version) + """ + ).format(VC_err=VC_err, version=version) sys.stderr.write(msg) sys.exit(2) # otherwise, reload ok - del pkg_resources, sys.modules['pkg_resources'] + del pkg_resources, sys.modules["pkg_resources"] return _do_download(version, download_base, to_dir, download_delay) @@ -184,22 +192,21 @@ def download_file_powershell(url, target): ps_cmd = ( "[System.Net.WebRequest]::DefaultWebProxy.Credentials = " "[System.Net.CredentialCache]::DefaultCredentials; " - "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" - % vars() + "(new-object System.Net.WebClient).DownloadFile(%(url)r, %(target)r)" % vars() ) cmd = [ - 'powershell', - '-Command', + "powershell", + "-Command", ps_cmd, ] _clean_check(cmd, target) def has_powershell(): - if platform.system() != 'Windows': + if platform.system() != "Windows": return False - cmd = ['powershell', '-Command', 'echo test'] - with open(os.path.devnull, 'wb') as devnull: + cmd = ["powershell", "-Command", "echo test"] + with open(os.path.devnull, "wb") as devnull: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) except Exception: @@ -211,13 +218,13 @@ def has_powershell(): def download_file_curl(url, target): - cmd = ['curl', url, '--silent', '--output', target] + cmd = ["curl", url, "--silent", "--output", target] _clean_check(cmd, target) def has_curl(): - cmd = ['curl', '--version'] - with open(os.path.devnull, 'wb') as devnull: + cmd = ["curl", "--version"] + with open(os.path.devnull, "wb") as devnull: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) except Exception: @@ -229,13 +236,13 @@ def has_curl(): def download_file_wget(url, target): - cmd = ['wget', url, '--quiet', '--output-document', target] + cmd = ["wget", url, "--quiet", "--output-document", target] _clean_check(cmd, target) def has_wget(): - cmd = ['wget', '--version'] - with open(os.path.devnull, 'wb') as devnull: + cmd = ["wget", "--version"] + with open(os.path.devnull, "wb") as devnull: try: subprocess.check_call(cmd, stdout=devnull, stderr=devnull) except Exception: @@ -277,8 +284,13 @@ def get_best_downloader(): return next(viable_downloaders, None) -def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, - to_dir=os.curdir, delay=15, downloader_factory=get_best_downloader): +def download_setuptools( + version=DEFAULT_VERSION, + download_base=DEFAULT_URL, + to_dir=os.curdir, + delay=15, + downloader_factory=get_best_downloader, +): """ Download setuptools from a specified location and return its filename @@ -307,7 +319,7 @@ def _build_install_args(options): """ Build the arguments to 'python setup.py install' on the setuptools package """ - return ['--user'] if options.user_install else [] + return ["--user"] if options.user_install else [] def _parse_args(): @@ -316,20 +328,29 @@ def _parse_args(): """ parser = optparse.OptionParser() parser.add_option( - '--user', dest='user_install', action='store_true', default=False, - help='install in user site package (requires Python 2.6 or later)') + "--user", + dest="user_install", + action="store_true", + default=False, + help="install in user site package (requires Python 2.6 or later)", + ) parser.add_option( - '--download-base', dest='download_base', metavar="URL", + "--download-base", + dest="download_base", + metavar="URL", default=DEFAULT_URL, - help='alternative URL from where to download the setuptools package') + help="alternative URL from where to download the setuptools package", + ) parser.add_option( - '--insecure', dest='downloader_factory', action='store_const', - const=lambda: download_file_insecure, default=get_best_downloader, - help='Use internal, non-validating downloader' + "--insecure", + dest="downloader_factory", + action="store_const", + const=lambda: download_file_insecure, + default=get_best_downloader, + help="Use internal, non-validating downloader", ) parser.add_option( - '--version', help="Specify which version to download", - default=DEFAULT_VERSION, + "--version", help="Specify which version to download", default=DEFAULT_VERSION, ) options, args = parser.parse_args() # positional arguments are ignored @@ -347,5 +368,5 @@ def main(): return _install(archive, _build_install_args(options)) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/other_resources/scripts/build_omx.py b/other_resources/scripts/build_omx.py index 221393bf12..3312ad6f5a 100644 --- a/other_resources/scripts/build_omx.py +++ b/other_resources/scripts/build_omx.py @@ -5,23 +5,23 @@ import os -import pandas as pd import openmatrix as omx +import pandas as pd def read_manifest(manifest_file_name): column_map = { - 'Token': 'skim_key1', - 'TimePeriod': 'skim_key2', - 'File': 'source_file_name', - 'Matrix': 'source_key', - } - converters = { - col: str for col in column_map.keys() + "Token": "skim_key1", + "TimePeriod": "skim_key2", + "File": "source_file_name", + "Matrix": "source_key", } + converters = {col: str for col in column_map.keys()} - manifest = pd.read_csv(manifest_file_name, header=0, comment='#', converters=converters) + manifest = pd.read_csv( + manifest_file_name, header=0, comment="#", converters=converters + ) manifest.rename(columns=column_map, inplace=True) @@ -30,27 +30,32 @@ def read_manifest(manifest_file_name): def omx_getMatrix(omx_file_name, omx_key): - with omx.open_file(omx_file_name, 'r') as omx_file: + with omx.open_file(omx_file_name, "r") as omx_file: if omx_key not in omx_file.list_matrices(): - print "Source matrix with key '%s' not found in file '%s" % (omx_key, omx_file,) + print "Source matrix with key '%s' not found in file '%s" % ( + omx_key, + omx_file, + ) print omx_file.list_matrices() - raise RuntimeError("Source matrix with key '%s' not found in file '%s" - % (omx_key, omx_file,)) + raise RuntimeError( + "Source matrix with key '%s' not found in file '%s" + % (omx_key, omx_file,) + ) data = omx_file[omx_key] return data -manifest_dir = '.' -source_data_dir = '.' -dest_data_dir = '.' +manifest_dir = "." +source_data_dir = "." +dest_data_dir = "." -manifest_file_name = os.path.join(manifest_dir, 'skim_manifest.csv') -dest_file_name = os.path.join(dest_data_dir, 'skims.omx') +manifest_file_name = os.path.join(manifest_dir, "skim_manifest.csv") +dest_file_name = os.path.join(dest_data_dir, "skims.omx") -with omx.open_file(dest_file_name, 'a') as dest_omx: +with omx.open_file(dest_file_name, "a") as dest_omx: manifest = read_manifest(manifest_file_name) @@ -59,19 +64,27 @@ def omx_getMatrix(omx_file_name, omx_key): source_file_name = os.path.join(source_data_dir, row.source_file_name) if row.skim_key2: - dest_key = row.skim_key1 + '__' + row.skim_key2 + dest_key = row.skim_key1 + "__" + row.skim_key2 else: dest_key = row.skim_key1 - print "Reading '%s' from '%s' in %s" % (dest_key, row.source_key, source_file_name) - with omx.open_file(source_file_name, 'r') as source_omx: + print "Reading '%s' from '%s' in %s" % ( + dest_key, + row.source_key, + source_file_name, + ) + with omx.open_file(source_file_name, "r") as source_omx: if row.source_key not in source_omx.list_matrices(): - print "Source matrix with key '%s' not found in file '%s" \ - % (row.source_key, source_file_name,) + print "Source matrix with key '%s' not found in file '%s" % ( + row.source_key, + source_file_name, + ) print source_omx.list_matrices() - raise RuntimeError("Source matrix with key '%s' not found in file '%s" - % (row.source_key, dest_omx,)) + raise RuntimeError( + "Source matrix with key '%s' not found in file '%s" + % (row.source_key, dest_omx,) + ) data = source_omx[row.source_key] diff --git a/other_resources/scripts/create_sf_example.py b/other_resources/scripts/create_sf_example.py index c052317fb3..de6c7758d1 100644 --- a/other_resources/scripts/create_sf_example.py +++ b/other_resources/scripts/create_sf_example.py @@ -2,12 +2,11 @@ # See full license in LICENSE.txt. import os +import sys import numpy as np -import pandas as pd import openmatrix as omx - -import sys +import pandas as pd # currently hdf5 written with python3 works with both p2.7 and p3, # but reading hdf5 built with p2.7 (tables==3.4.4) p3 throws a ValueError reading land_use_taz: @@ -18,7 +17,7 @@ # input files, SF county is zones 1 to 190, output files source_store = "/Users/jeff.doyle/work/activitysim-data/mtc_tm1/data/mtc_asim.h5" -source_skims = '/Users/jeff.doyle/work/activitysim-data/mtc_tm1/data/skims.omx' +source_skims = "/Users/jeff.doyle/work/activitysim-data/mtc_tm1/data/skims.omx" dest_data_dir = "/Users/jeff.doyle/work/activitysim-data" @@ -28,42 +27,46 @@ def create_subset(dest_store, dest_skims, maxZone, households_sample_size=0): dest_store_path = os.path.join(dest_data_dir, dest_store) dest_skims_path = os.path.join(dest_data_dir, dest_skims) - print('land_use_taz') - df = pd.read_hdf(source_store, 'land_use_taz') + print("land_use_taz") + df = pd.read_hdf(source_store, "land_use_taz") df = df[df.index <= maxZone] - df.to_hdf(dest_store_path, 'land_use_taz') + df.to_hdf(dest_store_path, "land_use_taz") del df - print('households') - hh_df = pd.read_hdf(source_store, 'households') + print("households") + hh_df = pd.read_hdf(source_store, "households") hh_df = hh_df[hh_df.TAZ <= maxZone] if households_sample_size: - hh_df = hh_df.take(np.random.choice(len(hh_df), size=households_sample_size, replace=False)) - hh_df.to_hdf(dest_store_path, 'households') + hh_df = hh_df.take( + np.random.choice(len(hh_df), size=households_sample_size, replace=False) + ) + hh_df.to_hdf(dest_store_path, "households") - print('persons') - per_df = pd.read_hdf(source_store, 'persons') + print("persons") + per_df = pd.read_hdf(source_store, "persons") per_df = per_df[per_df.household_id.isin(hh_df.index)] - per_df.to_hdf(dest_store_path, 'persons') + per_df.to_hdf(dest_store_path, "persons") # process all skims skims = omx.open_file(source_skims) - skims_out = omx.open_file(dest_skims_path, 'w') + skims_out = omx.open_file(dest_skims_path, "w") skimsToProcess = skims.list_matrices() for skimName in skimsToProcess: print(skimName) skims_out[skimName] = skims[skimName][0:maxZone, 0:maxZone] - skims_out[skimName].attrs.TITLE = '' # remove funny character for OMX viewer + skims_out[skimName].attrs.TITLE = "" # remove funny character for OMX viewer -create_subset(dest_store='mtc_tm1_sf/data/mtc_asim.h5', - dest_skims='mtc_tm1_sf/data/skims.omx', - maxZone=190 - ) +create_subset( + dest_store="mtc_tm1_sf/data/mtc_asim.h5", + dest_skims="mtc_tm1_sf/data/skims.omx", + maxZone=190, +) -create_subset(dest_store='mtc_tm1_test/data/mtc_asim.h5', - dest_skims='mtc_tm1_test/data/skims.omx', - maxZone=25, - households_sample_size=5000 - ) +create_subset( + dest_store="mtc_tm1_test/data/mtc_asim.h5", + dest_skims="mtc_tm1_test/data/skims.omx", + maxZone=25, + households_sample_size=5000, +) diff --git a/other_resources/scripts/make_pipeline_output.py b/other_resources/scripts/make_pipeline_output.py index c511b78cd3..4dd69d4a8b 100644 --- a/other_resources/scripts/make_pipeline_output.py +++ b/other_resources/scripts/make_pipeline_output.py @@ -1,48 +1,56 @@ - # create table of pipeline table fields by creator # Ben Stabler, ben.stabler@rsginc.com, 06/06/18 # C:\projects\development\activitysim\example>python ../scripts/make_pipeline_output.py import pandas as pd -pipeline_filename = 'output\\pipeline.h5' -out_fields_filename = 'output\\pipeline_fields.csv' +pipeline_filename = "output\\pipeline.h5" +out_fields_filename = "output\\pipeline_fields.csv" # get pipeline tables pipeline = pd.io.pytables.HDFStore(pipeline_filename) -checkpoints = pipeline['/checkpoints'] +checkpoints = pipeline["/checkpoints"] p_tables = pipeline.keys() -p_tables.remove('/checkpoints') -p_tables_tables = [i.split('/')[1] for i in p_tables] -p_tables_cps = [i.split('/')[2] for i in p_tables] +p_tables.remove("/checkpoints") +p_tables_tables = [i.split("/")[1] for i in p_tables] +p_tables_cps = [i.split("/")[2] for i in p_tables] p_tables = pd.DataFrame({"table": p_tables_tables, "cp": p_tables_cps}) # join timestamps and sort -cp_times = checkpoints[['checkpoint_name', 'timestamp']] -cp_times = cp_times.set_index('checkpoint_name') -p_tables = p_tables.join(cp_times, on='cp') -p_tables = p_tables.sort_values(by=['table', 'timestamp']) +cp_times = checkpoints[["checkpoint_name", "timestamp"]] +cp_times = cp_times.set_index("checkpoint_name") +p_tables = p_tables.join(cp_times, on="cp") +p_tables = p_tables.sort_values(by=["table", "timestamp"]) # build table of fields for each table by creator # TABLE, FIELD, DTYPE, CREATOR, NCOL, NROW tables_fields = dict() for i in range(len(p_tables)): - cur_table = p_tables['table'].iloc[i] - cur_cp = p_tables['cp'].iloc[i] + cur_table = p_tables["table"].iloc[i] + cur_cp = p_tables["cp"].iloc[i] - print('process ' + cur_table + '/' + cur_cp) + print("process " + cur_table + "/" + cur_cp) - cur_table_data = pipeline['/' + cur_table + '/' + cur_cp] + cur_table_data = pipeline["/" + cur_table + "/" + cur_cp] cur_table_data_fields = cur_table_data.dtypes.index cur_table_data_dtypes = cur_table_data.dtypes.values cur_table_data_nrow = len(cur_table_data) cur_table_data_ncol = len(cur_table_data.columns) - table_data = pd.DataFrame({'TABLE': cur_table, 'FIELD': cur_table_data_fields, - 'DTYPE': cur_table_data_dtypes, 'CREATOR': cur_cp, - 'NCOL': cur_table_data_ncol, 'NROW': cur_table_data_nrow}) - table_data = table_data[["TABLE", "FIELD", "DTYPE", "CREATOR", "NCOL", "NROW"]] # reorder + table_data = pd.DataFrame( + { + "TABLE": cur_table, + "FIELD": cur_table_data_fields, + "DTYPE": cur_table_data_dtypes, + "CREATOR": cur_cp, + "NCOL": cur_table_data_ncol, + "NROW": cur_table_data_nrow, + } + ) + table_data = table_data[ + ["TABLE", "FIELD", "DTYPE", "CREATOR", "NCOL", "NROW"] + ] # reorder if cur_table not in tables_fields.keys(): tables_fields[cur_table] = table_data diff --git a/other_resources/scripts/omx32.py b/other_resources/scripts/omx32.py index 12b897e056..53be977ec5 100644 --- a/other_resources/scripts/omx32.py +++ b/other_resources/scripts/omx32.py @@ -1,17 +1,18 @@ -import os -import pandas as pd -import openmatrix as omx -import numpy as np - import argparse +import os +import numpy as np +import openmatrix as omx +import pandas as pd -parser = argparse.ArgumentParser(description='crop PSRC raw_data') -parser.add_argument('input', metavar='input_file_name', type=str, nargs=1, - help=f"input omx file") +parser = argparse.ArgumentParser(description="crop PSRC raw_data") +parser.add_argument( + "input", metavar="input_file_name", type=str, nargs=1, help=f"input omx file" +) -parser.add_argument('output', metavar='output_file_name', type=str, nargs=1, - help=f"output omx file") +parser.add_argument( + "output", metavar="output_file_name", type=str, nargs=1, help=f"output omx file" +) args = parser.parse_args() @@ -25,10 +26,10 @@ # skim_data_type = np.float32 -omx_in = omx.open_file(input_file_name, 'r') +omx_in = omx.open_file(input_file_name, "r") print(f"omx_in shape {omx_in.shape()}") -omx_out = omx.open_file(output_file_name, 'w') +omx_out = omx.open_file(output_file_name, "w") for mapping_name in omx_in.listMappings(): offset_map = omx_in.mapentries(mapping_name) diff --git a/other_resources/scripts/simulation.py b/other_resources/scripts/simulation.py index 3fc082790e..b00ee62f88 100644 --- a/other_resources/scripts/simulation.py +++ b/other_resources/scripts/simulation.py @@ -1,45 +1,42 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import logging +import sys -from activitysim.core import mem -from activitysim.core import inject -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import mp_tasks -from activitysim.core import chunk +from activitysim.core import chunk, config, inject, mem, mp_tasks, pipeline, tracing # from activitysim import abm -logger = logging.getLogger('activitysim') +logger = logging.getLogger("activitysim") def cleanup_output_files(): - active_log_files = \ - [h.baseFilename for h in logger.root.handlers if isinstance(h, logging.FileHandler)] - tracing.delete_output_files('log', ignore=active_log_files) + active_log_files = [ + h.baseFilename + for h in logger.root.handlers + if isinstance(h, logging.FileHandler) + ] + tracing.delete_output_files("log", ignore=active_log_files) - tracing.delete_output_files('h5') - tracing.delete_output_files('csv') - tracing.delete_output_files('txt') - tracing.delete_output_files('yaml') - tracing.delete_output_files('prof') - tracing.delete_output_files('omx') + tracing.delete_output_files("h5") + tracing.delete_output_files("csv") + tracing.delete_output_files("txt") + tracing.delete_output_files("yaml") + tracing.delete_output_files("prof") + tracing.delete_output_files("omx") def run(run_list, injectables=None): - if run_list['multiprocess']: + if run_list["multiprocess"]: logger.info("run multiprocess simulation") mp_tasks.run_multiprocess(run_list, injectables) else: logger.info("run single process simulation") - pipeline.run(models=run_list['models'], resume_after=run_list['resume_after']) + pipeline.run(models=run_list["models"], resume_after=run_list["resume_after"]) pipeline.close_pipeline() mem.log_global_hwm() @@ -47,11 +44,11 @@ def run(run_list, injectables=None): def log_settings(injectables): settings = [ - 'households_sample_size', - 'chunk_size', - 'multiprocess', - 'num_processes', - 'resume_after', + "households_sample_size", + "chunk_size", + "multiprocess", + "num_processes", + "resume_after", ] for k in settings: @@ -61,10 +58,10 @@ def log_settings(injectables): logger.info("injectable %s: %s" % (k, inject.get_injectable(k))) -if __name__ == '__main__': +if __name__ == "__main__": - inject.add_injectable('data_dir', 'data') - inject.add_injectable('configs_dir', 'configs') + inject.add_injectable("data_dir", "data") + inject.add_injectable("configs_dir", "configs") config.handle_standard_args() @@ -76,14 +73,16 @@ def log_settings(injectables): t0 = tracing.print_elapsed_time() # cleanup if not resuming - if not config.setting('resume_after', False): + if not config.setting("resume_after", False): cleanup_output_files() run_list = mp_tasks.get_run_list() - if run_list['multiprocess']: + if run_list["multiprocess"]: # do this after config.handle_standard_args, as command line args may override injectables - injectables = list(set(injectables) | set(['data_dir', 'configs_dir', 'output_dir'])) + injectables = list( + set(injectables) | set(["data_dir", "configs_dir", "output_dir"]) + ) injectables = {k: inject.get_injectable(k) for k in injectables} else: injectables = None diff --git a/other_resources/scripts/verify_results.py b/other_resources/scripts/verify_results.py index aa288e3695..9c8fb8e326 100644 --- a/other_resources/scripts/verify_results.py +++ b/other_resources/scripts/verify_results.py @@ -1,22 +1,21 @@ - ############################################################# # ActivitySim verification against TM1 # Ben Stabler, ben.stabler@rsginc.com, 02/22/19 # C:\projects\activitysim\verification>python compare_results.py ############################################################# -import pandas as pd import openmatrix as omx +import pandas as pd ############################################################# # INPUTS ############################################################# -pipeline_filename = 'asim/pipeline.h5' +pipeline_filename = "asim/pipeline.h5" distance_matrix_filename = "asim/skims.omx" asim_nmtf_alts_filename = "asim/non_mandatory_tour_frequency_alternatives.csv" -process_sp = True # False skip work/sch shadow pricing comparisons, True do them +process_sp = True # False skip work/sch shadow pricing comparisons, True do them process_tm1 = True # False only processes asim, True processes tm1 as well asim_sp_work_filename = "asim/shadow_price_workplace_modeled_size_10.csv" @@ -50,13 +49,39 @@ # COMMON LABELS ############################################################# -ptypes = ["", "Full-time worker", "Part-time worker", "University student", "Non-worker", - "Retired", "Student of driving age", "Student of non-driving age", - "Child too young for school"] - -mode_labels = ["", "DRIVEALONEFREE", "DRIVEALONEPAY", "SHARED2FREE", "SHARED2PAY", "SHARED3FREE", - "SHARED3PAY", "WALK", "BIKE", "WALK_LOC", "WALK_LRF", "WALK_EXP", "WALK_HVY", - "WALK_COM", "DRIVE_LOC", "DRIVE_LRF", "DRIVE_EXP", "DRIVE_HVY", "DRIVE_COM"] +ptypes = [ + "", + "Full-time worker", + "Part-time worker", + "University student", + "Non-worker", + "Retired", + "Student of driving age", + "Student of non-driving age", + "Child too young for school", +] + +mode_labels = [ + "", + "DRIVEALONEFREE", + "DRIVEALONEPAY", + "SHARED2FREE", + "SHARED2PAY", + "SHARED3FREE", + "SHARED3PAY", + "WALK", + "BIKE", + "WALK_LOC", + "WALK_LRF", + "WALK_EXP", + "WALK_HVY", + "WALK_COM", + "DRIVE_LOC", + "DRIVE_LRF", + "DRIVE_EXP", + "DRIVE_HVY", + "DRIVE_COM", +] ############################################################# # DISTANCE SKIM @@ -115,23 +140,43 @@ if process_sp: if process_tm1: - tm1_markets = ["work_low", "work_med", "work_high", "work_high", "work_very high", "university", - "school_high", "school_grade"] + tm1_markets = [ + "work_low", + "work_med", + "work_high", + "work_high", + "work_very high", + "university", + "school_high", + "school_grade", + ] tm1 = pd.read_csv(tm1_sp_filename) tm1 = tm1.groupby(tm1["zone"]).sum() tm1["zone"] = tm1.index tm1 = tm1.loc[tm1["zone"] > 0] ws_size = tm1[["zone"]] for i in range(len(tm1_markets)): - ws_size[tm1_markets[i] + "_modeledDests"] = tm1[tm1_markets[i] + "_modeledDests"] + ws_size[tm1_markets[i] + "_modeledDests"] = tm1[ + tm1_markets[i] + "_modeledDests" + ] ws_size.to_csv("outputs/tm1_work_school_location.csv", na_rep=0) - asim_markets = ["work_low", "work_med", "work_high", "work_high", "work_veryhigh", "university", - "highschool", "gradeschool"] + asim_markets = [ + "work_low", + "work_med", + "work_high", + "work_high", + "work_veryhigh", + "university", + "highschool", + "gradeschool", + ] asim = pd.read_csv(asim_sp_work_filename) asim_sch = pd.read_csv(asim_sp_school_filename) asim_sch_no_sp = pd.read_csv(asim_sp_school_no_sp_filename) - asim_sch["gradeschool"] = asim_sch_no_sp["gradeschool"] # grade school not shadow priced + asim_sch["gradeschool"] = asim_sch_no_sp[ + "gradeschool" + ] # grade school not shadow priced asim = asim.set_index("TAZ", drop=False) asim_sch = asim_sch.set_index("TAZ", drop=False) @@ -153,18 +198,30 @@ if process_tm1: tm1_work = pd.read_csv(tm1_work_filename) tm1_work["HomeCounty"] = tazs["COUNTYNAME"].loc[tm1_work["HomeTAZ"]].tolist() - tm1_work["WorkCounty"] = tazs["COUNTYNAME"].loc[tm1_work["WorkLocation"]].tolist() - tm1_work_counties = tm1_work.groupby(["HomeCounty", "WorkCounty"]).count()["HHID"] + tm1_work["WorkCounty"] = ( + tazs["COUNTYNAME"].loc[tm1_work["WorkLocation"]].tolist() + ) + tm1_work_counties = tm1_work.groupby(["HomeCounty", "WorkCounty"]).count()[ + "HHID" + ] tm1_work_counties = tm1_work_counties.reset_index() - tm1_work_counties = tm1_work_counties.pivot(index="HomeCounty", columns="WorkCounty") + tm1_work_counties = tm1_work_counties.pivot( + index="HomeCounty", columns="WorkCounty" + ) tm1_work_counties.to_csv("outputs/tm1_work_counties.csv", na_rep=0) asim_cdap = pd.read_csv(asim_per_filename) asim_cdap["HomeCounty"] = tazs["COUNTYNAME"].loc[asim_cdap["home_taz"]].tolist() - asim_cdap["WorkCounty"] = tazs["COUNTYNAME"].loc[asim_cdap["workplace_zone_id"]].tolist() - asim_work_counties = asim_cdap.groupby(["HomeCounty", "WorkCounty"]).count()["household_id"] + asim_cdap["WorkCounty"] = ( + tazs["COUNTYNAME"].loc[asim_cdap["workplace_zone_id"]].tolist() + ) + asim_work_counties = asim_cdap.groupby(["HomeCounty", "WorkCounty"]).count()[ + "household_id" + ] asim_work_counties = asim_work_counties.reset_index() - asim_work_counties = asim_work_counties.pivot(index="HomeCounty", columns="WorkCounty") + asim_work_counties = asim_work_counties.pivot( + index="HomeCounty", columns="WorkCounty" + ) asim_work_counties.to_csv("outputs/asim_work_counties.csv", na_rep=0) # auto ownership - count of hhs by num autos by taz @@ -204,10 +261,12 @@ if process_tm1: tm1_per = pd.read_csv(tm1_per_filename) - tm1_per["fp_choice"] = (tm1_per["fp_choice"] == 1) # 1=free, 2==pay + tm1_per["fp_choice"] = tm1_per["fp_choice"] == 1 # 1=free, 2==pay tm1_work = pd.read_csv(tm1_work_filename) tm1_work = tm1_work.set_index("PersonID", drop=False) - tm1_per["WorkLocation"] = tm1_work["WorkLocation"].loc[tm1_per["person_id"]].tolist() + tm1_per["WorkLocation"] = ( + tm1_work["WorkLocation"].loc[tm1_per["person_id"]].tolist() + ) tm1_fp = tm1_per[tm1_per["WorkLocation"] > 0] tm1_fp = tm1_fp.groupby(["type", "fp_choice"]).count()["hh_id"] tm1_fp = tm1_fp.reset_index() @@ -215,7 +274,9 @@ tm1_fp.to_csv("outputs/tm1_fp.csv", na_rep=0) asim_cdap["ptypename"] = pd.Series(ptypes)[asim_cdap["ptype"].tolist()].tolist() -asim_fp = asim_cdap.groupby(["ptypename", "free_parking_at_work"]).count()["household_id"] +asim_fp = asim_cdap.groupby(["ptypename", "free_parking_at_work"]).count()[ + "household_id" +] asim_fp = asim_fp.reset_index() asim_fp = asim_fp.pivot(index="ptypename", columns="free_parking_at_work") asim_fp.to_csv("outputs/asim_fp.csv", na_rep=0) @@ -225,11 +286,15 @@ if process_tm1: tm1_per = pd.read_csv(tm1_per_filename) tm1_per["vot_bin"] = pd.cut(tm1_per["value_of_time"], range(51)) - tm1_per.groupby(["vot_bin"]).count()["hh_id"].to_csv("outputs/tm1_vot.csv", na_rep=0) + tm1_per.groupby(["vot_bin"]).count()["hh_id"].to_csv( + "outputs/tm1_vot.csv", na_rep=0 + ) asim_per = pd.read_csv(asim_per_filename) asim_per["vot_bin"] = pd.cut(asim_per["value_of_time"], range(51)) -asim_per.groupby(["vot_bin"]).count()["household_id"].to_csv("outputs/asim_vot.csv", na_rep=0) +asim_per.groupby(["vot_bin"]).count()["household_id"].to_csv( + "outputs/asim_vot.csv", na_rep=0 +) ############################################################# # TOUR @@ -245,7 +310,9 @@ tm1_hh = tm1_hh.set_index("hh_id", drop=False) tm1_per["hhsize"] = tm1_hh["size"].loc[tm1_per["hh_id"]].tolist() # indexing starts at 1 - tm1_per["imf_choice_name"] = pd.Series(tm1_imf_codes)[(tm1_per["imf_choice"]+1).tolist()].tolist() + tm1_per["imf_choice_name"] = pd.Series(tm1_imf_codes)[ + (tm1_per["imf_choice"] + 1).tolist() + ].tolist() tm1_imf = tm1_per.groupby(["type", "imf_choice_name"]).count()["hh_id"] tm1_imf = tm1_imf.reset_index() tm1_imf = tm1_imf.pivot(index="type", columns="imf_choice_name") @@ -257,10 +324,15 @@ asim_cdap["ptypename"] = pd.Series(ptypes)[asim_cdap["ptype"].tolist()].tolist() asim_imf = pd.read_csv(asim_per_filename) asim_imf["ptypename"] = pd.Series(ptypes)[asim_imf["ptype"].tolist()].tolist() -asim_imf["mandatory_tour_frequency"] = pd.Categorical(asim_imf["mandatory_tour_frequency"], - categories=tm1_imf_codes) -asim_imf["mandatory_tour_frequency"][asim_imf["mandatory_tour_frequency"].isnull()] = "0" -asim_imf = asim_imf.groupby(["ptypename", "mandatory_tour_frequency"]).count()["household_id"] +asim_imf["mandatory_tour_frequency"] = pd.Categorical( + asim_imf["mandatory_tour_frequency"], categories=tm1_imf_codes +) +asim_imf["mandatory_tour_frequency"][ + asim_imf["mandatory_tour_frequency"].isnull() +] = "0" +asim_imf = asim_imf.groupby(["ptypename", "mandatory_tour_frequency"]).count()[ + "household_id" +] asim_imf = asim_imf.reset_index() asim_imf = asim_imf.pivot(index="ptypename", columns="mandatory_tour_frequency") asim_imf.to_csv("outputs/asim_imtf.csv", na_rep=0) @@ -272,44 +344,79 @@ tm1_tours = tm1_tours[tm1_tours["tour_category"] == "MANDATORY"] tm1_tours["tour_purpose"][tm1_tours["tour_purpose"].str.contains("work")] = "work" tm1_tours["tour_purpose"][tm1_tours["tour_purpose"].str.contains("s")] = "school" - tm1_mtdd = tm1_tours.groupby(["start_hour", "end_hour", "tour_purpose"]).count()["hh_id"] + tm1_mtdd = tm1_tours.groupby(["start_hour", "end_hour", "tour_purpose"]).count()[ + "hh_id" + ] tm1_mtdd = tm1_mtdd.reset_index() - tm1_mtdd_sch = tm1_mtdd[tm1_mtdd["tour_purpose"] == "school"][[ - "start_hour", "end_hour", "hh_id"]].pivot(index="start_hour", columns="end_hour") - tm1_mtdd_work = tm1_mtdd[tm1_mtdd["tour_purpose"] == "work"][[ - "start_hour", "end_hour", "hh_id"]].pivot(index="start_hour", columns="end_hour") + tm1_mtdd_sch = tm1_mtdd[tm1_mtdd["tour_purpose"] == "school"][ + ["start_hour", "end_hour", "hh_id"] + ].pivot(index="start_hour", columns="end_hour") + tm1_mtdd_work = tm1_mtdd[tm1_mtdd["tour_purpose"] == "work"][ + ["start_hour", "end_hour", "hh_id"] + ].pivot(index="start_hour", columns="end_hour") tm1_mtdd_sch.to_csv("outputs/tm1_mtdd_school.csv", na_rep=0) tm1_mtdd_work.to_csv("outputs/tm1_mtdd_work.csv", na_rep=0) asim_tours = pd.read_csv(asim_tour_filename) asim_tours_man = asim_tours[asim_tours["tour_category"] == "mandatory"] -asim_mtdd = asim_tours_man.groupby(["start", "end", "tour_type"]).count()["household_id"] +asim_mtdd = asim_tours_man.groupby(["start", "end", "tour_type"]).count()[ + "household_id" +] asim_mtdd = asim_mtdd.reset_index() -asim_mtdd_sch = asim_mtdd[asim_mtdd["tour_type"] == "school"][[ - "start", "end", "household_id"]].pivot(index="start", columns="end") -asim_mtdd_work = asim_mtdd[asim_mtdd["tour_type"] == "work"][[ - "start", "end", "household_id"]].pivot(index="start", columns="end") +asim_mtdd_sch = asim_mtdd[asim_mtdd["tour_type"] == "school"][ + ["start", "end", "household_id"] +].pivot(index="start", columns="end") +asim_mtdd_work = asim_mtdd[asim_mtdd["tour_type"] == "work"][ + ["start", "end", "household_id"] +].pivot(index="start", columns="end") asim_mtdd_sch.to_csv("outputs/asim_mtdd_school.csv", na_rep=0) asim_mtdd_work.to_csv("outputs/asim_mtdd_work.csv", na_rep=0) # joint tour frequency -jtf_labels = ["", "0_tours", "1_Shop", "1_Main", "1_Eat", "1_Visit", "1_Disc", - "2_SS", "2_SM", "2_SE", "2_SV", "2_SD", "2_MM", "2_ME", "2_MV", "2_MD", "2_EE", - "2_EV", "2_ED", "2_VV", "2_VD", "2_DD"] +jtf_labels = [ + "", + "0_tours", + "1_Shop", + "1_Main", + "1_Eat", + "1_Visit", + "1_Disc", + "2_SS", + "2_SM", + "2_SE", + "2_SV", + "2_SD", + "2_MM", + "2_ME", + "2_MV", + "2_MD", + "2_EE", + "2_EV", + "2_ED", + "2_VV", + "2_VD", + "2_DD", +] if process_tm1: tm1_jtf = tm1_hh tm1_jtf = tm1_jtf[tm1_jtf["jtf_choice"] > 0] - tm1_jtf["jtf_choice_label"] = pd.Series(jtf_labels)[tm1_jtf["jtf_choice"].tolist()].tolist() - tm1_jtf.groupby("jtf_choice_label").count()["hh_id"].to_csv("outputs/tm1_jtf.csv", na_rep=0) + tm1_jtf["jtf_choice_label"] = pd.Series(jtf_labels)[ + tm1_jtf["jtf_choice"].tolist() + ].tolist() + tm1_jtf.groupby("jtf_choice_label").count()["hh_id"].to_csv( + "outputs/tm1_jtf.csv", na_rep=0 + ) asim_jtf = pd.read_csv(asim_hh_filename) asim_jtf = asim_jtf[asim_jtf["joint_tour_frequency"] != ""] -asim_jtf.groupby("joint_tour_frequency").count()["household_id"].to_csv("outputs/asim_jtf.csv", na_rep=0) +asim_jtf.groupby("joint_tour_frequency").count()["household_id"].to_csv( + "outputs/asim_jtf.csv", na_rep=0 +) # joint tour comp @@ -317,15 +424,22 @@ tm1_jtours = pd.read_csv(tm1_jtour_filename) comp_labels = ["", "adult", "children", "mixed"] tm1_jtours["tour_composition_labels"] = pd.Series(comp_labels)[ - tm1_jtours["tour_composition"].tolist()].tolist() - tm1_jtour_comp = tm1_jtours.groupby(["tour_purpose", "tour_composition_labels"]).count()["hh_id"] + tm1_jtours["tour_composition"].tolist() + ].tolist() + tm1_jtour_comp = tm1_jtours.groupby( + ["tour_purpose", "tour_composition_labels"] + ).count()["hh_id"] tm1_jtour_comp = tm1_jtour_comp.reset_index() - tm1_jtour_comp = tm1_jtour_comp.pivot(index="tour_purpose", columns="tour_composition_labels") + tm1_jtour_comp = tm1_jtour_comp.pivot( + index="tour_purpose", columns="tour_composition_labels" + ) tm1_jtour_comp.to_csv("outputs/tm1_jtour_comp.csv", na_rep=0) asim_jtours = pd.read_csv(asim_tour_filename) asim_jtours = asim_jtours[asim_jtours["tour_category"] == "joint"] -asim_jtour_comp = asim_jtours.groupby(["tour_type", "composition"]).count()["household_id"] +asim_jtour_comp = asim_jtours.groupby(["tour_type", "composition"]).count()[ + "household_id" +] asim_jtour_comp = asim_jtour_comp.reset_index() asim_jtour_comp = asim_jtour_comp.pivot(index="tour_type", columns="composition") asim_jtour_comp.to_csv("outputs/asim_jtour_comp.csv", na_rep=0) @@ -333,14 +447,21 @@ # joint tour destination if process_tm1: - tm1_jtours["distance"] = distmat[tm1_jtours["orig_taz"]-1, tm1_jtours["dest_taz"]-1] + tm1_jtours["distance"] = distmat[ + tm1_jtours["orig_taz"] - 1, tm1_jtours["dest_taz"] - 1 + ] tm1_jtours["dist_bin"] = pd.cut(tm1_jtours["distance"], range(51)) - tm1_jtours.groupby(["dist_bin"]).count()["hh_id"].to_csv("outputs/tm1_jtour_dist.csv", na_rep=0) + tm1_jtours.groupby(["dist_bin"]).count()["hh_id"].to_csv( + "outputs/tm1_jtour_dist.csv", na_rep=0 + ) -asim_jtours["distance"] = distmat[asim_jtours["origin"].astype(int)-1, - asim_jtours["destination"].astype(int)-1] +asim_jtours["distance"] = distmat[ + asim_jtours["origin"].astype(int) - 1, asim_jtours["destination"].astype(int) - 1 +] asim_jtours["dist_bin"] = pd.cut(asim_jtours["distance"], range(51)) -asim_jtours.groupby(["dist_bin"]).count()["household_id"].to_csv("outputs/asim_jtour_dist.csv", na_rep=0) +asim_jtours.groupby(["dist_bin"]).count()["household_id"].to_csv( + "outputs/asim_jtour_dist.csv", na_rep=0 +) # joint tour tdd @@ -371,7 +492,9 @@ asim_per_nmtf = pd.read_csv(asim_per_filename) asim_per_nmtf["ptypename"] = pd.Series(ptypes)[asim_per_nmtf["ptype"].tolist()].tolist() -asim_nmtf_sum = asim_per_nmtf.groupby(["non_mandatory_tour_frequency"]).count()["household_id"] +asim_nmtf_sum = asim_per_nmtf.groupby(["non_mandatory_tour_frequency"]).count()[ + "household_id" +] asim_alts = pd.concat([alts, asim_nmtf_sum], axis=1) asim_alts.to_csv("outputs/asim_nmtf.csv", na_rep=0) @@ -379,17 +502,25 @@ if process_tm1: tm1_tours = pd.read_csv(tm1_tour_filename) - tm1_tours["distance"] = distmat[tm1_tours["orig_taz"]-1, tm1_tours["dest_taz"]-1] + tm1_tours["distance"] = distmat[ + tm1_tours["orig_taz"] - 1, tm1_tours["dest_taz"] - 1 + ] tm1_tours["dist_bin"] = pd.cut(tm1_tours["distance"], range(51)) tm1_tours_nm = tm1_tours[tm1_tours["tour_category"] == "INDIVIDUAL_NON_MANDATORY"] - tm1_tours_nm.groupby(["dist_bin"]).count()["hh_id"].to_csv("outputs/tm1_nmtd_dist.csv", na_rep=0) + tm1_tours_nm.groupby(["dist_bin"]).count()["hh_id"].to_csv( + "outputs/tm1_nmtd_dist.csv", na_rep=0 + ) asim_nm_tours = pd.read_csv(asim_tour_filename) asim_nm_tours = asim_nm_tours[asim_nm_tours["tour_category"] == "non_mandatory"] -asim_nm_tours["distance"] = distmat[asim_nm_tours["origin"].astype(int)-1, - asim_nm_tours["destination"].astype(int)-1] +asim_nm_tours["distance"] = distmat[ + asim_nm_tours["origin"].astype(int) - 1, + asim_nm_tours["destination"].astype(int) - 1, +] asim_nm_tours["dist_bin"] = pd.cut(asim_nm_tours["distance"], range(51)) -asim_nm_tours.groupby(["dist_bin"]).count()["household_id"].to_csv("outputs/asim_nmtd_dist.csv", na_rep=0) +asim_nm_tours.groupby(["dist_bin"]).count()["household_id"].to_csv( + "outputs/asim_nmtd_dist.csv", na_rep=0 +) # non_mandatory_tour_scheduling @@ -409,19 +540,33 @@ if process_tm1: tm1_tours = pd.read_csv(tm1_tour_filename) tm1_jtours = pd.read_csv(tm1_jtour_filename) - tm1_tours["tour_mode_labels"] = pd.Series(mode_labels)[tm1_tours["tour_mode"].tolist()].tolist() - tm1_tours["tour_mode_labels"] = pd.Categorical(tm1_tours["tour_mode_labels"], - categories=mode_labels) - tm1_jtours["tour_mode_labels"] = pd.Series(mode_labels)[tm1_jtours["tour_mode"].tolist()].tolist() - tm1_jtours["tour_mode_labels"] = pd.Categorical(tm1_jtours["tour_mode_labels"], - categories=mode_labels) - tm1_nmn_tour_mode = tm1_tours.groupby(["tour_mode_labels", "tour_category"]).count()["hh_id"] + tm1_tours["tour_mode_labels"] = pd.Series(mode_labels)[ + tm1_tours["tour_mode"].tolist() + ].tolist() + tm1_tours["tour_mode_labels"] = pd.Categorical( + tm1_tours["tour_mode_labels"], categories=mode_labels + ) + tm1_jtours["tour_mode_labels"] = pd.Series(mode_labels)[ + tm1_jtours["tour_mode"].tolist() + ].tolist() + tm1_jtours["tour_mode_labels"] = pd.Categorical( + tm1_jtours["tour_mode_labels"], categories=mode_labels + ) + tm1_nmn_tour_mode = tm1_tours.groupby( + ["tour_mode_labels", "tour_category"] + ).count()["hh_id"] tm1_nmn_tour_mode = tm1_nmn_tour_mode.reset_index() - tm1_nmn_tour_mode = tm1_nmn_tour_mode.pivot(index="tour_mode_labels", columns="tour_category") + tm1_nmn_tour_mode = tm1_nmn_tour_mode.pivot( + index="tour_mode_labels", columns="tour_category" + ) - tm1_jtour_mode = tm1_jtours.groupby(["tour_mode_labels", "tour_category"]).count()["hh_id"] + tm1_jtour_mode = tm1_jtours.groupby(["tour_mode_labels", "tour_category"]).count()[ + "hh_id" + ] tm1_jtour_mode = tm1_jtour_mode.reset_index() - tm1_jtour_mode = tm1_jtour_mode.pivot(index="tour_mode_labels", columns="tour_category") + tm1_jtour_mode = tm1_jtour_mode.pivot( + index="tour_mode_labels", columns="tour_category" + ) tm1_tour_mode = pd.concat([tm1_nmn_tour_mode, tm1_jtour_mode], axis=1) tm1_tour_mode.columns = ["atwork", "non_mandatory", "mandatory", "joint"] @@ -429,8 +574,12 @@ tm1_tour_mode.to_csv("outputs/tm1_tour_mode.csv", na_rep=0) asim_tours = pd.read_csv(asim_tour_filename) -asim_tours["tour_mode"] = pd.Categorical(asim_tours["tour_mode"], categories=mode_labels) -asim_tour_mode = asim_tours.groupby(["tour_mode", "tour_category"]).count()["household_id"] +asim_tours["tour_mode"] = pd.Categorical( + asim_tours["tour_mode"], categories=mode_labels +) +asim_tour_mode = asim_tours.groupby(["tour_mode", "tour_category"]).count()[ + "household_id" +] asim_tour_mode = asim_tour_mode.reset_index() asim_tour_mode = asim_tour_mode.pivot(index="tour_mode", columns="tour_category") asim_tour_mode.to_csv("outputs/asim_tour_mode.csv", na_rep=0) @@ -439,36 +588,57 @@ if process_tm1: tm1_work_tours = tm1_tours[tm1_tours["tour_purpose"].str.startswith("work")] - tm1_atwork_freq_strs = ["", "no_subtours", "eat", "business1", - "maint", "business2", "eat_business"] + tm1_atwork_freq_strs = [ + "", + "no_subtours", + "eat", + "business1", + "maint", + "business2", + "eat_business", + ] tm1_work_tours["atWork_freq_str"] = pd.Series(tm1_atwork_freq_strs)[ - tm1_work_tours["atWork_freq"].tolist()].tolist() - tm1_work_tours.groupby(["atWork_freq_str"]).count()["hh_id"].to_csv("outputs/tm1_atwork_tf.csv", na_rep=0) + tm1_work_tours["atWork_freq"].tolist() + ].tolist() + tm1_work_tours.groupby(["atWork_freq_str"]).count()["hh_id"].to_csv( + "outputs/tm1_atwork_tf.csv", na_rep=0 + ) asim_work_tours = asim_tours[asim_tours["primary_purpose"] == "work"] -asim_work_tours.groupby(["atwork_subtour_frequency"]).count()["household_id"].to_csv("outputs/asim_atwork_tf.csv", - na_rep=0) +asim_work_tours.groupby(["atwork_subtour_frequency"]).count()["household_id"].to_csv( + "outputs/asim_atwork_tf.csv", na_rep=0 +) # atwork_subtour_destination if process_tm1: tm1_tours = pd.read_csv(tm1_tour_filename) - tm1_tours["distance"] = distmat[tm1_tours["orig_taz"]-1, tm1_tours["dest_taz"]-1] + tm1_tours["distance"] = distmat[ + tm1_tours["orig_taz"] - 1, tm1_tours["dest_taz"] - 1 + ] tm1_tours_atw = tm1_tours[tm1_tours["tour_category"] == "AT_WORK"] tm1_tours_atw["dist_bin"] = pd.cut(tm1_tours_atw["distance"], range(51)) - tm1_tours_atw.groupby(["dist_bin"]).count()["hh_id"].to_csv("outputs/tm1_atwork_dist.csv", na_rep=0) + tm1_tours_atw.groupby(["dist_bin"]).count()["hh_id"].to_csv( + "outputs/tm1_atwork_dist.csv", na_rep=0 + ) asim_atw_tours = pd.read_csv(asim_tour_filename) asim_atw_tours = asim_atw_tours[asim_atw_tours["tour_category"] == "atwork"] -asim_atw_tours["distance"] = distmat[asim_atw_tours["origin"].astype(int)-1, - asim_atw_tours["destination"].astype(int)-1] +asim_atw_tours["distance"] = distmat[ + asim_atw_tours["origin"].astype(int) - 1, + asim_atw_tours["destination"].astype(int) - 1, +] asim_atw_tours["dist_bin"] = pd.cut(asim_atw_tours["distance"], range(51)) -asim_atw_tours.groupby(["dist_bin"]).count()["household_id"].to_csv("outputs/asim_atwork_dist.csv", na_rep=0) +asim_atw_tours.groupby(["dist_bin"]).count()["household_id"].to_csv( + "outputs/asim_atwork_dist.csv", na_rep=0 +) # atwork_subtour_scheduling if process_tm1: - tm1_tours_atw_tdd = tm1_tours_atw.groupby(["start_hour", "end_hour"]).count()["hh_id"] + tm1_tours_atw_tdd = tm1_tours_atw.groupby(["start_hour", "end_hour"]).count()[ + "hh_id" + ] tm1_tours_atw_tdd = tm1_tours_atw_tdd.reset_index() tm1_tours_atw_tdd = tm1_tours_atw_tdd.pivot(index="start_hour", columns="end_hour") tm1_tours_atw_tdd.to_csv("outputs/tm1_atwork_tours_tdd.csv", na_rep=0) @@ -487,46 +657,75 @@ tm1_jtours = pd.read_csv(tm1_jtour_filename) tm1_tours["tour_purpose_simple"] = tm1_tours["tour_purpose"] - tm1_tours["tour_purpose_simple"] = tm1_tours["tour_purpose_simple"].str.replace("atwork_", "") - tm1_tours["tour_purpose_simple"][tm1_tours["tour_purpose_simple"]. - str.contains("work_")] = "work" - tm1_tours["tour_purpose_simple"][tm1_tours["tour_purpose_simple"]. - str.contains("school_")] = "school" - tm1_tours["tour_purpose_simple"][tm1_tours["tour_purpose_simple"]. - str.contains("university")] = "school" - tm1_tours["tour_purpose_simple"][tm1_tours["tour_purpose_simple"]. - str.contains("escort_")] = "escort" + tm1_tours["tour_purpose_simple"] = tm1_tours["tour_purpose_simple"].str.replace( + "atwork_", "" + ) + tm1_tours["tour_purpose_simple"][ + tm1_tours["tour_purpose_simple"].str.contains("work_") + ] = "work" + tm1_tours["tour_purpose_simple"][ + tm1_tours["tour_purpose_simple"].str.contains("school_") + ] = "school" + tm1_tours["tour_purpose_simple"][ + tm1_tours["tour_purpose_simple"].str.contains("university") + ] = "school" + tm1_tours["tour_purpose_simple"][ + tm1_tours["tour_purpose_simple"].str.contains("escort_") + ] = "escort" tm1_tours_atw = tm1_tours[tm1_tours["tour_category"] == "AT_WORK"] tm1_tours_nmn = tm1_tours[tm1_tours["tour_category"] != "AT_WORK"] - tm1_tours_nmn["tsf"] = tm1_tours_nmn[ - "num_ob_stops"].astype(str) + "-" + tm1_tours_nmn["num_ib_stops"].astype(str) - tm1_stop_freq = tm1_tours_nmn.groupby(["tsf", "tour_purpose_simple"]).count()["hh_id"] + tm1_tours_nmn["tsf"] = ( + tm1_tours_nmn["num_ob_stops"].astype(str) + + "-" + + tm1_tours_nmn["num_ib_stops"].astype(str) + ) + tm1_stop_freq = tm1_tours_nmn.groupby(["tsf", "tour_purpose_simple"]).count()[ + "hh_id" + ] tm1_stop_freq = tm1_stop_freq.reset_index() tm1_stop_freq = tm1_stop_freq.pivot(index="tsf", columns="tour_purpose_simple") - tm1_jtours["tsf"] = tm1_jtours[ - "num_ob_stops"].astype(str) + "-" + tm1_jtours["num_ib_stops"].astype(str) - tm1_tours_atw["tsf"] = tm1_tours_atw[ - "num_ob_stops"].astype(str) + "-" + tm1_tours_atw["num_ib_stops"].astype(str) + tm1_jtours["tsf"] = ( + tm1_jtours["num_ob_stops"].astype(str) + + "-" + + tm1_jtours["num_ib_stops"].astype(str) + ) + tm1_tours_atw["tsf"] = ( + tm1_tours_atw["num_ob_stops"].astype(str) + + "-" + + tm1_tours_atw["num_ib_stops"].astype(str) + ) tm1_stop_freq_joint = tm1_jtours.groupby(["tsf"]).count()["hh_id"] tm1_stop_freq_atwork = tm1_tours_atw.groupby(["tsf"]).count()["hh_id"] - tm1_stop_freq = pd.concat([tm1_stop_freq, tm1_stop_freq_joint, tm1_stop_freq_atwork], axis=1) + tm1_stop_freq = pd.concat( + [tm1_stop_freq, tm1_stop_freq_joint, tm1_stop_freq_atwork], axis=1 + ) tm1_stop_freq.to_csv("outputs/tm1_stop_freq.csv", na_rep=0) asim_tours = pd.read_csv(asim_tour_filename) -asim_nmn_tours = asim_tours[(asim_tours["tour_category"] == "mandatory") | - (asim_tours["tour_category"] == "non_mandatory")] +asim_nmn_tours = asim_tours[ + (asim_tours["tour_category"] == "mandatory") + | (asim_tours["tour_category"] == "non_mandatory") +] asim_joint_tours = asim_tours[asim_tours["tour_category"] == "joint"] asim_atw_tours = asim_tours[asim_tours["tour_category"] == "atwork"] -asim_stop_freq = asim_nmn_tours.groupby(["stop_frequency", "tour_type"]).count()["household_id"] +asim_stop_freq = asim_nmn_tours.groupby(["stop_frequency", "tour_type"]).count()[ + "household_id" +] asim_stop_freq = asim_stop_freq.reset_index() asim_stop_freq = asim_stop_freq.pivot(index="stop_frequency", columns="tour_type") -asim_stop_freq_joint = asim_joint_tours.groupby(["stop_frequency"]).count()["household_id"] -asim_stop_freq_atwork = asim_atw_tours.groupby(["stop_frequency"]).count()["household_id"] -asim_stop_freq = pd.concat([asim_stop_freq, asim_stop_freq_joint, asim_stop_freq_atwork], axis=1) +asim_stop_freq_joint = asim_joint_tours.groupby(["stop_frequency"]).count()[ + "household_id" +] +asim_stop_freq_atwork = asim_atw_tours.groupby(["stop_frequency"]).count()[ + "household_id" +] +asim_stop_freq = pd.concat( + [asim_stop_freq, asim_stop_freq_joint, asim_stop_freq_atwork], axis=1 +) asim_stop_freq.to_csv("outputs/asim_stop_freq.csv", na_rep=0) ############################################################# @@ -540,9 +739,12 @@ tm1_jtrips = pd.read_csv(tm1_jtrips_filename) tm1_trips["orig_purpose"][tm1_trips["orig_purpose"] == "university"] = "univ" tm1_trips["orig_purpose"] = pd.Categorical(tm1_trips["orig_purpose"]) - tm1_jtrips["orig_purpose"] = pd.Categorical(tm1_jtrips["orig_purpose"], - categories=tm1_trips["orig_purpose"].cat.categories) - tm1_trip_purp = tm1_trips.groupby(["orig_purpose", "tour_category"]).count()["hh_id"] + tm1_jtrips["orig_purpose"] = pd.Categorical( + tm1_jtrips["orig_purpose"], categories=tm1_trips["orig_purpose"].cat.categories + ) + tm1_trip_purp = tm1_trips.groupby(["orig_purpose", "tour_category"]).count()[ + "hh_id" + ] tm1_trip_purp = tm1_trip_purp.reset_index() tm1_trip_purp = tm1_trip_purp.pivot(index="orig_purpose", columns="tour_category") tm1_jtrip_purp = tm1_jtrips.groupby(["orig_purpose"]).count()["hh_id"] @@ -554,8 +756,12 @@ asim_trips = pd.read_csv(asim_trips_filename) asim_tours = pd.read_csv(asim_tour_filename) asim_tours = asim_tours.set_index("tour_id", drop=False) -asim_trips["tour_category"] = asim_tours["tour_category"].loc[asim_trips["tour_id"]].tolist() -asim_trip_purp = asim_trips.groupby(["purpose", "tour_category"]).count()["household_id"] +asim_trips["tour_category"] = ( + asim_tours["tour_category"].loc[asim_trips["tour_id"]].tolist() +) +asim_trip_purp = asim_trips.groupby(["purpose", "tour_category"]).count()[ + "household_id" +] asim_trip_purp = asim_trip_purp.reset_index() asim_trip_purp = asim_trip_purp.pivot(index="purpose", columns="tour_category") asim_trip_purp.to_csv("outputs/asim_trip_purp.csv", na_rep=0) @@ -563,23 +769,38 @@ # trip destination if process_tm1: - tm1_trips["distance"] = distmat[tm1_trips["orig_taz"]-1, tm1_trips["dest_taz"]-1] - tm1_jtrips["distance"] = distmat[tm1_jtrips["orig_taz"]-1, tm1_jtrips["dest_taz"]-1] + tm1_trips["distance"] = distmat[ + tm1_trips["orig_taz"] - 1, tm1_trips["dest_taz"] - 1 + ] + tm1_jtrips["distance"] = distmat[ + tm1_jtrips["orig_taz"] - 1, tm1_jtrips["dest_taz"] - 1 + ] tm1_trips["dist_bin"] = pd.cut(tm1_trips["distance"], range(51)) tm1_jtrips["dist_bin"] = pd.cut(tm1_jtrips["distance"], range(51)) - tm1_trips_dist = pd.concat([tm1_trips.groupby(["dist_bin"]).count()["hh_id"] + - tm1_jtrips.groupby(["dist_bin"]).count()["hh_id"]], axis=1) + tm1_trips_dist = pd.concat( + [ + tm1_trips.groupby(["dist_bin"]).count()["hh_id"] + + tm1_jtrips.groupby(["dist_bin"]).count()["hh_id"] + ], + axis=1, + ) tm1_trips_dist.to_csv("outputs/tm1_trips_dist.csv", na_rep=0) -asim_trips["distance"] = distmat[asim_trips["origin"]-1, asim_trips["destination"]-1] +asim_trips["distance"] = distmat[ + asim_trips["origin"] - 1, asim_trips["destination"] - 1 +] asim_trips["dist_bin"] = pd.cut(asim_trips["distance"], range(51)) -asim_trips.groupby(["dist_bin"]).count()["household_id"].to_csv("outputs/asim_trips_dist.csv", na_rep=0) +asim_trips.groupby(["dist_bin"]).count()["household_id"].to_csv( + "outputs/asim_trips_dist.csv", na_rep=0 +) # trip scheduling if process_tm1: - tm1_trips_tdd = tm1_trips.groupby([ - "depart_hour"]).count()["hh_id"] + tm1_jtrips.groupby(["depart_hour"]).count()["hh_id"] + tm1_trips_tdd = ( + tm1_trips.groupby(["depart_hour"]).count()["hh_id"] + + tm1_jtrips.groupby(["depart_hour"]).count()["hh_id"] + ) tm1_trips_tdd.to_csv("outputs/tm1_trips_depart.csv", na_rep=0) asim_trips_tdd = asim_trips.groupby(["depart"]).count()["household_id"] @@ -588,12 +809,22 @@ # trip mode share by tour purpose if process_tm1: - tm1_trips["trip_mode_str"] = pd.Series(mode_labels)[tm1_trips["trip_mode"].tolist()].tolist() - tm1_trips["trip_mode_str"] = pd.Categorical(tm1_trips["trip_mode_str"], categories=mode_labels) - tm1_jtrips["trip_mode_str"] = pd.Series(mode_labels)[tm1_jtrips["trip_mode"].tolist()].tolist() - tm1_jtrips["trip_mode_str"] = pd.Categorical(tm1_jtrips["trip_mode_str"], categories=mode_labels) - - tm1_trip_mode = tm1_trips.groupby(["trip_mode_str", "tour_category"]).count()["hh_id"] + tm1_trips["trip_mode_str"] = pd.Series(mode_labels)[ + tm1_trips["trip_mode"].tolist() + ].tolist() + tm1_trips["trip_mode_str"] = pd.Categorical( + tm1_trips["trip_mode_str"], categories=mode_labels + ) + tm1_jtrips["trip_mode_str"] = pd.Series(mode_labels)[ + tm1_jtrips["trip_mode"].tolist() + ].tolist() + tm1_jtrips["trip_mode_str"] = pd.Categorical( + tm1_jtrips["trip_mode_str"], categories=mode_labels + ) + + tm1_trip_mode = tm1_trips.groupby(["trip_mode_str", "tour_category"]).count()[ + "hh_id" + ] tm1_trip_mode = tm1_trip_mode.reset_index() tm1_trip_mode = tm1_trip_mode.pivot(index="trip_mode_str", columns="tour_category") @@ -603,8 +834,12 @@ tm1_trip_mode = tm1_trip_mode[["atwork", "joint", "mandatory", "non_mandatory"]] tm1_trip_mode.to_csv("outputs/tm1_trip_mode.csv", na_rep=0) -asim_trips["trip_mode"] = pd.Categorical(asim_trips["trip_mode"], categories=mode_labels) -asim_trip_mode = asim_trips.groupby(["trip_mode", "tour_category"]).count()["household_id"] +asim_trips["trip_mode"] = pd.Categorical( + asim_trips["trip_mode"], categories=mode_labels +) +asim_trip_mode = asim_trips.groupby(["trip_mode", "tour_category"]).count()[ + "household_id" +] asim_trip_mode = asim_trip_mode.reset_index() asim_trip_mode = asim_trip_mode.pivot(index="trip_mode", columns="tour_category") asim_trip_mode.to_csv("outputs/asim_trip_mode.csv", na_rep=0) diff --git a/other_resources/verification/simulation.py b/other_resources/verification/simulation.py index 467e60a259..890418332b 100644 --- a/other_resources/verification/simulation.py +++ b/other_resources/verification/simulation.py @@ -1,46 +1,43 @@ # ActivitySim # See full license in LICENSE.txt. -import sys import logging +import sys import pandas as pd -from activitysim.core import mem -from activitysim.core import inject -from activitysim.core import tracing -from activitysim.core import config -from activitysim.core import pipeline -from activitysim.core import mp_tasks -from activitysim.core import chunk +from activitysim.core import chunk, config, inject, mem, mp_tasks, pipeline, tracing # from activitysim import abm -logger = logging.getLogger('activitysim') +logger = logging.getLogger("activitysim") def cleanup_output_files(): - active_log_files = \ - [h.baseFilename for h in logger.root.handlers if isinstance(h, logging.FileHandler)] - tracing.delete_output_files('log', ignore=active_log_files) + active_log_files = [ + h.baseFilename + for h in logger.root.handlers + if isinstance(h, logging.FileHandler) + ] + tracing.delete_output_files("log", ignore=active_log_files) - tracing.delete_output_files('h5') - tracing.delete_output_files('csv') - tracing.delete_output_files('txt') - tracing.delete_output_files('yaml') - tracing.delete_output_files('prof') + tracing.delete_output_files("h5") + tracing.delete_output_files("csv") + tracing.delete_output_files("txt") + tracing.delete_output_files("yaml") + tracing.delete_output_files("prof") def run(run_list, injectables=None): - if run_list['multiprocess']: + if run_list["multiprocess"]: logger.info("run multiprocess simulation") mp_tasks.run_multiprocess(run_list, injectables) else: logger.info("run single process simulation") - pipeline.run(models=run_list['models'], resume_after=run_list['resume_after']) + pipeline.run(models=run_list["models"], resume_after=run_list["resume_after"]) pipeline.close_pipeline() mem.log_global_hwm() @@ -48,13 +45,12 @@ def run(run_list, injectables=None): def log_settings(injectables): settings = [ - 'households_sample_size', - 'chunk_size', - 'multiprocess', - 'num_processes', - 'resume_after' - 'use_shadow_pricing', - 'hh_ids', + "households_sample_size", + "chunk_size", + "multiprocess", + "num_processes", + "resume_after" "use_shadow_pricing", + "hh_ids", ] for k in settings: @@ -64,19 +60,19 @@ def log_settings(injectables): logger.info("injectable %s: %s" % (k, inject.get_injectable(k))) -if __name__ == '__main__': +if __name__ == "__main__": # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.set_option.html # pd.set_option('display.max_columns', 50) data_dir = "E:/projects/clients/ASIM/data/mtc_tm1" - data_dir = '/Users/jeff.doyle/work/activitysim-data/mtc_tm1/data' - data_dir = '../example/data' + data_dir = "/Users/jeff.doyle/work/activitysim-data/mtc_tm1/data" + data_dir = "../example/data" # inject.add_injectable('data_dir', '/Users/jeff.doyle/work/activitysim-data/mtc_tm1/data') - inject.add_injectable('data_dir', ['ancillary_data', data_dir]) + inject.add_injectable("data_dir", ["ancillary_data", data_dir]) # inject.add_injectable('data_dir', ['ancillary_data', '../activitysim/abm/test/data']) - inject.add_injectable('configs_dir', ['configs', '../example/configs']) + inject.add_injectable("configs_dir", ["configs", "../example/configs"]) injectables = config.handle_standard_args() @@ -88,14 +84,16 @@ def log_settings(injectables): t0 = tracing.print_elapsed_time() # cleanup if not resuming - if not config.setting('resume_after', False): + if not config.setting("resume_after", False): cleanup_output_files() run_list = mp_tasks.get_run_list() - if run_list['multiprocess']: + if run_list["multiprocess"]: # do this after config.handle_standard_args, as command line args may override injectables - injectables = list(set(injectables) | set(['data_dir', 'configs_dir', 'output_dir'])) + injectables = list( + set(injectables) | set(["data_dir", "configs_dir", "output_dir"]) + ) injectables = {k: inject.get_injectable(k) for k in injectables} else: injectables = None diff --git a/setup.py b/setup.py index 911e8b70d0..0d097dceb5 100644 --- a/setup.py +++ b/setup.py @@ -1,42 +1,43 @@ from ez_setup import use_setuptools -use_setuptools() # nopep8 -from setuptools import setup, find_packages +use_setuptools() # nopep8 import os import re -with open(os.path.join('activitysim', '__init__.py')) as f: - info = re.search(r'__.*', f.read(), re.S) +from setuptools import find_packages, setup + +with open(os.path.join("activitysim", "__init__.py")) as f: + info = re.search(r"__.*", f.read(), re.S) exec(info[0]) setup( - name='activitysim', + name="activitysim", version=__version__, description=__doc__, - author='contributing authors', - author_email='ben.stabler@rsginc.com', - license='BSD-3', - url='https://github.com/activitysim/activitysim', + author="contributing authors", + author_email="ben.stabler@rsginc.com", + license="BSD-3", + url="https://github.com/activitysim/activitysim", classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Programming Language :: Python :: 3.8', - 'License :: OSI Approved :: BSD License' + "Development Status :: 5 - Production/Stable", + "Programming Language :: Python :: 3.8", + "License :: OSI Approved :: BSD License", ], - packages=find_packages(exclude=['*.tests']), + packages=find_packages(exclude=["*.tests"]), include_package_data=True, - entry_points={'console_scripts': ['activitysim=activitysim.cli.main:main']}, + entry_points={"console_scripts": ["activitysim=activitysim.cli.main:main"]}, install_requires=[ - 'pyarrow >= 2.0', - 'numpy >= 1.16.1', - 'openmatrix >= 0.3.4.1', - 'pandas >= 1.1.0', - 'pyyaml >= 5.1', - 'tables >= 3.5.1', - 'cytoolz >= 0.8.1', - 'psutil >= 4.1', - 'requests >= 2.7', - 'numba >= 0.51.2', - 'orca >= 1.6', - ] + "pyarrow >= 2.0", + "numpy >= 1.16.1", + "openmatrix >= 0.3.4.1", + "pandas >= 1.1.0", + "pyyaml >= 5.1", + "tables >= 3.5.1", + "cytoolz >= 0.8.1", + "psutil >= 4.1", + "requests >= 2.7", + "numba >= 0.51.2", + "orca >= 1.6", + ], ) From 8843b3c56be15f0f856a96f51bc5279d782a65c7 Mon Sep 17 00:00:00 2001 From: Clint Daniels Date: Thu, 23 Sep 2021 08:30:40 -0700 Subject: [PATCH 2/4] travis changes --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a2d316b31d..f56cc57503 100644 --- a/.travis.yml +++ b/.travis.yml @@ -95,9 +95,10 @@ script: - python /home/travis/miniconda/envs/asimtest/lib/python$TRAVIS_PYTHON_VERSION/site-packages/activitysim/examples/example_multiple_zone/scripts/two_zone_example_data.py - python /home/travis/miniconda/envs/asimtest/lib/python$TRAVIS_PYTHON_VERSION/site-packages/activitysim/examples/example_multiple_zone/scripts/three_zone_example_data.py # pycodestyle -- pycodestyle activitysim # run specific TEST_SUITE job on travis to avoid job max time - travis_wait 50 py.test $TEST_SUITE --cov activitysim --cov-report term-missing --durations=0 +- isort --profile black . +- black --check --diff . - coveralls From 78cdae8023845855ca32101dcc22aa9ca5e3e04b Mon Sep 17 00:00:00 2001 From: Clint Daniels Date: Thu, 23 Sep 2021 08:59:43 -0700 Subject: [PATCH 3/4] Black 3.9 --- activitysim/abm/models/free_parking.py | 4 +- .../abm/models/joint_tour_participation.py | 8 ++- activitysim/abm/models/location_choice.py | 8 ++- activitysim/abm/models/tour_mode_choice.py | 5 +- .../abm/models/trip_departure_choice.py | 5 +- activitysim/abm/models/trip_mode_choice.py | 5 +- activitysim/abm/models/trip_purpose.py | 5 +- activitysim/abm/models/util/estimation.py | 33 ++++++++----- activitysim/abm/tables/size_terms.py | 42 ++++++++-------- activitysim/cli/create.py | 3 +- activitysim/cli/run.py | 3 +- activitysim/core/assign.py | 9 ++-- activitysim/core/chunk.py | 6 +-- activitysim/core/interaction_simulate.py | 7 +-- activitysim/core/logit.py | 7 ++- activitysim/core/los.py | 4 +- activitysim/core/mem.py | 5 +- activitysim/core/mp_tasks.py | 8 ++- activitysim/core/simulate.py | 14 +++--- activitysim/core/skim_dict_factory.py | 6 +-- activitysim/core/test/test_random.py | 14 +++++- activitysim/core/test/test_skim.py | 7 ++- .../estimation/larch/auto_ownership.py | 15 ++++-- activitysim/estimation/larch/cdap.py | 4 +- activitysim/estimation/larch/general.py | 19 +++++-- .../estimation/larch/location_choice.py | 49 ++++++++++++++----- activitysim/estimation/larch/mode_choice.py | 26 +++++++--- .../estimation/larch/nonmand_tour_freq.py | 25 +++++++--- activitysim/estimation/larch/scheduling.py | 41 ++++++++++++---- .../estimation/larch/simple_simulate.py | 45 +++++++++++++---- .../estimation/larch/stop_frequency.py | 28 ++++++++--- .../estimation/test/test_larch_estimation.py | 21 ++++++-- .../examples/scan_examples_for_errors.py | 5 +- docs/conf.py | 4 +- ez_setup.py | 4 +- other_resources/scripts/build_omx.py | 10 +++- 36 files changed, 364 insertions(+), 140 deletions(-) diff --git a/activitysim/abm/models/free_parking.py b/activitysim/abm/models/free_parking.py index c9f2f30b69..086422ba27 100644 --- a/activitysim/abm/models/free_parking.py +++ b/activitysim/abm/models/free_parking.py @@ -11,9 +11,7 @@ @inject.step() def free_parking(persons_merged, persons, chunk_size, trace_hh_id): - """ - - """ + """ """ trace_label = "free_parking" model_settings_file_name = "free_parking.yaml" diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index f60e1c2bbe..bce5a91e24 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -174,7 +174,9 @@ def participants_chooser(probs, choosers, spec, trace_label): num_tours_remaining = len(candidates.tour_id.unique()) logger.info( - "%s %s joint tours to satisfy.", trace_label, num_tours_remaining, + "%s %s joint tours to satisfy.", + trace_label, + num_tours_remaining, ) iter = 0 @@ -233,7 +235,9 @@ def participants_chooser(probs, choosers, spec, trace_label): assert rands.index.equals(choosers.index) logger.info( - "%s %s iterations to satisfy all joint tours.", trace_label, iter, + "%s %s iterations to satisfy all joint tours.", + trace_label, + iter, ) return choices, rands diff --git a/activitysim/abm/models/location_choice.py b/activitysim/abm/models/location_choice.py index c433baf4b9..9ac033aa8b 100644 --- a/activitysim/abm/models/location_choice.py +++ b/activitysim/abm/models/location_choice.py @@ -901,7 +901,13 @@ def iterate_location_choice( spc.write_trace_files(iteration) if spc.use_shadow_pricing and spc.check_fit(iteration): - logging.info("%s converged after iteration %s" % (trace_label, iteration,)) + logging.info( + "%s converged after iteration %s" + % ( + trace_label, + iteration, + ) + ) break # - shadow price table diff --git a/activitysim/abm/models/tour_mode_choice.py b/activitysim/abm/models/tour_mode_choice.py index 4d2dbef55e..303e195e45 100644 --- a/activitysim/abm/models/tour_mode_choice.py +++ b/activitysim/abm/models/tour_mode_choice.py @@ -144,7 +144,10 @@ def tour_mode_choice_simulate( logger.info( "tour_mode_choice_simulate tour_type '%s' (%s tours)" - % (tour_purpose, len(tours_segment.index),) + % ( + tour_purpose, + len(tours_segment.index), + ) ) if network_los.zone_system == los.THREE_ZONE: diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py index 640a00f8f0..2dad8d37c0 100644 --- a/activitysim/abm/models/trip_departure_choice.py +++ b/activitysim/abm/models/trip_departure_choice.py @@ -501,7 +501,10 @@ def trip_departure_choice(trips, trips_merged, skim_dict, chunk_size, trace_hh_i simulate.set_skim_wrapper_targets(trips_merged_df, skims) locals_d.update( - {"od_skims": od_skim, "do_skims": do_skim,} + { + "od_skims": od_skim, + "do_skims": do_skim, + } ) expressions.assign_columns( diff --git a/activitysim/abm/models/trip_mode_choice.py b/activitysim/abm/models/trip_mode_choice.py index 8f830df613..5ea9806f13 100644 --- a/activitysim/abm/models/trip_mode_choice.py +++ b/activitysim/abm/models/trip_mode_choice.py @@ -128,7 +128,10 @@ def trip_mode_choice(trips, tours_merged, network_los, chunk_size, trace_hh_id): logger.info( "trip_mode_choice tour_type '%s' (%s trips)" - % (primary_purpose, len(trips_segment.index),) + % ( + primary_purpose, + len(trips_segment.index), + ) ) # name index so tracing knows how to slice diff --git a/activitysim/abm/models/trip_purpose.py b/activitysim/abm/models/trip_purpose.py index 938da1eb1f..4035ae61d9 100644 --- a/activitysim/abm/models/trip_purpose.py +++ b/activitysim/abm/models/trip_purpose.py @@ -114,7 +114,10 @@ def choose_intermediate_trip_purpose( ) logger.info( "Writing %s unmatched choosers to %s" - % (len(unmatched_choosers), file_name,) + % ( + len(unmatched_choosers), + file_name, + ) ) tracing.write_csv(unmatched_choosers, file_name=file_name, transpose=False) raise RuntimeError( diff --git a/activitysim/abm/models/util/estimation.py b/activitysim/abm/models/util/estimation.py index 0bf7419f80..6a5dbadf1f 100644 --- a/activitysim/abm/models/util/estimation.py +++ b/activitysim/abm/models/util/estimation.py @@ -351,9 +351,11 @@ def melt_alternatives(self, df): "alt_id not set. Did you forget to call set_alt_id()? (%s)" % self.model_name ) - assert alt_id_name in df, ( - "alt_id_column_name '%s' not in alternatives table (%s)" - % (alt_id_name, self.model_name) + assert ( + alt_id_name in df + ), "alt_id_column_name '%s' not in alternatives table (%s)" % ( + alt_id_name, + self.model_name, ) variable_column = "variable" @@ -480,9 +482,11 @@ def initialize_settings(self): self.survey_tables = settings.get("survey_tables", {}) for table_name, table_info in self.survey_tables.items(): - assert "file_name" in table_info, ( - "No file name specified for survey_table '%s' in %s" - % (table_name, ESTIMATION_SETTINGS_FILE_NAME) + assert ( + "file_name" in table_info + ), "No file name specified for survey_table '%s' in %s" % ( + table_name, + ESTIMATION_SETTINGS_FILE_NAME, ) file_path = config.data_file_path( table_info["file_name"], mandatory=True @@ -536,16 +540,21 @@ def begin_estimation(self, model_name, bundle_name=None): model_name not in self.estimating ), "Cant begin estimating %s - already estimating that model." % (model_name,) - assert bundle_name in self.model_estimation_table_types, ( - "No estimation_table_type for %s in %s." - % (bundle_name, ESTIMATION_SETTINGS_FILE_NAME) + assert ( + bundle_name in self.model_estimation_table_types + ), "No estimation_table_type for %s in %s." % ( + bundle_name, + ESTIMATION_SETTINGS_FILE_NAME, ) model_estimation_table_type = self.model_estimation_table_types[bundle_name] - assert model_estimation_table_type in self.estimation_table_recipes, ( - "model_estimation_table_type '%s' for model %s no in %s." - % (model_estimation_table_type, model_name, ESTIMATION_SETTINGS_FILE_NAME) + assert ( + model_estimation_table_type in self.estimation_table_recipes + ), "model_estimation_table_type '%s' for model %s no in %s." % ( + model_estimation_table_type, + model_name, + ESTIMATION_SETTINGS_FILE_NAME, ) self.estimating[model_name] = Estimator( diff --git a/activitysim/abm/tables/size_terms.py b/activitysim/abm/tables/size_terms.py index 8109b3a3bb..e31b004e5d 100644 --- a/activitysim/abm/tables/size_terms.py +++ b/activitysim/abm/tables/size_terms.py @@ -55,27 +55,27 @@ def size_term(land_use, destination_choice_coefficients): def tour_destination_size_terms(land_use, size_terms, model_selector): """ - Parameters - ---------- - land_use - pipeline table - size_terms - pipeline table - model_selector - str - - Returns - ------- - - :: - - pandas.dataframe - one column per model_selector segment with index of land_use - e.g. for model_selector 'workplace', columns will be work_low, work_med, ... - and for model_selector 'trip', columns will be eatout, escort, othdiscr, ... - - work_low work_med work_high work_veryhigh - zone_id ... - 1 1267.00000 522.000 1108.000 1540.0000 ... - 2 1991.00000 824.500 1759.000 2420.0000 ... - ... + Parameters + ---------- + land_use - pipeline table + size_terms - pipeline table + model_selector - str + + Returns + ------- + + :: + + pandas.dataframe + one column per model_selector segment with index of land_use + e.g. for model_selector 'workplace', columns will be work_low, work_med, ... + and for model_selector 'trip', columns will be eatout, escort, othdiscr, ... + + work_low work_med work_high work_veryhigh + zone_id ... + 1 1267.00000 522.000 1108.000 1540.0000 ... + 2 1991.00000 824.500 1759.000 2420.0000 ... + ... """ land_use = land_use.to_frame() diff --git a/activitysim/cli/create.py b/activitysim/cli/create.py index f83da6eec0..96c8c3aedb 100644 --- a/activitysim/cli/create.py +++ b/activitysim/cli/create.py @@ -31,8 +31,7 @@ def _load_manifest(): def add_create_args(parser): - """Create command args - """ + """Create command args""" create_group = parser.add_mutually_exclusive_group(required=True) create_group.add_argument( "-l", diff --git a/activitysim/cli/run.py b/activitysim/cli/run.py index f667608f38..5507085062 100644 --- a/activitysim/cli/run.py +++ b/activitysim/cli/run.py @@ -17,8 +17,7 @@ def add_run_args(parser, multiprocess=True): - """Run command args - """ + """Run command args""" parser.add_argument( "-w", "--working_dir", diff --git a/activitysim/core/assign.py b/activitysim/core/assign.py index 7b76d326a5..3fd94106f4 100644 --- a/activitysim/core/assign.py +++ b/activitysim/core/assign.py @@ -258,9 +258,12 @@ def to_series(x): for e in zip(assignment_expressions.target, assignment_expressions.expression): target, expression = e - assert isinstance(target, str), ( - "expected target '%s' for expression '%s' to be string not %s" - % (target, expression, type(target)) + assert isinstance( + target, str + ), "expected target '%s' for expression '%s' to be string not %s" % ( + target, + expression, + type(target), ) if target in local_keys: diff --git a/activitysim/core/chunk.py b/activitysim/core/chunk.py index effbbba1e7..c3c5153633 100644 --- a/activitysim/core/chunk.py +++ b/activitysim/core/chunk.py @@ -471,8 +471,7 @@ def write_history(self, history, chunk_tag): class ChunkLedger(object): - """ - """ + """ """ def __init__(self, trace_label, chunk_size, baseline_rss, baseline_uss, headroom): self.trace_label = trace_label @@ -708,8 +707,7 @@ def run(self): class ChunkSizer(object): - """ - """ + """ """ def __init__(self, chunk_tag, trace_label, num_choosers=0, chunk_size=0): diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index cc6453bb98..ce604d97c7 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -110,9 +110,10 @@ def to_series(x): # bug - location choice has df index_name zone_id but should be person_id???? if df.index.name is None: chooser_id = estimator.get_chooser_id() - assert chooser_id in df.columns, ( - "Expected to find choose_id column '%s' in interaction dataset" - % (chooser_id,) + assert ( + chooser_id in df.columns + ), "Expected to find choose_id column '%s' in interaction dataset" % ( + chooser_id, ) assert df.index.name is None expression_values_df[chooser_id] = df[chooser_id] diff --git a/activitysim/core/logit.py b/activitysim/core/logit.py index 9d831866a2..652965e395 100644 --- a/activitysim/core/logit.py +++ b/activitysim/core/logit.py @@ -436,8 +436,11 @@ def _each_nest(spec, parent_nest, post_order): if isinstance(spec, dict): name = spec["name"] coefficient = spec["coefficient"] - assert isinstance(coefficient, (int, float)), ( - "Coefficient '%s' (%s) not a number" % (name, coefficient) + assert isinstance( + coefficient, (int, float) + ), "Coefficient '%s' (%s) not a number" % ( + name, + coefficient, ) # forgot to eval coefficient? alternatives = [ a["name"] if isinstance(a, dict) else a for a in spec["alternatives"] diff --git a/activitysim/core/los.py b/activitysim/core/los.py index ba5bf4712b..5031ef4d46 100644 --- a/activitysim/core/los.py +++ b/activitysim/core/los.py @@ -333,7 +333,9 @@ def load_data(self): if self.tap_lines_df is None: # load tap_lines on demand (required if they specify tap_line_distance_col) - tap_lines_file_name = self.setting("tap_lines",) + tap_lines_file_name = self.setting( + "tap_lines", + ) self.tap_lines_df = pd.read_csv( config.data_file_path(tap_lines_file_name, mandatory=True) ) diff --git a/activitysim/core/mem.py b/activitysim/core/mem.py index 4575186fea..3b93402a2f 100644 --- a/activitysim/core/mem.py +++ b/activitysim/core/mem.py @@ -92,7 +92,10 @@ def consolidate_logs(): # consolidate events (duplicate rows should be idle steps (e.g. log_rss) df = ( df.groupby("time") - .agg(rss=("rss", "max"), uss=("uss", "max"),) + .agg( + rss=("rss", "max"), + uss=("uss", "max"), + ) .reset_index(drop=False) ) diff --git a/activitysim/core/mp_tasks.py b/activitysim/core/mp_tasks.py index 742a86a0ca..14937a7737 100644 --- a/activitysim/core/mp_tasks.py +++ b/activitysim/core/mp_tasks.py @@ -1205,7 +1205,13 @@ def check_proc_status(): p = multiprocessing.Process( target=mp_run_simulation, name=process_name, - args=(locutor, q, injectables, step_info, resume_after,), + args=( + locutor, + q, + injectables, + step_info, + resume_after, + ), kwargs=shared_data_buffers, ) diff --git a/activitysim/core/simulate.py b/activitysim/core/simulate.py index 4e138149a5..75f8e3e1f5 100644 --- a/activitysim/core/simulate.py +++ b/activitysim/core/simulate.py @@ -128,9 +128,10 @@ def read_model_coefficients(model_settings=None, file_name=None): assert file_name is not None else: assert file_name is None - assert "COEFFICIENTS" in model_settings, ( - "'COEFFICIENTS' tag not in model_settings in %s" - % model_settings.get("source_file_paths") + assert ( + "COEFFICIENTS" in model_settings + ), "'COEFFICIENTS' tag not in model_settings in %s" % model_settings.get( + "source_file_paths" ) file_name = model_settings["COEFFICIENTS"] logger.debug(f"read_model_coefficients file_name {file_name}") @@ -212,9 +213,10 @@ def read_model_coefficient_template(model_settings): Read the coefficient template specified by COEFFICIENT_TEMPLATE model setting """ - assert "COEFFICIENT_TEMPLATE" in model_settings, ( - "'COEFFICIENT_TEMPLATE' not in model_settings in %s" - % model_settings.get("source_file_paths") + assert ( + "COEFFICIENT_TEMPLATE" in model_settings + ), "'COEFFICIENT_TEMPLATE' not in model_settings in %s" % model_settings.get( + "source_file_paths" ) coefficients_file_name = model_settings["COEFFICIENT_TEMPLATE"] diff --git a/activitysim/core/skim_dict_factory.py b/activitysim/core/skim_dict_factory.py index be3196628e..a2c82591be 100644 --- a/activitysim/core/skim_dict_factory.py +++ b/activitysim/core/skim_dict_factory.py @@ -307,8 +307,8 @@ def _read_skims_from_omx(self, skim_info, skim_data): def _open_existing_readonly_memmap_skim_cache(self, skim_info): """ - read cached memmapped skim data from canonically named cache file(s) in output directory into skim_data - return True if it was there and we read it, return False if not found + read cached memmapped skim data from canonically named cache file(s) in output directory into skim_data + return True if it was there and we read it, return False if not found """ dtype = np.dtype(skim_info.dtype_name) @@ -340,7 +340,7 @@ def _open_existing_readonly_memmap_skim_cache(self, skim_info): def _create_empty_writable_memmap_skim_cache(self, skim_info): """ - write skim data from skim_data to canonically named cache file(s) in output directory + write skim data from skim_data to canonically named cache file(s) in output directory """ dtype = np.dtype(skim_info.dtype_name) diff --git a/activitysim/core/test/test_random.py b/activitysim/core/test/test_random.py index a3aa4bbe13..50b4f0dc5a 100644 --- a/activitysim/core/test/test_random.py +++ b/activitysim/core/test/test_random.py @@ -39,10 +39,20 @@ def test_channel(): } rng = random.Random() - persons = pd.DataFrame({"household_id": [1, 1, 2, 2, 2],}, index=[1, 2, 3, 4, 5]) + persons = pd.DataFrame( + { + "household_id": [1, 1, 2, 2, 2], + }, + index=[1, 2, 3, 4, 5], + ) persons.index.name = "person_id" - households = pd.DataFrame({"data": [1, 1, 2, 2, 2],}, index=[1, 2, 3, 4, 5]) + households = pd.DataFrame( + { + "data": [1, 1, 2, 2, 2], + }, + index=[1, 2, 3, 4, 5], + ) households.index.name = "household_id" rng.begin_step("test_step") diff --git a/activitysim/core/test/test_skim.py b/activitysim/core/test/test_skim.py index a8ff8396c9..a1e47779ae 100644 --- a/activitysim/core/test/test_skim.py +++ b/activitysim/core/test/test_skim.py @@ -39,7 +39,12 @@ def test_skims(data): skim_dict.offset_mapper.set_offset_int(0) # default is -1 skims = skim_dict.wrap("taz_l", "taz_r") - df = pd.DataFrame({"taz_l": [1, 9, 4], "taz_r": [2, 3, 7],}) + df = pd.DataFrame( + { + "taz_l": [1, 9, 4], + "taz_r": [2, 3, 7], + } + ) skims.set_df(df) diff --git a/activitysim/estimation/larch/auto_ownership.py b/activitysim/estimation/larch/auto_ownership.py index 06cd7f34a2..b008683bec 100644 --- a/activitysim/estimation/larch/auto_ownership.py +++ b/activitysim/estimation/larch/auto_ownership.py @@ -21,7 +21,9 @@ def auto_ownership_model( return_data=False, ): data = simple_simulate_data( - name=name, edb_directory=edb_directory, values_index_col="household_id", + name=name, + edb_directory=edb_directory, + values_index_col="household_id", ) coefficients = data.coefficients # coef_template = data.coef_template # not used @@ -45,12 +47,19 @@ def auto_ownership_model( m.initialize_graph(alternative_codes=altcodes, root_id=99) m.utility_co = dict_of_linear_utility_from_spec( - spec, "Label", dict(zip(altnames, altcodes)), + spec, + "Label", + dict(zip(altnames, altcodes)), ) apply_coefficients(coefficients, m) - d = DataFrames(co=chooser_data, av=True, alt_codes=altcodes, alt_names=altnames,) + d = DataFrames( + co=chooser_data, + av=True, + alt_codes=altcodes, + alt_names=altnames, + ) m.dataservice = d m.choice_co_code = "override_choice" diff --git a/activitysim/estimation/larch/cdap.py b/activitysim/estimation/larch/cdap.py index 4c82e87b6e..288ac565c1 100644 --- a/activitysim/estimation/larch/cdap.py +++ b/activitysim/estimation/larch/cdap.py @@ -329,7 +329,9 @@ def read_yaml(filename, **kwargs): person_rank = cdap.assign_cdap_rank(persons, person_type_map) coefficients = read_csv( - coefficients_file, index_col="coefficient_name", comment="#", + coefficients_file, + index_col="coefficient_name", + comment="#", ) interaction_coef = read_csv( diff --git a/activitysim/estimation/larch/general.py b/activitysim/estimation/larch/general.py index 7c0f498624..435d4e3265 100644 --- a/activitysim/estimation/larch/general.py +++ b/activitysim/estimation/larch/general.py @@ -142,9 +142,15 @@ def linear_utility_from_spec(spec, x_col, p_col, ignore_x=(), segment_id=None): raise ValueError("segment_id must be given if p_col is a dict") partial_utility = {} for seg_p_col, segval in p_col.items(): - partial_utility[seg_p_col] = linear_utility_from_spec( - spec, x_col, seg_p_col, ignore_x, - ) * X(f"{segment_id}=={str_repr(segval)}") + partial_utility[seg_p_col] = ( + linear_utility_from_spec( + spec, + x_col, + seg_p_col, + ignore_x, + ) + * X(f"{segment_id}=={str_repr(segval)}") + ) return sum(partial_utility.values()) parts = [] for i in spec.index: @@ -263,7 +269,9 @@ def explicit_value_parameters_from_spec(spec, p_col, model): pass else: model.set_value( - getattr(i, p_col), value=j, holdfast=True, + getattr(i, p_col), + value=j, + holdfast=True, ) @@ -488,6 +496,7 @@ def update_coefficients(model, data, result_dir=Path("."), output_file=None): if output_file is not None: os.makedirs(result_dir, exist_ok=True) coefficients.reset_index().to_csv( - result_dir / output_file, index=False, + result_dir / output_file, + index=False, ) return coefficients diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py index 0a5009b3b0..2f392e8af5 100644 --- a/activitysim/estimation/larch/location_choice.py +++ b/activitysim/estimation/larch/location_choice.py @@ -57,7 +57,10 @@ def _read_csv(filename, **kwargs): filename = filename.format(name=name) return pd.read_csv(os.path.join(edb_directory, filename), **kwargs) - coefficients = _read_csv(coefficients_file, index_col="coefficient_name",) + coefficients = _read_csv( + coefficients_file, + index_col="coefficient_name", + ) spec = _read_csv(spec_file, comment="#") alt_values = _read_csv(alt_values_file) chooser_data = _read_csv(chooser_file) @@ -71,14 +74,20 @@ def _read_csv(filename, **kwargs): settings_file = settings_file.format(name=name) with open(os.path.join(edb_directory, settings_file), "r") as yf: - settings = yaml.load(yf, Loader=yaml.SafeLoader,) + settings = yaml.load( + yf, + Loader=yaml.SafeLoader, + ) include_settings = settings.get("include_settings") if include_settings: include_settings = os.path.join(edb_directory, include_settings) if include_settings and os.path.exists(include_settings): with open(include_settings, "r") as yf: - more_settings = yaml.load(yf, Loader=yaml.SafeLoader,) + more_settings = yaml.load( + yf, + Loader=yaml.SafeLoader, + ) settings.update(more_settings) CHOOSER_SEGMENT_COLUMN_NAME = settings.get("CHOOSER_SEGMENT_COLUMN_NAME") @@ -212,7 +221,10 @@ def _read_csv(filename, **kwargs): spec.columns == ["Label", "Description", "Expression", "coefficient"] ): m.utility_ca = linear_utility_from_spec( - spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist",), + spec, + x_col="Label", + p_col=spec.columns[-1], + ignore_x=("local_dist",), ) elif ( len(spec.columns) == 4 @@ -221,7 +233,10 @@ def _read_csv(filename, **kwargs): and spec.columns[3] == list(SEGMENT_IDS.values())[0] ): m.utility_ca = linear_utility_from_spec( - spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist",), + spec, + x_col="Label", + p_col=spec.columns[-1], + ignore_x=("local_dist",), ) else: m.utility_ca = linear_utility_from_spec( @@ -301,23 +316,31 @@ def update_size_spec(model, data, result_dir=Path("."), output_file=None): if output_file is not None: os.makedirs(result_dir, exist_ok=True) master_size_spec.reset_index().to_csv( - result_dir / output_file, index=False, + result_dir / output_file, + index=False, ) return master_size_spec def workplace_location_model(return_data=False): - return location_choice_model(name="workplace_location", return_data=return_data,) + return location_choice_model( + name="workplace_location", + return_data=return_data, + ) def school_location_model(return_data=False): - return location_choice_model(name="school_location", return_data=return_data,) + return location_choice_model( + name="school_location", + return_data=return_data, + ) def atwork_subtour_destination_model(return_data=False): return location_choice_model( - name="atwork_subtour_destination", return_data=return_data, + name="atwork_subtour_destination", + return_data=return_data, ) @@ -333,9 +356,13 @@ def joint_tour_destination_model(return_data=False): def non_mandatory_tour_destination_model(return_data=False): # goes with joint_tour_destination return location_choice_model( - name="non_mandatory_tour_destination", return_data=return_data, + name="non_mandatory_tour_destination", + return_data=return_data, ) def trip_destination_model(return_data=False): - return location_choice_model(name="trip_destination", return_data=return_data,) + return location_choice_model( + name="trip_destination", + return_data=return_data, + ) diff --git a/activitysim/estimation/larch/mode_choice.py b/activitysim/estimation/larch/mode_choice.py index d2b319fc19..a790075e14 100644 --- a/activitysim/estimation/larch/mode_choice.py +++ b/activitysim/estimation/larch/mode_choice.py @@ -29,7 +29,9 @@ def mode_choice_model( override_filenames = {} edb_directory = edb_directory.format(name=name) data = simple_simulate_data( - name=name, edb_directory=edb_directory, **override_filenames, + name=name, + edb_directory=edb_directory, + **override_filenames, ) coefficients = data.coefficients coef_template = data.coef_template @@ -56,7 +58,10 @@ def mode_choice_model( for alt_code, alt_name in tree.elemental_names().items(): # Read in base utility function for this alt_name u = linear_utility_from_spec( - spec, x_col="Label", p_col=alt_name, ignore_x=("#",), + spec, + x_col="Label", + p_col=alt_name, + ignore_x=("#",), ) for purpose in purposes: # Modify utility function based on template for purpose @@ -75,7 +80,10 @@ def mode_choice_model( ) d = DataFrames( - co=chooser_data, av=avail, alt_codes=data.alt_codes, alt_names=data.alt_names, + co=chooser_data, + av=avail, + alt_codes=data.alt_codes, + alt_names=data.alt_names, ) if "atwork" not in name: @@ -114,7 +122,9 @@ def tour_mode_choice_model( return_data=False, ): return mode_choice_model( - name=name, edb_directory=edb_directory, return_data=return_data, + name=name, + edb_directory=edb_directory, + return_data=return_data, ) @@ -124,7 +134,9 @@ def trip_mode_choice_model( return_data=False, ): return mode_choice_model( - name=name, edb_directory=edb_directory, return_data=return_data, + name=name, + edb_directory=edb_directory, + return_data=return_data, ) @@ -137,5 +149,7 @@ def atwork_subtour_mode_choice_model( name=name, edb_directory=edb_directory, return_data=return_data, - override_filenames=dict(coefficients_file="tour_mode_choice_coefficients.csv",), + override_filenames=dict( + coefficients_file="tour_mode_choice_coefficients.csv", + ), ) diff --git a/activitysim/estimation/larch/nonmand_tour_freq.py b/activitysim/estimation/larch/nonmand_tour_freq.py index ecbb1408c6..22f026d05d 100644 --- a/activitysim/estimation/larch/nonmand_tour_freq.py +++ b/activitysim/estimation/larch/nonmand_tour_freq.py @@ -40,7 +40,10 @@ def _read_csv(filename, **kwargs): settings_file = settings_file.format(name=name) with open(os.path.join(edb_directory, settings_file), "r") as yf: - settings = yaml.load(yf, Loader=yaml.SafeLoader,) + settings = yaml.load( + yf, + Loader=yaml.SafeLoader, + ) coefficients = {} chooser_data = {} @@ -60,7 +63,9 @@ def _read_csv(filename, **kwargs): alt_values_files.format(name=name, segment_name=segment_name), ) - spec = _read_csv(spec_file,) + spec = _read_csv( + spec_file, + ) spec = remove_apostrophes(spec, ["Label"]) # alt_names = list(spec.columns[3:]) # alt_codes = np.arange(1, len(alt_names) + 1) @@ -118,10 +123,12 @@ def unavail(model, x_ca): def nonmand_tour_freq_model( - edb_directory="output/estimation_data_bundle/{name}/", return_data=False, + edb_directory="output/estimation_data_bundle/{name}/", + return_data=False, ): data = interaction_simulate_data( - name="non_mandatory_tour_frequency", edb_directory=edb_directory, + name="non_mandatory_tour_frequency", + edb_directory=edb_directory, ) settings = data.settings @@ -145,7 +152,9 @@ def nonmand_tour_freq_model( # Utility specifications segment_model.utility_ca = linear_utility_from_spec( - spec, x_col="Label", p_col=segment_name, + spec, + x_col="Label", + p_col=segment_name, ) apply_coefficients(coefficients[segment_name], segment_model) segment_model.choice_co_code = "override_choice" @@ -157,7 +166,11 @@ def nonmand_tour_freq_model( .rename(columns={"TAZ": "HOMETAZ"}) ) x_ca = cv_to_ca(alt_values[segment_name].set_index(["person_id", "variable"])) - d = DataFrames(co=x_co, ca=x_ca, av=~unavail(segment_model, x_ca),) + d = DataFrames( + co=x_co, + ca=x_ca, + av=~unavail(segment_model, x_ca), + ) m[segment_name].dataservice = d if return_data: diff --git a/activitysim/estimation/larch/scheduling.py b/activitysim/estimation/larch/scheduling.py index d9e595a7e4..ea2aef1b2b 100644 --- a/activitysim/estimation/larch/scheduling.py +++ b/activitysim/estimation/larch/scheduling.py @@ -47,14 +47,23 @@ def _read_csv(filename, optional=False, **kwargs): settings_file = settings_file.format(name=name) with open(os.path.join(edb_directory, settings_file), "r") as yf: - settings = yaml.load(yf, Loader=yaml.SafeLoader,) + settings = yaml.load( + yf, + Loader=yaml.SafeLoader, + ) try: - coefficients = _read_csv(coefficients_file, index_col="coefficient_name",) + coefficients = _read_csv( + coefficients_file, + index_col="coefficient_name", + ) except FileNotFoundError: # possibly mis-named file is shown in settings coefficients_file = settings.get("COEFFICIENTS", coefficients_file) - coefficients = _read_csv(coefficients_file, index_col="coefficient_name",) + coefficients = _read_csv( + coefficients_file, + index_col="coefficient_name", + ) spec = _read_csv(spec_file, comment="#") alt_values = _read_csv(alt_values_file) @@ -67,7 +76,10 @@ def _read_csv(filename, optional=False, **kwargs): include_settings = settings.get("include_settings") if include_settings: with open(os.path.join(edb_directory, include_settings), "r") as yf: - more_settings = yaml.load(yf, Loader=yaml.SafeLoader,) + more_settings = yaml.load( + yf, + Loader=yaml.SafeLoader, + ) settings.update(more_settings) CHOOSER_SEGMENT_COLUMN_NAME = settings.get("CHOOSER_SEGMENT_COLUMN_NAME") @@ -90,7 +102,10 @@ def _read_csv(filename, optional=False, **kwargs): == ["label", "description", "expression", "coefficient"] ): m.utility_ca = linear_utility_from_spec( - spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist",), + spec, + x_col="Label", + p_col=spec.columns[-1], + ignore_x=("local_dist",), ) elif ( len(spec.columns) == 4 @@ -99,7 +114,10 @@ def _read_csv(filename, optional=False, **kwargs): and spec.columns[3] == list(SEGMENT_IDS.values())[0] ): m.utility_ca = linear_utility_from_spec( - spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist",), + spec, + x_col="Label", + p_col=spec.columns[-1], + ignore_x=("local_dist",), ) else: m.utility_ca = linear_utility_from_spec( @@ -216,15 +234,20 @@ def mandatory_tour_scheduling_school_model(return_data=False): def non_mandatory_tour_scheduling_model(return_data=False): return schedule_choice_model( - name="non_mandatory_tour_scheduling", return_data=return_data, + name="non_mandatory_tour_scheduling", + return_data=return_data, ) def joint_tour_scheduling_model(return_data=False): - return schedule_choice_model(name="joint_tour_scheduling", return_data=return_data,) + return schedule_choice_model( + name="joint_tour_scheduling", + return_data=return_data, + ) def atwork_subtour_scheduling_model(return_data=False): return schedule_choice_model( - name="atwork_subtour_scheduling", return_data=return_data, + name="atwork_subtour_scheduling", + return_data=return_data, ) diff --git a/activitysim/estimation/larch/simple_simulate.py b/activitysim/estimation/larch/simple_simulate.py index 31f1015fd9..0ee055fe6c 100644 --- a/activitysim/estimation/larch/simple_simulate.py +++ b/activitysim/estimation/larch/simple_simulate.py @@ -67,14 +67,21 @@ def _read_csv(filename, **kwargs): settings_file = settings_file.format(name=name) with open(os.path.join(edb_directory, settings_file), "r") as yf: - settings = yaml.load(yf, Loader=yaml.SafeLoader,) + settings = yaml.load( + yf, + Loader=yaml.SafeLoader, + ) try: - coefficients = _read_csv(coefficients_file, index_col="coefficient_name",) + coefficients = _read_csv( + coefficients_file, + index_col="coefficient_name", + ) try: coef_template = _read_csv( - coefficients_template, index_col="coefficient_name", + coefficients_template, + index_col="coefficient_name", ) except FileNotFoundError: coef_template = None @@ -92,7 +99,10 @@ def _read_csv(filename, **kwargs): alt_names_to_codes = dict(zip(alt_names, alt_codes)) alt_codes_to_names = dict(zip(alt_codes, alt_names)) - chooser_data = _read_csv(chooser_data_file, index_col=values_index_col,) + chooser_data = _read_csv( + chooser_data_file, + index_col=values_index_col, + ) except Exception: # when an error happens in reading anything other than settings, print settings @@ -124,7 +134,9 @@ def simple_simulate_model( values_index_col="household_id", ): data = simple_simulate_data( - name=name, edb_directory=edb_directory, values_index_col=values_index_col, + name=name, + edb_directory=edb_directory, + values_index_col=values_index_col, ) coefficients = data.coefficients # coef_template = data.coef_template # not used @@ -150,7 +162,9 @@ def simple_simulate_model( m = Model(alts=data.alt_codes_to_names) m.utility_co = dict_of_linear_utility_from_spec( - spec, "Label", dict(zip(alt_names, alt_codes)), + spec, + "Label", + dict(zip(alt_names, alt_codes)), ) apply_coefficients(coefficients, m) @@ -160,7 +174,12 @@ def simple_simulate_model( else: avail = True - d = DataFrames(co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names,) + d = DataFrames( + co=chooser_data, + av=avail, + alt_codes=alt_codes, + alt_names=alt_names, + ) m.dataservice = d m.choice_co_code = "override_choice_code" @@ -218,7 +237,9 @@ def mandatory_tour_frequency_model( return_data=False, ): return simple_simulate_model( - name=name, edb_directory=edb_directory, return_data=return_data, + name=name, + edb_directory=edb_directory, + return_data=return_data, ) @@ -228,7 +249,9 @@ def joint_tour_frequency_model( return_data=False, ): return simple_simulate_model( - name=name, edb_directory=edb_directory, return_data=return_data, + name=name, + edb_directory=edb_directory, + return_data=return_data, ) @@ -251,7 +274,9 @@ def joint_tour_composition_model( return_data=False, ): return simple_simulate_model( - name=name, edb_directory=edb_directory, return_data=return_data, + name=name, + edb_directory=edb_directory, + return_data=return_data, ) diff --git a/activitysim/estimation/larch/stop_frequency.py b/activitysim/estimation/larch/stop_frequency.py index 8fdd02b967..c572af5e87 100644 --- a/activitysim/estimation/larch/stop_frequency.py +++ b/activitysim/estimation/larch/stop_frequency.py @@ -26,7 +26,10 @@ def stop_frequency_data( settings_file = settings_file.format(name=name) with open(os.path.join(edb_directory, settings_file), "r") as yf: - settings = yaml.load(yf, Loader=yaml.SafeLoader,) + settings = yaml.load( + yf, + Loader=yaml.SafeLoader, + ) segments = [i["primary_purpose"] for i in settings["SPEC_SEGMENTS"]] @@ -39,7 +42,8 @@ def stop_frequency_data( seg_purpose = seg_["primary_purpose"] seg_subdir = Path(os.path.join(edb_directory, seg_purpose)) segment_coef[seg_["primary_purpose"]] = pd.read_csv( - seg_subdir / seg_["COEFFICIENTS"], index_col="coefficient_name", + seg_subdir / seg_["COEFFICIENTS"], + index_col="coefficient_name", ) for seg in segments: @@ -129,9 +133,13 @@ def stop_frequency_data( def stop_frequency_model( - edb_directory="output/estimation_data_bundle/{name}/", return_data=False, + edb_directory="output/estimation_data_bundle/{name}/", + return_data=False, ): - data = stop_frequency_data(edb_directory=edb_directory, values_index_col="tour_id",) + data = stop_frequency_data( + edb_directory=edb_directory, + values_index_col="tour_id", + ) models = [] @@ -161,7 +169,9 @@ def stop_frequency_model( m = Model() m.utility_co = dict_of_linear_utility_from_spec( - spec, "Label", dict(zip(alt_names, alt_codes)), + spec, + "Label", + dict(zip(alt_names, alt_codes)), ) apply_coefficients(coefficients, m) @@ -169,7 +179,10 @@ def stop_frequency_model( avail = True d = DataFrames( - co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names, + co=chooser_data, + av=avail, + alt_codes=alt_codes, + alt_names=alt_names, ) m.dataservice = d @@ -205,5 +218,6 @@ def update_segment_coefficients(model, data, result_dir=Path("."), output_file=N if output_file is not None: os.makedirs(result_dir, exist_ok=True) coefficients.reset_index().to_csv( - result_dir / output_file.format(segment_name=segment_name), index=False, + result_dir / output_file.format(segment_name=segment_name), + index=False, ) diff --git a/activitysim/estimation/test/test_larch_estimation.py b/activitysim/estimation/test/test_larch_estimation.py index bad64770dd..98c922fb70 100644 --- a/activitysim/estimation/test/test_larch_estimation.py +++ b/activitysim/estimation/test/test_larch_estimation.py @@ -115,7 +115,12 @@ def test_location_model(est_data, num_regression, dataframe_regression, name, me basename=f"test_loc_{name}_loglike", ) _regression_check(dataframe_regression, m.pf) - size_spec = update_size_spec(m, data, result_dir=None, output_file=None,) + size_spec = update_size_spec( + m, + data, + result_dir=None, + output_file=None, + ) dataframe_regression.check( size_spec, basename=f"test_loc_{name}_size_spec", @@ -177,7 +182,12 @@ def test_workplace_location(est_data, num_regression, dataframe_regression): basename="test_workplace_location_loglike", ) _regression_check(dataframe_regression, m.pf) - size_spec = update_size_spec(m, data, result_dir=None, output_file=None,) + size_spec = update_size_spec( + m, + data, + result_dir=None, + output_file=None, + ) dataframe_regression.check( size_spec, basename="test_workplace_location_size_spec", @@ -197,7 +207,12 @@ def test_school_location(est_data, num_regression, dataframe_regression): basename="test_school_location_loglike", ) _regression_check(dataframe_regression, m.pf) - size_spec = update_size_spec(m, data, result_dir=None, output_file=None,) + size_spec = update_size_spec( + m, + data, + result_dir=None, + output_file=None, + ) dataframe_regression.check( size_spec, basename="test_school_location_size_spec", diff --git a/activitysim/examples/scan_examples_for_errors.py b/activitysim/examples/scan_examples_for_errors.py index 570ef861aa..75268c8160 100644 --- a/activitysim/examples/scan_examples_for_errors.py +++ b/activitysim/examples/scan_examples_for_errors.py @@ -5,7 +5,10 @@ parser = argparse.ArgumentParser() parser.add_argument( - "working_dir", type=str, metavar="PATH", help="path to examples working directory", + "working_dir", + type=str, + metavar="PATH", + help="path to examples working directory", ) args = parser.parse_args() diff --git a/docs/conf.py b/docs/conf.py index da6570fb92..532b4dc0f8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -283,5 +283,7 @@ html_static_path = ["_static"] html_context = { - "css_files": ["_static/theme_overrides.css",], # override wide tables in RTD theme + "css_files": [ + "_static/theme_overrides.css", + ], # override wide tables in RTD theme } diff --git a/ez_setup.py b/ez_setup.py index 1de2837266..6df0f8fded 100644 --- a/ez_setup.py +++ b/ez_setup.py @@ -350,7 +350,9 @@ def _parse_args(): help="Use internal, non-validating downloader", ) parser.add_option( - "--version", help="Specify which version to download", default=DEFAULT_VERSION, + "--version", + help="Specify which version to download", + default=DEFAULT_VERSION, ) options, args = parser.parse_args() # positional arguments are ignored diff --git a/other_resources/scripts/build_omx.py b/other_resources/scripts/build_omx.py index 3312ad6f5a..7ad91fa4e1 100644 --- a/other_resources/scripts/build_omx.py +++ b/other_resources/scripts/build_omx.py @@ -40,7 +40,10 @@ def omx_getMatrix(omx_file_name, omx_key): print omx_file.list_matrices() raise RuntimeError( "Source matrix with key '%s' not found in file '%s" - % (omx_key, omx_file,) + % ( + omx_key, + omx_file, + ) ) data = omx_file[omx_key] @@ -83,7 +86,10 @@ def omx_getMatrix(omx_file_name, omx_key): print source_omx.list_matrices() raise RuntimeError( "Source matrix with key '%s' not found in file '%s" - % (row.source_key, dest_omx,) + % ( + row.source_key, + dest_omx, + ) ) data = source_omx[row.source_key] From 28140e5d6288df812e143a8ca492d05aeda614b0 Mon Sep 17 00:00:00 2001 From: Clint Daniels Date: Thu, 23 Sep 2021 19:37:24 -0700 Subject: [PATCH 4/4] Update activitysim-test-larch.yml --- conda-environments/activitysim-test-larch.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda-environments/activitysim-test-larch.yml b/conda-environments/activitysim-test-larch.yml index 474736b430..080f9d6812 100644 --- a/conda-environments/activitysim-test-larch.yml +++ b/conda-environments/activitysim-test-larch.yml @@ -21,5 +21,6 @@ dependencies: - pytest - pytest-cov - coveralls -- pycodestyle +- black +- isort - pytest-regressions