compile_watch, standalones

ActivitySim · Mar 25, 2022 · a35bf68 · a35bf68
1 parent 1216033
commit a35bf68
Show file tree

Hide file tree

Showing 12 changed files with 403 additions and 11 deletions.
diff --git a/activitysim/core/flow.py b/activitysim/core/flow.py
@@ -22,6 +22,7 @@
 from .simulate_consts import SPEC_EXPRESSION_NAME, SPEC_LABEL_NAME
 from . import inject, config
 from .. import __version__
+from ..core import tracing
 
 logger = logging.getLogger(__name__)
 
@@ -242,7 +243,7 @@ def skim_dataset():
                 if zarr_file:
                     logger.info(f"did not find zarr skims, loading omx")
                 d = sh.dataset.from_omx_3d(
-                    [openmatrix.open_file(f) for f in omx_file_paths],
+                    [openmatrix.open_file(f, mode='r') for f in omx_file_paths],
                     time_periods=time_periods,
                     max_float_precision=max_float_precision,
                 )
@@ -724,6 +725,7 @@ def apply_flow(spec, choosers, locals_d=None, trace_label=None, required=False,
                 flow_result = flow.dot(
                     coefficients=spec.values.astype(np.float32),
                     dtype=np.float32,
+                    compile_watch=True,
                 )
                 # TODO: are there remaining internal arrays in dot that need to be
                 #  passed out to be seen by the dynamic chunker before they are freed?
@@ -739,4 +741,6 @@ def apply_flow(spec, choosers, locals_d=None, trace_label=None, required=False,
                 # index_keys = self.shared_data.meta_match_names_idx.keys()
                 # logger.debug(f"Flow._get_indexes: {index_keys}")
                 raise
+            if flow.compiled_recently:
+                tracing.timing_notes.add(f"compiled:{flow.name}")
             return flow_result, flow
diff --git a/activitysim/core/skim_dict_factory.py b/activitysim/core/skim_dict_factory.py
@@ -116,7 +116,7 @@ def load_skim_info(self, skim_tag):
 
             logger.debug(f"load_skim_info {skim_tag} reading {omx_file_path}")
 
-            with omx.open_file(omx_file_path) as omx_file:
+            with omx.open_file(omx_file_path, mode='r') as omx_file:
 
                 # fixme call to omx_file.shape() failing in windows p3.5
                 if self.omx_shape is None:
@@ -273,7 +273,7 @@ def _read_skims_from_omx(self, skim_info, skim_data):
             logger.info(f"_read_skims_from_omx {omx_file_path}")
 
             # read skims into skim_data
-            with omx.open_file(omx_file_path) as omx_file:
+            with omx.open_file(omx_file_path, mode='r') as omx_file:
                 for skim_key, omx_key in omx_keys.items():
 
                     if omx_manifest[omx_key] == omx_file_path:

diff --git a/activitysim/core/tracing.py b/activitysim/core/tracing.py
@@ -28,6 +28,7 @@
 
 
 logger = logging.getLogger(__name__)
+timing_notes = set()
 
 
 class ElapsedTimeFormatter(logging.Formatter):
@@ -66,6 +67,7 @@ def print_elapsed_time(msg=None, t0=None, debug=False):
 
 
 def log_runtime(model_name, start_time=None, timing=None):
+    global timing_notes
 
     assert (start_time or timing) and not (start_time and timing)
 
@@ -85,10 +87,12 @@ def log_runtime(model_name, start_time=None, timing=None):
         if not inject.get_injectable('locutor', False):
             return
 
-    header = "process_name,model_name,seconds,minutes"
+    header = "process_name,model_name,seconds,minutes,notes"
+    note = " ".join(timing_notes)
     with config.open_log_file('timing_log.csv', 'a', header) as log_file:
-        print(f"{process_name},{model_name},{seconds},{minutes}", file=log_file)
+        print(f"{process_name},{model_name},{seconds},{minutes},{note}", file=log_file)
 
+    timing_notes.clear()
 
 def delete_output_files(file_type, ignore=None, subdir=None):
     """

diff --git a/activitysim/standalone/__init__.py b/activitysim/standalone/__init__.py
diff --git a/activitysim/standalone/compare.py b/activitysim/standalone/compare.py
@@ -0,0 +1,161 @@
+import altair as alt
+import pandas as pd
+import os
+from .pipeline import load_checkpointed_tables
+
+def load_pipelines(pipelines, tables=None, checkpoint_name=None):
+    """
+    Parameters
+    ----------
+    pipelines : Dict[Str, Path-like]
+        Mapping run name to path of pipeline file.
+    checkpoint : str
+        Name of checkpoint to load for all pipelines
+    """
+    return {
+        key: load_checkpointed_tables(
+            pth,
+            tables=tables,
+            checkpoint_name=checkpoint_name,
+        )
+        for key, pth in pipelines.items()
+    }
+
+
+def load_final_tables(output_dirs, tables=None, index_cols=None):
+    result = {}
+    for key, pth in output_dirs.items():
+        result[key] = {}
+        for tname, tfile in tables.items():
+            tpath = os.path.join(pth, tfile)
+            kwargs = {}
+            if index_cols is not None and tname in index_cols:
+                kwargs['index_col'] = index_cols[tname]
+            result[key][tname] = pd.read_csv(tpath, **kwargs)
+    return result
+
+
+def compare_trip_mode_choice(tablesets, title="Trip Mode Choice", grouping='primary_purpose'):
+
+    d = {}
+    groupings = [grouping,]
+
+    for key, tableset in tablesets.items():
+        df = tableset['trips'].groupby(
+            groupings + ['trip_mode']
+        ).size().rename('n_trips').reset_index()
+        df['share_trips'] = df['n_trips'] / df.groupby(groupings)['n_trips'].transform('sum')
+        d[key] = df
+
+    all_d = pd.concat(d, names=['source']).reset_index()
+
+    selection = alt.selection_point(
+        fields=['trip_mode'], bind='legend',
+    )
+
+    fig = alt.Chart(
+        all_d
+    ).mark_bar(
+    ).encode(
+        color='trip_mode',
+        y=alt.Y('source', axis=alt.Axis(grid=False, title='')),
+        x=alt.X('share_trips', axis=alt.Axis(grid=False, labels=False, title='Mode Share')),
+        row='primary_purpose',
+        opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
+        tooltip = ['trip_mode', 'source', 'n_trips', alt.Tooltip('share_trips:Q', format='.2%')],
+    ).add_selection(
+        selection,
+    )
+
+    if title:
+        fig = fig.properties(
+            title=title
+        ).configure_title(
+            fontSize=20,
+            anchor='start',
+            color='black',
+        )
+
+    return fig
+
+
+def compare_trip_distance(
+    tablesets,
+    skims,
+    dist_skim_name,
+    otaz_col='origin',
+    dtaz_col='destination',
+    time_col='depart',
+    dist_bins=20,
+    grouping='primary_purpose',
+    title="Trip Length Distribution",
+    max_dist=None,
+):
+    groupings = [grouping]
+    if not isinstance(skims, dict):
+        skims = {i: skims for i in tablesets.keys()}
+
+    distances = {}
+    for key, tableset in tablesets.items():
+        skim_dist = skims[key][[dist_skim_name]]
+        looks = [
+            tableset['trips'][otaz_col].rename('otaz'),
+            tableset['trips'][dtaz_col].rename('dtaz'),
+        ]
+        if 'time_period' in skim_dist.dims:
+            looks.append(
+                tableset['trips'][time_col].apply(skims[key].attrs['time_period_imap'].get).rename('time_period'),
+            )
+        look = pd.concat(looks, axis=1)
+        distances[key] = skims[key][[dist_skim_name]].iat.df(look)
+
+    if dist_bins is not None:
+        result = pd.concat(distances, names=['source'])
+        if max_dist is not None:
+            result = result[result <= max_dist]
+        result = pd.cut(result.iloc[:, 0], dist_bins).to_frame()
+        distances = {k:result.loc[k] for k in tablesets.keys()}
+
+    data = {}
+    for key, tableset in tablesets.items():
+        data[key] = tableset['trips'].assign(**{'distance': distances[key]})
+
+    d = {}
+    for key, dat in data.items():
+        df = dat.groupby(
+            groupings + ['distance']
+        ).size().rename('n_trips').unstack('distance').fillna(0).stack().rename('n_trips').reset_index()
+        df['share_trips'] = df['n_trips'] / df.groupby(groupings)['n_trips'].transform('sum')
+        d[key] = df
+
+    all_d = pd.concat(d, names=['source']).reset_index()
+    all_d['distance'] = all_d['distance'].apply(lambda x: x.mid)
+
+    fig = alt.Chart(
+        all_d
+    ).mark_line(
+        interpolate='monotone',
+    ).encode(
+        color='source',
+        y=alt.Y('share_trips', axis=alt.Axis(grid=False, title='')),
+        x=alt.X('distance', axis=alt.Axis(grid=False, title='Distance')),
+        #opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
+        #tooltip = ['trip_mode', 'source', 'n_trips', alt.Tooltip('share_trips:Q', format='.2%')],
+        facet=alt.Facet(grouping, columns=3),
+        strokeWidth = 'source',
+    ).properties(
+        width=200,
+        height=120,
+    )
+
+    if title:
+        fig = fig.properties(
+            title=title
+        ).configure_title(
+            fontSize=20,
+            anchor='start',
+            color='black',
+        )
+
+    return fig
+
diff --git a/activitysim/standalone/pipeline.py b/activitysim/standalone/pipeline.py
@@ -0,0 +1,42 @@
+import pandas as pd
+
+from ..core import pipeline
+
+def load_checkpointed_tables(
+    pipeline_file_path,
+    tables=None,
+    checkpoint_name=None,
+):
+    pipeline_store = pd.HDFStore(pipeline_file_path, mode='r')
+
+    checkpoints = pipeline_store[pipeline.CHECKPOINT_TABLE_NAME]
+
+    # checkpoint row as series
+    if checkpoint_name is None:
+        checkpoint = checkpoints.iloc[-1]
+        checkpoint_name = checkpoint.loc[pipeline.CHECKPOINT_NAME]
+    else:
+        i = checkpoints.set_index(pipeline.CHECKPOINT_NAME).index.get_loc(checkpoint_name)
+        checkpoint = checkpoints.iloc[i]
+
+    # series with table name as index and checkpoint_name as value
+    checkpoint_tables = checkpoint[~checkpoint.index.isin(pipeline.NON_TABLE_COLUMNS)]
+
+    # omit dropped tables with empty checkpoint name
+    checkpoint_tables = checkpoint_tables[checkpoint_tables != '']
+
+    # hdf5 key is <table_name>/<checkpoint_name>
+    checkpoint_tables = {
+        table_name: pipeline.pipeline_table_key(table_name, checkpoint_name)
+        for table_name, checkpoint_name in checkpoint_tables.items()
+    }
+
+    data = {}
+    for table_name, table_key in checkpoint_tables.items():
+        if tables is None or table_name in tables:
+            data[table_name] = pipeline_store[table_key]
+
+    pipeline_store.close()
+
+    # checkpoint name and series mapping table name to hdf5 key for tables in that checkpoint
+    return checkpoint_name, data
diff --git a/activitysim/standalone/render.py b/activitysim/standalone/render.py
@@ -0,0 +1,89 @@
+import nbformat as nbf
+from nbconvert import HTMLExporter
+import nbclient
+
+# from jupyter_contrib_nbextensions.nbconvert_support import TocExporter # problematic
+
+def render_notebook(nb_filename, cellcontent):
+
+    nb = nbf.v4.new_notebook()
+
+    cells = []
+    for c in cellcontent:
+        if c[:4] == "[md]":
+            cells.append(nbf.v4.new_markdown_cell(c[4:]))
+        else:
+            cells.append(nbf.v4.new_code_cell(c))
+    nb['cells'] = cells
+
+    nb = nbclient.execute(nb)
+
+    html_exporter = HTMLExporter(
+        embed_images=True,
+        exclude_input_prompt=True,
+        exclude_output_prompt=True,
+        exclude_input=True,
+        # template_name = 'classic'
+    )
+    (body, resources) = html_exporter.from_notebook_node(nb)
+
+    with open(nb_filename, 'w') as f:
+        f.write(body)
+
+
+def render_comparison(html_filename, title, dist_skim="DIST"):
+
+    cells = [
+        f"[md]# {title}",
+
+        """\
+        from activitysim.standalone.compare import load_final_tables, compare_trip_mode_choice, compare_trip_distance
+        from activitysim.standalone.skims import load_skims
+        """,
+
+        """\
+        data = load_final_tables(
+            {"sharrow": "output-sharrow", "legacy": "output-legacy"},
+            {"trips": "final_trips.csv"},
+            {"trips": "trip_id"},
+        )
+        """,
+
+        f"[md]## Trip Mode Choice",
+
+        """\
+        compare_trip_mode_choice(data)
+        """,
+    ]
+
+    if dist_skim:
+        dist_cells = [
+            f"[md]## Trip Distance",
+
+            f"""\
+            skims = load_skims("../configs/network_los.yaml", "../data")
+            compare_trip_distance(
+                data,
+                skims,
+                "{dist_skim}",
+                max_dist=10,
+                title="Trip Length Distribution <10 miles",
+                )
+            """,
+
+            f"""\
+            compare_trip_distance(
+                data,
+                skims,
+                "{dist_skim}",
+                title="Trip Length Distribution Overall",
+                )
+            """,
+        ]
+    else:
+        dist_cells = []
+
+    render_notebook(
+        html_filename,
+        cells + dist_cells
+    )