# Estimating Non-Mandatory Tour Frequency



# Load libraries

In [None]:
import larch  # !conda install larch #for estimation
import larch.util.activitysim
import pandas as pd
import numpy as np
import yaml 
import larch.util.excel
import os

# Required Inputs

In addition to a working ActivitySim model setup, estimation mode requires an ActivitySim format household travel survey.  An ActivitySim format household travel survey is very similar to ActivitySim's simulation model tables:

 - households
 - persons
 - tours
 - joint_tour_participants
 - trips (not yet implemented)

Examples of the ActivitySim format household travel survey are included in the [example_estimation data folders](https://github.com/RSGInc/activitysim/tree/develop/activitysim/examples/example_estimation).  The user is responsible for formatting their household travel survey into the appropriate format.  

After creating an ActivitySim format household travel survey, the `scripts/infer.py` script is run to append additional calculated fields.  An example of an additional calculated field is the `household:joint_tour_frequency`, which is calculated based on the `tours` and `joint_tour_participants` tables.  

The input survey files are below.

### Survey households

In [None]:
pd.read_csv("../data_sf/survey_data/override_households.csv")

### Survey persons

In [None]:
pd.read_csv("../data_sf/survey_data/override_persons.csv")

### Survey tours

In [None]:
pd.read_csv("../data_sf/survey_data/override_tours.csv")

### Survey joint tour participants

In [None]:
pd.read_csv("../data_sf/survey_data/survey_joint_tour_participants.csv")

# Example Setup if Needed

To avoid duplication of inputs, especially model settings and expressions, the `example_estimation` depends on the `example`.  The following commands create an example setup for use.  The location of these example setups (i.e. the folders) are important because the paths are referenced in this notebook.  The commands below download the skims.omx for the SF county example from the [activitysim resources repository](https://github.com/RSGInc/activitysim_resources).

In [None]:
!activitysim create -e example_estimation_sf -d test

# Run the Estimation Example

In [None]:
%cd test
!activitysim run -c configs_estimation/configs -c configs -o output -d data_sf

In [None]:
# os.chdir(os.path.expanduser("~/sandbox/cdap-est/test_example_estimation"))

In [None]:
# Load the settings file from the original config directory, not the EDB

settings = yaml.load(
    open(os.path.join("configs","non_mandatory_tour_frequency.yaml"),"r"), 
    Loader=yaml.SafeLoader,
)
settings

# Read EDB

The next step is to read the EDB, including the coefficients, model settings, utilities specification, and chooser and alternative data.

In [None]:
segment_names = [s['NAME'] for s in settings['SPEC_SEGMENTS']]
segment_names

In [None]:
edb_directory = f"output/estimation_data_bundle/non_mandatory_tour_frequency"

def read_csv(filename, **kwargs):
    return pd.read_csv(os.path.join(edb_directory, filename), **kwargs)

In [None]:
# TEMP FILE RENAMES -- REMOVE THIS CELL WHEN EDB IS FIXED
os.makedirs(edb_directory, exist_ok=True)
for segment_name in segment_names:
    os.makedirs(os.path.join(edb_directory, segment_name), exist_ok=True)
    os.rename(
        os.path.join(
            "output", "estimation_data_bundle", 
            f"non_mandatory_tour_frequency_{segment_name}", 
            f"non_mandatory_tour_frequency_{segment_name}_coefficients.csv"
        ),
        os.path.join(
            edb_directory, 
            segment_name, 
            f"non_mandatory_tour_frequency_{segment_name}_coefficients.csv"
        ),
    )
    os.rename(
        os.path.join(
            "output", "estimation_data_bundle", 
            f"non_mandatory_tour_frequency_{segment_name}", 
            f"non_mandatory_tour_frequency_{segment_name}_choosers_combined.csv"
        ),
        os.path.join(
            edb_directory, 
            segment_name, 
            f"non_mandatory_tour_frequency_choosers_combined.csv"
        ),
    )
    os.rename(
        os.path.join(
            "output", "estimation_data_bundle", 
            f"non_mandatory_tour_frequency_{segment_name}", 
            f"non_mandatory_tour_frequency_{segment_name}_interaction_expression_values.csv"
        ),
        os.path.join(
            edb_directory, 
            segment_name, 
            f"non_mandatory_tour_frequency_interaction_expression_values.csv"
        ),
    )
    
os.rename(
    os.path.join(
        "output", "estimation_data_bundle", 
        f"non_mandatory_tour_frequency_{segment_name}", 
        f"non_mandatory_tour_frequency_{segment_name}_SPEC.csv"
    ),
    os.path.join(
        edb_directory, 
        f"non_mandatory_tour_frequency_SPEC.csv"
    ),
)

os.rename(
    os.path.join(
        "output", "estimation_data_bundle", 
        f"non_mandatory_tour_frequency_{segment_name}", 
        f"non_mandatory_tour_frequency_{segment_name}_alternatives.csv"
    ),
    os.path.join(
        edb_directory, 
        f"non_mandatory_tour_frequency_alternatives.csv"
    ),
)


In [None]:
spec = read_csv(f"non_mandatory_tour_frequency_SPEC.csv")
alt_def = read_csv(f"non_mandatory_tour_frequency_alternatives.csv", index_col=0)

In [None]:
coefficients = {}
chooser_data = {}
alt_values = {}

for segment_name in segment_names:
    coefficients[segment_name] = read_csv(
        f"{segment_name}/non_mandatory_tour_frequency_{segment_name}_coefficients.csv", 
        index_col='coefficient_name',
    )
    chooser_data[segment_name] = read_csv(
        f"{segment_name}/non_mandatory_tour_frequency_choosers_combined.csv",
    )
    alt_values[segment_name] = read_csv(
        f"{segment_name}/non_mandatory_tour_frequency_interaction_expression_values.csv", 
    )

In [None]:
# Assume all coefficients with exactly equal current values are
# actually the same estimated coefficient value and should be 
# treated as such by Larch.  Comment out this entire cell to relax
# this assumption, although be careful about the number of unique
# parameters to estimate in these models.

relabel_coef = {}
for segment_name in segment_names:
    coef_backwards_map = dict([(j,i) for i,j in coefficients[segment_name]['value'].items()])
    relabel_coef[segment_name] = r = coefficients[segment_name]['value'].map(coef_backwards_map)
    spec[segment_name] = spec[segment_name].map(r)


### Utility specification

In [None]:
spec

### Alternatives data

In [None]:
alt_values['PTYPE_FULL']

### Chooser data

In [None]:
chooser_data['PTYPE_FULL']

# Data Processing and Estimation Setup

The next step is to transform the EDB for larch for model re-estimation.  

In [None]:
from larch import P, X

### Utility specifications

In [None]:
m = {}
for segment_name in segment_names:
    segment_model = m[segment_name] = larch.Model()
    segment_model.utility_ca = larch.util.activitysim.linear_utility_from_spec(
        spec, x_col='Label', p_col=segment_name, 
    )
    larch.util.activitysim.apply_coefficients(coefficients[segment_name], segment_model)
    segment_model.choice_co_code = 'override_choice'

### Attach Data

In [None]:
for segment_name in segment_names:
    x_co = chooser_data[segment_name].set_index('person_id').rename(columns={'TAZ':'HOMETAZ'})
    x_ca = larch.util.activitysim.cv_to_ca(
        alt_values[segment_name].set_index(['person_id', 'variable'])
    )
    d = larch.DataFrames(
        co=x_co,
        ca=x_ca,
        av=True,
    )
    m[segment_name].dataservice = d

# Estimate

With the model setup for estimation, the next step is to estimate the model coefficients.  Make sure to use a sufficiently large enough household sample and set of zones to avoid an over-specified model, which does not have a numerically stable likelihood maximizing solution.  Larch has two built-in estimation methods: BHHH and SLSQP.  BHHH is the default and typically runs faster, but does not follow constraints on parameters.  SLSQP is safer, but slower, and may need additional iterations.

In [None]:
for segment_name in segment_names:
    m[segment_name].estimate(method='SLSQP', options={'maxiter':1000})

### Estimated coefficients

In [None]:
for segment_name in segment_names:
    display(m[segment_name].parameter_summary())

# Output Estimation Results

In [None]:
for segment_name in segment_names:
    est_names = [j for j in coefficients[segment_name].index if j in m[segment_name].pf.index]
    try:
        for unique_coef,common_coef in relabel_coef[segment_name].items():
            coefficients[segment_name].loc[unique_coef,'value'] = m[segment_name].pf.loc[common_coef, 'value']
    except NameError:
        coefficients[segment_name].loc[est_names,'value'] = m[segment_name].pf.loc[est_names, 'value']

In [None]:
os.makedirs(os.path.join(edb_directory,'estimated'), exist_ok=True)

### Write the re-estimated coefficients file

In [None]:
for segment_name in segment_names:
    coefficients[segment_name].reset_index().to_csv(
        os.path.join(
            edb_directory,
            'estimated',
            f"non_mandatory_tour_frequency_{segment_name}_coefficients.csv",
        ), 
        index=False,
    )

### Write the model estimation report, including coefficient t-statistic and log likelihood

In [None]:
for segment_name in segment_names:
    m[segment_name].to_xlsx(
        os.path.join(
            edb_directory,
            'estimated',
            f"non_mandatory_tour_frequency_{segment_name}_model_estimation.xlsx",
        ), 
        data_statistics=False
    )