# Estimating School Location

Integration with [larch](https://larch.newman.me) for model estimation. See [estimation tools review](https://github.com/ActivitySim/activitysim/wiki/Estimation-Tools-Review) for more information about larch.

# Run the Example

Output an estimation data bundle (EBD), which contains:
  - model settings - school_location_model_settings.yaml
  - coefficients - school_location_coefficients.csv
  - utilities specification - school_location_SPEC.csv
  - land use data - school_location_landuse.csv
  - size terms - school_location_size_terms.csv
  - alternatives values - school_location_alternatives_combined.csv
  - chooser data - school_location_choosers_combined.csv
  - chooses made - school_location_choices.csv

# Read EDB

In [1]:
import larch  # !conda install larch #for estimation
import pandas as pd
import numpy as np
import yaml 
import larch.util.excel
import larch_asim  # utility functions in a local module
import os

In [2]:
from larch import P,X

In [3]:
edb_directory = "estimation_data_bundle/school_location/"

def read_csv(filename, **kwargs):
    return pd.read_csv(os.path.join(edb_directory, filename), **kwargs)

## Read Settings

In [4]:
try:
    from yaml import CLoader as yamlLoader
except ImportError:
    from yaml import yamlLoader

with open(os.path.join(edb_directory, "school_location_model_settings.yaml"), 'r') as stream:
    settings = yaml.load(stream, Loader=yamlLoader)


In [5]:
CHOOSER_SEGMENT_COLUMN_NAME = settings['CHOOSER_SEGMENT_COLUMN_NAME']
SEGMENT_IDS = settings['SEGMENT_IDS']
settings

{'SAMPLE_SIZE': 30,
 'SIMULATE_CHOOSER_COLUMNS': ['TAZ', 'school_segment', 'household_id'],
 'CHOOSER_ORIG_COL_NAME': 'TAZ',
 'ALT_DEST_COL_NAME': 'alt_dest',
 'IN_PERIOD': 14,
 'OUT_PERIOD': 8,
 'DEST_CHOICE_COLUMN_NAME': 'school_taz',
 'SAMPLE_SPEC': 'school_location_sample.csv',
 'SPEC': 'school_location.csv',
 'COEFFICIENTS': 'school_location_coeffs.csv',
 'LOGSUM_SETTINGS': 'tour_mode_choice.yaml',
 'LOGSUM_PREPROCESSOR': 'nontour_preprocessor',
 'LOGSUM_TOUR_PURPOSE': {'university': 'univ',
  'highschool': 'school',
  'gradeschool': 'school'},
 'annotate_persons': {'SPEC': 'annotate_persons_school', 'DF': 'persons'},
 'CHOOSER_TABLE_NAME': 'persons',
 'MODEL_SELECTOR': 'school',
 'CHOOSER_SEGMENT_COLUMN_NAME': 'school_segment',
 'CHOOSER_FILTER_COLUMN_NAME': 'is_student',
 'SEGMENT_IDS': {'university': 3, 'highschool': 2, 'gradeschool': 1},
 'SHADOW_PRICE_TABLE': 'school_shadow_prices',
 'MODELED_SIZE_TABLE': 'school_modeled_size',
 'SAVED_SHADOW_PRICE_TABLE_NAME': 'school_shadow

## Read EDB

### Spec

In [6]:
spec = read_csv("school_location_SPEC.csv")

In [7]:
spec

Unnamed: 0,Label,Description,Expression,university,highschool,gradeschool
0,local_dist,,_DIST@skims['DIST'],1,1,1
1,util_dist_0_1,"Distance, piecewise linear from 0 to 1 miles","@_DIST.clip(0,1)",coef_univ_dist_0_1,coef_high_dist_0_1,coef_grade_dist_0_1
2,util_dist_1_2,"Distance, piecewise linear from 1 to 2 miles","@(_DIST-1).clip(0,1)",coef_univ_dist_1_2,coef_high_grade_dist_1_2,coef_high_grade_dist_1_2
3,util_dist_2_5,"Distance, piecewise linear from 2 to 5 miles","@(_DIST-2).clip(0,3)",coef_univ_dist_2_5,coef_high_grade_dist_2_5,coef_high_grade_dist_2_5
4,util_dist_5_15,"Distance, piecewise linear from 5 to 15 miles","@(_DIST-5).clip(0,10)",coef_univ_dist_5_15,coef_high_dist_5_15,coef_grade_dist_5_15
5,util_dist_15_up,"Distance, piecewise linear for 15+ miles",@(_DIST-15.0).clip(0),coef_univ_dist_15_up,coef_high_dist_15_up,coef_grade_dist_15_up
6,util_size_variable,Size variable,@(df['size_term'] * df['shadow_price_size_term...,1,1,1
7,util_utility_adjustment,utility adjustment,@df['shadow_price_utility_adjustment'],1,1,1
8,util_no_attractions,No attractions,@df['size_term']==0,-999,-999,-999
9,util_mode_choice_logsum,Mode choice logsum,mode_choice_logsum,coef_mode_logsum,coef_mode_logsum,coef_mode_logsum


In [8]:
# Remove shadow pricing and pre-existing size expression

spec = spec\
.set_index('Label')\
.drop(index=['util_size_variable', 'util_utility_adjustment'])\
.reset_index()

### Size Spec

In [9]:
size_spec = read_csv("school_location_size_terms.csv")

In [10]:
school_size_spec = size_spec \
.query("model_selector == 'school'") \
.drop(columns='model_selector') \
.set_index('segment')
school_size_spec = school_size_spec.loc[:,school_size_spec.max()>0]
school_size_spec

Unnamed: 0_level_0,AGE0519,HSENROLL,COLLFTE,COLLPTE
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
university,0.0,0.0,0.592,0.408
gradeschool,1.0,0.0,0.0,0.0
highschool,0.0,1.0,0.0,0.0


### Size Coefficients

In [11]:
size_coef = school_size_spec.stack().reset_index()
size_coef.index = size_coef.iloc[:,0] +"_"+ size_coef.iloc[:,1]
size_coef = size_coef.loc[size_coef.iloc[:,2]>0]
size_coef['constrain'] = 'F'
one_each = size_coef.groupby('segment').first().reset_index()
size_coef.loc[one_each.iloc[:,0] +"_"+ one_each.iloc[:,1], 'constrain'] = 'T'
size_coef = size_coef.iloc[:,2:]
size_coef.columns = ['value','constrain']
size_coef.index.name = 'coefficient_name'
size_coef['value'] = np.log(size_coef['value'])
size_coef

Unnamed: 0_level_0,value,constrain
coefficient_name,Unnamed: 1_level_1,Unnamed: 2_level_1
university_COLLFTE,-0.524249,T
university_COLLPTE,-0.896488,F
gradeschool_AGE0519,0.0,T
highschool_HSENROLL,0.0,T


### Coefficients

In [12]:
coefficients = pd.read_csv(
    "estimation_data_bundle/school_location/school_location_coefficients.csv",
    index_col=0
)
coefficients

Unnamed: 0_level_0,value,constrain
coefficient_name,Unnamed: 1_level_1,Unnamed: 2_level_1
coef_univ_dist_0_1,-3.2451,F
coef_univ_dist_1_2,-2.7011,F
coef_univ_dist_2_5,-0.5707,F
coef_univ_dist_5_15,-0.5002,F
coef_univ_dist_15_up,-0.073,F
coef_high_dist_0_1,-0.9523,F
coef_high_grade_dist_1_2,-0.57,F
coef_high_grade_dist_2_5,-0.57,F
coef_high_dist_5_15,-0.193,F
coef_high_dist_15_up,-0.1882,F


### Choosers

In [13]:
x_co = read_csv(
    "school_location_choosers_combined.csv",
    index_col='person_id'
)
x_co.head()

Unnamed: 0_level_0,TAZ,school_segment,household_id,model_choice,override_choice
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
29448,16,3,29448,101,12
30431,31,3,30431,13,9
31152,46,3,31152,12,9
31532,62,3,31532,69,69
31609,63,3,31609,9,69


In [14]:
x_co.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1041 entries, 29448 to 7540206
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   TAZ              1041 non-null   int64
 1   school_segment   1041 non-null   int64
 2   household_id     1041 non-null   int64
 3   model_choice     1041 non-null   int64
 4   override_choice  1041 non-null   int64
dtypes: int64(5)
memory usage: 48.8 KB


### Alternative Features

In [15]:
x_cv = pd.read_csv(
    "estimation_data_bundle/school_location/school_location_alternatives_combined.csv.gz",
    index_col=(0,1)
)
x_cv.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,1,2,3,4,5,6,7,8,9,10,...,181,182,183,184,185,186,187,188,189,190
person_id,variable,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
29448,TAZ,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,181.0,182.0,183.0,184.0,185.0,186.0,187.0,188.0,189.0,190.0
29448,mode_choice_logsum,0.0934387115039297,0.2563812542387489,0.0761035269175729,0.2045124374570126,0.1233975801122931,-0.1735939200116112,-0.0765343018299863,-0.2956332671484997,1.2466092314635877,-0.6545527082987366,...,-1.0430559511166833,-0.8835779386540672,-0.7767333870209246,-0.6361103614556888,-0.5041983376207029,-0.5790199602383332,-1.0057858828672832,-0.5129070755892168,-0.5869640729281603,-1.076532137563604
29448,pick_count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
29448,prob,0.0,0.0,0.0,0.0,0.0018385813330181,0.0,0.0,0.0,0.0050858245267254,0.0026438879218731,...,0.0,0.0,0.0,0.0,2.250953917987133e-05,0.0,0.0,0.001684289991594,0.0,0.0
29448,shadow_price_size_term_adjustment,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Land Use

In [16]:
landuse = read_csv("school_location_landuse.csv", index_col='TAZ')
landuse.head()

Unnamed: 0_level_0,DISTRICT,SD,county_id,TOTHH,TOTPOP,TOTACRE,RESACRE,CIACRE,TOTEMP,AGE0519,...,OPRKCST,area_type,HSENROLL,COLLFTE,COLLPTE,TOPOLOGY,TERMINAL,household_density,employment_density,density_index
TAZ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,1,46,82,20.3,1.0,15.0,27318,7,...,932.83514,0,0.0,0.0,0.0,3,5.89564,2.875,1707.375,2.870167
2,1,1,1,134,240,31.1,1.0,24.79297,42078,19,...,885.61682,0,0.0,0.0,0.0,1,5.84871,5.195214,1631.374751,5.178722
3,1,1,1,267,476,14.7,1.0,2.31799,2445,38,...,716.27252,0,0.0,0.0,0.0,1,5.53231,80.470405,736.891913,72.547987
4,1,1,1,151,253,19.3,1.0,18.0,22434,20,...,314.0,0,0.0,0.0,0.0,2,5.6433,7.947368,1180.736842,7.894233
5,1,1,1,611,1069,52.7,1.0,15.0,15662,86,...,314.01431,0,0.0,72.14684,0.0,1,5.52555,38.1875,978.875,36.753679


## Prep Data

In [17]:
x_ca = larch_asim.cv_to_ca(x_cv)

In [18]:
x_ca_1 = pd.merge(x_ca, landuse, on='TAZ', how='left')
x_ca_1.index = x_ca.index

In [19]:
x_ca_1, x_co = larch_asim.prevent_overlapping_column_names(x_ca_1, x_co)

In [20]:
d = larch.DataFrames(
    co=x_co,
    ca=x_ca_1,
    av=True,
)

## Prep Model

In [21]:
m = larch.Model()

In [22]:
m.utility_ca = larch_asim.linear_utility_from_spec(
    spec, x_col='Label', 
    p_col=SEGMENT_IDS, 
    ignore_x=('local_dist',), 
    segment_id=CHOOSER_SEGMENT_COLUMN_NAME,
)
print(m.utility_ca)

  P.coef_univ_dist_0_1 * X('util_dist_0_1*(school_segment==3)')
+ P.coef_univ_dist_1_2 * X('util_dist_1_2*(school_segment==3)')
+ P.coef_univ_dist_2_5 * X('util_dist_2_5*(school_segment==3)')
+ P.coef_univ_dist_5_15 * X('util_dist_5_15*(school_segment==3)')
+ P.coef_univ_dist_15_up * X('util_dist_15_up*(school_segment==3)')
+ P('-999') * X('util_no_attractions*(school_segment==3)')
+ P.coef_mode_logsum * X('util_mode_choice_logsum*(school_segment==3)')
+ P('1') * X('util_sample_of_corrections_factor*(school_segment==3)')
+ P.coef_high_dist_0_1 * X('util_dist_0_1*(school_segment==2)')
+ P.coef_high_grade_dist_1_2 * X('util_dist_1_2*(school_segment==2)')
+ P.coef_high_grade_dist_2_5 * X('util_dist_2_5*(school_segment==2)')
+ P.coef_high_dist_5_15 * X('util_dist_5_15*(school_segment==2)')
+ P.coef_high_dist_15_up * X('util_dist_15_up*(school_segment==2)')
+ P('-999') * X('util_no_attractions*(school_segment==2)')
+ P.coef_mode_logsum * X('util_mode_choice_logsum*(school_segment==2)')
+ P(

In [23]:
m.quantity_ca = sum(
    P(f"{i}_{q}") * X(q) * X(f"{settings['CHOOSER_SEGMENT_COLUMN_NAME']}=={settings['SEGMENT_IDS'][i]}")
    for i in school_size_spec.index
    for q in school_size_spec.columns
    if school_size_spec.loc[i,q]!=0
)

In [24]:
larch_asim.explicit_value_parameters_from_spec(spec, p_col=SEGMENT_IDS, model=m)

In [25]:
m.pf

Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note
-999,-999.0,0.0,0.0,-inf,inf,1,
1,1.0,0.0,0.0,-inf,inf,1,
coef_grade_dist_0_1,0.0,0.0,0.0,-inf,inf,0,
coef_grade_dist_15_up,0.0,0.0,0.0,-inf,inf,0,
coef_grade_dist_5_15,0.0,0.0,0.0,-inf,inf,0,
coef_high_dist_0_1,0.0,0.0,0.0,-inf,inf,0,
coef_high_dist_15_up,0.0,0.0,0.0,-inf,inf,0,
coef_high_dist_5_15,0.0,0.0,0.0,-inf,inf,0,
coef_high_grade_dist_1_2,0.0,0.0,0.0,-inf,inf,0,
coef_high_grade_dist_2_5,0.0,0.0,0.0,-inf,inf,0,


In [26]:
larch_asim.apply_coefficients(coefficients, m)
larch_asim.apply_coefficients(size_coef, m, minimum=-6, maximum=6)

In [27]:
m.pf  # Spot check, confirm coefficients set correctly. 

Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note
-999,-999.0,-999.0,-999.0,-999.0,-999.0,1,
1,1.0,1.0,1.0,1.0,1.0,1,
coef_grade_dist_0_1,-1.6419,0.0,0.0,,,0,
coef_grade_dist_15_up,-0.046,0.0,0.0,,,0,
coef_grade_dist_5_15,-0.2031,0.0,0.0,,,0,
coef_high_dist_0_1,-0.9523,0.0,0.0,,,0,
coef_high_dist_15_up,-0.1882,0.0,0.0,,,0,
coef_high_dist_5_15,-0.193,0.0,0.0,,,0,
coef_high_grade_dist_1_2,-0.57,0.0,0.0,,,0,
coef_high_grade_dist_2_5,-0.57,0.0,0.0,,,0,


In [28]:
m.dataservice = d

In [29]:
m.choice_co_code = 'override_choice'

## Re-Estimate

In [30]:
m.estimate()

req_data does not request avail_ca or avail_co but it is set and being provided


Unnamed: 0,value,initvalue,nullvalue,minimum,maximum,holdfast,note,best
-999,-999.0,-999.0,-999.0,-999.0,-999.0,1,,-999.0
1,1.0,1.0,1.0,1.0,1.0,1,,1.0
coef_grade_dist_0_1,-4.312078,0.0,0.0,,,0,,-4.312078
coef_grade_dist_15_up,-0.046,0.0,0.0,,,0,,-0.046
coef_grade_dist_5_15,-0.362956,0.0,0.0,,,0,,-0.362956
coef_high_dist_0_1,-1.661998,0.0,0.0,,,0,,-1.661998
coef_high_dist_15_up,-0.1882,0.0,0.0,,,0,,-0.1882
coef_high_dist_5_15,-0.458801,0.0,0.0,,,0,,-0.458801
coef_high_grade_dist_1_2,-1.220211,0.0,0.0,,,0,,-1.220211
coef_high_grade_dist_2_5,-1.359651,0.0,0.0,,,0,,-1.359651




Unnamed: 0_level_0,0
Unnamed: 0_level_1,0
-999,-999.000000
1,1.000000
coef_grade_dist_0_1,-4.312078
coef_grade_dist_15_up,-0.046000
coef_grade_dist_5_15,-0.362956
coef_high_dist_0_1,-1.661998
coef_high_dist_15_up,-0.188200
coef_high_dist_5_15,-0.458801
coef_high_grade_dist_1_2,-1.220211
coef_high_grade_dist_2_5,-1.359651

Unnamed: 0,0
-999,-999.0
1,1.0
coef_grade_dist_0_1,-4.312078
coef_grade_dist_15_up,-0.046
coef_grade_dist_5_15,-0.362956
coef_high_dist_0_1,-1.661998
coef_high_dist_15_up,-0.1882
coef_high_dist_5_15,-0.458801
coef_high_grade_dist_1_2,-1.220211
coef_high_grade_dist_2_5,-1.359651

Unnamed: 0,0
-999,0.0
1,0.0
coef_grade_dist_0_1,-0.000327
coef_grade_dist_15_up,0.0
coef_grade_dist_5_15,0.000443
coef_high_dist_0_1,0.000332
coef_high_dist_15_up,0.0
coef_high_dist_5_15,-0.001173
coef_high_grade_dist_1_2,0.001978
coef_high_grade_dist_2_5,0.009383


In [31]:
m.possible_overspecification

## Write Out Results

In [32]:
# Write re-estimated value back into the coefficients file.
est_names = [j for j in coefficients.index if j in m.pf.index]
coefficients.loc[est_names, 'value'] = m.pf.loc[est_names, 'value']

In [33]:
# Write out replacement coefficients file and model summaries
os.makedirs(os.path.join(edb_directory,'estimated'), exist_ok=True)

In [34]:
coefficients.reset_index().to_csv(
    os.path.join(
        edb_directory, 
        'estimated',
        "school_location_coefficients_revised.csv",
    ),
    index=False,
)

In [35]:
m.to_xlsx(
    os.path.join(
        edb_directory, 
        'estimated',
        "school_location_model_estimation.xlsx",
    )
)

<larch.util.excel.ExcelWriter at 0x243a4d26f08>

In [36]:
# Write size coefficients into size_spec
for c in school_size_spec.columns:
    for i in school_size_spec.index:
        param_name = f"{i}_{c}"
        j = (size_spec['segment'] == i) & (size_spec['model_selector'] == 'school')
        try:
            size_spec.loc[j,c] = np.exp(m.get_value(param_name))
        except KeyError:
            pass


In [37]:
# Rescale each row to total 1, not mathematically needed
# but to maintain a consistent approach from existing ASim

size_spec.iloc[:,2:] = (size_spec.iloc[:,2:].div(size_spec.iloc[:,2:].sum(1), axis=0))

In [38]:
size_spec.to_csv(
    os.path.join(edb_directory,'estimated',"school_location_size_terms.csv"), 
    index=False,
)