-
Notifications
You must be signed in to change notification settings - Fork 96
/
simulate.py
1385 lines (1067 loc) · 51.8 KB
/
simulate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# ActivitySim
# See full license in LICENSE.txt.
from builtins import range
import warnings
import logging
from collections import OrderedDict
import numpy as np
import pandas as pd
from . import logit
from . import tracing
from . import pipeline
from . import config
from . import util
from . import assign
from . import chunk
from . import pathbuilder
logger = logging.getLogger(__name__)
SPEC_DESCRIPTION_NAME = 'Description'
SPEC_EXPRESSION_NAME = 'Expression'
SPEC_LABEL_NAME = 'Label'
ALT_LOSER_UTIL = -900
def random_rows(df, n):
# only sample if df has more than n rows
if len(df.index) > n:
prng = pipeline.get_rn_generator().get_global_rng()
return df.take(prng.choice(len(df), size=n, replace=False))
else:
return df
def uniquify_spec_index(spec):
# uniquify spec index inplace
# ensure uniqueness of spec index by appending comment with dupe count
# this allows us to use pandas dot to compute_utilities
dict = OrderedDict()
for expr in spec.index:
dict[assign.uniquify_key(dict, expr, template="{} # ({})")] = expr
prev_index_name = spec.index.name
spec.index = list(dict.keys())
spec.index.name = prev_index_name
assert spec.index.is_unique
def read_model_alts(file_name, set_index=None):
file_path = config.config_file_path(file_name)
df = pd.read_csv(file_path, comment='#')
if set_index:
df.set_index(set_index, inplace=True)
return df
def read_model_spec(file_name):
"""
Read a CSV model specification into a Pandas DataFrame or Series.
file_path : str absolute or relative path to file
The CSV is expected to have columns for component descriptions
and expressions, plus one or more alternatives.
The CSV is required to have a header with column names. For example:
Description,Expression,alt0,alt1,alt2
Parameters
----------
model_settings : dict
name of spec_file is in model_settings['SPEC'] and file is relative to configs
file_name : str
file_name id spec file in configs folder
description_name : str, optional
Name of the column in `fname` that contains the component description.
expression_name : str, optional
Name of the column in `fname` that contains the component expression.
Returns
-------
spec : pandas.DataFrame
The description column is dropped from the returned data and the
expression values are set as the table index.
"""
assert isinstance(file_name, str)
if not file_name.lower().endswith('.csv'):
file_name = '%s.csv' % (file_name,)
file_path = config.config_file_path(file_name)
try:
spec = pd.read_csv(file_path, comment='#')
except Exception as err:
logger.error(f"read_model_spec error reading {file_path}")
logger.error(f"read_model_spec error {type(err).__name__}: {str(err)}")
raise(err)
spec = spec.dropna(subset=[SPEC_EXPRESSION_NAME])
# don't need description and set the expression to the index
if SPEC_DESCRIPTION_NAME in spec.columns:
spec = spec.drop(SPEC_DESCRIPTION_NAME, axis=1)
spec = spec.set_index(SPEC_EXPRESSION_NAME).fillna(0)
# ensure uniqueness of spec index by appending comment with dupe count
# this allows us to use pandas dot to compute_utilities
uniquify_spec_index(spec)
if SPEC_LABEL_NAME in spec:
spec = spec.set_index(SPEC_LABEL_NAME, append=True)
assert isinstance(spec.index, pd.MultiIndex)
return spec
def read_model_coefficients(model_settings=None, file_name=None):
"""
Read the coefficient file specified by COEFFICIENTS model setting
"""
if model_settings is None:
assert file_name is not None
else:
assert file_name is None
assert 'COEFFICIENTS' in model_settings, \
"'COEFFICIENTS' tag not in model_settings in %s" % model_settings.get('source_file_paths')
file_name = model_settings['COEFFICIENTS']
logger.debug(f"read_model_coefficients file_name {file_name}")
file_path = config.config_file_path(file_name)
try:
coefficients = pd.read_csv(file_path, comment='#', index_col='coefficient_name')
except ValueError:
logger.exception("Coefficient File Invalid: %s" % str(file_path))
raise
if coefficients.index.duplicated().any():
logger.warning(f"duplicate coefficients in {file_path}\n"
f"{coefficients[coefficients.index.duplicated(keep=False)]}")
raise RuntimeError(f"duplicate coefficients in {file_path}")
if coefficients.value.isnull().any():
logger.warning(
f"null coefficients in {file_path}\n{coefficients[coefficients.value.isnull()]}")
raise RuntimeError(f"null coefficients in {file_path}")
return coefficients
def spec_for_segment(model_settings, spec_id, segment_name, estimator):
"""
Select spec for specified segment from omnibus spec containing columns for each segment
Parameters
----------
model_spec : pandas.DataFrame
omnibus spec file with expressions in index and one column per segment
segment_name : str
segment_name that is also column name in model_spec
Returns
-------
pandas.dataframe
canonical spec file with expressions in index and single column with utility coefficients
"""
spec_file_name = model_settings[spec_id]
spec = read_model_spec(file_name=spec_file_name)
if len(spec.columns) > 1:
# if spec is segmented
spec = spec[[segment_name]]
else:
# otherwise we expect a single coefficient column
# doesn't really matter what it is called, but this may catch errors
assert spec.columns[0] in ['coefficient', segment_name]
if 'COEFFICIENTS' not in model_settings:
logger.warning(f"no coefficient file specified in model_settings for {spec_file_name}")
try:
assert (spec.astype(float) == spec).all(axis=None)
except (ValueError, AssertionError):
raise RuntimeError(f"No coefficient file specified for {spec_file_name} "
f"but not all spec column values are numeric")
return spec
coefficients = read_model_coefficients(model_settings)
spec = eval_coefficients(spec, coefficients, estimator)
return spec
def read_model_coefficient_template(model_settings):
"""
Read the coefficient template specified by COEFFICIENT_TEMPLATE model setting
"""
assert 'COEFFICIENT_TEMPLATE' in model_settings, \
"'COEFFICIENT_TEMPLATE' not in model_settings in %s" % model_settings.get('source_file_paths')
coefficients_file_name = model_settings['COEFFICIENT_TEMPLATE']
file_path = config.config_file_path(coefficients_file_name)
try:
template = pd.read_csv(file_path, comment='#', index_col='coefficient_name')
except ValueError:
logger.exception("Coefficient Template File Invalid: %s" % str(file_path))
raise
# by convention, an empty cell in the template indicates that
# the coefficient name should be propogated to across all segments
# this makes for a more legible template than repeating the identical coefficient name in each column
# replace missing cell values with coefficient_name from index
template = template.where(
~template.isnull(),
np.broadcast_to(template.index.values[:, None], template.shape),
)
if template.index.duplicated().any():
dupes = template[template.index.duplicated(keep=False)].sort_index()
logger.warning(f"duplicate coefficient names in {coefficients_file_name}:\n{dupes}")
assert not template.index.duplicated().any()
return template
def dump_mapped_coefficients(model_settings):
"""
dump template_df with coefficient values
"""
coefficients_df = read_model_coefficients(model_settings)
template_df = read_model_coefficient_template(model_settings)
for c in template_df.columns:
template_df[c] = template_df[c].map(coefficients_df.value)
coefficients_template_file_name = model_settings['COEFFICIENT_TEMPLATE']
file_path = config.output_file_path(coefficients_template_file_name)
template_df.to_csv(file_path, index=True)
logger.info(f"wrote mapped coefficient template to {file_path}")
coefficients_file_name = model_settings['COEFFICIENTS']
file_path = config.output_file_path(coefficients_file_name)
coefficients_df.to_csv(file_path, index=True)
logger.info(f"wrote raw coefficients to {file_path}")
def get_segment_coefficients(model_settings, segment_name):
"""
Return a dict mapping generic coefficient names to segment-specific coefficient values
some specs mode_choice logsums have the same espression values with different coefficients for various segments
(e.g. eatout, .. ,atwork) and a template file that maps a flat list of coefficients into segment columns.
This allows us to provide a coefficient file with just the coefficients for a specific segment,
that works with generic coefficient names in the spec. For instance coef_ivt can take on the values
of segment-specific coefficients coef_ivt_school_univ, coef_ivt_work, coef_ivt_atwork,...
::
coefficients_df
value constrain
coefficient_name
coef_ivt_eatout_escort_... -0.0175 F
coef_ivt_school_univ -0.0224 F
coef_ivt_work -0.0134 F
coef_ivt_atwork -0.0188 F
template_df
coefficient_name eatout school school work
coef_ivt coef_ivt_eatout_escort_... coef_ivt_school_univ coef_ivt_school_univ coef_ivt_work
For school segment this will return the generic coefficient name with the segment-specific coefficient value
e.g. {'coef_ivt': -0.0224, ...}
...
"""
if 'COEFFICIENTS' in model_settings and 'COEFFICIENT_TEMPLATE' in model_settings:
legacy = False
elif 'COEFFICIENTS' in model_settings:
legacy = 'COEFFICIENTS'
warnings.warn("Support for COEFFICIENTS without COEFFICIENT_TEMPLATE in model settings file will be removed."
"Use COEFFICIENT and COEFFICIENT_TEMPLATE to support estimation.", FutureWarning)
elif 'LEGACY_COEFFICIENTS' in model_settings:
legacy = 'LEGACY_COEFFICIENTS'
warnings.warn("Support for 'LEGACY_COEFFICIENTS' setting in model settings file will be removed."
"Use COEFFICIENT and COEFFICIENT_TEMPLATE to support estimation.", FutureWarning)
else:
raise RuntimeError(f"No COEFFICIENTS setting in model_settings")
if legacy:
constants = config.get_model_constants(model_settings)
legacy_coeffs_file_path = config.config_file_path(model_settings[legacy])
omnibus_coefficients = pd.read_csv(legacy_coeffs_file_path, comment='#', index_col='coefficient_name')
coefficients_dict = assign.evaluate_constants(omnibus_coefficients[segment_name], constants=constants)
else:
coefficients_df = read_model_coefficients(model_settings)
template_df = read_model_coefficient_template(model_settings)
coefficients_col = template_df[segment_name].map(coefficients_df.value).astype(float)
if coefficients_col.isnull().any():
# show them the offending lines from interaction_coefficients_file
logger.warning(f"bad coefficients in COEFFICIENTS {model_settings['COEFFICIENTS']}\n"
f"{coefficients_col[coefficients_col.isnull()]}")
assert not coefficients_col.isnull().any()
coefficients_dict = coefficients_col.to_dict()
return coefficients_dict
def eval_nest_coefficients(nest_spec, coefficients, trace_label):
def replace_coefficients(nest):
if isinstance(nest, dict):
assert 'coefficient' in nest
coefficient_name = nest['coefficient']
if isinstance(coefficient_name, str):
assert coefficient_name in coefficients, "%s not in nest coefficients" % (coefficient_name, )
nest['coefficient'] = coefficients[coefficient_name]
assert 'alternatives' in nest
for alternative in nest['alternatives']:
if isinstance(alternative, dict):
replace_coefficients(alternative)
if isinstance(coefficients, pd.DataFrame):
assert ('value' in coefficients.columns)
coefficients = coefficients['value'].to_dict()
replace_coefficients(nest_spec)
logit.validate_nest_spec(nest_spec, trace_label)
return nest_spec
def eval_coefficients(spec, coefficients, estimator):
spec = spec.copy() # don't clobber input spec
if isinstance(coefficients, pd.DataFrame):
assert ('value' in coefficients.columns)
coefficients = coefficients['value'].to_dict()
assert isinstance(coefficients, dict), \
"eval_coefficients doesn't grok type of coefficients: %s" % (type(coefficients))
for c in spec.columns:
if c == SPEC_LABEL_NAME:
continue
spec[c] = spec[c].apply(lambda x: eval(str(x), {}, coefficients)).astype(np.float32)
# drop any rows with all zeros since they won't have any effect (0 marginal utility)
# (do not drop rows in estimation mode as it may confuse the estimation package (e.g. larch)
zero_rows = (spec == 0).all(axis=1)
if zero_rows.any():
if estimator:
logger.debug("keeping %s all-zero rows in SPEC" % (zero_rows.sum(),))
else:
logger.debug("dropping %s all-zero rows from SPEC" % (zero_rows.sum(), ))
spec = spec.loc[~zero_rows]
return spec
def eval_utilities(spec, choosers, locals_d=None, trace_label=None,
have_trace_targets=False, trace_all_rows=False,
estimator=None, trace_column_names=None, log_alt_losers=False):
"""
Parameters
----------
spec : pandas.DataFrame
A table of variable specifications and coefficient values.
Variable expressions should be in the table index and the table
should have a column for each alternative.
choosers : pandas.DataFrame
locals_d : Dict or None
This is a dictionary of local variables that will be the environment
for an evaluation of an expression that begins with @
trace_label: str
have_trace_targets: boolean - choosers has targets to trace
trace_all_rows: boolean - trace all chooser rows, bypassing tracing.trace_targets
estimator :
called to report intermediate table results (used for estimation)
trace_column_names: str or list of str
chooser columns to include when tracing expression_values
Returns
-------
"""
# fixme - restore tracing and _check_for_variability
trace_label = tracing.extend_trace_label(trace_label, 'eval_utils')
# avoid altering caller's passed-in locals_d parameter (they may be looping)
locals_dict = assign.local_utilities()
if locals_d is not None:
locals_dict.update(locals_d)
globals_dict = {}
locals_dict['df'] = choosers
# - eval spec expressions
if isinstance(spec.index, pd.MultiIndex):
# spec MultiIndex with expression and label
exprs = spec.index.get_level_values(SPEC_EXPRESSION_NAME)
else:
exprs = spec.index
expression_values = np.empty((spec.shape[0], choosers.shape[0]))
chunk.log_df(trace_label, "expression_values", expression_values)
i = 0
for expr, coefficients in zip(exprs, spec.values):
try:
with warnings.catch_warnings(record=True) as w:
# Cause all warnings to always be triggered.
warnings.simplefilter("always")
if expr.startswith('@'):
expression_value = eval(expr[1:], globals_dict, locals_dict)
else:
expression_value = choosers.eval(expr)
if len(w) > 0:
for wrn in w:
logger.warning(f"{trace_label} - {type(wrn).__name__} ({wrn.message}) evaluating: {str(expr)}")
except Exception as err:
logger.exception(f"{trace_label} - {type(err).__name__} ({str(err)}) evaluating: {str(expr)}")
raise err
if log_alt_losers:
# utils for each alt for this expression
# FIXME if we always did tis, we cold uem these and skip np.dot below
utils = np.outer(expression_value, coefficients)
losers = np.amax(utils, axis=1) < ALT_LOSER_UTIL
if losers.any():
logger.warning(f"{trace_label} - {sum(losers)} choosers of {len(losers)} "
f"with prohibitive utilities for all alternatives for expression: {expr}")
expression_values[i] = expression_value
i += 1
chunk.log_df(trace_label, "expression_values", expression_values)
if estimator:
df = pd.DataFrame(
data=expression_values.transpose(),
index=choosers.index,
columns=spec.index.get_level_values(SPEC_LABEL_NAME))
df.index.name = choosers.index.name
estimator.write_expression_values(df)
# - compute_utilities
utilities = np.dot(expression_values.transpose(), spec.astype(np.float64).values)
utilities = pd.DataFrame(data=utilities, index=choosers.index, columns=spec.columns)
chunk.log_df(trace_label, "utilities", utilities)
# sometimes tvpb will drop rows on the fly and we wind up with an empty
# table of choosers. this will just bypass tracing in that case.
if (trace_all_rows or have_trace_targets) and (len(choosers) > 0):
if trace_all_rows:
trace_targets = pd.Series(True, index=choosers.index)
else:
trace_targets = tracing.trace_targets(choosers)
assert trace_targets.any() # since they claimed to have targets...
# get int offsets of the trace_targets (offsets of bool=True values)
offsets = np.nonzero(list(trace_targets))[0]
# get array of expression_values
# expression_values.shape = (len(spec), len(choosers))
# data.shape = (len(spec), len(offsets))
data = expression_values[:, offsets]
# index is utility expressions (and optional label if MultiIndex)
expression_values_df = pd.DataFrame(data=data, index=spec.index)
if trace_column_names is not None:
if isinstance(trace_column_names, str):
trace_column_names = [trace_column_names]
expression_values_df.columns = pd.MultiIndex.from_frame(choosers.loc[trace_targets, trace_column_names])
tracing.trace_df(expression_values_df, tracing.extend_trace_label(trace_label, 'expression_values'),
slicer=None, transpose=False)
if len(spec.columns) > 1:
for c in spec.columns:
name = f'expression_value_{c}'
tracing.trace_df(expression_values_df.multiply(spec[c].values, axis=0),
tracing.extend_trace_label(trace_label, name),
slicer=None, transpose=False)
del expression_values
chunk.log_df(trace_label, "expression_values", None)
# no longer our problem - but our caller should re-log this...
chunk.log_df(trace_label, "utilities", None)
return utilities
def eval_variables(exprs, df, locals_d=None):
"""
Evaluate a set of variable expressions from a spec in the context
of a given data table.
There are two kinds of supported expressions: "simple" expressions are
evaluated in the context of the DataFrame using DataFrame.eval.
This is the default type of expression.
Python expressions are evaluated in the context of this function using
Python's eval function. Because we use Python's eval this type of
expression supports more complex operations than a simple expression.
Python expressions are denoted by beginning with the @ character.
Users should take care that these expressions must result in
a Pandas Series.
# FIXME - for performance, it is essential that spec and expression_values
# FIXME - not contain booleans when dotted with spec values
# FIXME - or the arrays will be converted to dtype=object within dot()
Parameters
----------
exprs : sequence of str
df : pandas.DataFrame
locals_d : Dict
This is a dictionary of local variables that will be the environment
for an evaluation of an expression that begins with @
Returns
-------
variables : pandas.DataFrame
Will have the index of `df` and columns of eval results of `exprs`.
"""
# avoid altering caller's passed-in locals_d parameter (they may be looping)
locals_dict = assign.local_utilities()
if locals_d is not None:
locals_dict.update(locals_d)
globals_dict = {}
locals_dict['df'] = df
def to_array(x):
if x is None or np.isscalar(x):
a = np.asanyarray([x] * len(df.index))
elif isinstance(x, pd.Series):
# fixme
# assert x.index.equals(df.index)
# save a little RAM
a = x.values
else:
a = x
# FIXME - for performance, it is essential that spec and expression_values
# FIXME - not contain booleans when dotted with spec values
# FIXME - or the arrays will be converted to dtype=object within dot()
if not np.issubdtype(a.dtype, np.number):
a = a.astype(np.int8)
return a
values = OrderedDict()
for expr in exprs:
try:
if expr.startswith('@'):
expr_values = to_array(eval(expr[1:], globals_dict, locals_dict))
else:
expr_values = to_array(df.eval(expr))
# read model spec should ensure uniqueness, otherwise we should uniquify
assert expr not in values
values[expr] = expr_values
except Exception as err:
logger.exception(f"Variable evaluation failed {type(err).__name__} ({str(err)}) evaluating: {str(expr)}")
raise err
values = util.df_from_dict(values, index=df.index)
return values
# no longer used because eval_utilities aggregates expression_values as they are computed to save space
# def compute_utilities(expression_values, spec):
#
# # matrix product of spec expression_values with utility coefficients of alternatives
# # sums the partial utilities (represented by each spec row) of the alternatives
# # resulting in a dataframe with one row per chooser and one column per alternative
# # pandas.dot depends on column names of expression_values matching spec index values
#
# # FIXME - for performance, it is essential that spec and expression_values
# # FIXME - not contain booleans when dotted with spec values
# # FIXME - or the arrays will be converted to dtype=object within dot()
#
# spec = spec.astype(np.float64)
#
# # pandas.dot depends on column names of expression_values matching spec index values
# # expressions should have been uniquified when spec was read
# # we could do it here if need be, and then set spec.index and expression_values.columns equal
# assert spec.index.is_unique
# assert (spec.index.values == expression_values.columns.values).all()
#
# utilities = expression_values.dot(spec)
#
# return utilities
def set_skim_wrapper_targets(df, skims):
"""
Add the dataframe to the SkimWrapper object so that it can be dereferenced
using the parameters of the skims object.
Parameters
----------
df : pandas.DataFrame
Table to which to add skim data as new columns.
`df` is modified in-place.
skims : SkimWrapper or Skim3dWrapper object, or a list or dict of skims
The skims object is used to contain multiple matrices of
origin-destination impedances. Make sure to also add it to the
locals_d below in order to access it in expressions. The *only* job
of this method in regards to skims is to call set_df with the
dataframe that comes back from interacting choosers with
alternatives. See the skims module for more documentation on how
the skims object is intended to be used.
"""
skims = skims if isinstance(skims, list) \
else skims.values() if isinstance(skims, dict) \
else [skims]
# assume any object in skims can be treated as a skim
for skim in skims:
try:
skim.set_df(df)
except AttributeError:
pass
def _check_for_variability(expression_values, trace_label):
"""
This is an internal method which checks for variability in each
expression - under the assumption that you probably wouldn't be using a
variable (in live simulations) if it had no variability. This is a
warning to the user that they might have constructed the variable
incorrectly. It samples 1000 rows in order to not hurt performance -
it's likely that if 1000 rows have no variability, the whole dataframe
will have no variability.
"""
if trace_label is None:
trace_label = '_check_for_variability'
sample = random_rows(expression_values, min(1000, len(expression_values)))
no_variability = has_missing_vals = 0
for i in range(len(sample.columns)):
v = sample.iloc[:, i]
if v.min() == v.max():
col_name = sample.columns[i]
logger.info("%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], col_name))
no_variability += 1
# FIXME - how could this happen? Not sure it is really a problem?
if np.count_nonzero(v.isnull().values) > 0:
col_name = sample.columns[i]
logger.info("%s: missing values in: %s" % (trace_label, col_name))
has_missing_vals += 1
if no_variability > 0:
logger.warning("%s: %s columns have no variability" % (trace_label, no_variability))
if has_missing_vals > 0:
logger.warning("%s: %s columns have missing values" % (trace_label, has_missing_vals))
def compute_nested_exp_utilities(raw_utilities, nest_spec):
"""
compute exponentiated nest utilities based on nesting coefficients
For nest nodes this is the exponentiated logsum of alternatives adjusted by nesting coefficient
leaf <- exp( raw_utility )
nest <- exp( ln(sum of exponentiated raw_utility of leaves) * nest_coefficient)
Parameters
----------
raw_utilities : pandas.DataFrame
dataframe with the raw alternative utilities of all leaves
(what in non-nested logit would be the utilities of all the alternatives)
nest_spec : dict
Nest tree dict from the model spec yaml file
Returns
-------
nested_utilities : pandas.DataFrame
Will have the index of `raw_utilities` and columns for exponentiated leaf and node utilities
"""
nested_utilities = pd.DataFrame(index=raw_utilities.index)
for nest in logit.each_nest(nest_spec, post_order=True):
name = nest.name
if nest.is_leaf:
# leaf_utility = raw_utility / nest.product_of_coefficients
nested_utilities[name] = \
raw_utilities[name].astype(float) / nest.product_of_coefficients
else:
# nest node
# the alternative nested_utilities will already have been computed due to post_order
# this will RuntimeWarning: divide by zero encountered in log
# if all nest alternative utilities are zero
# but the resulting inf will become 0 when exp is applied below
with np.errstate(divide='ignore'):
nested_utilities[name] = \
nest.coefficient * np.log(nested_utilities[nest.alternatives].sum(axis=1))
# exponentiate the utility
nested_utilities[name] = np.exp(nested_utilities[name])
return nested_utilities
def compute_nested_probabilities(nested_exp_utilities, nest_spec, trace_label):
"""
compute nested probabilities for nest leafs and nodes
probability for nest alternatives is simply the alternatives's local (to nest) probability
computed in the same way as the probability of non-nested alternatives in multinomial logit
i.e. the fractional share of the sum of the exponentiated utility of itself and its siblings
except in nested logit, its sib group is restricted to the nest
Parameters
----------
nested_exp_utilities : pandas.DataFrame
dataframe with the exponentiated nested utilities of all leaves and nodes
nest_spec : dict
Nest tree dict from the model spec yaml file
Returns
-------
nested_probabilities : pandas.DataFrame
Will have the index of `nested_exp_utilities` and columns for leaf and node probabilities
"""
nested_probabilities = pd.DataFrame(index=nested_exp_utilities.index)
for nest in logit.each_nest(nest_spec, type='node', post_order=False):
probs = logit.utils_to_probs(nested_exp_utilities[nest.alternatives],
trace_label=trace_label,
exponentiated=True,
allow_zero_probs=True)
nested_probabilities = pd.concat([nested_probabilities, probs], axis=1)
return nested_probabilities
def compute_base_probabilities(nested_probabilities, nests, spec):
"""
compute base probabilities for nest leaves
Base probabilities will be the nest-adjusted probabilities of all leaves
This flattens or normalizes all the nested probabilities so that they have the proper global
relative values (the leaf probabilities sum to 1 for each row.)
Parameters
----------
nested_probabilities : pandas.DataFrame
dataframe with the nested probabilities for nest leafs and nodes
nests : dict
Nest tree dict from the model spec yaml file
spec : pandas.Dataframe
simple simulate spec so we can return columns in appropriate order
Returns
-------
base_probabilities : pandas.DataFrame
Will have the index of `nested_probabilities` and columns for leaf base probabilities
"""
base_probabilities = pd.DataFrame(index=nested_probabilities.index)
for nest in logit.each_nest(nests, type='leaf', post_order=False):
# skip root: it has a prob of 1 but we didn't compute a nested probability column for it
ancestors = nest.ancestors[1:]
base_probabilities[nest.name] = nested_probabilities[ancestors].prod(axis=1)
# reorder alternative columns to match spec
# since these are alternatives chosen by column index, order of columns matters
assert(set(base_probabilities.columns) == set(spec.columns))
base_probabilities = base_probabilities[spec.columns]
return base_probabilities
def eval_mnl(choosers, spec, locals_d, custom_chooser, estimator,
log_alt_losers=False,
want_logsums=False, trace_label=None,
trace_choice_name=None, trace_column_names=None):
"""
Run a simulation for when the model spec does not involve alternative
specific data, e.g. there are no interactions with alternative
properties and no need to sample from alternatives.
Each row in spec computes a partial utility for each alternative,
by providing a spec expression (often a boolean 0-1 trigger)
and a column of utility coefficients for each alternative.
We compute the utility of each alternative by matrix-multiplication of eval results
with the utility coefficients in the spec alternative columns
yielding one row per chooser and one column per alternative
Parameters
----------
choosers : pandas.DataFrame
spec : pandas.DataFrame
A table of variable specifications and coefficient values.
Variable expressions should be in the table index and the table
should have a column for each alternative.
locals_d : Dict or None
This is a dictionary of local variables that will be the environment
for an evaluation of an expression that begins with @
custom_chooser : function(probs, choosers, spec, trace_label) returns choices, rands
custom alternative to logit.make_choices
estimator : Estimator object
called to report intermediate table results (used for estimation)
trace_label: str
This is the label to be used for trace log file entries and dump file names
when household tracing enabled. No tracing occurs if label is empty or None.
trace_choice_name: str
This is the column label to be used in trace file csv dump of choices
trace_column_names: str or list of str
chooser columns to include when tracing expression_values
Returns
-------
choices : pandas.Series
Index will be that of `choosers`, values will match the columns
of `spec`.
"""
# FIXME - not implemented because not currently needed
assert not want_logsums
trace_label = tracing.extend_trace_label(trace_label, 'eval_mnl')
have_trace_targets = tracing.has_trace_targets(choosers)
if have_trace_targets:
tracing.trace_df(choosers, '%s.choosers' % trace_label)
utilities = eval_utilities(spec, choosers, locals_d,
log_alt_losers=log_alt_losers,
trace_label=trace_label, have_trace_targets=have_trace_targets,
estimator=estimator, trace_column_names=trace_column_names)
chunk.log_df(trace_label, "utilities", utilities)
if have_trace_targets:
tracing.trace_df(utilities, '%s.utilities' % trace_label,
column_labels=['alternative', 'utility'])
probs = logit.utils_to_probs(utilities, trace_label=trace_label, trace_choosers=choosers)
chunk.log_df(trace_label, "probs", probs)
del utilities
chunk.log_df(trace_label, 'utilities', None)
if have_trace_targets:
# report these now in case make_choices throws error on bad_choices
tracing.trace_df(probs, '%s.probs' % trace_label,
column_labels=['alternative', 'probability'])
if custom_chooser:
choices, rands = custom_chooser(probs=probs, choosers=choosers, spec=spec,
trace_label=trace_label)
else:
choices, rands = logit.make_choices(probs, trace_label=trace_label)
del probs
chunk.log_df(trace_label, 'probs', None)
if have_trace_targets:
tracing.trace_df(choices, '%s.choices' % trace_label,
columns=[None, trace_choice_name])
tracing.trace_df(rands, '%s.rands' % trace_label,
columns=[None, 'rand'])
return choices
def eval_nl(choosers, spec, nest_spec, locals_d, custom_chooser, estimator,
log_alt_losers=False,
want_logsums=False, trace_label=None,
trace_choice_name=None, trace_column_names=None):
"""
Run a nested-logit simulation for when the model spec does not involve alternative
specific data, e.g. there are no interactions with alternative
properties and no need to sample from alternatives.
Parameters
----------
choosers : pandas.DataFrame
spec : pandas.DataFrame
A table of variable specifications and coefficient values.
Variable expressions should be in the table index and the table
should have a column for each alternative.
nest_spec:
dictionary specifying nesting structure and nesting coefficients
(from the model spec yaml file)
locals_d : Dict or None
This is a dictionary of local variables that will be the environment
for an evaluation of an expression that begins with @
custom_chooser : function(probs, choosers, spec, trace_label) returns choices, rands
custom alternative to logit.make_choices
estimator : Estimator object
called to report intermediate table results (used for estimation)
trace_label: str
This is the label to be used for trace log file entries and dump file names
when household tracing enabled. No tracing occurs if label is empty or None.
trace_choice_name: str
This is the column label to be used in trace file csv dump of choices
trace_column_names: str or list of str
chooser columns to include when tracing expression_values
Returns
-------
choices : pandas.Series
Index will be that of `choosers`, values will match the columns
of `spec`.
"""
trace_label = tracing.extend_trace_label(trace_label, 'eval_nl')
assert trace_label
have_trace_targets = tracing.has_trace_targets(choosers)
logit.validate_nest_spec(nest_spec, trace_label)
if have_trace_targets:
tracing.trace_df(choosers, '%s.choosers' % trace_label)
raw_utilities = eval_utilities(spec, choosers, locals_d,
log_alt_losers=log_alt_losers,
trace_label=trace_label, have_trace_targets=have_trace_targets,
estimator=estimator, trace_column_names=trace_column_names)
chunk.log_df(trace_label, "raw_utilities", raw_utilities)
if have_trace_targets:
tracing.trace_df(raw_utilities, '%s.raw_utilities' % trace_label,
column_labels=['alternative', 'utility'])
# exponentiated utilities of leaves and nests
nested_exp_utilities = compute_nested_exp_utilities(raw_utilities, nest_spec)
chunk.log_df(trace_label, "nested_exp_utilities", nested_exp_utilities)
del raw_utilities
chunk.log_df(trace_label, 'raw_utilities', None)
if have_trace_targets:
tracing.trace_df(nested_exp_utilities, '%s.nested_exp_utilities' % trace_label,
column_labels=['alternative', 'utility'])
# probabilities of alternatives relative to siblings sharing the same nest
nested_probabilities = \
compute_nested_probabilities(nested_exp_utilities, nest_spec, trace_label=trace_label)