In [1]:
import polars as pl
from IPython.display import display
import os

### Parameters

In [2]:
SAMENESS_MONTE_CARLO_WORD = "Basics + tour_type_num"
SAMENESS_BASICS_WORD = "Basics"
SAMENESS_START_TIME_WORD = "Basics + start_time"
SAMENESS_SCHEDULE_LOCATION_WORD = "Basics + start_time + duration + origin + destination"
SAMENESS_DESTINATION_WORD = "Basics + destination"

In [3]:
os.getcwd()

'/Users/wsp/Documents/GitHub/asim_eet_viz'

### Remote I/O

In [4]:
input_dir = "./input/tour-mc-data/"
output_dir = "./notebooks/"

base_dir_name = "base"
build_dir_name = "build-transit"

eet_dir_name = "eet"
mc_same_dir_name = "mc_same_seed"

tour_file_name = "final_tours.csv"

output_file = os.path.join(output_dir, "tour-mc-investigation-for-tableau.csv")

###  Data Reads

In [5]:
df_list = {}

column_types = {
    "person_id": pl.Float64,
    "tour_type": pl.Utf8,
    "tour_type_count": pl.Float64,
    "tour_type_num": pl.Float64,
    "tour_num": pl.Float64,
    "tour_count": pl.Float64,
    "tour_category": pl.Utf8,
    "number_of_participants": pl.Float64,
    "destination": pl.Float64,
    "origin": pl.Float64,
    "household_id": pl.Float64,
    "start": pl.Float64,
    "end": pl.Float64,
    "duration": pl.Float64,
    "school_esc_outbound": pl.Utf8,
    "school_esc_inbound": pl.Utf8,
    "num_escortees": pl.Float64,
    "tdd": pl.Float64,
    "tour_id_temp": pl.Float64,
    "composition": pl.Utf8,
    "is_external_tour": pl.Boolean,
    "is_internal_tour": pl.Boolean,
    "destination_logsum": pl.Float64,
    "vehicle_occup_1": pl.Utf8,
    "vehicle_occup_2": pl.Utf8,
    "vehicle_occup_3.5": pl.Utf8,
    "tour_mode": pl.Utf8,
    "mode_choice_logsum": pl.Float64,
    "selected_vehicle": pl.Utf8,
    "atwork_subtour_frequency": pl.Utf8,
    "parent_tour_id": pl.Float64,
    "stop_frequency": pl.Utf8,
    "primary_purpose": pl.Utf8,
    "tour_id": pl.Int64,
}


for scen_name in [base_dir_name, build_dir_name]:
    for sim_type_name in [eet_dir_name, mc_same_dir_name]:
        in_file = os.path.join(input_dir, scen_name, sim_type_name, tour_file_name)
        
        df = pl.read_csv(in_file, schema_overrides=column_types)
        
        df = df.with_columns(
            pl.lit(scen_name).alias("scenario"),
            pl.lit(sim_type_name).alias("simulation_type")
        )
        
        df_list[f"{scen_name}--{sim_type_name}"] = df

### Reductions

#### EET Reductions

In [6]:
eet_base_df = df_list[f"{base_dir_name}--{eet_dir_name}"].select(
    pl.col("household_id").alias("base_household_id"),
    pl.col("person_id").alias("base_person_id"),
    pl.col("tour_type").alias("base_tour_type"),
    pl.col("primary_purpose").alias("base_primary_purpose"),
    pl.col("tour_type_num").alias("base_tour_type_num"),
    pl.col("tour_type_count").alias("base_tour_type_count"),
    pl.col("tour_id").alias("base_tour_id"),
    pl.col("tour_mode").alias("base_tour_mode"),
    pl.col("duration").alias("base_duration"),
    pl.col("start").alias("base_start_time_index"),
    pl.col("end").alias("base_end_time_index"),
    pl.col("origin").alias("base_origin"),
    pl.col("destination").alias("base_destination"),
).with_row_index(name="base_unique_id", offset = 1).sort("base_unique_id")


eet_build_df = df_list[f"{build_dir_name}--{eet_dir_name}"].select(
    pl.col("household_id").alias("build_household_id"),
    pl.col("person_id").alias("build_person_id"),
    pl.col("tour_type").alias("build_tour_type"),
    pl.col("primary_purpose").alias("build_primary_purpose"),
    pl.col("tour_type_num").alias("build_tour_type_num"),
    pl.col("tour_type_count").alias("build_tour_type_count"),
    pl.col("tour_id").alias("build_tour_id"),
    pl.col("tour_mode").alias("build_tour_mode"),
    pl.col("duration").alias("build_duration"),
    pl.col("start").alias("build_start_time_index"),
    pl.col("end").alias("build_end_time_index"),
    pl.col("origin").alias("build_origin"),
    pl.col("destination").alias("build_destination"),
).with_row_index(name="build_unique_id", offset = 1).sort("build_unique_id") # `with_row_count` generates 0-indexed row numbers

#### Free up some memory -- No Monte Carlo for now

In [7]:
del df_list

#### Identify tours with the same error terms

In [8]:
same_error_df = eet_base_df.join(
    eet_build_df,
    left_on="base_tour_id",
    right_on="build_tour_id",
    how="left", 
).select(
    pl.col("base_unique_id"),
    pl.col("build_unique_id"),
    pl.col("base_tour_id").alias("error_term_id")
)

base_error_term_in_build_df = eet_base_df.select(
    pl.col("base_unique_id"),
    pl.col("base_tour_id").alias("error_term_id")
).with_columns(
    pl.col("error_term_id").is_in(pl.Series(eet_build_df["build_tour_id"]).implode()).alias("error_term_present_in_build")
)

#### Method to Assess Sameness across Different Definitions

In [9]:
def process_sameness(base_df, build_df, join_cols, sameness_word, same_error_df, base_error_term_in_build_df):

    temp_df = base_df.join(
        build_df,
        left_on=[col[0] for col in join_cols],
        right_on=[col[1] for col in join_cols],
        how="left", 
        suffix="_build_join",
        coalesce=False
    ).with_columns(
        pl.col("build_unique_id").is_null().alias("nothing_comparable_in_build")
    ).with_columns(
        pl.col("build_unique_id").fill_null(0)
    ).join(
        same_error_df,
        on=["base_unique_id", "build_unique_id"],
        how="left"
    ).with_columns(
        pl.col("error_term_id").is_not_null().alias("same_error_term")
    ).join(
        base_error_term_in_build_df,
        on="base_unique_id",
        how="left"
    ).with_columns(
        pl.lit(sameness_word).alias("sameness_definition")
    )

    # Estimate likely iia violations
    iia_violations_df = temp_df.sort(
        pl.col("base_unique_id"),
        pl.col("same_error_term").cast(pl.Int32).neg(), 
        pl.col("build_unique_id")
    ).group_by(
        pl.col("base_unique_id")
    ).agg(
        pl.len().alias("iia_violations"),
        pl.col("build_unique_id").first().alias("build_unique_id"),
    ).with_columns(
        pl.col("iia_violations").fill_null(0)        
    ).with_columns(
        pl.col("build_unique_id").fill_null(0)
    )

    # Count the number uf unique error terms likely to have
    unique_error_terms_count = base_df.select(
        *[pl.col(col[0]) for col in join_cols]
    ).unique().shape[0]
    
    total_base_rows = base_df.shape[0]
    unique_error_terms_percentage = unique_error_terms_count / total_base_rows if total_base_rows > 0 else 0

    result_df = temp_df.join(
        iia_violations_df,
        on=["base_unique_id", "build_unique_id"],
        how="left"
    ).filter(
        pl.col("iia_violations").is_not_null()
    ).with_columns(
        (pl.col("iia_violations") - 1).alias("iia_violations"),
        pl.lit(unique_error_terms_percentage).alias("unique_error_terms")
    )
    
    return result_df

#### Sameness Definition 1: By household_id, person_id, tour_type, primary_purpose, and tour_type_num

In [10]:
join_cols = [
    ("base_household_id", "build_household_id"),
    ("base_person_id", "build_person_id"),
    ("base_tour_type", "build_tour_type"),
    ("base_primary_purpose", "build_primary_purpose"),
    ("base_tour_type_num", "build_tour_type_num")
]
same_01_df = process_sameness(eet_base_df, eet_build_df, join_cols, SAMENESS_MONTE_CARLO_WORD, same_error_df, base_error_term_in_build_df)

#### Sameness Definition 2: By household_id, person_id, tour_type, primary_purpose, and start_time

In [11]:
join_cols = [
    ("base_household_id", "build_household_id"),
    ("base_person_id", "build_person_id"),
    ("base_tour_type", "build_tour_type"),
    ("base_primary_purpose", "build_primary_purpose"),
    ("base_start_time_index", "build_start_time_index")
]
same_02_df = process_sameness(eet_base_df, eet_build_df, join_cols, SAMENESS_START_TIME_WORD, same_error_df, base_error_term_in_build_df)

#### Sameness Definition 3: By household_id, person_id, tour_type, primary_purpose, start_time, duration, origin, and destination

In [12]:
join_cols = [
    ("base_household_id", "build_household_id"),
    ("base_person_id", "build_person_id"),
    ("base_tour_type", "build_tour_type"),
    ("base_primary_purpose", "build_primary_purpose"),
    ("base_start_time_index", "build_start_time_index"),
    ("base_duration", "build_duration"),
    ("base_origin", "build_origin"),
    ("base_destination", "build_destination")
]
same_03_df = process_sameness(eet_base_df, eet_build_df, join_cols, SAMENESS_SCHEDULE_LOCATION_WORD, same_error_df, base_error_term_in_build_df)

#### Sameness Definition 4: By household_id, person_id, tour_type, primary_purpose, and destination

In [13]:
join_cols = [
    ("base_household_id", "build_household_id"),
    ("base_person_id", "build_person_id"),
    ("base_tour_type", "build_tour_type"),
    ("base_primary_purpose", "build_primary_purpose"),
    ("base_destination", "build_destination")
]
same_04_df = process_sameness(eet_base_df, eet_build_df, join_cols, SAMENESS_DESTINATION_WORD, same_error_df, base_error_term_in_build_df)

#### Sameness Definition 5: By household_id, person_id, tour_type, primary_purpose

In [14]:
join_cols = [
    ("base_household_id", "build_household_id"),
    ("base_person_id", "build_person_id"),
    ("base_tour_type", "build_tour_type"),
    ("base_primary_purpose", "build_primary_purpose")
]
same_05_df = process_sameness(eet_base_df, eet_build_df, join_cols, SAMENESS_BASICS_WORD, same_error_df, base_error_term_in_build_df)

#### Consolidate Sameness Results

In [15]:
output_df = pl.concat([same_01_df, same_02_df, same_03_df, same_04_df, same_05_df])
output_df = output_df.with_columns(
    (pl.col("base_tour_id") == pl.col("build_tour_id")).alias("is_tour_id_same"),
    (pl.col("base_duration") == pl.col("build_duration")).alias("is_duration_same"),
    (pl.col("base_tour_type_num") == pl.col("build_tour_type_num")).alias("is_tour_type_num_same"),
    (pl.col("base_start_time_index") == pl.col("build_start_time_index")).alias("is_start_time_same"),
    (pl.col("base_end_time_index") == pl.col("build_end_time_index")).alias("is_end_time_same"),
    pl.when(pl.col("same_error_term"))
    .then(pl.lit("Success: Correct match to build"))
    .when(~pl.col("same_error_term") & pl.col("error_term_present_in_build"))
    .then(pl.lit("Failure: Incorrect match to build"))
    .when(~pl.col("same_error_term") & ~pl.col("error_term_present_in_build") & pl.col("nothing_comparable_in_build"))
    .then(pl.lit("Success: Nothing comparable in build"))
    .when(~pl.col("same_error_term") & ~pl.col("error_term_present_in_build") & ~pl.col("nothing_comparable_in_build"))
    .then(pl.lit("Failure: Error term not generated in build"))
    .otherwise(pl.lit("ERROR")).alias("eet_performance")
)


#### High-level Outcome

In [16]:
pl.Config.set_tbl_rows(50)
pl.Config.set_tbl_cols(50)
pl.Config.set_fmt_str_lengths(75)

display("Sameness Definition vs EET Performance:")
display(output_df.group_by("sameness_definition", "eet_performance").agg(pl.len()).sort("sameness_definition", "eet_performance"))

'Sameness Definition vs EET Performance:'

sameness_definition,eet_performance,len
str,str,u32
"""Basics""","""Failure: Error term not generated in build""",6284
"""Basics""","""Failure: Incorrect match to build""",4
"""Basics""","""Success: Correct match to build""",4500845
"""Basics""","""Success: Nothing comparable in build""",3872
"""Basics + destination""","""Failure: Error term not generated in build""",1589
"""Basics + destination""","""Failure: Incorrect match to build""",74225
"""Basics + destination""","""Success: Correct match to build""",4426624
"""Basics + destination""","""Success: Nothing comparable in build""",8567
"""Basics + start_time""","""Failure: Error term not generated in build""",1032
"""Basics + start_time""","""Failure: Incorrect match to build""",38665


In [17]:
display("Sameness Definition vs IIA Violations:")
display(output_df.group_by("sameness_definition", "iia_violations").len().sort("sameness_definition", "iia_violations"))

'Sameness Definition vs IIA Violations:'

sameness_definition,iia_violations,len
str,u32,u32
"""Basics""",0,3218333
"""Basics""",1,843129
"""Basics""",2,191296
"""Basics""",3,159320
"""Basics""",4,61221
"""Basics""",5,29503
"""Basics""",6,5344
"""Basics""",7,2081
"""Basics""",8,580
"""Basics""",9,198


In [18]:
display("Sameness Definition vs Unique Error Terms:")
display(output_df.group_by("sameness_definition").agg(pl.col("unique_error_terms").first()).sort("sameness_definition"))


'Sameness Definition vs Unique Error Terms:'

sameness_definition,unique_error_terms
str,f64
"""Basics""",0.833356
"""Basics + destination""",0.929703
"""Basics + start_time""",0.992803
"""Basics + start_time + duration + origin + destination""",0.999894
"""Basics + tour_type_num""",0.999931


#### Write to CSV for Tableau

In [19]:
output_df.write_csv(output_file)

### Deep Dive/Debug Investigation

#### Question #1: Why does the Monte Carlo method fail? It should be perfect.

In [20]:
working_df = output_df.filter(
    (pl.col("sameness_definition") == SAMENESS_MONTE_CARLO_WORD) &
    (pl.col("eet_performance").str.contains("Failure"))
)

display("Primary Purpose distribution for Monte Carlo method failures:")
display(working_df.group_by("base_primary_purpose").len().sort("base_primary_purpose"))


'Primary Purpose distribution for Monte Carlo method failures:'

base_primary_purpose,len
str,u32
"""atwork""",1
"""eatout""",1
"""escort""",5292
"""othdiscr""",3
"""othmaint""",2
"""shopping""",2


##### Look into Shopping Failures

In [21]:
df_shopping_failures = working_df.filter(pl.col("base_primary_purpose") == "shopping")
display(df_shopping_failures.select(["base_unique_id", "base_household_id"]))


base_unique_id,base_household_id
u32,f64
1189351,302481.0
1189352,302481.0


In [22]:
DEBUG_HH_ID = 302481
DEBUG_PERSON_ID = 858708
DEBUG_PURPOSE = "shopping"

debug_base_df = eet_base_df.filter(
    (pl.col("base_household_id") == DEBUG_HH_ID) &
    (pl.col("base_person_id") == DEBUG_PERSON_ID) &
    (pl.col("base_tour_type") == DEBUG_PURPOSE)
).select(
    [
        "base_household_id",
        "base_person_id",
        "base_tour_type",
        "base_primary_purpose",
        "base_tour_id",
        "base_duration",
        "base_start_time_index",
        "base_end_time_index",
    ]
)
display("Debug Base DataFrame (Shopping):")
display(debug_base_df)


'Debug Base DataFrame (Shopping):'

base_household_id,base_person_id,base_tour_type,base_primary_purpose,base_tour_id,base_duration,base_start_time_index,base_end_time_index
f64,f64,str,str,i64,f64,f64,f64
302481.0,858708.0,"""shopping""","""shopping""",42935370,1.0,21.0,22.0


In [23]:
debug_build_df = eet_build_df.filter(
    (pl.col("build_household_id") == DEBUG_HH_ID) &
    (pl.col("build_person_id") == DEBUG_PERSON_ID) &
    (pl.col("build_tour_type") == DEBUG_PURPOSE)
)
display("Debug Build DataFrame (Shopping):")
display(debug_build_df)

'Debug Build DataFrame (Shopping):'

build_unique_id,build_household_id,build_person_id,build_tour_type,build_primary_purpose,build_tour_type_num,build_tour_type_count,build_tour_id,build_tour_mode,build_duration,build_start_time_index,build_end_time_index,build_origin,build_destination
u32,f64,f64,str,str,f64,f64,i64,str,f64,f64,f64,f64,f64
1189428,302481.0,858708.0,"""shopping""","""shopping""",1.0,1.0,42935369,"""SHARED3""",1.0,28.0,29.0,14843.0,12724.0


The `tour_id` is different. Perhaps this is a joint tour that is leading to the different `tour_id`s?

#### Question #2: Illustrate bad matches from the base (Monte Carlo) method

This example also illustrates the 'Failure: Error term not generated in build' error. Base tour number 5 matches, in terms of time and location, build tour number 3. But because there are only four tours in the build scenario, the error term for the fifth base tour is not generated in the build scenario.

In [24]:
DEBUG_HH_ID = 239
DEBUG_PERSON_ID = 641
DEBUG_PURPOSE = "escort"

debug_base_df_q2 = eet_base_df.filter(
    (pl.col("base_household_id") == DEBUG_HH_ID) &
    (pl.col("base_person_id") == DEBUG_PERSON_ID) &
    (pl.col("base_tour_type") == DEBUG_PURPOSE)
)
display("Debug Base DataFrame (Escort - Q2):")
display(debug_base_df_q2)

debug_build_df_q2 = eet_build_df.filter(
    (pl.col("build_household_id") == DEBUG_HH_ID) &
    (pl.col("build_person_id") == DEBUG_PERSON_ID) &
    (pl.col("build_tour_type") == DEBUG_PURPOSE)
)
display("Debug Build DataFrame (Escort - Q2):")
display(debug_build_df_q2)



'Debug Base DataFrame (Escort - Q2):'

base_unique_id,base_household_id,base_person_id,base_tour_type,base_primary_purpose,base_tour_type_num,base_tour_type_count,base_tour_id,base_tour_mode,base_duration,base_start_time_index,base_end_time_index,base_origin,base_destination
u32,f64,f64,str,str,f64,f64,i64,str,f64,f64,f64,f64,f64
836,239.0,641.0,"""escort""","""escort""",1.0,6.0,32085,"""SHARED3""",1.0,8.0,9.0,322.0,23872.0
837,239.0,641.0,"""escort""","""escort""",2.0,6.0,32086,"""SHARED3""",1.0,9.0,10.0,322.0,5702.0
838,239.0,641.0,"""escort""","""escort""",3.0,6.0,32087,"""WALK""",1.0,18.0,19.0,322.0,11186.0
839,239.0,641.0,"""escort""","""escort""",4.0,6.0,32088,"""SHARED3""",4.0,19.0,23.0,322.0,5702.0
840,239.0,641.0,"""escort""","""escort""",5.0,6.0,32089,"""SHARED2""",1.0,23.0,24.0,322.0,23872.0
841,239.0,641.0,"""escort""","""escort""",6.0,6.0,32090,"""WALK""",1.0,25.0,26.0,322.0,11186.0


'Debug Build DataFrame (Escort - Q2):'

build_unique_id,build_household_id,build_person_id,build_tour_type,build_primary_purpose,build_tour_type_num,build_tour_type_count,build_tour_id,build_tour_mode,build_duration,build_start_time_index,build_end_time_index,build_origin,build_destination
u32,f64,f64,str,str,f64,f64,i64,str,f64,f64,f64,f64,f64
836,239.0,641.0,"""escort""","""escort""",1.0,4.0,32085,"""SHARED3""",1.0,8.0,9.0,322.0,23872.0
837,239.0,641.0,"""escort""","""escort""",2.0,4.0,32086,"""WALK""",4.0,18.0,22.0,322.0,1834.0
838,239.0,641.0,"""escort""","""escort""",3.0,4.0,32087,"""WALK""",1.0,23.0,24.0,322.0,23872.0
839,239.0,641.0,"""escort""","""escort""",4.0,4.0,32088,"""WALK""",2.0,24.0,26.0,322.0,1834.0
