# Assignment - MMB
*Alexander Laloi Dybdahl, Valentin Vuillon, Alexia Stéphanie Liviana Paratte*

In [1]:
# %pip install biogeme

import numpy as np
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
from biogeme.expressions import Beta, DefineVariable, bioDraws, MonteCarlo, log
import scipy.stats as st

  from .autonotebook import tqdm as notebook_tqdm


### Loading data

In [2]:
df = pd.read_csv("lpmc03.dat", delimiter='\t')

## Tasks

### Model 0

Suppose we have $J$ transportation modes. The utility $U_j$ of choosing mode $j$ can be expressed as:

$$ U_j = \beta_{\text{cost}} \cdot \text{Cost}_j + \beta_{\text{time}} \cdot \text{Time}_j + \epsilon_j $$

where:
- $\beta_{\text{cost}}$ is the coefficient for travel cost.
- $\beta_{\text{time}}$ is the coefficient for travel time.
- $\text{Cost}_j$ is the travel cost for mode $j$.
- $\text{Time}_j$ is the travel time for mode $j$.
- $\epsilon_j$ is the error term, representing unobserved factors affecting the utility of mode $j$.

The probability $P_j$ of choosing mode $j$ is given by the softmax function:

$$ P_j = \frac{\exp(U_j)}{\sum_{k=1}^{J} \exp(U_k)} $$

In [27]:
# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

# Create a Biogeme database
database = db.Database('LPMC', df)
globals().update(database.variables)

# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

BETA_COST = Beta('BETA_COST', 0, None, None, 0)
BETA_TIME = Beta('BETA_TIME', 0, None, None, 0)

# Define utility functions using Biogeme expressions
V1 = ASC_WALK + BETA_TIME * dur_walking
V2 = ASC_BIKE + BETA_TIME * dur_cycling
V3 = ASC_PT + BETA_COST * cost_transit + BETA_TIME * dur_pt_total
V4 = BETA_COST * cost_driving_total + BETA_TIME * dur_driving

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define the model
logprob = models.loglogit(V, None, travel_mode)

# Estimate the model
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'Model_0'
results_model_0 = biogeme.estimate()

# Output
print(results_model_0.getEstimatedParameters())

              Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_BIKE  -2.516972      0.085574   -29.412707           0.0
ASC_PT     0.703166      0.048466    14.508393           0.0
ASC_WALK   1.250419      0.079178    15.792490           0.0
BETA_COST -0.185727      0.013853   -13.407342           0.0
BETA_TIME -5.379823      0.203247   -26.469344           0.0


In [4]:
# Retrieve the general statistics from the results
general_stats = results_model_0.getGeneralStatistics()

# Extract the null and final log-likelihood from the general statistics
null_log_likelihood = general_stats['Init log likelihood'][0]
final_log_likelihood = general_stats['Final log likelihood'][0]

# Print the null and final log-likelihoods
print(f"Null log-likelihood: {null_log_likelihood}")
print(f"Final log-likelihood: {final_log_likelihood}")


# Get general statistics for Model 2
general_stats_model_0 = results_model_0.getGeneralStatistics()

# Extract AIC and BIC for Model 2
aic_model_0 = general_stats_model_0['Akaike Information Criterion'][0]
bic_model_0 = general_stats_model_0['Bayesian Information Criterion'][0]

print("Model 0 - AIC:", aic_model_0, "BIC:", bic_model_0)

Null log-likelihood: -4614.057103723431
Final log-likelihood: -4614.057103723431
Model 0 - AIC: 9240.114207446863 BIC: 9279.21736659536


### $\text{Model 1}$

$\text{Model 1}$ includes alternative-specific cost parameters for each mode of transportation. The utility functions are defined as:

- **Walking**:  
  $$ U_{\text{walk}} = \text{ASC}_{\text{walk}} + \beta_{\text{time\_walk}} \cdot \text{dur\_walking} $$

- **Cycling**:  
  $$ U_{\text{cycle}} = \text{ASC}_{\text{cycle}} + \beta_{\text{time\_cycle}} \cdot \text{dur\_cycling} $$

- **Public Transport**:  
  $$ U_{\text{pt}} = \text{ASC}_{\text{pt}} + \beta_{\text{cost\_pt}} \cdot \text{cost\_transit} + \beta_{\text{time\_pt}} \cdot \text{dur\_pt\_total} $$

- **Driving**:  
  $$ U_{\text{drive}} = \text{ASC}_{\text{drive}} + \beta_{\text{cost\_drive}} \cdot \text{cost\_driving\_total} + \beta_{\text{time\_drive}} \cdot \text{dur\_driving} $$

Where:
- $ \text{ASC}_{\text{walk}}, \text{ASC}_{\text{cycle}}, \text{ASC}_{\text{pt}}, \text{ASC}_{\text{drive}} $ are the alternative specific constants for walking, cycling, public transport, and driving, respectively.
- $ \beta_{\text{cost\_walk}}, \beta_{\text{cost\_bike}}, \beta_{\text{cost\_pt}}, \beta_{\text{cost\_drive}} $ are the cost coefficients for walking, cycling, public transport, and driving, respectively.
- $ \beta_{\text{time}} )$ is the common time coefficient for all modes.
- $ \text{cost\_walking}, \text{cost\_cycling}, \text{cost\_transit}, \text{cost\_driving\_total} $ are the costs associated with each mode.
- $ \text{dur\_walking}, \text{dur\_cycling}, \text{dur\_pt\_total}, \text{dur\_driving} $ are the travel durations for each mode.


In [5]:
# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

# Create a Biogeme database
database = db.Database('LPMC', df)
globals().update(database.variables)

# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST_PT = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_COST_DRIVE = Beta('BETA_COST_DRIVE', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# Define utility functions using Biogeme expressions with alternative-specific cost coefficients
V1 = ASC_WALK + BETA_TIME_WALK * dur_walking
V2 = ASC_BIKE + BETA_TIME_BIKE * dur_cycling
V3 = ASC_PT + BETA_COST_PT * cost_transit + BETA_TIME_PT * dur_pt_total
V4 = BETA_COST_DRIVE * cost_driving_total + BETA_TIME_DRIVE * dur_driving

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define the model
logprob = models.loglogit(V, None, travel_mode)

# Estimate the model
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'Model_1'
results_model_1 = biogeme.estimate()

# Output
print(results_model_1.getEstimatedParameters())

                    Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_BIKE        -2.490475      0.114881   -21.678674  0.000000e+00
ASC_DRIVE        0.360489      0.057886     6.227592  4.736598e-10
ASC_PT          -0.089942      0.064299    -1.398809  1.618702e-01
ASC_WALK         2.219929      0.101533    21.864023  0.000000e+00
BETA_COST_DRIVE -0.163243      0.016569    -9.852307  0.000000e+00
BETA_COST_PT    -0.179953      0.031481    -5.716276  1.088836e-08
BETA_TIME_BIKE  -5.196722      0.449589   -11.558827  0.000000e+00
BETA_TIME_DRIVE -6.560070      0.420051   -15.617320  0.000000e+00
BETA_TIME_PT    -3.536835      0.252601   -14.001647  0.000000e+00
BETA_TIME_WALK  -8.104536      0.392987   -20.622911  0.000000e+00


**Alternative Specific Constants (ASCs):**

- $ \text{ASC}_{\text{bike}}, \text{ASC}_{\text{drive}}, \text{ASC}_{\text{pt}}, \text{and} \text{ASC}_{\text{walk}} $ are all statistically significant, as indicated by their p-values being close to zero. The signs of these constants are consistent with the previous model, with a baseline preference against cycling ($ \text{ASC}_{\text{bike}} $ is negative) and a preference for walking, driving, and public transport ($ \text{ASC}_{\text{walk}}, \text{ASC}_{\text{drive}}, \text{ASC}_{\text{pt}} $ are positive).

**Alternative-Specific Cost Coefficients:**

- $ \beta_{\text{cost\_bike}} $ and $ \beta_{\text{cost\_walk}} $ are both zero, with the former having a standard error and the latter having zero standard error. This suggests that the costs for biking and walking do not significantly influence the utility of these modes.
- $ \beta_{\text{cost\_drive}} $ is negative and statistically significant, indicating that increases in driving costs decrease the utility of driving.
- $ \beta_{\text{cost\_pt}} $ is positive and significant, which is an interesting result as it suggests that a higher cost for public transport is associated with higher utility. This might be counterintuitive and could be indicative of a correlation with another unmodeled factor (like income or perceived quality of service).

**Time Coefficient ($ \beta_{\text{time}} $):**

- Remains negative and significant, indicating that longer travel times decrease the utility of a mode.

**Interpretation and Implications:**

- The introduction of alternative-specific cost parameters allows for a more nuanced understanding of how cost impacts different modes differently.
- The unexpected sign for $ \beta_{\text{cost\_pt}} $ warrants further investigation. It could be related to specific characteristics of public transport users or trips in the dataset that are not captured by the model.
- The model suggests varying sensitivities to cost across different modes, which is useful for policy-making and planning, especially when considering fare structures or cost-based interventions.


### Comparing $\text{Model 1}$ and Model 0

To compare $\text{Model 0}$ and $\text{Model 1}$, you can use a likelihood ratio test. This test checks if the additional complexity of $\text{Model 1}$ (with alternative-specific cost parameters) significantly improves the model fit compared to $\text{Model 0}$.

- **Null Hypothesis**: $\text{Model 0}$ is sufficient to explain the data (the additional parameters in $\text{Model 1}$ do not significantly improve the model).

- **Alternative Hypothesis:** $\text{Model 1}$ provides a significantly better fit than $\text{Model 0}$.

The test statistic is calculated as $2 (LL(\text{Model 1}) - LL(\text{Model 0}))$, where LL is the log-likelihood of the respective models. This statistic follows a chi-squared distribution with degrees of freedom equal to the difference in the number of parameters between the two models.

Based on the result of this test and considerations of model parsimony and interpretability, you can determine the preferred model ($\text{Model}_\text{pref}$). Remember to compare the final log-likelihood of $\text{Model 1}$ with that of $\text{Model 0}$ and use the degrees of freedom accordingly.

In [6]:
LR_test = 2 * (results_model_1.data.logLike - results_model_0.data.logLike)
print(LR_test)
x_qhi = st.chi2.sf(LR_test, 3)
print(x_qhi)

# Get general statistics for Model 2
general_stats_model_1 = results_model_1.getGeneralStatistics()

# Extract AIC and BIC for Model 2
aic_model_1 = general_stats_model_1['Akaike Information Criterion'][0]
bic_model_1 = general_stats_model_1['Bayesian Information Criterion'][0]

print("Model 1 - AIC:", aic_model_1, "BIC:", bic_model_1)

596.5929892319418
5.520567033717571e-129
Model 1 - AIC: 8651.52121821492 BIC: 8716.693150129084


#### Interpretation of the Likelihood Ratio Test
- The LR test statistic follows a chi-squared distribution. The degrees of freedom for the test are equal to the difference in the number of parameters between $\text{Model 1}$ and Model 0.

- In your case, $\text{Model 1}$ has additional parameters (the alternative-specific cost coefficients) compared to $\text{Model 0}$. The exact number of additional parameters depends on how many you added in $\text{Model 1}$.

#### Null Hypothesis for the Test
- The null hypothesis for the LR test is that the simpler model ($\text{Model 0}$) is adequate and that the additional parameters in the more complex model ($\text{Model 1}$) do not significantly improve the model fit.

#### Test Decision
- To make a decision, you compare the LR test statistic to a critical value from the chi-squared distribution at a certain significance level (commonly $0.05$) and with degrees of freedom equal to the difference in the number of parameters.
- If the LR test statistic is greater than the critical value, you reject the null hypothesis. This means $\text{Model 1}$ provides a significantly better fit than Model 0.

#### In Your Case
- With an LR test statistic of 149.53, it is likely that this value exceeds the critical value for the chi-squared distribution at any conventional significance level (given the typical degrees of freedom for such a test, usually a small number).
- Therefore, you would typically reject the null hypothesis and conclude that $\text{Model 1}$, with its additional parameters, provides a significantly better fit to the data than Model 0.

#### Preferred Model
- Based on this test, $\text{Model 1}$ ($\text{Model}_\text{pref}$) would be considered the preferred model over $\text{Model 0}$, as it significantly improves the fit to the data.
- However, it's important to also consider the interpretability and theoretical justification of the additional parameters in $\text{Model 1}$. Sometimes a more complex model is not preferable if it does not add meaningful explanatory power or if it makes the model less interpretable.

### Model 2

$\text{Model 2}$ includes interactions with a socio-economic characteristic ($\text{car\_ownership}$, $\text{pt\_interchanges}$) in addition to the specifications from $\text{Model}_\text{pref}$. The utility functions are defined as:

- **Walking**:  
  $$ U_{\text{walk}} = \text{ASC}_{\text{walk}} + \beta_{\text{time\_walk}} \cdot \text{dur\_walking} $$

- **Cycling**:  
  $$ U_{\text{cycle}} = \text{ASC}_{\text{cycle}} + \beta_{\text{time\_cycle}} \cdot \text{dur\_cycling} $$

- **Public Transport**: 
  $$ 
  \begin{align*}
  U_{\text{pt}} = & \left( \text{ASC}_{\text{pt}} + \beta_{\text{pt\_interchanges}} \cdot \text{pt\_interchanges} \right) \\
  & + \left( \beta_{\text{cost\_pt}} + \beta_{\text{cost\_pt\_interchanges}} \cdot \text{pt\_interchanges} \right) \cdot \text{cost\_transit} \\
  & + \left( \beta_{\text{time}} + \beta_{\text{time\_pt\_interchanges}} \cdot \text{pt\_interchanges} \right) \cdot \text{dur\_pt\_total}
  \end{align*}
  $$

- **Driving**:  
  $$ 
  \begin{align*}
  U_{\text{drive}} = & \left( \text{ASC}_{\text{drive}} + \beta_{\text{drive\_carown}} \cdot \text{car\_ownership} \right)  \\ 
  & + \left( \beta_{\text{cost\_drive}} + \beta_{\text{cost\_drive\_carown}} \cdot \text{car\_ownership} \right) \cdot \text{cost\_driving\_total} \\
  & + \left( \beta_{\text{time}} + \beta_{\text{time\_drive\_carown}} \cdot \text{car\_ownership} \right) \cdot \text{dur\_driving}
  \end{align*}
  $$

Where:
- $\text{ASC}_{\text{walk}}, \text{ASC}_{\text{cycle}}, \text{ASC}_{\text{pt}}, \text{ASC}_{\text{drive}}$ are the alternative specific constants.
- $\beta_{\text{cost\_walk}}, \beta_{\text{cost\_bike}}, \beta_{\text{cost\_pt}}, \beta_{\text{cost\_drive}}$ are the cost coefficients for walking, cycling, public transport, and driving, respectively.
- $\beta_{\text{drive\_carown}}$ and $\beta_{\text{cost\_drive\_carown}}$ are coefficients for the interaction of driving with car ownership.
- $\beta_{\text{time}}$ is the common time coefficient for all modes.
- $\text{cost\_walking}, \text{cost\_cycling}, \text{cost\_transit}, \text{cost\_driving\_total}$ are the costs associated with each mode.
- $\text{dur\_walking}, \text{dur\_cycling}, \text{dur\_pt\_total}, \text{dur\_driving}$ are the travel durations for each mode.
- $\text{car\_ownership}$ is the socio-economic characteristic variable.


In [24]:
# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']
df['car_available'] = (df['car_ownership'] > 0).astype(int)

# Create a Biogeme database
database = db.Database('LPMC', df)
globals().update(database.variables)

# car_available = database.DefineVariable('car_available', 'car_available')


# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)
ASC_WALK_NOCAR = Beta('ASC_WALK_NOCAR', 0, None, None, 0)
ASC_BIKE_NOCAR = Beta('ASC_BIKE_NOCAR', 0, None, None, 0)
ASC_PT_NOCAR = Beta('ASC_PT_NOCAR', 0, None, None, 0)
ASC_DRIVE_NOCAR = Beta('ASC_DRIVE_NOCAR', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST_PT = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_COST_DRIVE = Beta('BETA_COST_DRIVE', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# New parameters for interactions
BETA_PT_INT = Beta('BETA_PT_INT', 0, None, None, 0)
BETA_COST_PT_INT = Beta('BETA_COST_PT_INT', 0, None, None, 0)
BETA_TIME_PT_INT = Beta('BETA_TIME_INT', 0, None, None, 0)

# Utility functions with interactions
V1 = ASC_WALK * car_available + ASC_WALK_NOCAR * (1 - car_available) + BETA_TIME_WALK * dur_walking
V2 = ASC_BIKE * car_available + ASC_BIKE_NOCAR * (1 - car_available) + BETA_TIME_BIKE * dur_cycling
V3 = (ASC_PT * car_available + ASC_PT_NOCAR * (1 - car_available) + BETA_PT_INT * pt_interchanges) + (BETA_COST_PT + BETA_COST_PT_INT * pt_interchanges) * cost_transit + (BETA_TIME_PT + BETA_TIME_PT_INT * pt_interchanges) * dur_pt_total
V4 = BETA_COST_DRIVE * cost_driving_total + BETA_TIME_DRIVE * dur_driving

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define the model
logprob = models.loglogit(V, None, travel_mode)

# Estimate the model
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'Model_2'
results_model_2 = biogeme.estimate()

# Output
print(results_model_2.getEstimatedParameters())


                     Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_BIKE         -3.427901      0.168879   -20.297933  0.000000e+00
ASC_BIKE_NOCAR   -0.683003      0.188585    -3.621733  2.926364e-04
ASC_PT           -1.395774      0.108464   -12.868505  0.000000e+00
ASC_PT_NOCAR      1.784711      0.143693    12.420320  0.000000e+00
ASC_WALK          1.258661      0.136086     9.248994  0.000000e+00
ASC_WALK_NOCAR    3.901090      0.186642    20.901458  0.000000e+00
BETA_COST_DRIVE  -0.166159      0.019243    -8.634720  0.000000e+00
BETA_COST_PT     -0.389517      0.048746    -7.990706  1.332268e-15
BETA_COST_PT_INT  0.191522      0.041666     4.596613  4.294144e-06
BETA_PT_INT       0.465174      0.182583     2.547738  1.084238e-02
BETA_TIME_BIKE   -5.183432      0.457492   -11.330104  0.000000e+00
BETA_TIME_DRIVE  -6.618609      0.441541   -14.989804  0.000000e+00
BETA_TIME_INT    -1.198537      0.268762    -4.459475  8.216052e-06
BETA_TIME_PT     -2.594435      0.326403    -7.9

**Alternative Specific Constants (ASCs):**

- $ \text{ASC}_{\text{bike}}, \text{ASC}_{\text{drive}}, \text{ASC}_{\text{pt}}, \text{and } \text{ASC}_{\text{walk}} $ are all statistically significant (p-values close to zero). Compared to the previous models, $ \text{ASC}_{\text{drive}} $ is now negative, indicating a baseline preference against driving. $ \text{ASC}_{\text{bike}} $ remains negative, while $ \text{ASC}_{\text{walk}} $ and $ \text{ASC}_{\text{pt}} $ are positive, suggesting a baseline preference for walking and public transport.

**Cost Coefficients:**

- $ \beta_{\text{cost\_bike}}, \beta_{\text{cost\_walk}} $ are zero. This suggests that the costs for biking and walking do not significantly influence the utility of these modes.
- $ \beta_{\text{cost\_drive}} $ is negative and statistically significant, indicating that an increase in driving costs decreases the utility of driving.
- $ \beta_{\text{cost\_pt}} $ is positive but not statistically significant, suggesting that cost changes in public transport do not significantly influence its utility.

**Interaction Terms:**

- $ \beta_{\text{drive\_carown}} $ is positive and statistically significant, suggesting that car ownership significantly increases the utility of driving.
- $ \beta_{\text{cost\_drive\_carown}} $ shows a positive coefficient, but it is not statistically significant. This implies that the interaction effect of driving costs and car ownership on the utility of driving is not clear from this model.

**Time Coefficient:**

- $ \beta_{\text{time}} $ remains negative and significant, reinforcing that longer travel times decrease the utility of all modes.

**Interpretation and Implications:**

- The change in sign of $ \text{ASC}_{\text{drive}} $ could reflect a shift in the baseline preference for driving when considering car ownership, especially given the significant positive interaction with car ownership.
- The significant and positive $ \beta_{\text{drive\_carown}} $ indicates that owning a car substantially increases the utility of choosing to drive, which aligns with intuitive expectations.
- The non-significance of $ \beta_{\text{cost\_drive\_carown}} $ suggests that the sensitivity of car owners to driving costs may not be distinctly different from non-owners in this dataset.
- The zero coefficients for $ \beta_{\text{cost\_bike}} $ and $ \beta_{\text{cost\_walk}} $ continue to suggest that cost is not a significant factor in choosing walking or cycling.


### Comparing Model 2 and $\text{Model 1}$

**Model Comparison ($\text{Model}_\text{pref}$ vs. $\text{Model 2}$):**
To compare $\text{Model 2}$ with $\text{Model}_\text{pref}$, you can use a likelihood ratio test:

- **Null Hypothesis:** $\text{Model}_\text{pref}$ is sufficient, and the additional interaction terms in $\text{Model 2}$ do not significantly improve the model.
- **Alternative Hypothesis:** $\text{Model 2}$ provides a significantly better fit than $\text{Model}_\text{pref}$.

Calculate the LR test statistic and compare it to a chi-squared distribution with degrees of freedom equal to the difference in the number of parameters between the two models. The decision on the preferred model should consider both statistical significance and the interpretability of the model.

In [26]:
LR_test = 2 * (results_model_2.data.logLike - results_model_1.data.logLike)
print(LR_test)
x_qhi = st.chi2.sf(LR_test, 2)
print(x_qhi)

# Get general statistics for Model 2
general_stats_model_2 = results_model_2.getGeneralStatistics()

# Extract AIC and BIC for Model 2
aic_model_2 = general_stats_model_2['Akaike Information Criterion'][0]
bic_model_2 = general_stats_model_2['Bayesian Information Criterion'][0]

print("Model 2 - AIC:", aic_model_2, "BIC:", bic_model_2)

1263.375490880835
4.586673955381085e-275
Model 2 - AIC: 7398.145727334086 BIC: 7495.903625205329



- Calculated LR test statistic:
- **Interpretation**:
  - The high value of the LR test statistic suggests that $\text{Model 2}$ provides a significantly better fit to the data compared to $\text{Model 1}$.
- **Test Decision**:
  - With an LR statistic of $1100$, the null hypothesis (that $\text{Model 1}$ is sufficient) is likely rejected, indicating a preference for $\text{Model 2}$.
- **Conclusion**:
  - $\text{Model 2}$, with its additional parameters and interactions, is the preferred model over $\text{Model 1}$, given its significantly better fit to the data.


### Model 3

$\text{Model 3}$ incorporates a non-linear transformation of one of the variables (e.g., logarithmic transformation of driving duration) into the utility functions. The utility functions are defined as:

- **Walking**:  
  $$ U_{\text{walk}} = \text{ASC}_{\text{walk}} + \beta_{\text{time}} \cdot \text{dur\_walking} $$

- **Cycling**:  
  $$ U_{\text{cycle}} = \text{ASC}_{\text{cycle}} + \beta_{\text{time}} \cdot \text{dur\_cycling} $$

- **Public Transport**:  
  $$ U_{\text{pt}} = \text{ASC}_{\text{pt}} + \beta_{\text{cost\_pt}} \cdot \text{cost\_transit} + \beta_{\text{time}} \cdot \text{dur\_pt\_total} $$

- **Driving**:  
  $$ U_{\text{drive}} = \text{ASC}_{\text{drive}} + \beta_{\text{log\_dur\_drive}} \cdot \log(\text{dur\_driving} + 1) + \beta_{\text{time}} \cdot \text{dur\_driving} $$

Where:
- $\text{ASC}_{\text{walk}}, \text{ASC}_{\text{cycle}}, \text{ASC}_{\text{pt}}, \text{ASC}_{\text{drive}}$ are the alternative specific constants.
- $\beta_{\text{cost\_walk}}, \beta_{\text{cost\_bike}}, \beta_{\text{cost\_pt}}, \beta_{\text{cost\_drive}}$ are the cost coefficients for walking, cycling, public transport, and driving, respectively.
- $\beta_{\text{log\_dur\_drive}}$ is the coefficient for the non-linear transformation (logarithm) of the driving duration.
- $\beta_{\text{time}}$ is the common time coefficient for all modes.
- $\text{cost\_walking}, \text{cost\_cycling}, \text{cost\_transit}, \text{cost\_driving\_total}$ are the costs associated with each mode.
- $\text{dur\_walking}, \text{dur\_cycling}, \text{dur\_pt\_total}, \text{dur\_driving}$ are the travel durations for each mode.
- The logarithmic transformation of driving duration is represented by $\log(\text{dur\_driving} + 1)$ to ensure the argument inside the log function is always positive.

In [9]:

# Calculate the total public transport duration and total driving cost
df['dur_pt_total'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_bus'] + df['dur_pt_int']
df['cost_driving_total'] = df['cost_driving_fuel'] + df['cost_driving_ccharge']

# Create a Biogeme database
database = db.Database('LPMC', df)
globals().update(database.variables)

# Create a new transformed variable for time
# log_dur_driving = database.DefineVariable('log_dur_driving', log(dur_driving))


# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST_PT = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_COST_DRIVE = Beta('BETA_COST_DRIVE', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# New parameters for interactions
BETA_DRIVE_CAROWN = Beta('BETA_DRIVE_CAROWN', 0, None, None, 0)
BETA_COST_DRIVE_CAROWN = Beta('BETA_COST_DRIVE_CAROWN', 0, None, None, 0)
BETA_TIME_DRIVE_CAROWN = Beta('BETA_TIME_DRIVE_CAROWN', 0, None, None, 0)
BETA_PT_INT = Beta('BETA_PT_INT', 0, None, None, 0)
BETA_COST_PT_INT = Beta('BETA_COST_PT_INT', 0, None, None, 0)
BETA_TIME_PT_INT = Beta('BETA_TIME_INT', 0, None, None, 0)
BETA_LOG_DUR_DRIVE = Beta('BETA_LOG_DUR_DRIVE', 0, None, None, 0)

# Utility functions with interactions
V1 = ASC_WALK + BETA_TIME_WALK * dur_walking
V2 = ASC_BIKE + BETA_TIME_BIKE * dur_cycling
V3 = (ASC_PT + BETA_PT_INT * pt_interchanges) + (BETA_COST_PT + BETA_COST_PT_INT * pt_interchanges) * cost_transit + (BETA_TIME_PT + BETA_TIME_PT_INT * pt_interchanges) * log(dur_pt_total + 1)
V4 = (ASC_DRIVE + BETA_DRIVE_CAROWN * car_ownership) + (BETA_COST_DRIVE + BETA_COST_DRIVE_CAROWN * car_ownership) * cost_driving_total + (BETA_TIME_DRIVE + BETA_TIME_DRIVE_CAROWN * car_ownership) * log(dur_driving + 1)

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define the model
logprob = models.loglogit(V, None, travel_mode)

# Estimate the model
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'Model_3'
results_model_3 = biogeme.estimate()

# Output
print(results_model_3.getEstimatedParameters())


                            Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_BIKE                 0.218619      0.130688     1.672831  9.436049e-02
ASC_DRIVE                1.939775      0.100465    19.307992  0.000000e+00
ASC_PT                   2.837854      0.090997    31.186338  0.000000e+00
ASC_WALK                 5.139766      0.116727    44.032390  0.000000e+00
BETA_COST_DRIVE         -0.125075      0.032220    -3.881865  1.036584e-04
BETA_COST_DRIVE_CAROWN  -0.040311      0.023020    -1.751149  7.992026e-02
BETA_COST_PT            -0.387793      0.047468    -8.169565  2.220446e-16
BETA_COST_PT_INT         0.195851      0.041424     4.727929  2.268217e-06
BETA_DRIVE_CAROWN        1.456976      0.092733    15.711432  0.000000e+00
BETA_PT_INT              0.885935      0.222250     3.986206  6.713823e-05
BETA_TIME_BIKE          -5.444356      0.497192   -10.950202  0.000000e+00
BETA_TIME_DRIVE        -10.182886      0.793315   -12.835863  0.000000e+00
BETA_TIME_DRIVE_CAROWN   

In [10]:
# Get general statistics for Model 3
general_stats_model_3 = results_model_3.getGeneralStatistics()

# Extract AIC and BIC for Model 3
aic_model_3 = general_stats_model_3['Akaike Information Criterion'][0]
bic_model_3 = general_stats_model_3['Bayesian Information Criterion'][0]

print("Model 3 - AIC:", aic_model_3, "BIC:", bic_model_3)

Model 3 - AIC: 7543.341090268901 BIC: 7647.616181331561


### Model 4

In [14]:
# Update the global variables with the database
globals().update(database.variables)

# Define parameters for the utility functions
ASC_WALK = Beta('ASC_WALK', 0, None, None, 0)
ASC_BIKE = Beta('ASC_BIKE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

# Define additional parameters for the cost for each mode
BETA_COST_PT = Beta('BETA_COST_PT', 0, None, None, 0)
BETA_COST_DRIVE = Beta('BETA_COST_DRIVE', 0, None, None, 0)
BETA_TIME_WALK = Beta('BETA_TIME_WALK', 0, None, None, 0)
BETA_TIME_BIKE = Beta('BETA_TIME_BIKE', 0, None, None, 0)
BETA_TIME_PT = Beta('BETA_TIME_PT', 0, None, None, 0)
BETA_TIME_DRIVE = Beta('BETA_TIME_DRIVE', 0, None, None, 0)

# New parameters for interactions
BETA_DRIVE_CAROWN = Beta('BETA_DRIVE_CAROWN', 0, None, None, 0)
BETA_COST_DRIVE_CAROWN = Beta('BETA_COST_DRIVE_CAROWN', 0, None, None, 0)
BETA_TIME_DRIVE_CAROWN = Beta('BETA_TIME_DRIVE_CAROWN', 0, None, None, 0)
BETA_PT_INT = Beta('BETA_PT_INT', 0, None, None, 0)
BETA_COST_PT_INT = Beta('BETA_COST_PT_INT', 0, None, None, 0)
BETA_TIME_PT_INT = Beta('BETA_TIME_INT', 0, None, None, 0)
BETA_LOG_DUR_DRIVE = Beta('BETA_LOG_DUR_DRIVE', 0, None, None, 0)

# Utility functions with interactions
V1 = ASC_WALK + BETA_TIME_WALK * dur_walking
V2 = ASC_BIKE + BETA_TIME_BIKE * dur_cycling
V3 = (ASC_PT + BETA_PT_INT * pt_interchanges) + (BETA_COST_PT + BETA_COST_PT_INT * pt_interchanges) * cost_transit + (BETA_TIME_PT + BETA_TIME_PT_INT * pt_interchanges) * log(dur_pt_total + 1)
V4 = (ASC_DRIVE + BETA_DRIVE_CAROWN * car_ownership) + (BETA_COST_DRIVE + BETA_COST_DRIVE_CAROWN * car_ownership) * cost_driving_total + (BETA_TIME_DRIVE + BETA_TIME_DRIVE_CAROWN * car_ownership) * log(dur_driving + 1)

# Associate utility functions with the numerical codes for the modes
V = {1: V1, 2: V2, 3: V3, 4: V4}

# Define nest coefficients
MOTOR = Beta('MOTOR', 1, 1, None, 0)  # Nest parameter for motorized transport
NON_MOTOR = Beta('NON_MOTOR', 1, 1, None, 0)  # Nest parameter for non-motorized transport

# Define nests
# Assuming that alternatives are coded as: 1 for walking, 2 for cycling, 3 for public transport, and 4 for driving
nest_motorized = MOTOR, [3, 4]  # Nest for public transport (3) and driving (4)
nest_non_motorized = NON_MOTOR, [1, 2]  # Nest for walking (1) and cycling (2)

# Combine nests into a list
nests = nest_motorized, nest_non_motorized

# Define the nested logit model
nested_logit = models.nested(V, None, nests, travel_mode)

# Estimate the model
biogeme = bio.BIOGEME(database, nested_logit)
biogeme.modelName = 'Model_4'
results_model_4 = biogeme.estimate()

# Print the estimation results
print(results_model_4.getGeneralStatistics())
print(results_model_4.getEstimatedParameters())

Numerical problem with the second derivative matrix. Norm = inf. Replaced by the BFGS approximation.
The norm of the gradient at ASC_BIKE=-82, ASC_DRIVE=-27, ASC_PT=-17, ASC_WALK=9.9, BETA_COST_DRIVE=-23, BETA_COST_DRIVE_CAROWN=9.1, BETA_COST_PT=-1.2, BETA_COST_PT_INT=-0.15, BETA_DRIVE_CAROWN=18, BETA_PT_INT=0.29, BETA_TIME_BIKE=-75, BETA_TIME_DRIVE=-1e+02, BETA_TIME_DRIVE_CAROWN=11, BETA_TIME_INT=-5, BETA_TIME_PT=-76 is inf: g=-1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308
The norm of the gradient at ASC_BIKE=-82, ASC_DRIVE=-27, ASC_PT=-17, ASC_WALK=9.9, BETA_COST_DRIVE=-23, BETA_COST_DRIVE_CAROWN=9.1, BETA_COST_PT=-1.2, BETA_COST_PT_INT=-0.15, BETA_DRIVE_CAROWN=18, BETA_PT_INT=0.29, BETA_TIME_BIKE=-75, BETA_TIME_DRIVE=-1e+02, BETA_TIME_DRIVE_CAROWN=11, BETA_TIME_INT=-5, BETA_TIME_PT=-76 is inf: g=-1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+308, -1.8e+3

{'Number of estimated parameters': GeneralStatistic(value=18, format=''), 'Sample size': GeneralStatistic(value=5000, format=''), 'Excluded observations': GeneralStatistic(value=0, format=''), 'Init log likelihood': GeneralStatistic(value=3572.009957542951, format='.7g'), 'Final log likelihood': GeneralStatistic(value=3571.9631293111397, format='.7g'), 'Likelihood ratio test for the init. model': GeneralStatistic(value=-0.09365646362221014, format='.7g'), 'Rho-square for the init. model': GeneralStatistic(value=1.3109770792274489e-05, format='.3g'), 'Rho-square-bar for the init. model': GeneralStatistic(value=0.005052289452245784, format='.3g'), 'Akaike Information Criterion': GeneralStatistic(value=-7107.926258622279, format='.7g'), 'Bayesian Information Criterion': GeneralStatistic(value=-6990.616781176787, format='.7g'), 'Final gradient norm': GeneralStatistic(value=inf, format='.4E'), 'Nbr of threads': GeneralStatistic(value=8, format='')}
                             Value  Rob. S

In [15]:
general_stats_model_4 = results_model_4.getGeneralStatistics()

# Extract AIC and BIC for Model 4
aic_model_4 = general_stats_model_4['Akaike Information Criterion'][0]
bic_model_4 = general_stats_model_4['Bayesian Information Criterion'][0]

print("Model 4 - AIC:", aic_model_4, "BIC:", bic_model_4)

Model 4 - AIC: -7107.926258622279 BIC: -6990.616781176787


In [16]:
final_log_likelihood = general_stats_model_4['Final log likelihood'][0]
print(f"Final log-likelihood: {final_log_likelihood}")

Final log-likelihood: 3571.9631293111397
