# Estimation of the IPDL Model

In this notebook, we present and implement two consistent estimation methods for the IPDL Model: the MLE and the FKN-estimator.

In [None]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = "warn"
pd.set_option('display.max_rows', 500)
import os
import sys
from numpy import linalg as la
from scipy import optimize
import scipy.stats as scstat
from matplotlib import pyplot as plt
import itertools as iter
%load_ext line_profiler

# Files
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utilities.Logit_file import estimate_logit, logit_se, logit_t_p, q_logit, logit_score, logit_score_unweighted, logit_ccp, LogitBLP_estimator
from data.Eurocarsdata_file import Eurocars_cleandata

   variable names                                        description
0              cy            cylinder volume or displacement (in cc)
1              hp                                 horsepower (in kW)
2              we                                     weight (in kg)
3              le                                     length (in cm)
4              wi                                      width (in cm)
5              he                                     height (in cm)
6              li          average of li1, li2, li3 (used in papers)
7              sp                            maximum speed (km/hour)
8              ac  time to acceleration (in seconds from 0 to 100...
9              pr   price (in destination currency including V.A.T.)
10          brand                                      name of brand
11           home  domestic car dummy (appropriate interaction of...
12            cla                              class or segment code


x has full rank


In [None]:
# Load dataset and variable names
# os.chdir('../GREENCAR_notebooks/') # Assigns work directory

input_path = os.getcwd() # Assigns input path as current working directory (cwd)
descr = (pd.read_stata('../data/eurocars.dta', iterator = True)).variable_labels() # Obtain variable descriptions
dat_file = pd.read_csv('../data/eurocars.csv') # reads in the data set as a pandas dataframe.

In [None]:
# Outside option is included if OO == True, otherwise analysis is done on the inside options only.
OO = True

# Choose which variables to include in the analysis, and assign them either as discrete variables or continuous.

x_discretevars = [ 'brand', 'home', 'cla']
x_contvars = ['cy', 'hp', 'we', 'le', 'wi', 'he', 'li', 'sp', 'ac', 'pr']
z_IV_contvars = ['xexr']
z_IV_discretevars = []
x_allvars =  [*x_contvars, *x_discretevars]
z_allvars = [*z_IV_contvars, *z_IV_discretevars]

if OO:
    nest_vars = [var for var in ['in_out', *x_allvars] if (var != 'pr')] # We nest over all variables other than price, but an alternative list can be specified here if desired.
else:
    nest_vars = [var for var in x_allvars if (var != 'pr')] # See above

nest_cont_vars = ['cy', 'hp', 'we', 'le', 'wi', 'he', 'li', 'sp', 'ac'] # The list of continuous variables, from which nests will be created according to the deciles of the distribution.

G = len(nest_vars)

# Print list of chosen variables as a dataframe
pd.DataFrame(descr, index=['description'])[x_allvars].transpose().reset_index().rename(columns={'index' : 'variable names'})

Unnamed: 0,variable names,description
0,cy,cylinder volume or displacement (in cc)
1,hp,horsepower (in kW)
2,we,weight (in kg)
3,le,length (in cm)
4,wi,width (in cm)
5,he,height (in cm)
6,li,"average of li1, li2, li3 (used in papers)"
7,sp,maximum speed (km/hour)
8,ac,time to acceleration (in seconds from 0 to 100...
9,pr,price (in destination currency including V.A.T.)


In [None]:
dat, dat_org, x_vars, z_vars, N, pop_share, T, J, K = Eurocars_cleandata(dat_file, x_contvars, x_discretevars, z_IV_contvars, z_IV_discretevars, outside_option=OO)

In [None]:
# Create dictionaries of numpy arrays for each market. This allows the size of the data set to vary over markets.

dat = dat.reset_index(drop = True).sort_values(by = ['market', 'co']) # Sort data so that reshape is successfull

x = {t: dat[dat['market'] == t][x_vars].values.reshape((J[t],K)) for t in np.arange(T)} # Dict of explanatory variables
y = {t: dat[dat['market'] == t]['ms'].to_numpy().reshape((J[t])) for t in np.arange(T)} # Dict of market shares

## Maximum likelihood estimation of IPDL

The log-likelihood contribution is
$$
\ell_t(\theta)=y_t'\ln p(\mathbf{X}_t,\theta),
$$
and an estimation routine must therefore have a function that - given $\mathbf{X}_t$ and $\theta$ - calculates $u_t=\mathbf{X}_t\beta$ and constructs $\Gamma$, and then calls the fixed point routine described in 'IPDL_ccp.ipynb'. That routine will return $p(\mathbf{X}_t,\theta)$, and we can then evaluate $\ell_t(\theta)$. Using our above defined functions we now construct precisely such an estimation procedure.

For maximizing the likelihood, we want the derivates at some $\theta=(\beta',\lambda')$. Let $q_t=p(\mathbf{X}_t,\theta)$, then we have
$$
\nabla_\theta \ln p(\mathbf{X}_t,\theta)=\mathrm{diag}(q_t)^{-1}\left(\nabla_{qq}^2\Omega(q_t|\lambda)^{-1}-q_tq_t' \right)\left[\mathbf{X}_t,-\nabla_{q,\lambda}^2 \Omega(q_t|\lambda)\right]
$$
Note that the first two components is the elasticity $\nabla_u \ln P(u|\lambda)$ and the last term is a block matrix of size $J\times dim(\theta)$. Note that the latter cross derivative $\nabla_{q,\lambda}^2 \Omega(q_t|\lambda)$ is given by $\nabla_{q,\lambda} \Omega(q_t|\lambda)_g = - \ln(q) + (\Psi^g)' \ln(\Psi^g q)$ for each row $g=1,\ldots,G$. The derivative of the log-likelihood function can be obtained from this as
$$
\nabla_\theta \ell_t(\theta)=\nabla_\theta \ln p(\mathbf{X}_t,\theta)' y_t \\
$$

In [None]:
def IPDL_loglikelihood(Theta, y, x, sample_share, psi_stack, nest_count):
    ''' 
    This function computes the loglikehood contribution for each individual i.
    
    Args.
        Theta: a numpy array (K+G,) of parameters of (\beta', \lambda')',
        y: a dictionary of T numpy arrays (J[t],) of observed market shares in onehot encoding for each market t,
        x: a dictionary of T numpy arrays (J[t],K) of covariates for each market t,
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests'
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t

    Output
        ll: a numpy array (T,) of IPDL loglikelihood contributions
    '''

    T = len(x.keys())
    K = x[0].shape[1]
    ccp_hat = IPDL_ccp(Theta, x, psi_stack, nest_count)
    sum_lambdaplus = np.array([theta for theta in Theta[K:] if theta >0]).sum()

    '''if sum_lambdaplus >= 1:
        ll = np.NINF*np.ones((T,))'''

    
    ll=np.empty((T,))
    for t in np.arange(T):
        ll[t] = sample_share[t]*(y[t]@np.log(ccp_hat[t])) # np.einsum('j,j', y[t], np.log(ccp_hat[t], out = -np.inf*np.ones_like(ccp_hat[t]), where = (ccp_hat[t] > 0)))

    print([sum_lambdaplus, -ll.mean()])

    return ll

In [None]:
def q_IPDL(Theta, y, x, sample_share, psi_stack, nest_count):
    ''' The negative loglikelihood criterion to minimize
    '''
    Q = -IPDL_loglikelihood(Theta, y, x, sample_share, psi_stack, nest_count)
    
    return Q

We also implement the derivative of the loglikehood wrt. parameters $\nabla_\theta \ell_t(\theta)$.

In [None]:
def cross_grad_pertubation(q, psi_stack, nest_count):
    ''' 
    This function calculates the cross diffential of the pertubation function \Omega wrt. first ccp's and then the lambda parameters

    Args.
        q: a dictionary of T numpy arrays (J[t],) of choice probabilities for each market t
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests'
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t
    
    Returns
        Z: a dictionary of T numpy arrays (J[t],G) of cross diffentials of the pertubation function \Omega wrt. first ccp's and then the lambda parameters
    '''

    T = len(q.keys())
    Z = {}
    
    for t in np.arange(T):
        G = len(nest_count[t])
        indices = np.int64(np.cumsum(nest_count[t])) # Find the indices of the categories g used in the psi_stack matrices
        J = q[t].shape[0] # Find the number of alternatives

        log_q = np.log(q[t])
        Z_t = np.empty((J,G)) # Initialize a J[t] by G numpy matrix for market t

        for g in np.arange(G):

            # Find the \psi^g matrix for category g
            if g == 0:
                Psi = psi_stack[t][J:(J+indices[g]),:] 
            else:
                Psi = psi_stack[t][(J+indices[g-1]):(J+indices[g]),:]

            Psi_q = Psi @ q[t] # Compute a matrix product
            log_Psiq = np.log(Psi_q) # Determine log of Psi_q, and set entries equal to minus infinity if entry <= 0.
            Psi_logPsiq = Psi.T @ log_Psiq # Compute matrix product

            Z_t[:,g] = - log_q + Psi_logPsiq # Compute cross differential
        
        Z[t] = Z_t
    
    return Z

In [None]:
def IPDL_theta_grad_log_ccp(Theta, x, psi_stack, nest_count):
    '''
    This function calculates the derivative of the IPDL log ccp's wrt. parameters theta

    Args.
        Theta: a numpy array (K+G,) of parameters of (\beta', \lambda')',
        x: a dictionary of T numpy arrays (J[t],K) of covariates for each market t,
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests'
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t
    Returns
        Grad: a dictionary of T numpy arrays (J[t],K+G) of derivatives of the IPDL log ccp's wrt. parameters theta for each market t
    '''

    T = len(x.keys())

    q = IPDL_ccp(Theta, x, psi_stack, nest_count) # Find choice probabilities

    Z = cross_grad_pertubation(q, psi_stack, nest_count) # Find cross differentials of the pertubation function
    u_grad = IPDL_u_grad_Log_ccp(q, x, Theta, psi_stack, nest_count)  # Find the gradient of log ccp's wrt. utilities
    Grad={}

    for t in range(T):
        G=np.concatenate((x[t], -Z[t]), axis=1)
        Grad[t]=u_grad[t]@G
   
   # G = [np.concatenate((x[t], Z[t]), axis=1) for t in np.arange(T)] # Construct the block matrix of the covariates and the cross differentials as block matrices
    #Grad = {t: np.einsum('jk,kd->jd', u_grad[t], G[t]) for t in np.arange(T)} # Compute the derivative by matrix multiplication.

    return Grad

In [None]:
def IPDL_score(Theta, y, x, sample_share, psi_stack, nest_count):
    '''
    This function calculates the score of the IPDL loglikelihood.

    Args.
        Theta: a numpy array (K+G,) of parameters of (\beta', \lambda')',
        y: a dictionary of T numpy arrays (J[t],) of observed market shares in onehot encoding for each market t,
        x: a dictionary of T numpy arrays (J[t],K) of covariates for each market t,
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests'
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t

    Returns
        Score: a numpy array (T,K+G) of IPDL scores
    '''
    T = len(x.keys())

    log_ccp_grad = IPDL_theta_grad_log_ccp(Theta, x, psi_stack, nest_count) # Find derivatives of the IPDL log ccp's wrt. parameters theta
    D = log_ccp_grad[0].shape[1] # equal to K+G
    Score = np.empty((T,D))
    
    for t in np.arange(T):
        Score[t,:] =sample_share[t]*(log_ccp_grad[t].T@y[t]) #np.einsum('j,jd->d', y[t], log_ccp_grad[t]) # Computes a matrix product

    return Score

In [None]:
def q_IPDL_score(Theta, y, x, sample_share, psi_stack, nest_count):
    ''' The derivative of the negative loglikelihood criterion
    '''
    return -IPDL_score(Theta, y, x, sample_share, psi_stack, nest_count)

In [None]:
def test_analyticgrad(y, x, theta, sample_share, Psi, Nest_count, delta = 1.0e-8):

    numgrad = np.empty((T, K+G))

    for i in np.arange(K+G):
        vec = np.zeros((K+G,))
        vec[i] = 1
        numgrad[:,i] = (IPDL_loglikelihood(theta + delta*vec, y, x, sample_share, Psi, Nest_count) - IPDL_loglikelihood(theta, y, x, sample_share, Psi, Nest_count)) / delta

    angrad = IPDL_score(theta, y, x, sample_share, Psi, Nest_count)

    normdiff = la.norm(angrad - numgrad)
    
    return normdiff, angrad, numgrad

In [None]:
theta0 = np.ones((K+G,))/(K+G)
diff, an, num = test_analyticgrad(y, x, theta0, pop_share, Psi, Nest_count)

[0.2, 0.025328025815875677]
[0.2, 0.025328025752599303]
[0.2, 0.02532802577337812]


[0.2, 0.025328025752599303]
[0.2, 0.02532802577421719]
[0.2, 0.025328025752599303]
[0.2, 0.02532802578551036]
[0.2, 0.025328025752599303]
[0.2, 0.02532802580486519]
[0.2, 0.025328025752599303]
[0.2, 0.025328025808139996]
[0.2, 0.025328025752599303]
[0.2, 0.025328025807728423]
[0.2, 0.025328025752599303]
[0.2, 0.025328025781598762]
[0.2, 0.025328025752599303]
[0.2, 0.025328025793703513]
[0.2, 0.025328025752599303]
[0.2, 0.02532802577227401]
[0.2, 0.025328025752599303]
[0.2, 0.02532802575444403]
[0.2, 0.025328025752599303]
[0.2, 0.02532802575261861]
[0.2, 0.025328025752599303]
[0.2, 0.025328025756880854]
[0.2, 0.025328025752599303]
[0.2, 0.025328025754301448]
[0.2, 0.025328025752599303]
[0.2, 0.025328025755661687]
[0.2, 0.025328025752599303]
[0.2, 0.025328025756371494]
[0.2, 0.025328025752599303]
[0.2, 0.02532802575295324]
[0.2, 0.025328025752599303]
[0.2, 0.02532802575289476]
[0.2, 0.025328025752599303]
[0.2, 0.025328025753035364]
[0.2, 0.025328025752599303]
[0.2, 0.025328025758435027]


## Standard errors in Maximum Likelihood estimation

As usual we may consistently estimate the Covariance Matrix  of the IPDL maximum likelihood estimator for some estimate $\hat \theta = (\hat \beta', \hat \lambda')'\in \mathbb{R}^{K+G}$ as:

$$
\hat \Sigma = \left( \sum_{i=1}^N \nabla_\theta \ell_i (\hat \theta) \nabla_\theta \ell_i (\hat \theta)' \right)^{-1}
$$

Thereby we may find the estimated standard error of parameter $d$ as the squareroot of the d'th diagonal entry of $\hat \Sigma$:

$$
\hat \sigma_d = \sqrt{\hat \Sigma_{dd}}
$$

In [None]:
def IPDL_se(score, N):
    '''
    This function computes the asymptotic standard errors of the MLE.

    Args.
        score: a numpy array (T,K+G) of IPDL scores
        N: an integer giving the number of observations

    Returns
        SE: a numpy array (K+G,) of asymptotic IPDL MLE standard errors
    '''

    SE = np.sqrt(np.diag(la.inv(np.einsum('td,tm->dm', score, score))) / N)

    return SE

In [None]:
def IPDL_t_p(SE, Theta, N, Theta_hypothesis = 0):
    ''' 
    This function calculates t statistics and p values for characteristic and nest grouping parameters

    Args.
        SE: a numpy array (K+G,) of asymptotic IPDL MLE standard errors
        Theta: a numpy array (K+G,) of parameters of (\beta', \lambda')',
        N: an integer giving the number of observations
        Theta_hypothesis: a (K+G,) array or integer of parameter values to test in t-test. Default value is 0.
    
    Returns
        T: a (K+G,) array of estimated t tests
        p: a (K+G,) array of estimated asymptotic p values computed using the above t-tests
    '''

    T = np.abs(Theta - Theta_hypothesis) / SE
    p = 2*scstat.t.sf(T, df = N-1)

    return T,p

### We now estimate the model

In [None]:
def estimate_IPDL(f, Theta0, y, x, sample_share, psi_stack, nest_count, N, Analytic_jac:bool = True, options = {'disp': True}, **kwargs):
    ''' 
    Takes a function and returns the minimum, given starting values and variables necessary in the IPDL model specification.

    Args:
        f: a function to minimize,
        Theta0 : a numpy array (K+G,) of initial guess parameters (\beta', \lambda')',
        y: a dictionary of T numpy arrays (J[t],) of observed market shares in onehot encoding for each market t,
        x: a dictionary of T numpy arrays (J[t],K) of covariates for each market t,
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests', 
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t,
        N: an integer giving the number of observations,
        Analytic_jac: a boolean. Default value is 'True'. If 'True' the analytic jacobian of the IPDL loglikelihood function is used in estimation. Else the numerical jacobian is used.
        options: dictionary with options for the optimizer (e.g. disp=True which tells it to display information at termination.)
    
    Returns:
        res: a dictionary with results from the estimation.
    '''

    # The objective function is the average of q(), 
    # but Q is only a function of one variable, theta, 
    # which is what minimize() will expect
    Q = lambda Theta: np.mean(f(Theta, y, x, sample_share, psi_stack, nest_count))

    if Analytic_jac == True:
        Grad = lambda Theta: np.mean(q_IPDL_score(Theta, y, x, sample_share, psi_stack, nest_count), axis=0) # Finds the Jacobian of Q. Takes mean of criterion q derivatives along axis=0, i.e. the mean across individuals.
    else:
        Grad = None

    # call optimizer
    result = optimize.minimize(Q, Theta0.tolist(), options=options, jac=Grad, **kwargs) # optimize.minimize takes a list of parameters Theta0 (not a numpy array) as initial guess.
    se = IPDL_se(IPDL_score(result.x, y, x, sample_share, psi_stack, nest_count), N)
    T,p = IPDL_t_p(se, result.x, N)

    # collect output in a dict 
    res = {
        'theta': result.x,
        'se': se,
        't': T,
        'p': p,
        'success':  result.success, # bool, whether convergence was succesful 1
        'nit':      result.nit, # no. algorithm iterations 
        'nfev':     result.nfev, # no. function evaluations 
        'fun':      result.fun # function value at termination 
    }

    return res

In [None]:
beta_0 = np.zeros((K,))

# Estimate the model
Logit_beta = estimate_logit(q_logit, beta_0, y, x, sample_share=pop_share, Analytic_jac=True)['beta']
Logit_SE = logit_se(logit_score(Logit_beta, y, x, pop_share), pop_share, N)
Logit_t, Logit_p = logit_t_p(Logit_beta, logit_score(Logit_beta, y, x, pop_share), pop_share, N)

# Initialize \theta^0
theta0 = np.append(Logit_beta,lambda0)
q0 = IPDL_ccp(theta0, x, Psi, Nest_count)

Optimization terminated successfully.
         Current function value: 0.001526
         Iterations: 25
         Function evaluations: 29
         Gradient evaluations: 29


In [None]:
resbla2 = estimate_IPDL(q_IPDL, theta0, y, x, pop_share, Psi, Nest_count, N, Analytic_jac=True)

[0.4642857142857142, 0.0015899709273228062]
[0.46090876162018596, 0.0015889713847807645]
[0.4474009509580727, 0.0015850515424856092]
[0.3933697083096197, 0.0015705950490930536]
[0.23671077826873116, 0.0015395686298264781]
[0.06493464753797744, 0.0015274591822855104]
[0.09885323842764626, 0.001526556776446271]
[0.09350628851001848, 0.0015265286295808205]
[0.09314635294486558, 0.001526527551818973]
[0.09277560631463176, 0.0015265254847205024]
[0.09229333534239086, 0.0015265214861262715]
[0.0903642514534273, 0.0015265090375558383]
[0.08723931532582233, 0.0015264827038595846]
[0.08374217623589737, 0.0015264371722768877]
[0.07958619829060203, 0.0015263546256758743]
[0.07479387512040013, 0.0015262026165299041]
[0.07230591432952654, 0.0015259236511221328]
[0.08108249841372164, 0.0015254217386624547]
[0.111069882894378, 0.001524554949365476]
[0.17700369913547914, 0.0015231642147646566]
[0.3063977259472085, 0.0015212948069764096]
[0.428279371543753, 0.001519808498143949]
[0.5254814593618558, 0.

In [None]:
-IPDL_loglikelihood(theta0, y, x, pop_share, Psi, Nest_count).mean()

[0.4642857142857142, 0.0015899709273228062]


0.0015899709273228062

In [None]:
def reg_table(theta,se,N,x_vars,nest_vars):
    IPDL_t, IPDL_p = IPDL_t_p(se, theta, N)

    if OO:
        regdex = [*x_vars, *['group_' + var for var in nest_vars]]
    else:
        regdex = [*x_vars, *['group_' + var for var in nest_vars]]

    table  = pd.DataFrame({'theta': [ str(np.round(theta[i], decimals = 4)) + '***' if IPDL_p[i] <0.01 else str(np.round(theta[i], decimals = 3)) + '**' if IPDL_p[i] <0.05 else str(np.round(theta[i], decimals = 3)) + '*' if IPDL_p[i] <0.1 else np.round(theta[i], decimals = 3) for i in range(len(theta))], 
                'se' : np.round(se, decimals = 5),
                't (theta == 0)': np.round(IPDL_t, decimals = 3),
                'p': np.round(IPDL_p, decimals = 3)}, index = regdex).rename_axis(columns = 'variables')
    
    return table

In [None]:
IPDL_theta = resbla2['theta']
IPDL_SE = resbla2['se']
IPDL_t, IPDL_p = IPDL_t_p(IPDL_SE, IPDL_theta, N)
reg_table(IPDL_theta, IPDL_SE, N, x_vars, nest_vars)

variables,theta,se,t (theta == 0),p
in_out,-2.119,4.4206,0.479,0.632
cy,-0.547,3.08893,0.177,0.86
hp,-0.591,3.02535,0.195,0.845
we,-0.855,2.77277,0.309,0.758
le,-1.503,4.13013,0.364,0.716
wi,-1.686,5.1926,0.325,0.745
he,-1.819,4.05489,0.449,0.654
li,-0.874,1.68868,0.518,0.605
sp,-1.232,3.33815,0.369,0.712
ac,-0.779,1.29178,0.603,0.547


In [None]:
np.array([p for p in IPDL_theta[K:] if p>0]).sum()

0.4780980313341608

Note that some of the estimated nesting parameters $\lambda$ are negative. Fosgerau & Nielsen (2023) show that the model is still a PUM in this case if the condition $\sum_{g: \lambda_g \geq 0} \lambda_g < 1$ is satisfied such that the IPDL Model utility maximization problem still has a unique solution in this case. 

### An alternative approach

The log-likelihood function is not globally concave, and finding the global optimum can be difficult. Using the estimation procedure of Fosgerau, Kristensen & Nielsen (2023 working paper), we can instead fit the parameters using the first-order conditions for optimality. The estimator takes the form

$$
\hat \theta^0=\arg \min_{\theta} \sum_t s_t \hat \varepsilon^0_t(\theta)'\hat W^0_t\hat \varepsilon^0 _t(\theta),
$$
where $\hat W^0_t$ is a positive semidefinite weight matrix, $s_t$ is market $t$'s share of the total population and 
$$
\hat \varepsilon^0_t(\theta)=\hat D^0_t(u(X_t,\beta)- \nabla_q \Omega_t(\hat q_t^0|\lambda)) ,
$$
where 
$$
\hat D^0_t=\textrm{diag}(\hat q^0_t)-\hat q^0_t (\hat q^0_t)'.
$$
Using equation (...) above, we have that $\hat \epsilon_t$ is a linear function of $\theta$,
$$
\hat \varepsilon^0_t(\theta)=\hat D^0_t \left(\hat G^0_t\theta- \ln \hat q^0_t\right)\equiv \hat A^0_t\theta-\hat r^0_t.
$$
Using linearity, the weighted least squares criterion has a unique closed form solution,
$$
\hat \theta^0 =\left(\sum_t s_t (\hat A^0_t)'\hat W^0_t \hat A^0_t \right)^{-1}\left(\sum_t s_t (\hat A^0_t)'\hat W^0_t \hat r_t^0 \right)
$$




In [None]:
def G_array(q, x, psi_stack, nest_count):
    ''' 
    This function calculates the G block matrix

    Args.
        q: a dictionary of T numpy arrays (J[t],) of choice probabilities for each market t
        x: a dictionary of T numpy arrays (J[t],K) of covariates for each market t
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests'
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t

    Returns
        G: a dictionary  of T numpy arrays (J[t],K+G): a G matrix for each market t
    '''
    T = len(x)

    Z = cross_grad_pertubation(q, psi_stack, nest_count) # Find the cross derivative of the pertubation function \Omega wrt. lambda and ccp's q
    G = {t: np.concatenate((x[t], -Z[t]), axis=1) for t in np.arange(T)} # Join block matrices along 2nd dimensions  s.t. last dimension is K+G (same dimension as theta)

    return G

In [None]:
def D_array(q):
    '''
    This function calculates the D matrix - the logit derivative of ccp's wrt. utilities

    Args.
        q: a dictionary of T numpy arrays (J[t],) of choice probabilities for each market t

    Returns
        D: a dictionary of T numpy arrays (J[t],J[t]) of logit derivatives of ccp's wrt. utilities for each market t
    '''
    T = len(q)

    D = {t: np.diag(q[t]) - np.einsum('j,k->jk', q[t], q[t]) for t in np.arange(T)}
    
    return D

In [None]:
def A_array(q, x, psi_stack, nest_count):
    '''
    This function calculates the A matrix

    Args.
        q: a dictionary of T numpy arrays (J[t],) of choice probabilities for each market t
        x: a dictionary of T numpy arrays (J[t],K) of covariates for each market t
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests'
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t

    Returns
        A: a dictionary  of T numpy arrays (J[t],K+G): an A matrix for each market t
    '''
    T = len(x)

    D = D_array(q)
    G = G_array(q, x, psi_stack, nest_count)
    A = {t: np.einsum('jk,kd->jd', D[t], G[t]) for t in np.arange(T)}

    return A

In [None]:
def r_array(q):
    '''
    This function calculates 'r'; the logarithm of observed or nonparametrically estimated market shares

    Args.
        q: a dictionary of T numpy arrays (J[t],) of choice probabilities for each market t
    
    Returns
        r: a dictionary of T numpy arrays (J[t],) of the log of ccp's for each market t
    '''
    T = len(q)

    D = D_array(q) 
    log_q = {t: np.log(q[t], out = -np.inf*np.ones_like(q[t]), where = (q[t] > 0)) for t in np.arange(T)}
    r = {t: np.einsum('jk,k->j', D[t], log_q[t]) for t in np.arange(T)}

    return r

In [None]:
def WLS_init(q, x, sample_share, psi_stack, nest_count, N):
    ''' 
    This function calculates the weighted least squares estimator \hat \theta^k and its relevant estimated standard error for the initial FKN parameter estimates.

    Args.
        q: a dictionary of T numpy arrays (J[t],) of choice probabilities for each market t
        x: a dictionary of T numpy arrays (J[t],K) of covariates for each market t
        sample_share: A (T,) numpy array of the fraction of observations in each market t 
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests'
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t
        N: An integer giving the total amount of observations

    Returns
        theta_hat: a (K+G,) numpy array of initial FKN parameter estimates
        se_hat: a (K+G,) numpy array of standard errors for initial FKN parameter estimates
    '''

    T = len(x)

    #W = {t: la.inv(np.diag(q[t])) for t in np.arange(T)}
    A = A_array(q, x, psi_stack, nest_count)
    r = r_array(q)

    d = A[0].shape[1]
    
    AWA = np.empty((T,d,d))
    AWr = np.empty((T,d))

    for t in np.arange(T):
        AWA[t,:,:] = sample_share[t]*np.einsum('jd,j,jp->dp', A[t], 1/q[t], A[t], optimize = True) # Fast product using that the weights are diagonal.
        AWr[t,:] = sample_share[t]*np.einsum('jd,j,j->d', A[t], 1/q[t], r[t], optimize = True)
    
    theta_hat = la.solve(AWA.sum(axis = 0), AWr.sum(axis = 0))
    #se_hat = np.sqrt(np.diag(la.inv(AWA.sum(axis = 0))) / N)
    
    return theta_hat
    

Using the observed market shares we may thus find initial parameter estimates $\hat \theta^0$ as described above.

In [None]:
thetaFKN0 = WLS_init(y, x, pop_share, Psi, Nest_count, N)

In [None]:
np.array([p for p in thetaFKN0[K:] if p>0]).sum()

1.038008441930889

## Regularization for parameter bounds

As we see above, the least squares estimator is not guaranteed to respect the parameter bounds $\sum_g \hat \lambda_g<1$. We can use that if we replace $\hat q^0_t$ with the choice probabilities from the maximum likelihood estimator of the logit model, $\hat q^{logit}_t\propto \exp\{X_t\hat \beta^{logit}\}$, and plug these choice probabilities into the WLS estimator described above, it will return $\hat \theta=(\hat \beta^{logit},0,\ldots,0)$ as the parameter estimate. Let $\hat q_t(\alpha)$ denote the weighted average of the logit probabilites and the market shares,
$$
\hat q_t(\alpha) =(1-\alpha) \hat q^{logit}_t+\alpha \hat q^0_t.
$$
 Let $\hat \theta^0(\alpha)$ denote the resulting parameter vector. We perform a line search for values of $\alpha$, $(\frac{1}{2},\frac{1}{4},\frac{1}{8},\ldots)$ until $\hat \theta^0(\alpha)$ yields a feasible parameter vector.


In [None]:
def LogL(Theta, y, x, sample_share, psi_stack, nest_count):
    ''' A function giving the mean IPDL loglikehood evaluated at data and an array of parameters 'Theta'
    '''
    return np.mean(IPDL_loglikelihood(Theta, y, x, sample_share, psi_stack, nest_count))

In [None]:
def LineSearch(Theta0, Logit_Beta, y, x, sample_share, psi_stack, nest_count, N, num_alpha = 5):
    ''' 
    '''
    T = len(x)
    d = Theta0.shape[0]
    K = x[0].shape[1]
    G = d-K

    # Find probabilities
    q_logit = logit_ccp(Logit_Beta, x)
    q_obs = y

    # Search
    #alpha_line = np.linspace(0, 1, num_alpha)
    alpha0=0.5
    #LogL_alpha = np.empty((num_alpha,))
    #theta_alpha = np.empty((num_alpha, d))

    for k in range(1,100):

        alpha = alpha0**k

      
        q_alpha = {t: (1 - alpha)*q_logit[t] + alpha*q_obs[t] for t in np.arange(T)}
        theta_alpha = WLS_init(q_alpha, x, sample_share, psi_stack, nest_count, N)[0]

        lambda_alpha = theta_alpha[K:]
        
        pos_pars = np.array([theta for theta in lambda_alpha if theta > 0])

        if pos_pars.sum() <1:
            break
    
    # Pick the best set of parameters

    return theta_alpha

In [None]:
def GridSearch(Theta0, Logit_Beta, y, x, sample_share, psi_stack, nest_count, N, num_alpha = 5):
    ''' 
    '''
    T = len(x)
    d = len(Theta0)
    K = x[0].shape[1]

    # Find probabilities
    q_logit = logit_ccp(Logit_Beta, x)
    q_obs = y

    # Search
    alpha_line = np.linspace(0, 1, num_alpha)
    LogL_alpha = np.empty((num_alpha,))
    theta_alpha = np.empty((num_alpha, d))

    for k in np.arange(len(alpha_line)):

        alpha = alpha_line[k]

        q_alpha = {t: (1 - alpha)*q_logit[t] + alpha*q_obs[t] for t in np.arange(T)}
        theta_alpha[k,:] = WLS_init(q_alpha, x, sample_share, psi_stack, nest_count, N)

        lambda_alpha = theta_alpha[k,K:]
        pos_pars = np.array([theta for theta in lambda_alpha if theta > 0])

        if pos_pars.sum() >= 1:
            LogL_alpha[k] = np.NINF
        else:
            LogL_alpha[k] = LogL(theta_alpha[k,:], y, x, sample_share, psi_stack, nest_count)
    
    # Pick the best set of parameters
    alpha_star = np.argmax(LogL_alpha)
    theta_hat_star = theta_alpha[alpha_star,:]

    return theta_hat_star

Implementing the grid search method we find corressponding parameters $\hat \theta^*$.

In [None]:
theta_alpha = GridSearch(thetaFKN0, beta_0, y, x, pop_share, Psi, Nest_count, N)

[5.351103169530736e-16, 0.02722153305673005]


[0.9048287653890591, 0.009636926685372499]
[0.9331100364359847, 0.005436328885898605]
[0.9510042620683576, 0.002990924562664173]


In [None]:
np.array([p for p in theta_alpha[K:] if p>0]).sum()

0.9510042620683576

## Iterated FKN estimator

The iterated estimator is as the initial one, except there is an additional term on $\hat \varepsilon$. First, we update the choice probabilities,
$$
\hat q^k_i=p(\mathbf X_i,\hat \theta^{k-1})\\
$$
Then we assign
$$
\hat D^k_i=\nabla^2_{qq}\Omega(\hat q_i^k|\hat \lambda^{k-1})^{-1}-(\hat q^k_i \hat q^k_i)'
$$
and then construct the residual
$$
\hat \varepsilon^k_i(\theta)=\hat D^k_i\left( u(x_i,\beta)-\nabla_q \Omega(\hat q_i^k|\lambda)\right) -y_i+\hat q_i^k,
$$
Which can once again be simplified as
$$
\hat \varepsilon^k_i(\theta)= \hat A_i^k \theta-\hat r^k_i,
$$
where
$$
\hat A^k_i=\hat D_i^k\hat G^k_i, \hat r_i^k =\hat D^k_i\ln \hat q_i^k-y_i
$$
and where $\hat G^k_i$ is constructed as in the initial estimator. Using the weighted least squares estimator with weights $\hat W_i^k=\textrm{diag}(\hat q^k_i)^{-1}$, we get the estimator
$$
\hat \theta^k = \arg \min_{\theta}\frac{1}{n}\sum_i \hat \varepsilon^k_i(\theta)'\hat W_i^k \hat \varepsilon^k_i(\theta).
$$
We can once again solve it in closed form as
$$
\hat \theta^k =\left( \frac{1}{n}\sum_i \hat (A^k_i)'\hat W_i^k \hat A^k_i)\right)^{-1}\left( \frac{1}{n}\sum_i (\hat A_i^k)'\hat W_i^k \hat r_i^k\right)
$$
Now we implement this procedure and iterate starting from our initial guess $\hat \theta^{*}$


In [None]:
def WLS(Theta, y, x, sample_share, psi_stack, nest_count, N):
    '''
    This function calculates the weighted least squares estimator \hat \theta^k and its relevant estimated standard error for the iterated parameter estimates.

    Args.
        q: a dictionary of T numpy arrays (J[t],) of choice probabilities for each market t
        x: a dictionary of T numpy arrays (J[t],K) of covariates for each market t
        sample_share: A (T,) numpy array of the fraction of observations in each market t 
        psi_stack: a dictionary of T numpy arrays (J[t] + sum(C_g),J[t]) of the J[t] by J[t] identity stacked on top of the \psi^g matrices for each market t as outputted by 'Create_nests'
        nest_count: a dictionary of T numpy arrays (G,) containing the amount of nests in each category g in each market t
        N: An integer giving the total amount of observations

    Returns
        theta_hat: a (K+G,) numpy array of initial FKN parameter estimates
        se_hat: a (K+G,) numpy array of standard errors for initial FKN parameter estimates
    '''
    T = len(x)
    d = Theta.shape[0]
    
    # Get ccp's
    q = IPDL_ccp(Theta, x, psi_stack, nest_count)

    # Construct A
    D = ccp_gradient(q, x, Theta, psi_stack, nest_count) # A is here constructed using the IPDL derivative of ccp's wrt. utilities instead of teh Logit derivative
    G = G_array(q, x, psi_stack, nest_count)
    A = {t: np.einsum('jk,kd->jd', D[t], G[t]) for t in np.arange(T)}

    # Construct r
    log_q = {t: np.log(q[t], out = -np.inf*np.ones_like(q[t]), where=(q[t] > 0)) for t in np.arange(T)}
    r = {t: np.einsum('jk,k->j', D[t], log_q[t]) + y[t] for t in np.arange(T)}

    # Estimate parameters
    AWA = np.empty((T,d,d))
    AWr = np.empty((T,d))

    for t in np.arange(T):
        AWA[t,:,:] = sample_share[t]*np.einsum('jd,j,jp->dp', A[t], 1./q[t], A[t], optimize = True)
        AWr[t,:] = sample_share[t]*np.einsum('jd,j,j->d', A[t], 1./q[t], r[t], optimize = True)

    theta_hat = la.solve(AWA.sum(axis = 0), AWr.sum(axis = 0))
    se_hat = np.sqrt(np.diag(la.inv(AWA.sum(axis = 0))) / N)

    return theta_hat,se_hat

In [None]:
def FKN_estimator(logit_beta, q_obs, x, sample_share, psi_stack, nest_count, N, tol = 1.0e-15, max_iters = 1000):
    ''' 
    '''

    K = x[0].shape[1]

    theta_init = WLS_init(q_obs, x, sample_share, psi_stack, nest_count, N) #WLS_init(q_obs, x, sample_share, psi_stack, nest_count,  N)
    
    if np.array([p for p in theta_init[K:] if p>0]).sum() >= 1:
        theta_hat_star = GridSearch(theta_init, logit_beta, q_obs, x, sample_share, psi_stack, nest_count, N)
        theta0 = theta_hat_star
    else:
        theta0 = theta_init

    #logl0 = LogL(theta0, q_obs, x, sample_share, psi_stack, nest_count)
    
    for k in np.arange(max_iters):
        theta1, se1 = WLS(theta0, q_obs, x, sample_share, psi_stack, nest_count, N)

        '''logl1=LogL(theta1, q_obs, x, sample_share, psi_stack, nest_count)
        
        for m in range(10):
            if logl1<logl0:
                theta1=0.5*theta0+0.5*theta1
                logl1=LogL(theta1, q_obs, x, sample_share, psi_stack, nest_count)
            else:
                break'''

        # Check convergence in an appropriate distance function
        dist = np.max(np.abs(theta1 - theta0))

        if dist<tol:
            succes = True
            iter = k
            break
        elif k==max_iters:
            succes = False
            iter = max_iters
            break
        else:
            None
            
        # Iteration step
        theta0 = theta1

    res = {'theta': theta1,
           'se': se1,
           'fun': -LogL(theta1, y, x, sample_share, psi_stack, nest_count),
           'iter': iter,
           'succes': succes}
    
    return res 
        

In [None]:
res = FKN_estimator(beta_0, y, x, pop_share, Psi, Nest_count, N, tol=1.0e-8, max_iters=1000)

[5.351103169530736e-16, 0.02722153305673005]
[0.9048287653890591, 0.009636926685372499]
[0.9331100364359847, 0.005436328885898605]
[0.9510042620683576, 0.002990924562664173]
[0.9263335667528774, 0.0014941817336802585]


In [None]:
FKN_theta = res['theta']
FKN_SE = res['se']
FKN_t, FKN_p = IPDL_t_p(FKN_SE, FKN_theta, N)
reg_table(FKN_theta, FKN_SE, N, x_vars, nest_vars)

variables,theta,se,t (theta == 0),p
in_out,-10.0484***,0.00308,3266.6,0.0
cy,-0.9218***,0.0015,612.937,0.0
hp,-2.8092***,0.00193,1455.524,0.0
we,-0.0536***,0.00137,39.014,0.0
le,-1.5624***,0.00143,1089.602,0.0
wi,5.5314***,0.00299,1850.329,0.0
he,-0.0604***,0.00179,33.676,0.0
li,-0.8833***,0.00103,859.29,0.0
sp,2.8631***,0.00202,1414.538,0.0
ac,0.3022***,0.00064,475.615,0.0
