In [1]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal, multinomial
from scipy.special import softmax

np.random.seed(42)

# Settings
num_companies = 40
num_categories = 4  # categories: 0 (pivot), 1, 2, 3
num_prev_categories = 4

# 1. Generate random effects for each company (for categories 1-3, pivot is category 0)
mean_effect = np.zeros(num_categories - 1)
# Define covariance matrix with correlations between categories
cov_effect = np.array([
    [1.0, 0.5, 0.3],
    [0.5, 1.0, 0.4],
    [0.3, 0.4, 1.0]
])

company_effects = multivariate_normal.rvs(mean=mean_effect, cov=cov_effect, size=num_companies)
# Add pivot logits = 0
company_effects = np.hstack([np.zeros((num_companies, 1)), company_effects])

# Store company random effects in a DataFrame
company_ids = [f'company_{i}' for i in range(num_companies)]
company_effects_df = pd.DataFrame(company_effects, columns=[f'logit_{i}' for i in range(num_categories)])
company_effects_df['company'] = company_ids

# 2. Generate observations per company
data = []
for idx, company in enumerate(company_ids):
    n_obs = np.random.randint(1, 11)  # Between 1 and 10 observations per company
    
    # 3. Randomly assign prev_type for each observation
    prev_types = np.random.choice(num_prev_categories, size=n_obs)
    
    # Define fixed effects for prev_type (logits for categories 1-3, pivot is 0)
    # Here just an arbitrary fixed pattern; modify as desired
    prev_type_effects = np.array([
        [0.0, 0.0, 0.0],   # prev_type = 0 (reference)
        [0.5, -0.5, 0.2],  # prev_type = 1
        [-0.3, 0.6, -0.4], # prev_type = 2
        [0.2, -0.2, 0.5]   # prev_type = 3
    ])
    prev_type_effects = np.hstack([np.zeros((num_prev_categories, 1)), prev_type_effects])
    
    for prev in prev_types:
        # 4. Compute logits (company + prev_type)
        logits = company_effects[idx] + prev_type_effects[prev]
        
        # Convert logits to probabilities using softmax
        probs = softmax(logits)
        
        # 5. Sample observed type based on probs
        observed_type = np.random.choice(num_categories, p=probs)
        
        # Append observation
        data.append({
            'company': company,
            'prev_type': f'type_{prev}',
            'type': f'type_{observed_type}'
        })

# Convert to DataFrame
simulated_df = pd.DataFrame(data)


print(simulated_df.head(10))


     company prev_type    type
0  company_0    type_2  type_2
1  company_1    type_3  type_3
2  company_1    type_1  type_3
3  company_1    type_1  type_0
4  company_1    type_1  type_1
5  company_1    type_2  type_2
6  company_2    type_3  type_0
7  company_2    type_3  type_2
8  company_2    type_2  type_2
9  company_3    type_3  type_3


In [9]:
simulated_df

Unnamed: 0,company,prev_type,type
0,company_0,type_2,type_2
1,company_1,type_3,type_3
2,company_1,type_1,type_3
3,company_1,type_1,type_0
4,company_1,type_1,type_1
...,...,...,...
206,company_39,type_0,type_1
207,company_39,type_0,type_1
208,company_39,type_0,type_0
209,company_39,type_0,type_3


In [10]:
# Your original dataframe is simulated_df
simulated_df['type_num'] = simulated_df['type'].str.split('_').str[1].astype(int)

# Generate dummy variables for categories 1,2,3 (excluding baseline 0)
for i in range(1, 4):
    simulated_df[f'type_{i}'] = (simulated_df['type_num'] == i).astype(int)


In [11]:
simulated_df

Unnamed: 0,company,prev_type,type,type_num,type_1,type_2,type_3
0,company_0,type_2,type_2,2,0,1,0
1,company_1,type_3,type_3,3,0,0,1
2,company_1,type_1,type_3,3,0,0,1
3,company_1,type_1,type_0,0,0,0,0
4,company_1,type_1,type_1,1,1,0,0
...,...,...,...,...,...,...,...
206,company_39,type_0,type_1,1,1,0,0
207,company_39,type_0,type_1,1,1,0,0
208,company_39,type_0,type_0,0,0,0,0
209,company_39,type_0,type_3,3,0,0,1


Work in progress... still not doing what i think it should be doing.

In [16]:
import bambi as bmb

model = bmb.Model('type ~ 0 + prev_type + (0 + type_1 + type_2 + type_3 | company)', data=simulated_df, family='categorical')
results = model.fit()


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [prev_type, type_1|company_sigma, type_1|company_offset, type_2|company_sigma, type_2|company_offset, type_3|company_sigma, type_3|company_offset]


Output()

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 12 seconds.


In [18]:
import arviz as az
az.summary(results ,var_names = ['~company'])



Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
"prev_type[type_1, type_0]",-4.187,1.839,-7.748,-1.166,0.032,0.033,3911.0,2395.0,1.0
"prev_type[type_1, type_1]",-3.559,1.889,-7.184,-0.524,0.035,0.047,3639.0,2483.0,1.0
"prev_type[type_1, type_2]",-4.263,2.271,-8.547,-0.367,0.039,0.035,3731.0,2896.0,1.0
"prev_type[type_1, type_3]",-3.614,1.652,-6.781,-0.813,0.031,0.036,3584.0,2281.0,1.0
"prev_type[type_2, type_0]",-3.700,1.608,-6.799,-0.979,0.028,0.030,3771.0,2524.0,1.0
...,...,...,...,...,...,...,...,...,...
"type_3|company[type_3, company_5]",18.826,9.546,4.857,36.528,0.179,0.196,3372.0,2587.0,1.0
"type_3|company[type_3, company_6]",17.182,9.662,2.662,34.393,0.178,0.194,3398.0,2747.0,1.0
"type_3|company[type_3, company_7]",17.453,9.656,2.720,35.085,0.168,0.186,3809.0,3165.0,1.0
"type_3|company[type_3, company_8]",0.036,15.252,-27.516,30.465,0.185,0.279,6902.0,2609.0,1.0
