# Validation

In [None]:
import math

import jax
import jax.numpy as jnp
import matplotlib.pyplot as plt
import numpy as np
import numpyro
import numpyro.distributions as dist
import pandas as pd
import scipy
from scipy import stats
import statsmodels.api as sm
import tqdm

from frugalCopyla.model import Copula_Model
from frugalCopyla import copula_functions as copula_lpdfs

  from .autonotebook import tqdm as notebook_tqdm


## Example 1

In [None]:
input_dict = {
    'Z': {
        'dist': dist.Normal, 
        'formula': {'loc': 'A ~ 1', 'scale': 'A ~ 1'}, 
        'coeffs': {'loc': [0.], 'scale': [1.]}, 
        'link': None
    }, 
    'X': {
        'dist': dist.Normal, 
        'formula': {'loc': 'X ~ Z', 'scale': 'X ~ 1'}, 
        'coeffs': {'loc': [0., 0.5], 'scale': [1.]}, 
        'link': None
    },
    'Y': {
        'dist': dist.Normal, 
        'formula': {'loc': 'Y ~ X', 'scale': 'Y ~ 1'}, 
        'coeffs': {'loc': [0., 0.5], 'scale': [0.5]}, 
        'link': {'loc': None}
    }
}

Preparing the `Copula_Model`:

In [None]:
%%time
cop_mod = Copula_Model(input_dict)
sim_data = cop_mod.simulate_data(num_warmup=2000, num_samples=1_000_000, joint_status='continuous', seed=1)
sim_data = pd.DataFrame(sim_data)[['Z', 'X', 'Y']]
sim_data.describe()

In [None]:
assert stats.kstest(sim_data[['Z']].values.ravel(), stats.norm.cdf).pvalue > 0.1

In [None]:
assert (stats.kstest(
    (sim_data[['X']].values.ravel() - .5 * sim_data[['Z']].values.ravel()), 
    stats.norm.cdf
).pvalue > 0.1)

In [None]:
lm = sm.OLS(sim_data[['X']].values.ravel(), sm.add_constant(sim_data[['Z']]))
lm_results = lm.fit()

summary = pd.concat([ 
    pd.DataFrame(lm_results.params).rename(columns={0: 'estimate'}),
    pd.DataFrame(lm_results.bse * 2).rename(columns={0: '2sd'})
], axis=1)
summary['true_vals'] = [0, 0.5]
summary['true_estimate'] = (
    (summary['estimate'] + summary['2sd'] > summary['true_vals']) & (summary['estimate'] - summary['2sd'] < summary['true_vals'])
)
display(summary)
assert summary.loc['const', 'true_estimate'] == True
assert summary.loc['Z', 'true_estimate'] == True

In [None]:
lm = sm.OLS(sim_data[['Y']].values.ravel(), sm.add_constant(sim_data[['Z', 'X']]))
lm_results = lm.fit()

summary = pd.concat([ 
    pd.DataFrame(lm_results.params).rename(columns={0: 'estimate'}),
    pd.DataFrame(lm_results.bse * 2).rename(columns={0: '2sd'})
], axis=1)
summary['true_vals'] = [0, 0, 0.5]
summary['true_estimate'] = (
    (summary['estimate'] + summary['2sd'] > summary['true_vals']) & (summary['estimate'] - summary['2sd'] < summary['true_vals'])
)
display(summary)
assert summary.loc['const', 'true_estimate'] == True
assert summary.loc['Z', 'true_estimate'] == True

## Example 2

In [None]:
input_dict_2 = {
    'Z': {
        'dist': dist.Normal, 
        'formula': {'loc': 'A ~ 1', 'scale': 'A ~ 1'}, 
        'coeffs': {'loc': [0.], 'scale': [1.]}, 
        'link': None
    }, 
    'U': {
        'dist': dist.Normal, 
        'formula': {'loc': 'U ~ 1', 'scale': 'U ~ 1'}, 
        'coeffs': {'loc': [0.], 'scale': [1.]}, 
        'link': None
    },     
    'X': {
        'dist': dist.Normal, 
        'formula': {'loc': 'X ~ Z', 'scale': 'X ~ 1'}, 
        'coeffs': {'loc': [0., 0.5], 'scale': [1.]}, 
        'link': None
    },
    'Y': {
        'dist': dist.Normal, 
        'formula': {'loc': 'Y ~ X', 'scale': 'Y ~ 1'}, 
        'coeffs': {'loc': [0., 1.], 'scale': [0.5]}, 
        'link': {'loc': None}
    },
    'copula': {
        'class': copula_lpdfs.multivar_gaussian_copula_lpdf, 
        'vars': {'u': 'Z', 'w': 'Y'}, 
        'formula': {'rho_zy': 'c ~ 1'}, 
        'coeffs': {'rho_zy': [0.5]}, 
        'link': {'rho_zy': None}
    }    
}

In [None]:
%%time
cop_mod_2 = Copula_Model(input_dict_2)
sim_data_2 = cop_mod_2.simulate_data(num_warmup=2000, num_samples=1_000_000, joint_status='continuous', seed=1)
sim_data_2 = pd.DataFrame(sim_data_2)[['Z', 'X', 'Y']]

In [None]:
lm = sm.OLS(sim_data_2[['X']].values.ravel(), sm.add_constant(sim_data_2[['Z']]))
lm_results = lm.fit()

summary = pd.concat([ 
    pd.DataFrame(lm_results.params).rename(columns={0: 'estimate'}),
    pd.DataFrame(lm_results.bse * 2).rename(columns={0: '2sd'})
], axis=1)
summary['true_vals'] = [0, 0.5]
summary['true_estimate'] = (
    (summary['estimate'] + summary['2sd'] > summary['true_vals']) & (summary['estimate'] - summary['2sd'] < summary['true_vals'])
)
display(summary)
assert summary.loc['const', 'true_estimate'] == True
assert summary.loc['Z', 'true_estimate'] == True

In [None]:
lm = sm.OLS(sim_data_2[['Y']].values.ravel(), sm.add_constant(sim_data_2[['Z', 'X']]))
lm_results = lm.fit()

summary = pd.concat([ 
    pd.DataFrame(lm_results.params).rename(columns={0: 'estimate'}),
    pd.DataFrame(lm_results.bse * 2).rename(columns={0: '2sd'})
], axis=1)
summary['true_vals'] = [0, 0.5, 1]
summary['true_estimate'] = (
    (summary['estimate'] + summary['2sd'] > summary['true_vals']) & (summary['estimate'] - summary['2sd'] < summary['true_vals'])
)
display(summary)
assert summary.loc['const', 'true_estimate'] == True
assert summary.loc['X', 'true_estimate'] == True

### Check that binomial works OK

In [None]:
input_dict_3 = {
    'Z': {
        'dist': dist.BernoulliProbs, 
        'formula': {'probs': 'A ~ 1'}, 
        'coeffs': {'probs': [0.5]}, 
        'link': None
    }, 
    'X': {
        'dist': dist.Normal, 
        'formula': {'loc': 'X ~ Z', 'scale': 'X ~ 1'}, 
        'coeffs': {'loc': [0., 0.5], 'scale': [1.]}, 
        'link': None
    },
    'Y': {
        'dist': dist.Normal, 
        'formula': {'loc': 'Y ~ X', 'scale': 'Y ~ 1'}, 
        'coeffs': {'loc': [0., 1.], 'scale': [0.5]}, 
        'link': None
    }   
}

In [None]:
%%time
cop_mod_3 = Copula_Model(input_dict_3)
sim_data_3 = cop_mod_3.simulate_data(num_warmup=5000, num_samples=1_000_000, joint_status='mixed', seed=0)
sim_data_3 = pd.DataFrame(sim_data_3)[['Z', 'X', 'Y']]
sim_data_3.describe()

In [None]:
lm = sm.OLS(sim_data_3[['X']].values.ravel(), sm.add_constant(sim_data_3[['Z']]))
lm_results = lm.fit()

summary = pd.concat([ 
    pd.DataFrame(lm_results.params).rename(columns={0: 'estimate'}),
    pd.DataFrame(lm_results.bse * 2).rename(columns={0: '2sd'})
], axis=1)
summary['true_vals'] = [0, 0.5]
summary['true_estimate'] = (
    (summary['estimate'] + summary['2sd'] > summary['true_vals']) & (summary['estimate'] - summary['2sd'] < summary['true_vals'])
)
display(summary)
assert summary.loc['const', 'true_estimate'] == True
assert summary.loc['Z', 'true_estimate'] == True

In [None]:
lm = sm.OLS(sim_data_3[['Y']].values.ravel(), sm.add_constant(sim_data_3[['Z', 'X']]))
lm_results = lm.fit()

summary = pd.concat([ 
    pd.DataFrame(lm_results.params).rename(columns={0: 'estimate'}),
    pd.DataFrame(lm_results.bse * 2).rename(columns={0: '2sd'})
], axis=1)
summary['true_vals'] = [0, 0., 1]
summary['true_estimate'] = (
    (summary['estimate'] + summary['2sd'] > summary['true_vals']) & (summary['estimate'] - summary['2sd'] < summary['true_vals'])
)
display(summary)
assert summary.loc['const', 'true_estimate'] == True
assert summary.loc['X', 'true_estimate'] == True

In [None]:
assert stats.chisquare(sim_data_3[['Z']].values.ravel()).pvalue > 0.1