<a href="https://colab.research.google.com/github/Christheoneoneil/stat330_final/blob/model_fitting/model_fitting_regularzied_horseshoe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Relevant Imports


In [1]:
import stan
# !pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import arviz as az
import seaborn as sns
# %load_ext rpy2.ipython

# Preliminary Code Cleaning for Multinomial Logistic Regression

In [2]:
def read_data(file_name: str):
  """
  param filename: name of data file
  returns: pandas data frame
  """
  df = pd.read_csv(file_name, index_col = "Unnamed: 0")
  df.dropna(axis = 0, how="any", inplace=True)
  df = df.loc[:,~df.columns.duplicated()]
  return df

In [3]:
def prep_data(df: pd.DataFrame, key: str, unwanted_cols: list):
  """
  param df: data frame that needs to be formatted
  key: key value associated with provided dictionary
  unwanted_cols: columns that are ultimatly not needed for analysis
  returns: prept data frame for logit regression
  """
  from sklearn import preprocessing
  df_copy = df.copy()
  unnormed_cols = ["STUDWGT"]

  if key == "imputed":
    unnormed_cols = ['ACTComposite', 'SATMath', 
                     'SATVerbal', 'SATWriting'] + unnormed_cols
    df_copy.drop(columns=["Unnamed: 0.1"], inplace=True)

  normalizer = preprocessing.MinMaxScaler()
  normed_cols = normalizer.fit_transform(df[unnormed_cols])
  df_copy[unnormed_cols] = normed_cols
  df_copy.drop(columns=unwanted_cols, inplace=True)

  recode_vars = ["STRAT", "SELECTIVITY", "DOBYear"]
  for var in recode_vars:
    unencoded_list = list(df_copy[var].unique())
    encode_list = list(range(1, len(unencoded_list)+1))
    df_copy[var].replace(unencoded_list, encode_list, inplace=True)
  return(df_copy)

In [5]:
df_dict = {"imputed": read_data("data/final_frame_imputed.csv"),
           "non_imputed": read_data("data/final_frame_non_imputed.csv")}
unnecessary_cols = ["ACERECODE"]
for key, df in df_dict.items():
  df_dict[key] = prep_data(df, key, unnecessary_cols)

# Setting Up Model

In [6]:

def fit_model(model_code: str, X: np.array, Y: np.array, n: int, k: int, 
              flag_val: int):
  """
  param model_code: stan formatted code for model
  param X: nxk array of covariates 
  param Y: array of target values
  param n: number of rows 
  param k: number of covarites
  param flag_val: do predictive distrub flag value
  reutrns: stan sampler object
  """
  #Slab and golbal variables were randomly chosen we need to do more research
  
  mod = stan.build(model_code,data={"X": X, "Y": Y, "n": n, "k": k, 
                               "do_prior_predictive": flag_val, "scale_icept":1,
                               "scale_global": 10/(k-10)/np.sqrt(n), "nu_global":1,
                               "nu_local": 2,"slab_scale":1, "slab_df":20})
  samples = mod.sample()
  return samples


In [None]:
multinomial_log_stan_code = """
data {
    int <lower=0> n ;            // number of observations
    int <lower=0> k ;           // number of predictors
    array[n] int <lower=1, upper=4> Y;// outputs
    matrix[n, k] X;             // inputs
    real <lower=0> scale_icept;  // prior std for the intercept
    real <lower=0> scale_global; // scale for the half - t prior for tau
    real <lower=0> nu_global;    // degrees of freedom for the half - t prior

    // for tau
    real <lower =1> nu_local;   // degrees of freedom for the half - t priors

    // for lambdas
    real <lower =0> slab_scale; // slab scale for the regularized horseshoe
    real <lower =0> slab_df;    // slab degrees of freedom for the regularized
    // horseshoe
}
parameters {
    real beta0 ;
    vector[k] z ;
    real <lower=0> aux1_global;
    real <lower=0> aux2_global;
    vector <lower=0>[k] aux1_local;
    vector <lower=0>[k] aux2_local;
    real <lower=0> caux;
}
transformed parameters {
    real <lower=0> tau ;                // global shrinkage parameter
    vector <lower=0>[k] lambda ;        // local shrinkage parameter
    vector <lower=0>[k] lambda_tilde ;  // ’truncated’ local shrinkage parameter
    real <lower=0> c ;                  // slab scale
    vector[k] beta;                     // regression coefficients
    vector[n] f;                        // latent function values
    lambda = aux1_local.*sqrt(aux2_local);
    tau = aux1_global*sqrt(aux2_global)*scale_global;
    c = slab_scale * sqrt(caux);
    lambda_tilde = sqrt(c^2*square(lambda)./(c ^2 + tau ^2*square(lambda)));
    beta = z.*lambda_tilde*tau;
    f = beta0 + X*beta ;
}
model {
    // half - t priors for lambdas and tau , and inverse - gamma for c ^2
    z ~ normal(0 , 1);
    aux1_local ~ normal(0 , 1);
    aux2_local ~ inv_gamma(0.5*nu_local, 0.5*nu_local);
    aux1_global ~ normal(0 , 1);
    aux2_global ~ inv_gamma(0.5*nu_global, 0.5*nu_global);
    caux ~ inv_gamma(0.5*slab_df , 0.5*slab_df);
    beta0 ~ normal(0, scale_icept);
    Y ~ categorical_logit(f);
}
"""

In [11]:
target_col = "CHOICE"
prior_pred_dict = {}
for key, df in df_dict.items():
  df = df.sample(frac=0.10)
  covars = [cov for cov in list(df.columns) if cov not in target_col]
  X = np.array(df[covars])
  n = len(X)
  X = np.concatenate((np.ones((n, 1)), X), axis=1)
  Y = np.array(df[target_col], dtype="int64")
  k = X.shape[1]
  prior_pred_dict[key] = fit_model(multinomial_log_stan_code, X, Y, n, k, 1)
  break

Building...

In file included from /home/mitch/miniconda3/envs/bayes/lib/python3.9/site-packages/httpstan/include/boost/multi_array/multi_array_ref.hpp:32,
                 from /home/mitch/miniconda3/envs/bayes/lib/python3.9/site-packages/httpstan/include/boost/multi_array.hpp:34,
                 from /home/mitch/miniconda3/envs/bayes/lib/python3.9/site-packages/httpstan/include/boost/numeric/odeint/algebra/multi_array_algebra.hpp:22,
                 from /home/mitch/miniconda3/envs/bayes/lib/python3.9/site-packages/httpstan/include/boost/numeric/odeint.hpp:63,
                 from /home/mitch/miniconda3/envs/bayes/lib/python3.9/site-packages/httpstan/include/stan/math/prim/functor/ode_rk45.hpp:9,
                 from /home/mitch/miniconda3/envs/bayes/lib/python3.9/site-packages/httpstan/include/stan/math/prim/functor/integrate_ode_rk45.hpp:6,
                 from /home/mitch/miniconda3/envs/bayes/lib/python3.9/site-packages/httpstan/include/stan/math/prim/functor.hpp:14,
                 fro





Building: 17.8s, done.Messages from stanc:
    provided, or the prior(s) depend on data variables. In the later case,
    this may be a false positive.
    provided, or the prior(s) depend on data variables. In the later case,
    this may be a false positive.
    is provided, or the prior(s) depend on data variables. In the later case,
    this may be a false positive.
    is provided, or the prior(s) depend on data variables. In the later case,
    this may be a false positive.
Sampling:   0%
Sampling:   0% (1/8000)
Sampling:   0% (2/8000)
Sampling:   0% (3/8000)
Sampling:   0% (4/8000)