<a href="https://colab.research.google.com/github/Christheoneoneil/stat330_final/blob/model_fitting/model_fitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Relevant Imports


In [3]:
import stan
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import arviz as az
import seaborn as sns

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Setting Up Model

In [72]:
def read_data(file_name: str):
  df = pd.read_csv(file_name, index_col = "Unnamed: 0")
  df.dropna(axis = 0, how="any", inplace=True)
  return df

def fit_model(model_code: str, X: np.array, Y: np.array, n: int, k: int, flag_val):
  mod = stan.build(model_code, data={"X": X, "Y": Y, "n": n, "k": k, 
                               "do_prior_predictive": flag_val})
  return mod.sample()


In [69]:
df_dict = {"imputed": read_data("final_frame_imputed.csv"),
           "non_imputed": read_data("final_frame_non_imputed.csv")}

In [70]:
stan_code = """
data {
  int<lower=0> n;                // number of units
  int<lower=0> k;                // number of covariates
  matrix[n, k] X;            // covariates for each entry, including the intercept covariate
  int<lower=1,upper=5> Y[n];     // categorical

  int do_prior_predictive;
}
parameters {
  vector[k] beta;            // the coefficients
}
transformed parameters {
  vector[n] eta;                // linear predictors
  eta = X * beta;
}
model {
  beta ~ normal(0, 3);

  if (do_prior_predictive != 1) {
    for (i in 1:n)
      Y[i] ~ categorical_logit(eta);
  }
}
generated quantities {
  int<lower=1,upper=5> Y_tilde[n];
  for (i in 1:n)  
    Y_tilde[i] = categorical_logit_rng(eta);  
}
"""

In [None]:
target_col = "CHOICE"
prior_pred_dict = {}
for key, df in df_dict.items():
  covars = [cov for cov in list(df.columns) if cov not in target_col]
  X = np.array(df[covars])
  n = len(X)
  X = np.concatenate((np.ones((n, 1)), X), axis=1)
  Y = np.array(df[target_col], dtype="int64")
  k = X.shape[1]
  prior_pred_dict[key] = fit_model(stan_code, X, Y, n, k, 1)

Building...



Building: found in cache, done.Sampling:   0%
Sampling:   1% (100/8000)
Sampling:   2% (200/8000)
Sampling:   4% (300/8000)
Sampling:   5% (400/8000)
Sampling:   6% (500/8000)
Sampling:   8% (600/8000)
Sampling:   9% (700/8000)
Sampling:  10% (800/8000)
Sampling:  11% (900/8000)
Sampling:  12% (1000/8000)
Sampling:  14% (1100/8000)
Sampling:  15% (1200/8000)
Sampling:  16% (1300/8000)
Sampling:  18% (1400/8000)
Sampling:  19% (1500/8000)
Sampling:  20% (1600/8000)
Sampling:  21% (1700/8000)
Sampling:  22% (1800/8000)
Sampling:  24% (1900/8000)
Sampling:  24% (1901/8000)
Sampling:  25% (2002/8000)