<a href="https://colab.research.google.com/github/Christheoneoneil/stat330_final/blob/model_fitting/model_fitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Relevant Imports


In [1]:
import stan
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import arviz as az
import seaborn as sns



# Preliminary Code Cleaning for Multinomial Logistic Regression

In [2]:
def read_data(file_name: str):
  """
  param filename: name of data file
  returns: pandas data frame
  """
  df = pd.read_csv(file_name, index_col = "Unnamed: 0")
  df.dropna(axis = 0, how="any", inplace=True)
  df = df.loc[:,~df.columns.duplicated()]
  return df

In [3]:
def prep_data(df: pd.DataFrame, key: str, unwanted_cols: list):
  """
  param df: data frame that needs to be formatted
  key: key value associated with provided dictionary
  unwanted_cols: columns that are ultimatly not needed for analysis
  returns: prept data frame for logit regression
  """
  from sklearn import preprocessing
  df_copy = df.copy()
  unnormed_cols = ["STUDWGT"]

  if key == "imputed":
    unnormed_cols = ['ACTComposite', 'SATMath', 
                     'SATVerbal', 'SATWriting'] + unnormed_cols
    df_copy.drop(columns=["Unnamed: 0.1"], inplace=True)

  normalizer = preprocessing.MinMaxScaler()
  normed_cols = normalizer.fit_transform(df[unnormed_cols])
  df_copy[unnormed_cols] = normed_cols
  df_copy.drop(columns=unwanted_cols, inplace=True)

  recode_vars = ["STRAT", "SELECTIVITY", "DOBYear"]
  for var in recode_vars:
    unencoded_list = list(df_copy[var].unique())
    encode_list = list(range(1, len(unencoded_list)+1))
    df_copy[var].replace(unencoded_list, encode_list, inplace=True)
  return(df_copy)

In [4]:
df_dict = {"imputed": read_data("data/final_frame_imputed.csv"),
           "non_imputed": read_data("data/final_frame_non_imputed.csv")}
unnecessary_cols = ["ACERECODE"]
for key, df in df_dict.items():
  df_dict[key] = prep_data(df, key, unnecessary_cols)

# Setting Up Model

In [5]:
def fit_model(model_code: str, X: np.array, Y: np.array, n: int, k: int, 
              flag_val: int):
  """
  param model_code: stan formatted code for model
  param X: nxk array of covariates 
  param Y: array of target values
  param n: number of rows 
  param k: number of covarites
  param flag_val: do predictive distrub flag value
  reutrns: stan sampler object
  """
  mod = stan.build(model_code,data={"X": X, "Y": Y, "n": n, "k": k, 
                               "do_prior_predictive": flag_val})
  samples = mod.sample()
  return samples


In [11]:
multinomial_log_stan_code = """
data {
  int<lower=0> n;                // number of units
  int<lower=0> k;                // number of covariates
  matrix[n, k] X;            // covariates for each entry, including the intercept covariate
  int<lower=1,upper=5> Y[n];     // categorical

  int do_prior_predictive;
}
parameters {
  vector[k] beta;
  vector<lower=0>[k] lambda;
  real<lower=0> tau;
  real<lower=0> sigma;            // the coefficients
}

transformed parameters {
  vector[n] mu;
  vector[n] eta;                // linear predictors
  eta = X * beta;
  mu = inv_logit(eta);
}
model {
  lambda ~ cauchy(0, 1);
  tau ~ cauchy(0, 1);
  for (i in 1:k) {
    beta[i] ~ normal(0, lambda[i] * tau);
  }
  if (do_prior_predictive != 1) {
    for (i in 1:n) {
      Y[i] ~ categorical_logit_glm(mu);
    }
  }
}
generated quantities {
  int<lower=1,upper=4> Y_tilde[n];
  for (i in 1:n) {
    Y_tilde[i] = categorical_logit_glm_rng(mu);  
  } 
}
"""

In [None]:
target_col = "choice"
prior_pred_dict = {}
for key, df in df_dict.items():
  df = df.sample(frac=0.20)
  covars = [cov for cov in list(df.columns) if cov not in target_col]
  x = np.array(df[covars])
  n = len(x)
  x = np.concatenate((np.ones((n, 1)), x), axis=1)
  y = np.array(df[target_col], dtype="int64")
  k = x.shape[1]
  prior_pred_dict[key] = fit_model(multinomial_log_stan_code, x, y, n, k, 1)

In [29]:
# 2022-11-18 LASSO Attempt-maj
model_code = """
data {
    int<lower=1> num_classes;
    int<lower=0> n;
    int<lower=0> k;
    matrix[n, k] x;
    array[n] int<lower=1, upper=num_classes> y;

    int do_prior_predictive;
}
parameters {
    vector<lower=0>[num_classes] alpha;
    matrix[k, num_classes] beta;
    vector<lower=0>[k] lambda;
    real<lower=0> tau;
}
model {
    alpha ~ lognormal(0,1);
    lambda ~ cauchy(0,1);
    tau ~ cauchy(0,1);
    for (i in 1:k) {
        beta[i] ~ normal(0, lambda[i] * tau);
    }
    if (do_prior_predictive != 1) {
        y ~ categorical_logit_glm(x, alpha, beta);
    }
}
generated quantities {
    array[n] real Y_tilde_1;
    array[n] real Y_tilde_2;
    array[n] real Y_tilde_3;
    array[n] real Y_tilde_4;
    for (i in 1:n){
        Y_tilde_1[i] = categorical_logit_glm_lpmf(1 | x[i, :], alpha, beta);
        Y_tilde_2[i] = categorical_logit_glm_lpmf(2 | x[i, :], alpha, beta);
        Y_tilde_3[i] = categorical_logit_glm_lpmf(3 | x[i, :], alpha, beta);
        Y_tilde_4[i] = categorical_logit_glm_lpmf(4 | x[i, :], alpha, beta);
    }
}
"""

In [None]:
df = df_dict['imputed']
target_col = "CHOICE"
df_sub = df.sample(frac=0.10)
y = df_sub[target_col].to_numpy(dtype=np.int16)
x = df_sub.drop(target_col, axis=1).to_numpy()
num_classes = len(np.unique(y))
n = x.shape[0]
k = x.shape[1]

model = stan.build(model_code, {'num_classes':num_classes, 'n':n, 'k':k, 'x':x, 'y':y, 'do_prior_predictive':1})

In [31]:
fit = model.sample()

Sampling:   0%
Sampling:   0% (1/8000)
Sampling:   0% (2/8000)
Sampling:   0% (3/8000)
Sampling:   1% (102/8000)
Sampling:   1% (103/8000)
Sampling:   3% (202/8000)
Sampling:   4% (301/8000)
Sampling:   5% (401/8000)
Sampling:   6% (500/8000)
Sampling:   8% (600/8000)
Sampling:   9% (700/8000)
Sampling:  10% (800/8000)
Sampling:  11% (900/8000)
Sampling:  12% (1000/8000)
Sampling:  14% (1100/8000)
Sampling:  15% (1200/8000)
Sampling:  16% (1300/8000)
Sampling:  18% (1400/8000)
Sampling:  19% (1500/8000)
Sampling:  20% (1600/8000)
Sampling:  21% (1700/8000)
Sampling:  22% (1800/8000)
Sampling:  24% (1900/8000)
Sampling:  25% (2000/8000)
Sampling:  26% (2100/8000)
Sampling:  28% (2200/8000)
Sampling:  29% (2300/8000)
Sampling:  30% (2400/8000)
Sampling:  31% (2500/8000)
Sampling:  32% (2600/8000)
Sampling:  34% (2700/8000)
Sampling:  35% (2800/8000)
Sampling:  36% (2900/8000)
Sampling:  38% (3000/8000)
Sampling:  39% (3100/8000)
Sampling:  40% (3200/8000)
Sampling:  41% (3300/8000)
Sampl

In [32]:
fit

<stan.Fit>
Parameters:
    alpha: (4,)
    beta: (79, 4)
    lambda: (79,)
    tau: ()
    Y_tilde_1: (9858,)
    Y_tilde_2: (9858,)
    Y_tilde_3: (9858,)
    Y_tilde_4: (9858,)
Draws: 4000

In [33]:
# Check divergences
np.sum(fit['divergent__'])

1191.0

In [91]:
# fit.to_frame().to_csv("data/lasso_fit.csv", index=False)

In [34]:
az.summary(fit)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha[0],1.318,1.711,0.040,4.170,0.212,0.150,17.0,602.0,1.17
alpha[1],1.779,1.839,0.058,4.308,0.132,0.093,48.0,689.0,1.11
alpha[2],1.484,2.154,0.054,4.634,0.162,0.115,18.0,307.0,1.16
alpha[3],1.420,1.902,0.028,4.073,0.124,0.088,340.0,645.0,1.04
"beta[0, 0]",0.757,2.532,-1.831,5.462,0.874,0.641,14.0,42.0,1.22
...,...,...,...,...,...,...,...,...,...
Y_tilde_4[9853],-93.933,157.830,-303.084,0.000,19.874,14.118,87.0,40.0,1.29
Y_tilde_4[9854],-109.316,193.856,-481.516,0.000,37.510,26.819,127.0,44.0,1.33
Y_tilde_4[9855],-139.886,276.712,-506.140,0.000,46.625,33.245,47.0,50.0,1.22
Y_tilde_4[9856],-130.014,232.234,-571.116,0.000,54.955,39.512,36.0,22.0,1.31


In [35]:
az.plot_trace(fit, divergences="bottom")
plt.tight_layout()
plt.show();

In [98]:
# import pickle
# with open("data/lasso_model.pickle", "wb") as f:
#     pickle.dump(fit, f)

In [99]:
# with open("data/lasso_model.pickle", "rb") as f:
#     foo = pickle.load(f)