<a href="https://colab.research.google.com/github/Christheoneoneil/stat330_final/blob/model_fitting/model_fitting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Relevant Imports


In [None]:
import stan
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import arviz as az
import seaborn as sns

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nest_asyncio
  Downloading nest_asyncio-1.5.6-py3-none-any.whl (5.2 kB)
Installing collected packages: nest-asyncio
Successfully installed nest-asyncio-1.5.6


#Preliminary Code Cleaning for Multinomial Logistic Regression

In [None]:
def read_data(file_name: str):
  """
  param filename: name of data file
  returns: pandas data frame
  """
  df = pd.read_csv(file_name, index_col = "Unnamed: 0")
  df.dropna(axis = 0, how="any", inplace=True)
  df = df.loc[:,~df.columns.duplicated()]
  return df

In [76]:
def prep_data(df: pd.DataFrame, key: str, unwanted_cols: list):
  """
  param df: data frame that needs to be formatted
  key: key value associated with provided dictionary
  unwanted_cols: columns that are ultimatly not needed for analysis
  returns: prept data frame for logit regression
  """
  from sklearn import preprocessing
  df_copy = df.copy()
  unnormed_cols = ["STUDWGT"]

  if key == "imputed":
    unnormed_cols = ['ACTComposite', 'SATMath', 
                     'SATVerbal', 'SATWriting'] + unnormed_cols
    df_copy.drop(columns=["Unnamed: 0.1"], inplace=True)

  normalizer = preprocessing.MinMaxScaler()
  normed_cols = normalizer.fit_transform(df[unnormed_cols])
  df_copy[unnormed_cols] = normed_cols
  df_copy.drop(columns=unwanted_cols, inplace=True)

  recode_vars = ["STRAT", "SELECTIVITY", "DOBYear"]
  for var in recode_vars:
    unencoded_list = list(df_copy[var].unique())
    encode_list = list(range(1, len(unencoded_list)+1))
    df_copy[var].replace(unencoded_list, encode_list, inplace=True)
  return(df_copy)

In [81]:
df_dict = {"imputed": read_data("final_frame_imputed.csv"),
           "non_imputed": read_data("final_frame_non_imputed.csv")}
unnecessary_cols = ["ACERECODE"]
for key, df in df_dict.items():
  df_dict[key] = prep_data(df, key, unnecessary_cols)

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29.]
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27.]


#Setting Up Model

In [None]:
def fit_model(model_code: str, X: np.array, Y: np.array, n: int, k: int, 
              flag_val: int):
  """
  param model_code: stan formatted code for model
  param X: nxk array of covariates 
  param Y: array of target values
  param n: number of rows 
  param k: number of covarites
  param flag_val: do predictive distrub flag value
  reutrns: stan sampler object
  """
  mod = stan.build(model_code, data={"X": X, "Y": Y, "n": n, "k": k, 
                               "do_prior_predictive": flag_val})
  return mod.sample()


In [None]:
multinomial_log_stan_code = """
data {
  int<lower=0> n;                // number of units
  int<lower=0> k;                // number of covariates
  matrix[n, k] X;            // covariates for each entry, including the intercept covariate
  int<lower=1,upper=5> Y[n];     // categorical

  int do_prior_predictive;
}
parameters {
  vector[k] beta;            // the coefficients
}
transformed parameters {
  vector[n] eta;                // linear predictors
  eta = X * beta;
}
model {
  beta ~ normal(0, 3);

  if (do_prior_predictive != 1) {
    for (i in 1:n)
      Y[i] ~ categorical_logit(eta);
  }
}
generated quantities {
  int<lower=1,upper=5> Y_tilde[n];
  for (i in 1:n)  
    Y_tilde[i] = categorical_logit_rng(eta);  
}
"""

In [None]:
target_col = "CHOICE"
prior_pred_dict = {}
for key, df in df_dict.items():
  covars = [cov for cov in list(df.columns) if cov not in target_col]
  X = np.array(df[covars])
  n = len(X)
  X = np.concatenate((np.ones((n, 1)), X), axis=1)
  Y = np.array(df[target_col], dtype="int64")
  k = X.shape[1]
  #prior_pred_dict[key] = fit_model(multinomial_log_stan_code, X, Y, n, k, 1)

Building...



Building: 43.4s, done.Sampling:   0%
Sampling:   2% (200/8000)
Sampling:   4% (300/8000)
Sampling:   5% (400/8000)
Sampling:   6% (500/8000)
Sampling:   8% (600/8000)
Sampling:   9% (700/8000)
Sampling:  10% (800/8000)
Sampling:  11% (900/8000)
Sampling:  12% (1000/8000)
Sampling:  14% (1100/8000)
Sampling:  15% (1200/8000)
Sampling:  16% (1300/8000)
Sampling:  18% (1400/8000)
Sampling:  19% (1500/8000)
Sampling:  20% (1600/8000)
Sampling:  21% (1701/8000)
Sampling:  23% (1801/8000)
Sampling:  24% (1901/8000)
Sampling:  25% (2002/8000)
Sampling:  26% (2101/8000)
Sampling:  28% (2200/8000)
Sampling:  29% (2300/8000)
Sampling:  30% (2400/8000)
Sampling:  31% (2500/8000)
Sampling:  32% (2600/8000)
Sampling:  34% (2700/8000)
Sampling:  35% (2800/8000)
Sampling:  36% (2900/8000)
Sampling:  38% (3000/8000)
Sampling:  39% (3100/8000)
Sampling:  40% (3200/8000)
Sampling:  41% (3300/8000)
Sampling:  42% (3400/8000)
Sampling:  44% (3500/8000)
Sampling:  45% (3600/8000)
Sampling:  46% (3700/8000