## Building Class to encapsulate STAN_GLM method of RSTANARM

In [11]:
import subprocess
import os
from bedrock.analytics.utils import Algorithm
import pandas as pd
import logging
import rpy2
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

def check_valid_formula(formula):
    # TODO: Look at `patsy` for helper function to validate more fully
    if (len(formula.split('~')) < 2):
        logging.error("Formula does not have ~")
        return False

class Stan_GLM(Algorithm):
    def __init__(self):
        super(Stan_GLM, self).__init__()
        self.parameters = []
        self.inputs = ['matrix.csv','features.txt']
        self.outputs = ['prior_summary.txt', 'summary.txt']
        self.name ='Stan_GLM'
        self.type = 'GLM'
        self.description = 'Performs Stan_GLM analysis on the input dataset.'
        
        self.parameters_spec = [
            { "name" : "Formula", "attrname" : "formula", "value" : "", "type" : "input" },
            { "name" : "GLM family", "attrname" : "family", "value" : "binomial", "type" : "input" },
            { "name" : "chains", "attrname" : "chains" , "value" : "", "type" : "input"},
            { "name" : "iter", "attrname" : "iter" , "value" : "", "type" : "input"},
            { "name" : "prior", "attrname" : "prior" , "value" : "", "type" : "input"},
            { "name" : "prior_intercept", "attrname" : "prior_intercept" , "value" : "", "type" : "input"}
        ]

    def check_parameters(self):
        logging.error("Started check parms")
        super(Stan_GLM, self).check_parameters()

        if(check_valid_formula(self.formula) == False):
            return False

        self.family = self.family.lower()

        if (self.family != 'binomial(link = "logit")' and self.family != 'gaussian(link = "identity")'):
            logging.error("GLM family {} not supported".format(self.family))
            return False
            
        return True
    def __build_df__(self, filepath):
        featuresPath = filepath['features.txt']['rootdir'] + 'features.txt'
        matrixPath = filepath['matrix.csv']['rootdir'] + 'matrix.csv'
        df = pd.read_csv(matrixPath, header=-1)
        featuresList = pd.read_csv(featuresPath, header=-1)

        df.columns = featuresList.T.values[0]

        return df


    def compute(self, filepath, **kwargs):
        rstan = importr("rstan")
        rstanarm = importr("rstanarm")

        df = self.__build_df__(filepath)
        rdf = pandas2ri.py2ri(df)
        
        rglmString = "stan_glm({}, data=MyData,family = {}, chains = {}, iter = {})"
        
        rglmStringFormatted = rglmString.format(kwargs["formula"],kwargs["family"],kwargs["chains"], kwargs["iter"], kwargs["prior"], kwargs["prior_intercept"])
        
        rpy2.robjects.r('MyData <- read.csv(file="/home/atam6/git/bedrock-core/examples/RAND2011study/Rand2011PNAS_cooperation_data.csv", header=TRUE, sep=",")')
        rpy2.robjects.r('output = stan_glm(decision0d1c~round_num, data=MyData,family = binomial(link = "logit"), chains = 3, iter = 3000)')
        prior_summary = rpy2.robjects.r('prior_summary<-prior_summary(output)')
        summary = rpy2.robjects.r('summary<-summary(output)')

        self.results = {'prior_summary.txt': prior_summary, 'summary.txt': summary}


## Developing Area

In [None]:
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

In [15]:
rstan = importr("rstan")
rstanarm = importr("rstanarm")

In [22]:
rpy2.robjects.r('MyData <- read.csv(file="/home/atam6/git/bedrock-core/examples/RAND2011study/Rand2011PNAS_cooperation_data.csv", header=TRUE, sep=",")')
rpy2.robjects.r('output = stan_glm(decision0d1c~round_num, data=MyData,family = binomial(link = "logit"), chains = 3, iter = 3000)')
prior_summary = rpy2.robjects.r('prior_summary<-prior_summary(output)')
summary = rpy2.robjects.r('summary<-summary(output)')


SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).

Gradient evaluation took 0.000529 seconds
1000 transitions using 10 leapfrog steps per transition would take 5.29 seconds.
Adjust your expectations accordingly!


Iteration:    1 / 3000 [  0%]  (Warmup)
Iteration:  300 / 3000 [ 10%]  (Warmup)
Iteration:  600 / 3000 [ 20%]  (Warmup)
Iteration:  900 / 3000 [ 30%]  (Warmup)
Iteration: 1200 / 3000 [ 40%]  (Warmup)
Iteration: 1500 / 3000 [ 50%]  (Warmup)
Iteration: 1501 / 3000 [ 50%]  (Sampling)
Iteration: 1800 / 3000 [ 60%]  (Sampling)
Iteration: 2100 / 3000 [ 70%]  (Sampling)
Iteration: 2400 / 3000 [ 80%]  (Sampling)
Iteration: 2700 / 3000 [ 90%]  (Sampling)
Iteration: 3000 / 3000 [100%]  (Sampling)

 Elapsed Time: 5.21913 seconds (Warm-up)
               5.31744 seconds (Sampling)
               10.5366 seconds (Total)


SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).

Gradient evaluation took 0.000692 seconds
1000 transitions using 10 leapfrog steps per transition would take 6.92 seconds.
Ad

In [23]:
results = {'prior_summary.txt': prior_summary, 'summary.txt': summary}

In [24]:
results

{'prior_summary.txt': R object with classes: ('prior_summary.stanreg',) mapped to:
 <ListVector - Python:0x7f51a2db9638 / R:0x9dcdcc0>
 [ListVector, ListVector]
 R object with classes: ('prior_summary.stanreg',) mapped to:
 <ListVector - Python:0x7f51a2db9638 / R:0x9dcdcc0>
 [ListVector, ListVector]
 R object with classes: ('prior_summary.stanreg',) mapped to:
 <ListVector - Python:0x7f51a2db9638 / R:0x9dcdcc0>
 [ListVector, ListVector],
 'summary.txt': R object with classes: ('summary.stanreg',) mapped to:
 <Matrix - Python:0x7f51a2e1c170 / R:0xb68ab50>
 [0.661268, -0.137107, 0.532512, ..., 1.000267, 0.999709, 1.001906]}