In [17]:
# from sfi import Data, Macro, Missing, SFIToolkit, Scalar
import os
import acro
import pandas as pd
import acro_stata_parser
from importlib import reload
import pytest

reload(acro_stata_parser)

<module 'acro_stata_parser' from '/Users/j4-smith/GitHub/AI-SDC/ACRO/stata/acro_stata_parser.py'>

In [18]:
def dummy_acrohandler(command, varlist, exclusion, exp, weights, options):
    global myacro
    if debug:
        outline = "in python acrohandler function: "
        outline += f"command = {command} "
        outline += f"varlist={varlist} "
        outline += f"if = {exclusion} "
        outline += f"exp = {exp} "
        outline += f"weights={weights} "
        outline += f"options={options} "
        print(outline)

    # make data object
    path = os.path.join("../data", "test_data.dta")
    the_data = pd.read_stata(path)
    # print(f'in dummy acrohandler dataset has size {the_data.shape}')

    # now do the acro part
    acro_outstr = acro_stata_parser.parse_and_run(
        the_data, command, varlist, exclusion, exp, weights, options
    )

    return acro_outstr

In [9]:
# handy string to copy/paste
# command='',varlist='',exclusion='',exp='',weights='',options=''

In [19]:
debug = False
myacro = ""

In [20]:
def test_stata_acro_init() -> str:
    global myacro
    assert isinstance(myacro, str)
    ret = dummy_acrohandler(
        command="init", varlist="", exclusion="", exp="", weights="", options=""
    )
    assert (
        ret == "acro analysis session created\n"
    ), f"wrong string for acro init: {ret}\n"
    # assert isinstance(myacro,acro.ACRO),f'wrong type for myacro:{type(myacro)}'

In [21]:
test_stata_acro_init()

INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}


In [22]:
def test_parse_table_details():
    path = os.path.join("../data", "test_data.dta")
    the_data = pd.read_stata(path)

    varlist = ["survivor", "grant_type", "year"]
    varnames = the_data.columns
    options = "by(grant_type) " "contents(mean sd inc_activity)" "suppress " "nototals"
    details = acro_stata_parser.parse_table_details(varlist, varnames, options)

    errstring = f" rows {details['rowvars']} should be ['grant_type','survivor']"
    assert details["rowvars"] == ["grant_type", "survivor"], errstring

    errstring = f" cols {details['colvars']} should be ['year','grant_type']"
    assert details["colvars"] == ["year", "grant_type"], errstring

    errstring = f" aggfunctions {details['aggfuncs']} should be ['mean','sd']"
    assert details["aggfuncs"] == ["mean", "sd"], errstring

    errstring = f" values {details['values']} should be ['inc_activity']"
    assert details["values"] == ["inc_activity"], errstring

    assert not details["totals"], "totals should be False"
    assert details["suppress"], "supress should be True"

In [23]:
test_parse_table_details()

In [24]:
def test_simple_table() -> str:
    path = os.path.join("../data", "test_data.dta")
    df = pd.read_stata(path)
    correct = pd.crosstab(index=df["survivor"], columns=df["grant_type"]).to_string()
    ret = dummy_acrohandler(
        "table",
        "survivor grant_type",
        exclusion="",
        exp="",
        weights="",
        options="nototals",
    )
    ret = ret.replace("NaN", "0")
    ret = ret.replace(".0", "")
    assert ret.split() == correct.split(), f"got\n{ret}\n expected\n{correct}"

In [25]:
test_simple_table()

INFO:acro:outcome_df:
grant_type      G            N   R          R/G
survivor                                       
Dead in 2015   ok  threshold;   ok  threshold; 
Alive in 2015  ok           ok  ok           ok
INFO:acro:get_summary(): fail; threshold: 2 cells suppressed; 
INFO:acro::records:add(): output_0


In [26]:
def test_stata_linregress():
    global myacro
    ret = dummy_acrohandler(
        command="regress",
        varlist=" inc_activity inc_grants inc_donations total_costs",
        exclusion="",
        exp="",
        weights="",
        options="",
    )
    ret = ret.replace("\n", ",")
    tokens = ret.split(",")
    idx = tokens.index("Df Residuals:    ")
    val = int(tokens[idx + 1])
    assert val == 807, f"{val} should be 807"
    idx = tokens.index("  R-squared:         ")
    val = float(tokens[idx + 1])
    assert val == pytest.approx(0.894, 0.001)

In [27]:
test_stata_linregress()

INFO:acro:ols() outcome: pass; dof=807.0 >= 10
INFO:acro::records:add(): output_1


In [28]:
def test_stata_probit():
    ret = dummy_acrohandler(
        command="probit",
        varlist=" survivor inc_activity inc_grants inc_donations total_costs",
        exclusion="",
        exp="",
        weights="",
        options="",
    )
    ret = ret.replace("\n", ",")
    tokens = ret.split(",")
    idx = tokens.index("  Df Residuals:      ")
    val = int(tokens[idx + 1])
    assert val == 806, f"{val} should be 806"
    idx = tokens.index("  Pseudo R-squ.:     ")
    val = float(tokens[idx + 1])
    assert val == pytest.approx(0.208, 0.01)

In [29]:
test_stata_probit()

INFO:acro:probit() outcome: pass; dof=806.0 >= 10
INFO:acro::records:add(): output_2


Optimization terminated successfully.
         Current function value: 0.497218
         Iterations 10


In [30]:
def test_stata_print_outputs():
    ret = dummy_acrohandler(
        command="print_outputs",
        varlist=" inc_activity inc_grants inc_donations total_costs",
        exclusion="",
        exp="",
        weights="",
        options="",
    )
    assert len(ret) == 0, "return string should  be empty"

In [31]:
test_stata_print_outputs()

output_0:
uid: output_0
status: fail
type: table
properties: {'method': 'crosstab', 'negative': False, 'missing': False, 'threshold': 2, 'p-ratio': 0, 'nk-rule': 0}
command: safe_output = myacro.crosstab(
summary: fail; threshold: 2 cells suppressed; 
outcome: {'G': {'Dead in 2015': 'ok', 'Alive in 2015': 'ok'}, 'N': {'Dead in 2015': 'threshold; ', 'Alive in 2015': 'ok'}, 'R': {'Dead in 2015': 'ok', 'Alive in 2015': 'ok'}, 'R/G': {'Dead in 2015': 'threshold; ', 'Alive in 2015': 'ok'}}
output: [grant_type      G      N    R   R/G
survivor                           
Dead in 2015   18    NaN  282   NaN
Alive in 2015  72  354.0  144  48.0]
timestamp: 2023-06-29T15:33:57.492603
comments: []


output_1:
uid: output_1
status: pass
type: regression
properties: {'method': 'ols', 'dof': 807.0}
command: results = myacro.ols(y, x)
summary: pass; dof=807.0 >= 10
outcome: {}
output: [                       inc_activity           R-squared:      0.894
Dep. Variable:                                   

In [45]:
def test_stata_finalise():
    ret = dummy_acrohandler(
        command="finalise",
        varlist="",
        exclusion="",
        exp="",
        weights="",
        options="",
    )
    correct = "outputs and stata_out.json written\n"
    assert ret == correct, f"returned string {ret} should be {correct}\n"

In [46]:
test_stata_finalise()

INFO:acro::records:outputs written to: stata_out.json


In [47]:
def test_find_brace_contents():
    options = "by(grant_type) " "contents(mean sd inc_activity)" "suppress " "nototals"
    res, substr = acro_stata_parser.find_brace_contents("by", options)
    assert res == True
    assert substr == "grant_type"
    res, substr = acro_stata_parser.find_brace_contents("contents", options)
    assert res == True
    assert substr == "mean sd inc_activity"
    res, substr = acro_stata_parser.find_brace_contents("foo", options)
    assert res == False
    assert substr == "foo not found"

In [48]:
test_find_brace_contents()

In [75]:
def test_unsupported_formatting_options():
    path = os.path.join("../data", "test_data.dta")
    df = pd.read_stata(path)
    format_string = "acro does not currently support table formatting commands."
    correct = pd.crosstab(index=df["survivor"], columns=df["grant_type"]).to_string()
    for bad_option in [
        "cellwidth",
        "csepwidth",
        "stubwidth",
        "scsepwidth",
        "center",
        "left",
    ]:
        ret = dummy_acrohandler(
            "table",
            "survivor grant_type",
            exclusion="",
            exp="",
            weights="",
            options=f"{bad_option} nototals",
        )

        rets = ret.split("\n", 1)
        assert len(rets) == 2, "table should have warning preprended"
        errmsg = f"first line {rets[0]} should be {format_string}"
        assert rets[0] == format_string, errmsg
        ret = rets[1]
        ret = ret.replace("NaN", "0")
        ret = ret.replace(".0", "")
        assert ret.split() == correct.split(), f"got\n{ret}\n expected\n{correct}"

In [76]:
test_unsupported_formatting_options()

INFO:acro:outcome_df:
grant_type      G            N   R          R/G
survivor                                       
Dead in 2015   ok  threshold;   ok  threshold; 
Alive in 2015  ok           ok  ok           ok
INFO:acro:get_summary(): fail; threshold: 2 cells suppressed; 
INFO:acro::records:add(): output_21
INFO:acro:outcome_df:
grant_type      G            N   R          R/G
survivor                                       
Dead in 2015   ok  threshold;   ok  threshold; 
Alive in 2015  ok           ok  ok           ok
INFO:acro:get_summary(): fail; threshold: 2 cells suppressed; 
INFO:acro::records:add(): output_22
INFO:acro:outcome_df:
grant_type      G            N   R          R/G
survivor                                       
Dead in 2015   ok  threshold;   ok  threshold; 
Alive in 2015  ok           ok  ok           ok
INFO:acro:get_summary(): fail; threshold: 2 cells suppressed; 
INFO:acro::records:add(): output_23
INFO:acro:outcome_df:
grant_type      G            N   R     

passed first part
passed first part
passed first part
passed first part
passed first part
passed first part
