In [1]:
# from sfi import Data, Macro, Missing, SFIToolkit, Scalar
import os
import sys

sys.path.insert(0, "..")

In [2]:
from importlib import reload

import pandas as pd
import pytest

import acro
from acro import acro_stata_parser, stata_config

reload(acro_stata_parser)

INFO:acro:version: 0.4.3
INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}
INFO:acro:automatic suppression: False


<module 'acro.acro_stata_parser' from '/Users/j4-smith/GitHub/AI-SDC/ACRO/stata/../acro/acro_stata_parser.py'>

In [3]:
def dummy_acrohandler(command, varlist, exclusion, exp, weights, options):
    if debug:
        outline = "in python acrohandler function: "
        outline += f"command = {command} "
        outline += f"varlist={varlist} "
        outline += f"if = {exclusion} "
        outline += f"exp = {exp} "
        outline += f"weights={weights} "
        outline += f"options={options} "
        print(outline)

    # make data object
    path = os.path.join("../data", "test_data.dta")
    the_data = pd.read_stata(path)
    # print(f'in dummy acrohandler dataset has size {the_data.shape}')

    # now do the acro part
    acro_outstr = acro_stata_parser.parse_and_run(
        the_data, command, varlist, exclusion, exp, weights, options
    )

    return acro_outstr

In [7]:
# handy string to copy/paste
# command='',varlist='',exclusion='',exp='',weights='',options=''

In [10]:
def test_apply_stata_ifstmt(data):
    """Tests that if statements work for selection."""
    ifstring = "year!=2013"
    all_list = list(data["year"].unique())
    smaller = acro_stata_parser.apply_stata_ifstmt(ifstring, data)
    all_list.remove(2013)
    assert list(smaller["year"].unique()) == all_list
    ifstring2 = "year != 2013 & year <2015"
    all_list.remove(2015)
    smaller2 = acro_stata_parser.apply_stata_ifstmt(ifstring2, data)
    assert list(smaller2["year"].unique()) == all_list

In [11]:
# make data object
path = os.path.join("../data", "test_data.dta")
the_data = pd.read_stata(path)
test_apply_stata_ifstmt(the_data)

In [5]:
debug = False

In [6]:
def test_stata_acro_init() -> str:
    assert isinstance(stata_config.stata_acro, ACRO)
    ret = dummy_acrohandler(
        command="init", varlist="", exclusion="", exp="", weights="", options=""
    )
    assert (
        ret == "acro analysis session created\n"
    ), f"wrong string for acro init: {ret}\n"
    errmsg = f"wrong type for stata_config.stata_acro:{type(stata_config.stata_acro)}"
    assert isinstance(stata_config.stata_acro, acro.ACRO), errmsg

In [7]:
test_stata_acro_init()

INFO:acro:version: 0.4.3
INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}
INFO:acro:automatic suppression: False


In [8]:
def test_parse_table_details():
    path = os.path.join("../data", "test_data.dta")
    the_data = pd.read_stata(path)

    varlist = ["survivor", "grant_type", "year"]
    varnames = the_data.columns
    options = "by(grant_type) " "contents(mean sd inc_activity)" "suppress " "nototals"
    details = acro_stata_parser.parse_table_details(varlist, varnames, options)

    errstring = f" rows {details['rowvars']} should be ['grant_type','survivor']"
    assert details["rowvars"] == ["grant_type", "survivor"], errstring

    errstring = f" cols {details['colvars']} should be ['year','grant_type']"
    assert details["colvars"] == ["year", "grant_type"], errstring

    errstring = f" aggfunctions {details['aggfuncs']} should be ['mean','sd']"
    assert details["aggfuncs"] == ["mean", "sd"], errstring

    errstring = f" values {details['values']} should be ['inc_activity']"
    assert details["values"] == ["inc_activity"], errstring

    assert not details["totals"], "totals should be False"
    assert details["suppress"], "supress should be True"

In [9]:
test_parse_table_details()

In [10]:
def test_simple_table() -> str:
    path = os.path.join("../data", "test_data.dta")
    df = pd.read_stata(path)
    # correct = pd.crosstab(index=df["survivor"], columns=df["grant_type"]).to_string()
    correct = (
        "------------------------------------|\n"
        "grant_type     |G   |N    |R    |R/G|\n"
        "survivor       |    |     |     |   |\n"
        "------------------------------------|\n"
        "Dead in 2015   |18  |  0  |282  | 0 |\n"
        "Alive in 2015  |72  |354  |144  |48 |\n"
        "------------------------------------|\n"
    )
    ret = dummy_acrohandler(
        "table",
        "survivor grant_type",
        exclusion="",
        exp="",
        weights="",
        options="nototals",
    )
    ret = ret.replace("NaN", "0")
    ret = ret.replace(".0", "")
    assert ret.split() == correct.split(), f"got\n{ret}\n expected\n{correct}"

In [11]:
test_simple_table()

INFO:acro:get_summary(): fail; threshold: 2 cells may need suppressing; 
INFO:acro:outcome_df:
---------------------------------------------------|
grant_type    |G   |N            |R   |R/G         |
survivor      |    |             |    |            |
---------------------------------------------------|
Dead in 2015  | ok | threshold;  | ok | threshold; |
Alive in 2015 | ok |          ok | ok |          ok|
---------------------------------------------------|

INFO:acro:records:add(): output_0


In [12]:
def test_stata_linregress():
    ret = dummy_acrohandler(
        command="regress",
        varlist=" inc_activity inc_grants inc_donations total_costs",
        exclusion="",
        exp="",
        weights="",
        options="",
    )

    tokens = ret.split()
    idx = tokens.index("Residuals:")
    val = int(tokens[idx + 1])
    assert val == 807, f"{val} should be 807"
    idx = tokens.index("R-squared:")
    val = float(tokens[idx + 1])
    assert val == pytest.approx(0.894, 0.001)

In [13]:
test_stata_linregress()

INFO:acro:ols() outcome: pass; dof=807.0 >= 10
INFO:acro:records:add(): output_1


In [58]:
def test_stata_probit():
    ret = dummy_acrohandler(
        command="probit",
        varlist=" survivor inc_activity inc_grants inc_donations total_costs",
        exclusion="",
        exp="",
        weights="",
        options="",
    )

    # assert False,f'\n{ret}\n'
    tokens = ret.split()
    idx = tokens.index("Residuals:")
    val = tokens[idx + 1]
    if val[-1] == "|":
        val = val[0:-1]
    assert float(val) == pytest.approx(806.0, 0.01), f"{val} should be 806"
    idx = tokens.index("R-squ.:")
    val = tokens[idx + 1]
    if val[-1] == "|":
        val = val[0:-1]
    val = float(val)
    assert val == pytest.approx(0.208, 0.01)

In [None]:
test_stata_probit()

In [64]:
def test_stata_logit():
    ret = dummy_acrohandler(
        command="logit",
        varlist=" survivor inc_activity inc_grants inc_donations total_costs",
        exclusion="",
        exp="",
        weights="",
        options="",
    )
    # assert False,f'\n{ret}\n'
    tokens = ret.split()
    idx = tokens.index("Residuals:")
    val = tokens[idx + 1]
    if val[-1] == "|":
        val = val[0:-1]
    assert float(val) == pytest.approx(806.0, 0.01), f"{val} should be 806"
    idx = tokens.index("R-squ.:")
    val = tokens[idx + 1]
    if val[-1] == "|":
        val = val[0:-1]
    val = float(val)
    assert val == pytest.approx(0.214, 0.01)

In [None]:
reload(acro_stata_parser)
test_stata_logit()

In [66]:
def test_stata_print_outputs():
    ret = dummy_acrohandler(
        command="print_outputs",
        varlist=" inc_activity inc_grants inc_donations total_costs",
        exclusion="",
        exp="",
        weights="",
        options="",
    )
    assert len(ret) == 0, "return string should  be empty"

In [None]:
test_stata_print_outputs()

In [68]:
def test_stata_finalise():
    ret = dummy_acrohandler(
        command="finalise",
        varlist="",
        exclusion="",
        exp="",
        weights="",
        options="",
    )
    correct = "outputs and stata_out.json written\n"
    assert ret == correct, f"returned string {ret} should be {correct}\n"

In [None]:
test_stata_finalise()

In [None]:
def test_find_brace_contents():
    options = "by(grant_type) " "contents(mean sd inc_activity)" "suppress " "nototals"
    res, substr = acro_stata_parser.find_brace_contents("by", options)
    assert res == True
    assert substr == "grant_type"
    res, substr = acro_stata_parser.find_brace_contents("contents", options)
    assert res == True
    assert substr == "mean sd inc_activity"
    res, substr = acro_stata_parser.find_brace_contents("foo", options)
    assert res == False
    assert substr == "foo not found"

In [None]:
test_find_brace_contents()

In [None]:
def test_unsupported_formatting_options():
    path = os.path.join("../data", "test_data.dta")
    df = pd.read_stata(path)
    format_string = "acro does not currently support table formatting commands."
    correct = pd.crosstab(index=df["survivor"], columns=df["grant_type"]).to_string()
    for bad_option in [
        "cellwidth",
        "csepwidth",
        "stubwidth",
        "scsepwidth",
        "center",
        "left",
    ]:
        ret = dummy_acrohandler(
            "table",
            "survivor grant_type",
            exclusion="",
            exp="",
            weights="",
            options=f"{bad_option} nototals",
        )

        rets = ret.split("\n", 1)
        assert len(rets) == 2, "table should have warning preprended"
        errmsg = f"first line {rets[0]} should be {format_string}"
        assert rets[0] == format_string, errmsg
        ret = rets[1]
        ret = ret.replace("NaN", "0")
        ret = ret.replace(".0", "")
        assert ret.split() == correct.split(), f"got\n{ret}\n expected\n{correct}"

In [None]:
test_unsupported_formatting_options()

In [37]:
def test_stata_exclusion_in_context(mydata):
    """Tests that the subsetting code gets called properly from table handler"""
    # if condition
    correct1 = (
        "------------------|\n"
        "grant_type     |G |\n"
        "survivor       |  |\n"
        "------------------|\n"
        "Dead in 2015   |18|\n"
        "Alive in 2015  |72|\n"
        "------------------|\n"
    )
    data = mydata
    command = "table"
    varlist = "survivor grant_type"
    exclusion = 'grant_type == "G"'
    exp = ""
    weights = ""
    options = "nototals"

    ret = acro_stata_parser.parse_and_run(
        mydata, command, varlist, exclusion, exp, weights, options
    )
    ret = ret.replace("NaN", "0")
    ret = ret.replace(".0", "")
    assert ret.split() == correct1.split(), f"got\n{ret}\n expected\n{correct1}"

    # in expression
    correct2 = (
        "------------------------------------|\n"
        "grant_type     |G   |N    |R    |R/G|\n"
        "survivor       |    |     |     |   |\n"
        "------------------------------------|\n"
        "Dead in 2015   |12  |  0  |158  | 0 |\n"
        "Alive in 2015  |30  |222  | 48  |30 |\n"
        "------------------------------------|\n"
    )
    exclusion = ""
    exp = "1/500"
    ret2 = acro_stata_parser.parse_and_run(
        mydata, command, varlist, exclusion, exp, weights, options
    )
    ret2 = ret2.replace("NaN", "0")
    ret2 = ret2.replace(".0", "")
    assert ret2.split() == correct2.split(), f"got\n{ret2}\n expected\n{correct2}"

    # both
    exclusion = 'grant_type == "G"'
    exp = "1/500"
    ret3 = acro_stata_parser.parse_and_run(
        mydata, command, varlist, exclusion, exp, weights, options
    )
    correct3 = (
        "------------------|\n"
        "grant_type     |G |\n"
        "survivor       |  |\n"
        "------------------|\n"
        "Dead in 2015   |12|\n"
        "Alive in 2015  |30|\n"
        "------------------|\n"
    )
    ret3 = ret3.replace("NaN", "0")
    ret3 = ret3.replace(".0", "")
    assert ret3.split() == correct3.split(), f"got\n{ret3}\n expected\n{correct3}"

In [31]:
# make data object
path = os.path.join("../data", "test_data.dta")
the_data = pd.read_stata(path)

In [None]:
reload(acro_stata_parser)
test_stata_exclusion_in_context(the_data)

In [6]:
def test_table_aggcfn(mydata):
    """
    testing behaviour with aggregation function
    """
    correct = (
        "----------------------------------------\n"
        "          |           survivor          \n"
        "     year |  Dead in 2015  Alive in 2015\n"
        "----------+-----------------------------\n"
        "     2010 |      414823.4       2.86e+07\n"
        "     2011 |      400372.1       1.56e+07\n"
        "     2012 |      550443.7       1.60e+07\n"
        "     2013 |      586786.4       1.67e+07\n"
        "     2014 |      672290.8       1.67e+07\n"
        "     2015 |      485240.2       1.66e+07\n"
        "----------------------------------------\n"
    )
    data = mydata
    command = "table"
    varlist = "year survivor"
    exclusion = ""
    exp = ""
    weights = ""
    options = "contents(mean inc_activity)"

    ret = acro_stata_parser.parse_and_run(
        data, command, varlist, exclusion, exp, weights, options
    )
    assert ret.split() == correct.split(), f"got\n{ret}\n expected\n{correct}"

In [25]:
reload(acro_stata_parser)
# make data object
path = os.path.join("../data", "test_data.dta")
the_data = pd.read_stata(path)
correct = (
    "-----------------------------------|\n"
    "         mean                      |\n"
    "survivor Dead in 2015 Alive in 2015|\n"
    "year                               |\n"
    "-----------------------------------|\n"
    "2010      2056816.000  10050917.00 |\n"
    "2011      1264158.000   3468009.75 |\n"
    "2012      1625441.625   2934010.75 |\n"
    "2013      1868730.500   4579002.00 |\n"
    "2014      2182281.500   3612917.50 |\n"
    "2015      2571766.250   3375528.25 |\n"
    "-----------------------------------|\n\n"
)
command = "table"
varlist_string = "year survivor"
exclusion = ""
exp = "1/100"
weights = ""
options = "contents(mean inc_activity) nototals"
ret = acro_stata_parser.parse_and_run(
    the_data, command, varlist_string, exclusion, exp, weights, options
)


assert ret == correct, f"got:\nxx\n{ret}xx\nexpected:\nxx\n{correct}xx\n"
print(f"test passed\n{ret}")

INFO:acro:get_summary(): review; negative values found
INFO:acro:outcome_df:
-----------------------------------|
         mean                      |
survivor Dead in 2015 Alive in 2015|
year                               |
-----------------------------------|
2010                               |
2011                               |
2012                   negative    |
2013                               |
2014                               |
2015                               |
-----------------------------------|

INFO:acro:records:add(): output_2


test passed
-----------------------------------|
         mean                      |
survivor Dead in 2015 Alive in 2015|
year                               |
-----------------------------------|
2010      2056816.000  10050917.00 |
2011      1264158.000   3468009.75 |
2012      1625441.625   2934010.75 |
2013      1868730.500   4579002.00 |
2014      2182281.500   3612917.50 |
2015      2571766.250   3375528.25 |
-----------------------------------|




In [18]:
the_data2 = the_data
rows = the_data2["year"]
# rows=[the_data2['year'],the_data2['survivor']]
cols = the_data2["survivor"]
values = the_data2["inc_activity"]
print(pd.crosstab(index=rows, columns=cols, values=values, aggfunc=["mean"]))

                  mean              
survivor  Dead in 2015 Alive in 2015
year                                
2010      414823.43750    28598198.0
2011      400372.09375    15565661.0
2012      550443.68750    15989489.0
2013      586786.37500    16663146.0
2014      672290.81250    16654627.0
2015      485240.15625    16599919.0


In [17]:
from acro import ACRO

acro = ACRO()
print(acro.crosstab(index=rows, columns=cols, values=values, aggfunc=["mean"]))

INFO:acro:version: 0.4.3
INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}
INFO:acro:automatic suppression: False
INFO:acro:get_summary(): review; negative values found
INFO:acro:outcome_df:
-----------------------------------|
         mean                      |
survivor Dead in 2015 Alive in 2015|
year                               |
-----------------------------------|
2010                               |
2011                   negative    |
2012                   negative    |
2013                               |
2014                               |
2015                               |
-----------------------------------|

INFO:acro:records:add(): output_0


                  mean              
survivor  Dead in 2015 Alive in 2015
year                                
2010      414823.43750    28598198.0
2011      400372.09375    15565661.0
2012      550443.68750    15989489.0
2013      586786.37500    16663146.0
2014      672290.81250    16654627.0
2015      485240.15625    16599919.0


In [24]:
def parse_location_token(token: str, last: int) -> int:
    """
    Parses index position tokens from stata syntax
    stata allows f and F for first item  and l/L for last.
    """
    lookup: dict = {"f": 0, "F": 0, "l": last, "L": last}
    if token in ["f", "F", "l", "L"]:
        pos = lookup[token]
    else:
        try:
            pos = int(token)
            if pos > 0:
                pos -= 1
        except ValueError:
            print("valuerror")
            pos = 0
    return pos

In [53]:
def apply_stata_expstmt(raw: str, all_data: pd.DataFrame) -> pd.DataFrame:
    """
    Parses an in exp statement from stata and uses it
    to subset a dataframe by set of row indices.
    """
    last = len(all_data) - 1
    if "/" not in raw:
        pos = parse_location_token(raw, last)
        if pos < 0:
            start = max(0, last + pos + 1)
            end = last
        else:
            start = 0
            end = min(pos, last)

    else:
        token: list = raw.split("/")
        # first index
        start = parse_location_token(token[0], last)
        if start < 0:
            start = last + 1 + start  # -1==last
        # last index
        end = parse_location_token(token[1], last)
        if end < 0:
            end = last + end  # -1==last
        # enforce start <=end
        if start > end:
            end = last

    print(start, end)
    return all_data.iloc[start : end + 1]

In [54]:
data = np.zeros(100)
for i in range(100):
    data[i] = i
data = pd.DataFrame(data, columns=["vals"])
length = 100
# use of f/F and l/L for first and last with specified row range

exp = "f/5"
smaller = apply_stata_expstmt(exp, data)
assert smaller.shape[0] == 5, data

exp = "F/5"
smaller = apply_stata_expstmt(exp, data)
assert smaller.shape[0] == 5, data
assert (smaller.iloc[-1].fillna(0).values == data.iloc[4].fillna(0).values).all()

exp = "F/-5"
smaller = apply_stata_expstmt(exp, data)
assert smaller.shape[0] == length - 5, f"{smaller.shape[0]} != 95\n{data}"
assert (
    smaller.iloc[-1].fillna(0).values == data.iloc[length - 6].fillna(0).values
).all()

exp = "-6/l"
smaller = apply_stata_expstmt(exp, data)
assert smaller.shape[0] == 6, data
assert (
    smaller.iloc[-1].fillna(0).values == data.iloc[length - 1].fillna(0).values
).all()

exp = "-6/L"
smaller = apply_stata_expstmt(exp, data)
assert smaller.shape[0] == 6, data
assert (
    smaller.iloc[-1].fillna(0).values == data.iloc[length - 1].fillna(0).values
).all()

# invalid range should default to end of dataframe
exp = "50/45"
smaller = apply_stata_expstmt(exp, data)
assert smaller.shape[0] == length - 49, f"smaller.shape[0] !=51,{smaller}"


# missing / counts from front/back so same size but different
exp = "40"
smaller = apply_stata_expstmt(exp, data)
assert smaller.shape[0] == 40, data

exp = "-40"
smaller2 = apply_stata_expstmt(exp, data)
assert smaller2.shape[0] == 40
assert not smaller2.equals(smaller), "counting from front/back should be different"

exp = "gg"  # invalid exp returns empty dataframe
smaller = apply_stata_expstmt(exp, data)
assert smaller.shape[0] == 1, smaller