# ACRO Tests

In [1]:
import os
import sys
import pandas as pd
import numpy as np

In [2]:
sys.path.insert(0, os.path.abspath(".."))

In [3]:
from acro import ACRO, add_constant, utils

### Instantiate ACRO

In [4]:
acro = ACRO()

INFO:acro:config: {'safe_threshold': 10, 'safe_dof_threshold': 10, 'safe_nk_n': 2, 'safe_nk_k': 0.9, 'safe_pratio_p': 0.1, 'check_missing_values': False}


### Load test data
The dataset used in this notebook is the nursery dataset from OpenML. The dataset can be read directly from OpenML using the code commented in the next cell or it can be read directly from the local machine if it has been downloaded. The code used to read the data from a folder is in the below cell. The path might need to be changed to refer to the folder that holds the data

In [5]:
# from sklearn.datasets import fetch_openml

# data = fetch_openml(data_id=26, as_frame=True)
# df = data.data
# df["recommend"] = data.target

from scipy.io.arff import loadarff

path = os.path.join("../data", "nursery.arff")
data = loadarff(path)
df = pd.DataFrame(data[0])
df = df.select_dtypes([object])
df = df.stack().str.decode("utf-8").unstack()
df.rename(columns={"class": "recommend"}, inplace=True)
df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,recommend
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


### Convert 'more than 3' children to random between 4 and 10
Change the children column from categorical to numeric in order to be able to test some of the ACRO functions that requires a numeric feature

In [6]:
df["children"].replace(to_replace={"more": "4"}, inplace=True)
df["children"] = pd.to_numeric(df["children"])

df["children"] = df.apply(
    lambda row: row["children"]
    if row["children"] in (1, 2, 3)
    else np.random.randint(4, 10),
    axis=1,
)

### Pandas crosstab
This is an example of crosstab using pandas

In [7]:
table = pd.crosstab(df.recommend, df.parents)
table

parents,great_pret,pretentious,usual
recommend,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
not_recom,1440,1440,1440
priority,858,1484,1924
recommend,0,0,2
spec_prior,2022,1264,758
very_recom,0,132,196


### ACRO crosstab
This is an example of crosstab using ACRO

In [8]:
safe_table = acro.crosstab(df.recommend, df.parents)
safe_table

INFO:acro:outcome_df:
parents      great_pret  pretentious        usual
recommend                                        
not_recom            ok           ok           ok
priority             ok           ok           ok
recommend   threshold;   threshold;   threshold; 
spec_prior           ok           ok           ok
very_recom  threshold;            ok           ok
INFO:acro:get_summary(): fail; threshold: 4 cells suppressed; 
INFO:acro:add_output(): output_0_2023-04-28-16233491


parents,great_pret,pretentious,usual
recommend,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
not_recom,1440.0,1440.0,1440.0
priority,858.0,1484.0,1924.0
recommend,,,
spec_prior,2022.0,1264.0,758.0
very_recom,,132.0,196.0


### ACRO crosstab with aggregation function

In [9]:
safe_table = acro.crosstab(df.recommend, df.parents, values=df.children, aggfunc="mean")
safe_table

INFO:acro:outcome_df:
parents                        great_pret                    pretentious  \
recommend                                                                  
not_recom                              ok                             ok   
priority                               ok                             ok   
recommend   threshold; p-ratio; nk-rule;   threshold; p-ratio; nk-rule;    
spec_prior                             ok                             ok   
very_recom  threshold; p-ratio; nk-rule;                              ok   

parents                             usual  
recommend                                  
not_recom                              ok  
priority                               ok  
recommend   threshold; p-ratio; nk-rule;   
spec_prior                             ok  
very_recom                             ok  
INFO:acro:get_summary(): fail; threshold: 4 cells suppressed; p-ratio: 4 cells suppressed; nk-rule: 4 cells suppressed; 
INFO:acro:add_ou

parents,great_pret,pretentious,usual
recommend,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
not_recom,3.127083,3.099306,3.15
priority,2.583916,3.045148,3.133576
recommend,,,
spec_prior,3.346192,3.302215,3.379947
very_recom,,2.181818,2.244898


### ACRO pivot_table
This is an example of pivot table using ACRO

In [10]:
table = acro.pivot_table(
    df, index=["parents"], values=["children"], aggfunc=["mean", "std"]
)
table

INFO:acro:outcome_df:
                mean      std
            children children
parents                      
great_pret        ok       ok
pretentious       ok       ok
usual             ok       ok
INFO:acro:get_summary(): pass
INFO:acro:add_output(): output_2_2023-04-28-16233529


Unnamed: 0_level_0,mean,std
Unnamed: 0_level_1,children,children
parents,Unnamed: 1_level_2,Unnamed: 2_level_2
great_pret,3.121759,2.234771
pretentious,3.112037,2.223336
usual,3.140972,2.262121


### ACRO OLS 
This is an example of ordinary least square regression using ACRO. The recommend column was converted form categorical to numeric. Then, the linear regression between recommend and children was performed. This is just to show how the regression is done using ACRO. No correlation is expected to be seen by using these variables

In [11]:
df["recommend"].replace(
    to_replace={
        "not_recom": "0",
        "recommend": "1",
        "very_recom": "2",
        "priority": "3",
        "spec_prior": "4",
    },
    inplace=True,
)
df["recommend"] = pd.to_numeric(df["recommend"])

In [12]:
new_df = df[["recommend", "children"]]
new_df = new_df.dropna()

y = new_df["recommend"]
x = new_df["children"]
x = add_constant(x)

results = acro.ols(y, x)
results.summary()

INFO:acro:ols() outcome: pass; dof=12958.0 >= 10
INFO:acro:add_output(): output_3_2023-04-28-16233556


0,1,2,3
Dep. Variable:,recommend,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,7.36
Date:,"Fri, 28 Apr 2023",Prob (F-statistic):,0.00668
Time:,16:23:35,Log-Likelihood:,-25124.0
No. Observations:,12960,AIC:,50250.0
Df Residuals:,12958,BIC:,50270.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.2305,0.025,87.975,0.000,2.181,2.280
children,0.0179,0.007,2.713,0.007,0.005,0.031

0,1,2,3
Omnibus:,76780.984,Durbin-Watson:,2.883
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1742.815
Skew:,-0.485,Prob(JB):,0.0
Kurtosis:,1.488,Cond. No.,6.9


### ACRO OLSR
This is an example of ordinary least squares regression from a formula and dataframe using ACRO 

In [13]:
results = acro.olsr(formula="recommend ~ children", data=new_df)
results.summary()

INFO:acro:olsr() outcome: pass; dof=12958.0 >= 10
INFO:acro:add_output(): output_4_2023-04-28-16233571


0,1,2,3
Dep. Variable:,recommend,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,7.36
Date:,"Fri, 28 Apr 2023",Prob (F-statistic):,0.00668
Time:,16:23:35,Log-Likelihood:,-25124.0
No. Observations:,12960,AIC:,50250.0
Df Residuals:,12958,BIC:,50270.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.2305,0.025,87.975,0.000,2.181,2.280
children,0.0179,0.007,2.713,0.007,0.005,0.031

0,1,2,3
Omnibus:,76780.984,Durbin-Watson:,2.883
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1742.815
Skew:,-0.485,Prob(JB):,0.0
Kurtosis:,1.488,Cond. No.,6.9


### ACRO Probit
This is an example of probit regression using ACRO

In [14]:
new_df = df[["finance", "children"]]
new_df = new_df.dropna()

y = new_df["finance"].astype("category").cat.codes  # numeric
y.name = "finance"
x = new_df["children"]
x = add_constant(x)

results = acro.probit(y, x)
results.summary()

INFO:acro:probit() outcome: pass; dof=12958.0 >= 10
INFO:acro:add_output(): output_5_2023-04-28-16233586


Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 2


0,1,2,3
Dep. Variable:,finance,No. Observations:,12960.0
Model:,Probit,Df Residuals:,12958.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,4.528e-07
Time:,16:23:35,Log-Likelihood:,-8983.2
converged:,True,LL-Null:,-8983.2
Covariance Type:,nonrobust,LLR p-value:,0.9281

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0014,0.019,0.073,0.942,-0.036,0.038
children,-0.0004,0.005,-0.090,0.928,-0.010,0.009


### ACRO Logit
This is an example of logistic regression using ACRO

In [15]:
results = acro.logit(y, x)
results.summary()

INFO:acro:logit() outcome: pass; dof=12958.0 >= 10
INFO:acro:add_output(): output_6_2023-04-28-16233601


Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 2


0,1,2,3
Dep. Variable:,finance,No. Observations:,12960.0
Model:,Logit,Df Residuals:,12958.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 28 Apr 2023",Pseudo R-squ.:,4.528e-07
Time:,16:23:36,Log-Likelihood:,-8983.2
converged:,True,LL-Null:,-8983.2
Covariance Type:,nonrobust,LLR p-value:,0.9281

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0022,0.030,0.073,0.942,-0.057,0.061
children,-0.0007,0.008,-0.090,0.928,-0.016,0.015


### List current ACRO outputs
This is an example of using the print_output function to list all the outputs created so far

In [16]:
acro.print_outputs()

output_0_2023-04-28-16233491:
command: safe_table = acro.crosstab(df.recommend, df.parents)
summary: fail; threshold: 4 cells suppressed; 
outcome: parents      great_pret  pretentious        usual
recommend                                        
not_recom            ok           ok           ok
priority             ok           ok           ok
recommend   threshold;   threshold;   threshold; 
spec_prior           ok           ok           ok
very_recom  threshold;            ok           ok
output: [parents     great_pret  pretentious   usual
recommend                                  
not_recom       1440.0       1440.0  1440.0
priority         858.0       1484.0  1924.0
recommend          NaN          NaN     NaN
spec_prior      2022.0       1264.0   758.0
very_recom         NaN        132.0   196.0]
timestamp: 2023-04-28-16233491
comments: 


output_1_2023-04-28-16233508:
command: safe_table = acro.crosstab(df.recommend, df.parents, values=df.children, aggfunc="mean")
summary: fai

### Remove some ACRO outputs before finalising 
This is an example of deleting some of the ACRO outputs. The name of the output that needs to be removed should be passed to the function remove_output. Currently, all outputs names contain timestamp; that is the time when the output was created. The output name can be taken from the outputs listed by the print_outputs function or by listing the results and choosing the specific output that needs to be removed

In [17]:
output_1 = list(acro.results.keys())[1]
output_4 = list(acro.results.keys())[4]

acro.remove_output(output_1)
acro.remove_output(output_4)

INFO:acro:remove_output(): output_1_2023-04-28-16233508 removed
INFO:acro:remove_output(): output_4_2023-04-28-16233571 removed


### Rename ACRO outputs before finalising
This is an example of renaming the outputs to provide a more descriptive name. The timestamp associated with the output name will not get overwritten

In [18]:
acro.rename_output(list(acro.results.keys())[2], "pivot_table")

INFO:acro:rename_output(): output_3_2023-04-28-16233556 renamed to pivot_table_2023-04-28-16233556


### Add a comment to output
This is an example to add a comment to outputs. It can be used to provide a description or an information to the outputs

In [19]:
acro.add_comments(
    list(acro.results.keys())[0], "This is a cross table between year and grant_type"
)
acro.add_comments(list(acro.results.keys())[0], "6 cells were supressed in this table")

INFO:acro:a comment was added to output_0_2023-04-28-16233491
INFO:acro:a comment was added to output_0_2023-04-28-16233491


### Add an unsupported output to the list of outputs
This is an example to add an unsupported outputs (such as images) to the list of outputs

In [20]:
acro.custom_output(
    "XandY.jfif", "This output is an image showing the relationship between X and Y"
)

INFO:acro:add_output(): output_7_2023-04-28-16233694


### Finalise ACRO
This is an example of the function finalise which takes each output and saves it to a CSV file. It also saves the outputs to a json file or Excel file depending on the extension of the name of the file provided as an input to the function

In [21]:
output = acro.finalise("test_results.json")

INFO:acro:Directory outputs/ already exists
INFO:acro:output written to: test_results.json
