In [1]:
import statsmodels.api as sm
import pandas as pd
from patsy import dmatrices
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison, tukeyhsd
import numpy as np
from statsmodels.stats.libqsturng import psturng
from statsmodels.formula.api import ols

In [3]:
df = pd.read_csv('Exp1(ratio)__Anova_post-hoc.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,PPV,NPV,Accuracy,time(s),batch,ratio
0,0,0.25,1.0,0.7,827.625,batch_1,10%
1,1,0.333333,0.95122,0.84,782.875,batch_1,10%
2,2,0.090909,0.892857,0.54,1581.1875,batch_1,10%
3,3,0.08,0.88,0.48,1514.0625,batch_1,10%
4,4,0.1875,0.941176,0.7,734.125,batch_1,10%


In [4]:
df['ratio_number'] = df.ratio.str.replace('%', '').astype(int)

In [5]:
sm.OLS(df['PPV'], df['ratio_number']).fit().summary()

0,1,2,3
Dep. Variable:,PPV,R-squared:,0.954
Model:,OLS,Adj. R-squared:,0.953
Method:,Least Squares,F-statistic:,2048.0
Date:,"Wed, 20 Dec 2017",Prob (F-statistic):,5.859999999999999e-68
Time:,05:40:30,Log-Likelihood:,40.299
No. Observations:,100,AIC:,-78.6
Df Residuals:,99,BIC:,-75.99
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
ratio_number,0.0119,0.000,45.258,0.000,0.011 0.012

0,1,2,3
Omnibus:,3.642,Durbin-Watson:,0.574
Prob(Omnibus):,0.162,Jarque-Bera (JB):,3.029
Skew:,0.404,Prob(JB):,0.22
Kurtosis:,3.273,Cond. No.,1.0


A linear regression confirms the effect of the balance on the PPV (b = 0.012, t(99) = 45.26, $p < 10^{-4}$, $R^2=0.954$) and on the accuracy (b = 0.011, t(99) = 21.19, $p < 10^{-4}$, $R^2=0.819$).

In [24]:
sm.OLS(df['Accuracy'], df['ratio_number']).fit().summary()

0,1,2,3
Dep. Variable:,Accuracy,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.818
Method:,Least Squares,F-statistic:,449.0
Date:,"Wed, 20 Dec 2017",Prob (F-statistic):,1.44e-38
Time:,04:23:22,Log-Likelihood:,-29.399
No. Observations:,100,AIC:,60.8
Df Residuals:,99,BIC:,63.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
ratio_number,0.0111,0.001,21.190,0.000,0.010 0.012

0,1,2,3
Omnibus:,11.238,Durbin-Watson:,0.202
Prob(Omnibus):,0.004,Jarque-Bera (JB):,3.762
Skew:,-0.02,Prob(JB):,0.152
Kurtosis:,2.051,Cond. No.,1.0


In [19]:
spector_data.get

<function Dataset.get>

In [6]:
df_fit = ols('PPV ~ ratio',data=df).fit()


In [7]:
anova_results = sm.stats.anova_lm(df_fit, typ=1)
anova_results

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
ratio,9.0,5.955021,0.661669,79.376274,6.505076000000001e-39
Residual,90.0,0.750227,0.008336,,


In [8]:
anova_results['PR(>F)']<0.05

ratio        True
Residual    False
Name: PR(>F), dtype: bool

In [9]:
def compute_tukey(df,target,classes):
    n_classes = len(df[classes].unique())
    degrees_freedom = len(df) - n_classes
    results_tukey = pairwise_tukeyhsd(df[target], df[classes])
    v = np.abs(results_tukey.meandiffs/ results_tukey.std_pairs)
    confidence = psturng(v, n_classes, degrees_freedom)
    results_tukey.plot_simultaneous()
    #print(results_tukey.summary(),confidence)
    return results_tukey.summary(),confidence

In [10]:
(result,confidence) = compute_tukey(df,'PPV','ratio')
result

group1,group2,meandiff,lower,upper,reject
10%,100%,0.8157,0.6833,0.9482,True
10%,20%,0.2637,0.1312,0.3962,True
10%,30%,0.3706,0.2381,0.5031,True
10%,40%,0.4417,0.3092,0.5742,True
10%,50%,0.531,0.3985,0.6635,True
10%,60%,0.6257,0.4933,0.7582,True
10%,70%,0.6889,0.5564,0.8214,True
10%,80%,0.7183,0.5858,0.8507,True
10%,90%,0.7739,0.6414,0.9063,True
100%,20%,-0.552,-0.6845,-0.4196,True


In [10]:
print(confidence)

[ 0.001       0.001       0.001       0.001       0.001       0.001       0.001
  0.001       0.001       0.001       0.001       0.001       0.001       0.001
  0.07263383  0.34702206  0.9         0.22462211  0.00136707  0.001       0.001
  0.001       0.001       0.001       0.74247016  0.00617284  0.001       0.001
  0.001       0.001       0.47485648  0.001       0.001       0.001       0.001
  0.38914091  0.00756734  0.001       0.001       0.85820657  0.42365673
  0.01624153  0.9         0.5391102   0.9       ]


In [17]:
result

group1,group2,meandiff,lower,upper,reject
10%,100%,0.8157,0.6833,0.9482,True
10%,20%,0.2637,0.1312,0.3962,True
10%,30%,0.3706,0.2381,0.5031,True
10%,40%,0.4417,0.3092,0.5742,True
10%,50%,0.531,0.3985,0.6635,True
10%,60%,0.6257,0.4933,0.7582,True
10%,70%,0.6889,0.5564,0.8214,True
10%,80%,0.7183,0.5858,0.8507,True
10%,90%,0.7739,0.6414,0.9063,True
100%,20%,-0.552,-0.6845,-0.4196,True
