In [95]:
import statsmodels.api as sm
import pandas as pd
from patsy import dmatrices
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison, tukeyhsd
from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.weightstats import ttest_ind
from statsmodels.sandbox.stats.multicomp import multipletests
import numpy as np
from statsmodels.stats.libqsturng import psturng
from statsmodels.formula.api import ols

# All topic test Exp2,Exp4a and Exp4b (PPV)

# Anova again

In [352]:
LIST_OF_TESTS= []

In [353]:
from numpy import std, mean, sqrt

#correct if the population S.D. is expected to be equal for the two groups.
def cohen_d(x,y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    d =(mean(x) - mean(y)) / sqrt(((nx-1)*std(x, ddof=1) ** 2 + (ny-1)*std(y, ddof=1) ** 2) / dof)
    if d < 0.20:
        effect = 'very small (Sawilowsky, 2009)'
    elif d < 0.5:
        effect = 'small (Cohen, 1988)'
    elif d < 0.8:
        effect = 'medium (Cohen, 1988)'
    elif d < 1.2:
        effect = 'large (Cohen, 1988)'
    elif d < 2:
        effect = 'very large (Sawilowsky, 2009)'
    else:
        effect = 'huge (Sawilowsky, 2009)'
    return d,effect

In [354]:
def compute_tukey(df,target,classes):
    results_tukey = pairwise_tukeyhsd(df[target], df[classes])
    v = np.abs(results_tukey.meandiffs/ results_tukey.std_pairs)
    confidence = psturng(v, len(results_tukey.groupsunique), results_tukey.df_total)
    return results_tukey.summary(),confidence

In [355]:
df = pd.read_csv('alltopics_forAnova.csv')
df.loc[df.batch=='batch_7',['typex']] = 'special'
df = df[df.typex!='random']

In [356]:
df1= df[df.balance==10]
df2= df[df.balance==50]

In [357]:
df_first_part =df[df.typex!='special']

In [358]:
df_first_part.typex.unique()

array(['first', 'last'], dtype=object)

In [359]:
df_fit = ols('PPV ~ typex*balance ',data=df_first_part).fit()
anova_results = sm.stats.anova_lm(df_fit, typ=2)
anova_results

Unnamed: 0,sum_sq,df,F,PR(>F)
typex,0.297683,1.0,5.65238,0.01780755
balance,26.901288,1.0,510.800279,2.0852299999999998e-78
typex:balance,0.189282,1.0,3.594072,0.05856349
Residual,26.279827,499.0,,


In [360]:
anova_results['PR(>F)']<0.05

typex             True
balance           True
typex:balance    False
Residual         False
Name: PR(>F), dtype: bool

In [361]:
#As a post-hoc for typex we use t-test
x = df_first_part[(df_first_part['typex']=='first')&(df_first_part['balance']==50)].PPV
y = df_first_part[(df_first_part['typex']=='last')&(df_first_part['balance']==50)].PPV
a1 = ttest_ind(x,y,alternative='larger')
print(a1)
print(cohen_d(x,y))
LIST_OF_TESTS= np.append(LIST_OF_TESTS,a1[1])

(3.2226457079770752, 0.00071910110285952676, 251.0)
(0.40524049000741369, 'small (Cohen, 1988)')


In [362]:
x = df_first_part[(df_first_part['typex']=='first')&(df_first_part['balance']==10)].PPV
y = df_first_part[(df_first_part['typex']=='last')&(df_first_part['balance']==10)].PPV
a1 = ttest_ind(x,y,alternative='larger')
print(a1)
print(cohen_d(x,y))
LIST_OF_TESTS= np.append(LIST_OF_TESTS,a1[1])

(0.3127660041044481, 0.37736064549829457, 248.0)
(0.03957351672006644, 'very small (Sawilowsky, 2009)')


In [363]:
(result,confidence) = compute_tukey(df_first_part,'PPV','balance')
print(result)
confidence

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff lower  upper  reject
-------------------------------------------
  10     50    0.4617  0.4212 0.5022  True 
-------------------------------------------


0.001

In [364]:
a1 = ttest_ind(df_first_part['PPV'],df_first_part['balance'])
print(a1)
print(cohen_d(x,y))

(-33.196264535431737, 1.0870906956484232e-163, 1004.0)
(0.03957351672006644, 'very small (Sawilowsky, 2009)')


In [365]:
#LIST_OF_TESTS = np.append(LIST_OF_TESTS,confidence) #not really need to do this post-hoc since is only one var

In [366]:
multipletests(LIST_OF_TESTS,method='holm')

(array([ True, False], dtype=bool),
 array([ 0.0014382 ,  0.37736065]),
 0.025320565519103666,
 0.025)

# let's try to add also special now

In [381]:
df_fit = ols('PPV ~ typex ',data=df2).fit()

In [382]:
anova_results = sm.stats.anova_lm(df_fit, typ=1)
anova_results

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
typex,2.0,0.535338,0.267669,5.447993,0.004647
Residual,381.0,18.719183,0.049132,,


In [369]:
anova_results['PR(>F)']<0.05

typex        True
Residual    False
Name: PR(>F), dtype: bool

In [370]:
(result,confidence) = compute_tukey(df2,'PPV','typex')
print(result)
confidence

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1  group2 meandiff  lower   upper  reject
----------------------------------------------
first    last  -0.0872  -0.1528 -0.0216  True 
first  special -0.0191  -0.0843  0.0461 False 
 last  special  0.0681   0.0033  0.133   True 
----------------------------------------------


array([ 0.00534867,  0.75017688,  0.03670704])

In [371]:
LIST_OF_TESTS = np.append(LIST_OF_TESTS,confidence)

In [380]:
print(LIST_OF_TESTS)
multipletests(LIST_OF_TESTS,method='fdr_tsbky')

[  7.19101103e-04   3.77360645e-01   5.34866952e-03   7.50176881e-01
   3.67070431e-02]


(array([ True, False,  True, False,  True], dtype=bool),
 array([ 0.00226517,  0.29717151,  0.00842415,  0.47261143,  0.0385424 ]),
 0.010206218313011495,
 0.01)

## Summary
We want to test the trend with see in Figure 2: showing relevant results first improves accuracy and PPV. To test this intuition we first perform a two-tailed ANOVA test on the PPV with the two factors being the order ('relevant first', 'relevant last') and the balance ('10\%-90\%' and '50\%-50\%'). The results show that both balance and order affect the PPV (F(1,499)=510.8, $p<10^{-78}$ and F(1,499)=5.65, $p=0.01$). As a post-hoc analysis (with FDR correction for multiple tests) we first investigate whether 'relevant first' does indeed lead to a better PPV. Thus, we perform two one-tailed t-tests, one for each balance ('10\%-90\%' and '50\%-50\%').  We obtain that for the inbalance case the result are not statistically significant (p=0.008), while for the balanced case indeed the accuracy is larger when relevant results are presented first, with an effect size of $d=0.04$ and $p=0.002<0.05$.

Regarding batch 7, we cannot include it the aforementioned two-way ANOVA, because this order technique have been tried only on the balanced result. We thus perform a one way ANOVA on PPV and order ('relevant first', 'relevant last', and 'batch 7'), that confirms that PPV is affected by the order (F(2,381)=5.45, $p=0.004$). We then perform a post-hoc Tukey HSD test (included in the FDR correction as above), that confirms that 'relevant first' and 'batch 7' are significantly different than 'relevant last' ($p=0.03$, $p=0.008$), while 'relevant first' and 'batch 7' were not significantly different, as one could expect since they are partly overlapping.

In [117]:
x = df[(df['typex']=='first')&(df['balance']==50)].PPV
y = df[(df['typex']=='last')&(df['balance']==50)].PPV
a1 = ttest_ind(x,y,alternative='larger')
print(a1)
print(cohen_d(x,y))

(3.2226457079770752, 0.00071910110285952676, 251.0)
(0.40524049000741369, 'small (Cohen, 1988)')


In [118]:
x = df[(df['typex']=='first')&(df['balance']==50)].Accuracy
y = df[(df['typex']=='last')&(df['balance']==50)].Accuracy
a2 = ttest_ind(x,y,alternative='larger')
print(a2)
print(cohen_d(x,y))

(2.6088017050289909, 0.0048155726290837154, 251.0)
(0.32805097956043971, 'small (Cohen, 1988)')


In [None]:
x = df[(df['typex']=='first')&(df['balance']==10)].PPV
y = df[(df['typex']=='last')&(df['balance']==10)].PPV
a1 = ttest_ind(x,y,alternative='larger')
print(a1)
print(cohen_d(x,y))

In [119]:
x = df[(df['typex']=='first')&(df['balance']==10)].Accuracy
y = df[(df['typex']=='last')&(df['balance']==10)].Accuracy
a1 = ttest_ind(x,y,alternative='larger')
print(a1)
print(cohen_d(x,y))

(0.28218404740934044, 0.38901886884753717, 248.0)
(0.035704056616587487, 'very small (Sawilowsky, 2009)')


In [94]:
multipletests(a1[1],a2[1],method='holm')

(array([ True], dtype=bool),
 array([ 0.00481557]),
 0.0048155726290837197,
 0.0048155726290837154)

In [56]:
x = df[(df['typex']=='random')&(df['balance']==50)].Accuracy
y = df[(df['typex']=='last')&(df['balance']==50)].Accuracy
a1 = ttest_ind(x,y)
print(a1)
print(cohen_d(x,y))
a2 = ttest_ind(df[(df['typex']=='first')&(df['balance']==50)].Accuracy,df[(df['typex']=='last')&(df['balance']==50)].Accuracy,alternative='larger')
a2

(2.6088017050289909, 0.0048155726290837154, 251.0)

In [66]:
a3 = ttest_ind(df[(df['typex']=='random')&(df['balance']==50)].Accuracy,df[(df['typex']=='first')&(df['balance']==50)].Accuracy)
a3

(-0.70921263081935859, 0.4794433215235836, 132.0)

In [67]:
a4 = ttest_ind(df[(df['typex']=='random')&(df['balance']==50)].Accuracy,df[(df['typex']=='last')&(df['balance']==50)].Accuracy)
a4

(1.7587494227208926, 0.080836638558093601, 138.0)

In [59]:
multipletests([a1[1],a2[1],a3[1],a4[1]],method='holm')

(array([ True,  True, False, False], dtype=bool),
 array([ 0.0028764 ,  0.01444672,  0.21700087,  0.21700087]),
 0.012741455098566168,
 0.0125)

In [75]:
smp.ttest_power(0.2, nobs=60, alpha=0.1, alternative='two-sided')

0.45558175996348543

In [78]:
smp.ttest_power(, nobs=138, alpha=0.08)

  pow_ = stats.nct._sf(crit_upp, df, d*np.sqrt(nobs))


TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'

considering all topics, first is better than last for 50-50 in both term of accuracy and PPV, while there is not enough evidence 

In [60]:
df = pd.read_csv('Exp2_forAnova.csv')

In [61]:
b1 = ttest_ind(df[(df['typex']=='first')&(df['balance']==50)].PPV,df[(df['typex']=='last')&(df['balance']==50)].PPV,alternative='larger')
b1

(2.3599183954989336, 0.010220051809051594, 90.0)

In [62]:
b2 = ttest_ind(df[(df['typex']=='first')&(df['balance']==50)].Accuracy,df[(df['typex']=='last')&(df['balance']==50)].Accuracy,alternative='larger')
b2

(1.6021061925946132, 0.056318165320166931, 90.0)

In [63]:
b3 = ttest_ind(df[(df['typex']=='random')&(df['balance']==50)].Accuracy,df[(df['typex']=='first')&(df['balance']==50)].Accuracy)
b3

(-0.80834313265678148, 0.42034751998311026, 132.0)

In [64]:
b4 = ttest_ind(df[(df['typex']=='random')&(df['balance']==50)].Accuracy,df[(df['typex']=='last')&(df['balance']==50)].Accuracy)
b4

(0.88286960004334025, 0.3788420865298312, 138.0)

In [65]:
multipletests([a1[1],a2[1],a3[1],a4[1],b1[1],b2[1],b3[1],b4[1]],method='holm')

(array([ True,  True, False, False, False, False, False, False], dtype=bool),
 array([ 0.00575281,  0.03370901,  0.43400175,  0.44571372,  0.06132031,
         0.28159083,  0.75768417,  0.75768417]),
 0.0063911509545450107,
 0.00625)

In [69]:
ttest_ind?