In [None]:
!pip install hypothesize

In [None]:
from hypothesize.utilities import create_example_data

### How to compare two groups

#### Load data from a CSV or create some random data

In [3]:
#df=pd.read_csv("/home/allan/two_groups_data.csv")
#df=pd.DataFrame(np.random.rand(30, 2), columns=['Group_1', 'Group_2'])
df=create_example_data(design_values=2)

df.head()

Unnamed: 0,Group_1,Group_2
0,0.731413,0.359505
1,0.352178,0.198409
2,0.692202,0.227757
3,0.818677,0.543662
4,0.147462,0.615596


#### Import the desired function and pass in the data for each group
- This example uses the bootstrapped-t method with 20% trimmed means
- The output is a dictionary containing the results (95% confidence interval, p_value, test statistics, etc...)

In [2]:
from hypothesize.compare_groups_with_single_factor import yuenbt

results=yuenbt(df.Group_1, df.Group_2)

print(results['ci'])

[-0.32294783884298334, 0.11774331261500753]


---

### How to compare three groups

#### Load data from a CSV or create some random data

In [4]:
import pandas as pd

#df=pd.read_csv("/home/allan/one_way_data.csv")
#df=pd.DataFrame(np.random.rand(30, 3), columns=['Group_1', 'Group_2', 'Group_3'])
df=create_example_data(design_values=3)

df.head()

Unnamed: 0,Group_1,Group_2,Group_3
0,0.41079,0.661795,0.545151
1,0.773183,0.348142,0.847576
2,0.543981,0.857189,0.38295
3,0.995212,0.738859,0.819848
4,0.192913,0.606873,0.418079


#### Import the desired functions and pass in the inputs
- One appraoch is to use a set of linear contrasts that will test all pairwise comparisons
- Then, the bootstrap-t method and the 20% trimmed mean can be used
- CIs are adjusted to control for FWE
- All pairwise contrasts can be created automatically using the `con1way` function
- The results are a dictionary of DataFrames that contain various statistics (p_value, CIs, standard error, test statistics, etc)

In [4]:
from hypothesize.compare_groups_with_single_factor import linconb
from hypothesize.utilities import con1way

results=linconb(df, con=con1way(3))

In [5]:
results['test']

Unnamed: 0,contrast_index,test,se,p_value
0,0.0,-0.999892,0.102613,0.33389
1,1.0,-0.65811,0.09961,0.522538
2,2.0,0.362839,0.102106,0.709516


In [6]:
results['psihat']

Unnamed: 0,contrast_index,psihat,ci_low,ci_up
0,0.0,-0.102602,-0.36885,0.163646
1,1.0,-0.065554,-0.324009,0.1929
2,2.0,0.037048,-0.227883,0.301979


---

### How to compare groups in a factorial design

#### Load data from a CSV or create some random data

In [5]:
import pandas as pd

#df=pd.read_csv("/home/allan/two_way_data.csv")
#df=pd.DataFrame(np.random.rand(30, 6), columns=['Cell_1_1', 'Cell_1_2', 'Cell_1_3', 'Cell_2_1', 'Cell_2_2', 'Cell_2_3'])
df=create_example_data(design_values=[2,3])

df.head()

Unnamed: 0,Cell_1_1,Cell_1_2,Cell_1_3,Cell_2_1,Cell_2_2,Cell_2_3
0,0.017184,0.353532,0.703423,0.224846,0.81783,0.469761
1,0.773729,0.260461,0.907378,0.512377,0.320044,0.775811
2,0.592182,0.876341,0.773215,0.869646,0.873871,0.914144
3,0.606674,0.747438,0.432764,0.233671,0.297686,0.327816
4,0.349483,0.856925,0.092334,0.709812,0.746853,0.00701


#### Import the desired function and pass in the data
- This example uses a 2-by-3 design
- One approach is to use a set of linear contrasts that will test all main effects and interactions
- Then, the bootstrap-t method and the 20% trimmed mean can be used
- The results are a dictionary of DataFrames that contain various statistics for each factor and the interactions

In [6]:
from hypothesize.compare_groups_with_two_factors import bwmcp

results=bwmcp(J=2, K=3, x=df)

In [7]:
results['factor_A']

Unnamed: 0,con_num,psihat,se,test,crit_value,p_value
0,0.0,0.056109,0.172951,0.32442,2.106767,0.759599


In [8]:
results['factor_B']

Unnamed: 0,con_num,psihat,se,test,crit_value,p_value
0,0.0,-0.130234,0.136259,-0.955782,2.405083,0.33389
1,1.0,0.002572,0.149269,0.017233,2.405083,0.984975
2,2.0,0.132807,0.15136,0.877423,2.405083,0.390651


In [9]:
results['factor_AB']

Unnamed: 0,con_num,psihat,se,test,crit_value,p_value
0,0.0,-0.020531,0.136259,-0.150678,2.402193,0.888147
1,1.0,0.10743,0.149269,0.719705,2.402193,0.48581
2,2.0,0.127961,0.15136,0.845408,2.402193,0.409015


---

### How to compute a robust correlation

#### Load data from a CSV or create some random data

In [10]:
import pandas as pd

#df=pd.read_csv("/home/allan/two_groups_data.csv")
#df=pd.DataFrame(np.random.rand(30, 2), columns=['Group_1', 'Group_2'])
df=create_example_data(design_values=2)

df.head()

Unnamed: 0,Group_1,Group_2
0,0.301979,0.387177
1,0.160594,0.194831
2,0.411265,0.251657
3,0.50872,0.968604
4,0.407827,0.468954


#### Import the desired function and pass in the data for each group
- One approach is to winsorize the x and y data
- A heteroscedastic method for testing zero correlation is also provided in this package but not shown here 
 - Please see the function `corb` which uses the percentile bootstrap to compute a 1-alpha CI and p_value for any correlation 
- The output is a dictionary containing various statistics (the winsorized correlation, winsorized covariance, etc...)

In [11]:
from hypothesize.measuring_associations import wincor

results=wincor(df.Group_1, df.Group_2)

results['wcor']

-0.163421476595891