Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dabest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@
from ._stats_tools import effsize as effsize
from ._classes import TwoGroupsEffectSize

__version__ = "0.2.4"
__version__ = "0.2.5"
42 changes: 33 additions & 9 deletions dabest/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,24 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
if all([isinstance(i, str) for i in idx]):
# flatten out idx.
all_plot_groups = pd.unique([t for t in idx]).tolist()
if len(idx) > len(all_plot_groups):
err0 = '`idx` contains duplicated groups. Please remove any duplicates and try again.'
raise ValueError(err0)

# We need to re-wrap this idx inside another tuple so as to
# easily loop thru each pairwise group later on.
self.__idx = (idx,)

elif all([isinstance(i, (tuple, list)) for i in idx]):
all_plot_groups = pd.unique([tt for t in idx for tt in t]).tolist()

actual_groups_given = sum([len(i) for i in idx])

if actual_groups_given > len(all_plot_groups):
err0 = 'Groups are repeated across tuples,'
err1 = ' or a tuple has repeated groups in it.'
err2 = ' Please remove any duplicates and try again.'
raise ValueError(err0 + err1 + err2)

else: # mix of string and tuple?
err = 'There seems to be a problem with the idx you'
Expand Down Expand Up @@ -91,9 +103,14 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
# check all the idx can be found in data_in[x]
for g in all_plot_groups:
if g not in data_in[x].unique():
raise IndexError('{0} is not a group in `{1}`.'.format(g, x))
err0 = '"{0}" is not a group in the column `{1}`.'.format(g, x)
err1 = " Please check `idx` and try again."
raise IndexError(err0 + err1)

# Select only rows where the value in the `x` column
# is found in `idx`.
plot_data = data_in[data_in.loc[:, x].isin(all_plot_groups)].copy()

# plot_data.drop("index", inplace=True, axis=1)

# Assign attributes
Expand All @@ -113,8 +130,10 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
# First, check we have all columns in the dataset.
for g in all_plot_groups:
if g not in data_in.columns:
raise IndexError('{0} is not a column in `data`.'.format(g))

err0 = '"{0}" is not a column in `data`.'.format(g)
err1 = " Please check `idx` and try again."
raise IndexError(err0 + err1)

set_all_columns = set(data_in.columns.tolist())
set_all_plot_groups = set(all_plot_groups)
id_vars = set_all_columns.difference(set_all_plot_groups)
Expand All @@ -139,8 +158,8 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
categories=all_plot_groups,
ordered=True)

# Line 143 added in v0.2.4.
plot_data.dropna(inplace=True)
# # The line below was added in v0.2.4, removed in v0.2.5.
# plot_data.dropna(inplace=True)

self.__plot_data = plot_data

Expand Down Expand Up @@ -990,8 +1009,11 @@ def __pre_calc(self):
self.__random_seed)
r_dict = result.to_dict()

r_dict["control"] = cname
r_dict["test"] = tname
r_dict["control"] = cname
r_dict["test"] = tname
r_dict["control_N"] = int(len(control))
r_dict["test_N"] = int(len(test))

out.append(r_dict)

if j == len(idx)-1 and ix == len(current_tuple)-2:
Expand Down Expand Up @@ -1020,7 +1042,8 @@ def __pre_calc(self):

out_ = pd.DataFrame(out)

columns_in_order = ['control', 'test', 'effect_size', 'is_paired',
columns_in_order = ['control', 'test', 'control_N', 'test_N',
'effect_size', 'is_paired',
'difference', 'ci',

'bca_low', 'bca_high', 'bca_interval_idx',
Expand Down Expand Up @@ -1256,7 +1279,8 @@ def statistical_tests(self):
stats_columns = [c for c in results_df.columns
if c.startswith("statistic") or c.startswith("pvalue")]

default_cols = ['control', 'test', 'effect_size', 'is_paired',
default_cols = ['control', 'test', 'control_N', 'test_N',
'effect_size', 'is_paired',
'difference', 'ci', 'bca_low', 'bca_high']

cols_of_interest = default_cols + stats_columns
Expand Down
51 changes: 29 additions & 22 deletions dabest/_stats_tools/confint_1group.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,28 @@
# -*-coding: utf-8 -*-
# Author: Joses Ho
# Email : joseshowh@gmail.com
"""
A range of functions to compute bootstraps for a single sample.
"""

def create_bootstrap_indexes(array, resamples=5000, random_seed=12345):
"""Given an array-like, returns a generator of bootstrap indexes
to be used for resampling.
"""
import numpy as np

# Set seed.
np.random.seed(random_seed)

indexes = range(0, len(array))

out = (np.random.choice(indexes, len(indexes), replace=True)
for i in range(0, resamples))

# Reset seed
np.random.seed()

return out

def compute_1group_jackknife(x, func, *args, **kwargs):
"""
Expand All @@ -20,22 +40,6 @@ def compute_1group_jackknife(x, func, *args, **kwargs):
def compute_1group_acceleration(jack_dist):
from . import confint_2group_diff as ci_2g
return ci_2g._calc_accel(jack_dist)



def _create_bootstrap_indexes(array, resamples=5000):
"""Given an array-like, returns a generator of bootstrap indexes
to be used for resampling.
"""
import numpy as np

indexes = range(0, len(array))

out = (np.random.choice(indexes, len(indexes), replace=True)
for i in range(0, resamples))

return out




Expand All @@ -49,8 +53,9 @@ def compute_1group_bootstraps(x, func, resamples=5000, random_seed=12345,
np.random.seed(random_seed)

# Create bootstrap indexes.
boot_indexes = _create_bootstrap_indexes(x, resamples)

boot_indexes = create_bootstrap_indexes(x, resamples=resamples,
random_seed=random_seed)

out = [func(x[b], *args, **kwargs) for b in boot_indexes]

del boot_indexes
Expand Down Expand Up @@ -123,11 +128,13 @@ def summary_ci_1group(x, func, resamples=5000, alpha=0.05, random_seed=12345,
from . import confint_2group_diff as ci2g
from numpy import sort as npsort

boots = compute_1group_bootstraps(x, func, resamples, random_seed)
bias = compute_1group_bias_correction(x, boots, func)
boots = compute_1group_bootstraps(x, func, resamples=resamples,
random_seed=random_seed,
*args, **kwargs)
bias = compute_1group_bias_correction(x, boots, func)

jk = compute_1group_jackknife(x, func)
accel = ci2g._calc_accel(jk)
jk = compute_1group_jackknife(x, func, *args, **kwargs)
accel = compute_1group_acceleration(jk)
del jk

ci_idx = ci2g.compute_interval_limits(bias, accel, resamples, alpha)
Expand Down
49 changes: 43 additions & 6 deletions dabest/_stats_tools/confint_2group_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
# -*-coding: utf-8 -*-
# Author: Joses Ho
# Email : joseshowh@gmail.com


"""
A range of functions to compute bootstraps for the mean difference
between two groups.
"""

def create_jackknife_indexes(data):
"""
Expand Down Expand Up @@ -103,9 +105,34 @@ def _calc_accel(jack_dist):



# def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
# resamples=5000, random_seed=12345):
# """Bootstraps the effect_size for 2 groups."""
# from . import effsize as __es
# import numpy as np
#
# np.random.seed(random_seed)
#
# out = np.repeat(np.nan, resamples)
# x0_len = len(x0)
# x1_len = len(x1)
#
# for i in range(int(resamples)):
# x0_boot = np.random.choice(x0, x0_len, replace=True)
# x1_boot = np.random.choice(x1, x1_len, replace=True)
# out[i] = __es.two_group_difference(x0_boot, x1_boot,
# is_paired, effect_size)
#
# # reset seed
# np.random.seed()
#
# return out


def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
resamples=5000, random_seed=12345):
"""Bootstraps the effect_size for 2 groups."""

from . import effsize as __es
import numpy as np

Expand All @@ -114,11 +141,20 @@ def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
out = np.repeat(np.nan, resamples)
x0_len = len(x0)
x1_len = len(x1)

for i in range(int(resamples)):
x0_boot = np.random.choice(x0, x0_len, replace=True)
x1_boot = np.random.choice(x1, x1_len, replace=True)
out[i] = __es.two_group_difference(x0_boot, x1_boot,

if is_paired:
if x0_len != x1_len:
raise ValueError("The two arrays do not have the same length.")
random_idx = np.random.choice(x0_len, x0_len, replace=True)
x0_sample = x0[random_idx]
x1_sample = x1[random_idx]
else:
x0_sample = np.random.choice(x0, x0_len, replace=True)
x1_sample = np.random.choice(x1, x1_len, replace=True)

out[i] = __es.two_group_difference(x0_sample, x1_sample,
is_paired, effect_size)

# reset seed
Expand All @@ -128,6 +164,7 @@ def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,




def compute_meandiff_bias_correction(bootstraps, effsize):
"""
Computes the bias correction required for the BCa method
Expand Down
19 changes: 8 additions & 11 deletions dabest/plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,16 +670,13 @@ def EffectSizeDataFramePlotter(EffectSizeDataFrame, **plot_kwargs):




# Place raw axes y-label.
if plot_kwargs['swarm_label'] is not None:
swarm_label = plot_kwargs['swarm_label']
else:
# Set raw axes y-label.
swarm_label = plot_kwargs['swarm_label']
if swarm_label is None and yvar is None:
swarm_label = "value"
elif swarm_label is None and yvar is not None:
swarm_label = yvar
rawdata_axes.set_ylabel(swarm_label)




# Place contrast axes y-label.
contrast_label_dict = {'mean_diff' : "mean difference",
'median_diff' : "median difference",
Expand All @@ -702,8 +699,8 @@ def EffectSizeDataFramePlotter(EffectSizeDataFrame, **plot_kwargs):
contrast_axes.yaxis.set_label_position("right")


# Set the rawdata axes labels appropriately
rawdata_axes.set_ylabel(plot_kwargs["swarm_label"])
# Set the rawdata axes labels appropriately
rawdata_axes.set_ylabel(swarm_label)
rawdata_axes.set_xlabel("")


Expand Down
43 changes: 43 additions & 0 deletions dabest/tests/test_02_edge_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/python
# -*-coding: utf-8 -*-
# Author: Joses Ho
# Email : joseshowh@gmail.com


import sys
import numpy as np
import scipy as sp
import pytest
import pandas as pd
from .._api import load



def test_unrelated_columns(N=60, random_seed=12345):
"""
Test to see if 'unrelated' columns jam up the analysis.
See Github Issue 43.
https://github.com/ACCLAB/DABEST-python/issues/44.

Added in v0.2.5.
"""

np.random.seed(random_seed)

df = pd.DataFrame(
{'groups': np.random.choice(['Group 1', 'Group 2', 'Group 3'], size=(N,)),
'color' : np.random.choice(['green', 'red', 'purple'], size=(N,)),
'value': np.random.random(size=(N,))})

np.random.seed()

df['unrelated'] = np.nan

test = load(data=df, x='groups', y='value',
idx=['Group 1', 'Group 2'])

md = test.mean_diff.results

assert md.difference[0] == pytest.approx(0.1115, abs=1e-6)
assert md.bca_low[0] == pytest.approx(-0.042835, abs=1e-6)
assert md.bca_high[0] == pytest.approx(0.264542, abs=1e-6)
File renamed without changes.
Loading