ACCLAB · josesho · Sep 4, 2019 · Sep 3, 2019 · Sep 3, 2019 · Sep 3, 2019
diff --git a/dabest/__init__.py b/dabest/__init__.py
@@ -23,4 +23,4 @@
 from ._stats_tools import effsize as effsize
 from ._classes import TwoGroupsEffectSize 
 
-__version__ = "0.2.4"
+__version__ = "0.2.5"
diff --git a/dabest/_classes.py b/dabest/_classes.py
@@ -41,12 +41,24 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
         if all([isinstance(i, str) for i in idx]):
             # flatten out idx.
             all_plot_groups = pd.unique([t for t in idx]).tolist()
+            if len(idx) > len(all_plot_groups):
+                err0 = '`idx` contains duplicated groups. Please remove any duplicates and try again.'
+                raise ValueError(err0)
+
             # We need to re-wrap this idx inside another tuple so as to
             # easily loop thru each pairwise group later on.
             self.__idx = (idx,)
 
         elif all([isinstance(i, (tuple, list)) for i in idx]):
             all_plot_groups = pd.unique([tt for t in idx for tt in t]).tolist()
+
+            actual_groups_given = sum([len(i) for i in idx])
+
+            if actual_groups_given > len(all_plot_groups):
+                err0 = 'Groups are repeated across tuples,'
+                err1 = ' or a tuple has repeated groups in it.'
+                err2 = ' Please remove any duplicates and try again.'
+                raise ValueError(err0 + err1 + err2)
 
         else: # mix of string and tuple?
             err = 'There seems to be a problem with the idx you'
@@ -91,9 +103,14 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
             # check all the idx can be found in data_in[x]
             for g in all_plot_groups:
                 if g not in data_in[x].unique():
-                    raise IndexError('{0} is not a group in `{1}`.'.format(g, x))
+                    err0 = '"{0}" is not a group in the column `{1}`.'.format(g, x)
+                    err1 = " Please check `idx` and try again."
+                    raise IndexError(err0 + err1)
 
+            # Select only rows where the value in the `x` column 
+            # is found in `idx`.
             plot_data = data_in[data_in.loc[:, x].isin(all_plot_groups)].copy()
+
             # plot_data.drop("index", inplace=True, axis=1)
 
             # Assign attributes
@@ -113,8 +130,10 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
             # First, check we have all columns in the dataset.
             for g in all_plot_groups:
                 if g not in data_in.columns:
-                    raise IndexError('{0} is not a column in `data`.'.format(g))
-
+                    err0 = '"{0}" is not a column in `data`.'.format(g)
+                    err1 = " Please check `idx` and try again."
+                    raise IndexError(err0 + err1)
+
             set_all_columns     = set(data_in.columns.tolist())
             set_all_plot_groups = set(all_plot_groups)
             id_vars = set_all_columns.difference(set_all_plot_groups)
@@ -139,8 +158,8 @@ def __init__(self, data, idx, x, y, paired, id_col, ci, resamples,
                                                categories=all_plot_groups,
                                                ordered=True)
 
-        # Line 143 added in v0.2.4. 
-        plot_data.dropna(inplace=True)
+        # # The line below was added in v0.2.4, removed in v0.2.5.
+        # plot_data.dropna(inplace=True)
 
         self.__plot_data = plot_data
 
@@ -990,8 +1009,11 @@ def __pre_calc(self):
                                              self.__random_seed)
                 r_dict = result.to_dict()
 
-                r_dict["control"] = cname
-                r_dict["test"] = tname
+                r_dict["control"]   = cname
+                r_dict["test"]      = tname
+                r_dict["control_N"] = int(len(control))
+                r_dict["test_N"]    = int(len(test))
+
                 out.append(r_dict)
 
                 if j == len(idx)-1 and ix == len(current_tuple)-2:
@@ -1020,7 +1042,8 @@ def __pre_calc(self):
 
         out_             = pd.DataFrame(out)
 
-        columns_in_order = ['control', 'test', 'effect_size', 'is_paired',
+        columns_in_order = ['control', 'test', 'control_N', 'test_N',
+                            'effect_size', 'is_paired',
                             'difference', 'ci',
 
                             'bca_low', 'bca_high', 'bca_interval_idx',
@@ -1256,7 +1279,8 @@ def statistical_tests(self):
         stats_columns = [c for c in results_df.columns
                          if c.startswith("statistic") or c.startswith("pvalue")]
 
-        default_cols = ['control', 'test', 'effect_size', 'is_paired',
+        default_cols = ['control', 'test', 'control_N', 'test_N',
+                        'effect_size', 'is_paired',
                         'difference', 'ci', 'bca_low', 'bca_high']
 
         cols_of_interest = default_cols + stats_columns

diff --git a/dabest/_stats_tools/confint_1group.py b/dabest/_stats_tools/confint_1group.py
@@ -2,8 +2,28 @@
 # -*-coding: utf-8 -*-
 # Author: Joses Ho
 # Email : joseshowh@gmail.com
+"""
+A range of functions to compute bootstraps for a single sample.
+"""
 
+def create_bootstrap_indexes(array, resamples=5000, random_seed=12345):
+    """Given an array-like, returns a generator of bootstrap indexes
+    to be used for resampling.
+    """
+    import numpy as np
+
+    # Set seed.
+    np.random.seed(random_seed)
+
+    indexes = range(0, len(array))
 
+    out = (np.random.choice(indexes, len(indexes), replace=True)
+            for i in range(0, resamples))
+
+    # Reset seed
+    np.random.seed()
+
+    return out
 
 def compute_1group_jackknife(x, func, *args, **kwargs):
     """
@@ -20,22 +40,6 @@ def compute_1group_jackknife(x, func, *args, **kwargs):
 def compute_1group_acceleration(jack_dist):
     from . import confint_2group_diff as ci_2g
     return ci_2g._calc_accel(jack_dist)
-
-
-
-def _create_bootstrap_indexes(array, resamples=5000):
-    """Given an array-like, returns a generator of bootstrap indexes
-    to be used for resampling.
-    """
-    import numpy as np
-
-    indexes = range(0, len(array))
-
-    out = (np.random.choice(indexes, len(indexes), replace=True)
-            for i in range(0, resamples))
-
-    return out
-
 
 
 
@@ -49,8 +53,9 @@ def compute_1group_bootstraps(x, func, resamples=5000, random_seed=12345,
     np.random.seed(random_seed)
 
     # Create bootstrap indexes.
-    boot_indexes = _create_bootstrap_indexes(x, resamples)
-
+    boot_indexes = create_bootstrap_indexes(x, resamples=resamples,
+                                            random_seed=random_seed)
+
     out = [func(x[b], *args, **kwargs) for b in boot_indexes]
 
     del boot_indexes
@@ -123,11 +128,13 @@ def summary_ci_1group(x, func, resamples=5000, alpha=0.05, random_seed=12345,
     from . import confint_2group_diff as ci2g
     from numpy import sort as npsort
 
-    boots = compute_1group_bootstraps(x, func, resamples, random_seed) 
-    bias  = compute_1group_bias_correction(x, boots, func)
+    boots = compute_1group_bootstraps(x, func, resamples=resamples,
+                                      random_seed=random_seed,
+                                      *args, **kwargs)
+    bias = compute_1group_bias_correction(x, boots, func)
 
-    jk = compute_1group_jackknife(x, func)
-    accel = ci2g._calc_accel(jk)
+    jk = compute_1group_jackknife(x, func, *args, **kwargs)
+    accel = compute_1group_acceleration(jk)
     del jk
 
     ci_idx = ci2g.compute_interval_limits(bias, accel, resamples, alpha)

diff --git a/dabest/_stats_tools/confint_2group_diff.py b/dabest/_stats_tools/confint_2group_diff.py
@@ -2,8 +2,10 @@
 # -*-coding: utf-8 -*-
 # Author: Joses Ho
 # Email : joseshowh@gmail.com
-
-
+"""
+A range of functions to compute bootstraps for the mean difference 
+between two groups.
+"""
 
 def create_jackknife_indexes(data):
     """
@@ -103,9 +105,34 @@ def _calc_accel(jack_dist):
 
 
 
+# def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
+#                                 resamples=5000, random_seed=12345):
+#     """Bootstraps the effect_size for 2 groups."""
+#     from . import effsize as __es
+#     import numpy as np
+# 
+#     np.random.seed(random_seed)
+# 
+#     out = np.repeat(np.nan, resamples)
+#     x0_len = len(x0)
+#     x1_len = len(x1)
+# 
+#     for i in range(int(resamples)):
+#         x0_boot = np.random.choice(x0, x0_len, replace=True)
+#         x1_boot = np.random.choice(x1, x1_len, replace=True)
+#         out[i] = __es.two_group_difference(x0_boot, x1_boot,
+#                                           is_paired, effect_size)
+# 
+#     # reset seed
+#     np.random.seed()
+# 
+#     return out
+
+
 def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
                                 resamples=5000, random_seed=12345):
     """Bootstraps the effect_size for 2 groups."""
+
     from . import effsize as __es
     import numpy as np
 
@@ -114,11 +141,20 @@ def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
     out = np.repeat(np.nan, resamples)
     x0_len = len(x0)
     x1_len = len(x1)
-
+    
     for i in range(int(resamples)):
-        x0_boot = np.random.choice(x0, x0_len, replace=True)
-        x1_boot = np.random.choice(x1, x1_len, replace=True)
-        out[i] = __es.two_group_difference(x0_boot, x1_boot,
+
+        if is_paired:
+            if x0_len != x1_len:
+                raise ValueError("The two arrays do not have the same length.")
+            random_idx = np.random.choice(x0_len, x0_len, replace=True)
+            x0_sample = x0[random_idx]
+            x1_sample = x1[random_idx]
+        else:
+            x0_sample = np.random.choice(x0, x0_len, replace=True)
+            x1_sample = np.random.choice(x1, x1_len, replace=True)
+
+        out[i] = __es.two_group_difference(x0_sample, x1_sample,
                                           is_paired, effect_size)
 
     # reset seed
@@ -128,6 +164,7 @@ def compute_bootstrapped_diff(x0, x1, is_paired, effect_size,
 
 
 
+
 def compute_meandiff_bias_correction(bootstraps, effsize):
     """
     Computes the bias correction required for the BCa method

diff --git a/dabest/plotter.py b/dabest/plotter.py
@@ -670,16 +670,13 @@ def EffectSizeDataFramePlotter(EffectSizeDataFrame, **plot_kwargs):
 
 
 
-
-    # Place raw axes y-label.
-    if plot_kwargs['swarm_label'] is not None:
-        swarm_label = plot_kwargs['swarm_label']
-    else:
+    # Set raw axes y-label.
+    swarm_label = plot_kwargs['swarm_label']
+    if swarm_label is None and yvar is None:
+        swarm_label = "value"
+    elif swarm_label is None and yvar is not None:
         swarm_label = yvar
-    rawdata_axes.set_ylabel(swarm_label)
-
-
-
+
     # Place contrast axes y-label.
     contrast_label_dict = {'mean_diff'    : "mean difference",
                            'median_diff'  : "median difference",
@@ -702,8 +699,8 @@ def EffectSizeDataFramePlotter(EffectSizeDataFrame, **plot_kwargs):
         contrast_axes.yaxis.set_label_position("right")
 
 
-    # Set the rawdata axes labels appropriately
-    rawdata_axes.set_ylabel(plot_kwargs["swarm_label"])
+    # Set the rawdata axes labels appropriately        
+    rawdata_axes.set_ylabel(swarm_label)
     rawdata_axes.set_xlabel("")
 
 

diff --git a/dabest/tests/test_02_edge_cases.py b/dabest/tests/test_02_edge_cases.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+# -*-coding: utf-8 -*-
+# Author: Joses Ho
+# Email : joseshowh@gmail.com
+
+
+import sys
+import numpy as np
+import scipy as sp
+import pytest
+import pandas as pd
+from .._api import load
+
+
+
+def test_unrelated_columns(N=60, random_seed=12345):
+    """
+    Test to see if 'unrelated' columns jam up the analysis.
+    See Github Issue 43.
+    https://github.com/ACCLAB/DABEST-python/issues/44.
+
+    Added in v0.2.5.
+    """
+
+    np.random.seed(random_seed)
+
+    df = pd.DataFrame(
+        {'groups': np.random.choice(['Group 1', 'Group 2', 'Group 3'], size=(N,)),
+         'color' : np.random.choice(['green', 'red', 'purple'], size=(N,)),
+         'value': np.random.random(size=(N,))})
+
+    np.random.seed()
+
+    df['unrelated'] = np.nan
+
+    test = load(data=df, x='groups', y='value', 
+                idx=['Group 1', 'Group 2'])
+
+    md = test.mean_diff.results
+
+    assert md.difference[0] == pytest.approx(0.1115, abs=1e-6)
+    assert md.bca_low[0]    == pytest.approx(-0.042835, abs=1e-6)
+    assert md.bca_high[0]   == pytest.approx(0.264542, abs=1e-6)
diff --git a/dabest/tests/test_02_plotting.py → dabest/tests/test_03_plotting.py b/dabest/tests/test_02_plotting.py → dabest/tests/test_03_plotting.py