In [64]:
import pandas as pd
# stats
from scipy.stats import mannwhitneyu
from statsmodels.sandbox.stats.multicomp import multipletests
from scipy.stats import chisquare
from collections import Counter

In [58]:
def get_data(paths, groupby=None, classes=None, rel_cols=None, sep=","):
    """Will load the data and return a list of two dataframes
    that can then be used for later comparism.
    :param path1: Path to dataframe1
    :param path2: Path to dataframe2. Optional if all data for comparison is in df1.
                  Then use groupby argument
    :param groupby: name of the column which specifies classes to compare to each other. (e.g. sampling site)
    """

    dfs = []
    
    if groupby:
        data = pd.read_csv(*paths, index_col=0, sep=sep)
        grouping = data.groupby(groupby)

        for name, grp in grouping: # split dataframe groups and create a list with all dataframes
            df = grouping.get_group(name)[::]
            
            # consider all columns as relevant is no rel_cols given.
            if rel_cols is None:
                rel_cols = list(df)
            # consider the relevant columns
            dfs.append(df[rel_cols])

    if len(paths) > 1:
        for path in paths:
            df = pd.read_csv(path, index_col=0)
            dfs.append(df)

    if classes:
        df_names = classes
    else:
        df_names = ["df" + str(x) for x in range(1, len(dfs)+1)]

    return dfs, df_names

In [52]:
def create_zipper(dfs, feats=None):
    """create zipper containing the values of the same features per df in one list.
    (df1_feat1, df2_feat1, df3_feat1), (df1_feat2, df2_feat2, df3_feat2),"""
    if feats is None:
        feats = list(dfs[0])

    df_feats = []

    for df in dfs:
        df_feats.append([list(df[feat].dropna()) for feat in feats])

    zip_values = zip(*df_feats)
    zipper = dict(zip(feats, zip_values))
    return zipper

In [72]:
def test_num_dist(zipper, feats=None):
    """Perform a hypothesis test to check if the distributions vary signifcantly from each other"""
    p_values = dict()

    if feats is None:
        feats = zipper.keys()

    for feat in feats:  # run through all variables
        # initiate dict in dict for d1 vs d2, d2 vs d3 etc. per feature
        p_values[feat] = dict() 
        
        for i in range(len(zipper[feat]) - 1):  # select dataset1
            for j in range(i + 1, len(zipper[feat])):  # select dataset2
                # calculate u statistic and return p-value
                z = mannwhitneyu(zipper[feat][i], zipper[feat][j], alternative="two-sided")
                p_values[feat][i,j] = z.pvalue

    return p_values

In [59]:
dfs, df_names = get_data(["/home/colin/git/DataComp/niklas_test.csv"], groupby="DD01",
                         rel_cols=["IM01_01", "IM01_02", "IM01_03", "IM01_04", "IM01_05"])

In [61]:
zipper = create_zipper(dfs)

In [73]:
test_num_dist(zipper)

{'IM01_01': {(0, 1): 0.93865022890263994,
  (0, 2): 0.33944697489739506,
  (1, 2): 0.41130514261053808},
 'IM01_02': {(0, 1): 0.087461416429921612,
  (0, 2): 0.086044407605177342,
  (1, 2): 0.10778634193843502},
 'IM01_03': {(0, 1): 0.72883092376094039,
  (0, 2): 0.7901927340621856,
  (1, 2): 0.87621447991874535},
 'IM01_04': {(0, 1): 0.026940228791894349,
  (0, 2): 0.55080947093197574,
  (1, 2): 0.42257809472985364},
 'IM01_05': {(0, 1): 0.28203340227206108,
  (0, 2): 0.15772935376007924,
  (1, 2): 0.16420346437619715}}