In [1]:
import random
import numpy as np
import pandas as pd
from itertools import chain, combinations

In [2]:
# initial parameters
random.seed(1234)
num_ids = 1000
num_vars = 10

# create repeating IDs
ids = sorted(list(range(0, num_ids)) * num_vars)

# create repeating variables
variables = list(range(0, num_vars)) * num_ids

# create random integers
values = np.random.randint(1, 5, size = num_ids * num_vars)

In [3]:
# create a dataframe with these values
df = pd.DataFrame({"id": ids, "variable": variables, "values": values})

# sort the dataframe
df.sort_values(['id', 'variable'], inplace=True)

In [4]:
def powerset(iterable):
    """
    Thanks to https://stackoverflow.com/questions/1482308
    """
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

In [5]:
# create every subset of variables
subsets = list(powerset(set(variables)))

# how many items?
len(subsets)

1024

In [6]:
# calculate the average value by ID when including all variables
actual = df.groupby('id')['values'].mean()

In [7]:
def calculate(df, subsets):
    
    """
    1. Iterate over each permutation of variables
    2. Subset the dataframe to only include those variables
    3. Group by ID and recalculate the mean values per ID
    4. Measure correlation compared to complete set of variables
    """

    # create a dictionary to hold the results
    results = {}

    # iterate over each subset
    for s in subsets:

        # make sure there is at least 1 variable...
        if len(s) > 0:

            # filter the dataframe to only the variables in the subset
            sub = df[df['variable'].isin(s)]

            # group by ID and calculate average value
            scores = sub.groupby('id')['values'].mean()

            # calculate correlation with complete set of variables
            corr = actual.corr(scores)

            # add results to dictionary
            results[s] = {'num_items': len(s),
                          'correlation': corr}

    return results

In [8]:
# time how long it takes to run
%timeit results = calculate(df, subsets)

2.26 s ± 7.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
