In [1]:
import random
import numpy as np
import pandas as pd
from itertools import chain, combinations

In [2]:
# initial parameters
random.seed(1234)
num_ids = 10000
num_vars = 10

# create repeating IDs
ids = sorted(list(range(0, num_ids)) * num_vars)

# create repeating variables
variables = list(range(0, num_vars)) * num_ids

# create random integers
values = np.random.randint(1, 5, size = num_ids * num_vars)

In [3]:
# create a dataframe with these values
df = pd.DataFrame({"id": ids, "variable": variables, "values": values})

# sort the dataframe
df.sort_values(['id', 'variable'], inplace=True)

In [4]:
def powerset(iterable):
    """
    Thanks to https://stackoverflow.com/questions/1482308
    """
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

In [5]:
# create every subset of variables
subsets = list(powerset(set(variables)))

# how many items?
len(subsets)

1024

In [9]:
def calculate(df, variables):
    
    """
    Subset the dataframe to only include specified variables
    and calculate the average scores per ID
    """
    
    # filter the dataframe to only the variables in the subset
    sub = df[df['variable'].isin(variables)]

    # group by ID and calculate average value
    scores = sub.groupby('id')['values'].mean()
    
    return scores

In [10]:
def correlalate(actual, subset):
    
    """
    Calculate the correlation between the actual and subset
    """
    
    return actual.corr(subset)

In [12]:
# calculate actual values
actual = calculate(df, set(variables))
subset = calculate(df, list(set(variables))[:5])
cor = correlalate(actual, subset)

0.7091866206463525

In [13]:
%%time

# create a dictionary to hold the results
results = []

# iterate over each subset
for s in subsets:

    # make sure there is at least 1 variable...
    if len(s) > 0:
        
        # calculate the values on the filtered dataframe
        sub = calculate(df, s)
        
        # calculate the correlation
        corr = correlalate(actual, sub)

        # add results to dictionary
        results.append((s, corr))

CPU times: user 5.36 s, sys: 12 ms, total: 5.38 s
Wall time: 5.38 s
