In [106]:
try:
    import davos
except:
    %pip install davos
    import davos

davos.config.suppress_stdout = False

In [107]:
smuggle pandas as pd # pip: pandas==1.5.1
smuggle numpy as np # pip: numpy==1.22.3
smuggle seaborn as sns # pip: seaborn==0.12.1
smuggle dill as pickle # pip: dill==0.3.6

smuggle h5py # pip: h5py==3.7.0
smuggle pathos # pip: pathos==0.3.0
smuggle quail # pip: quail==0.2.2
smuggle requests # pip: requests==2.28.1
smuggle os
smuggle warnings
smuggle string
from copy smuggle copy

from tqdm smuggle tqdm # pip: tqdm==4.64.1
from matplotlib smuggle pyplot as plt #pip: matplotlib==3.6.2
from matplotlib.ticker smuggle MaxNLocator
from pathos.multiprocessing smuggle ProcessingPool as Pool # pip: pathos==0.3.0
from multiprocessing smuggle cpu_count  # pip: multiprocess==0.70.14
from sklearn.decomposition smuggle IncrementalPCA as PCA # pip: scikit-learn==1.1.3
from scipy smuggle stats # pip: scipy==1.10.0

# local functions
from dataloader import datadir, grouping, feature_groupings, descriptions, sort_by_grouping, fetch_data
from analyze import analyze_data, recover_fingerprint_features, organize_by_listgroup, random, adaptive, non_adaptive_exclude_random, \
                    select_conds, select_lists, filter, get_diffs, stack_diffs, pnr_matrix, accuracy2df, adaptive_listnum2cond, \
                    clustering_matrices, average_by_cond, rename_features, fingerprint2temporal, get_boundaries, \
                    recall_accuracy_near_boundaries, results, results_by_list, analyses, listgroups, orders, ttest, merge_results, \
                    create_clustering_df

# Performance on *feature rich* versus *reduced* lists:
  - accuracy
  - temporal clustering
  - non-visual feature-based clustering (category, size, length, first letter)

In [108]:
print('Accuracy for feature rich vs. reduced (all lists):')
ttest(results['accuracy']['feature rich'], results['accuracy']['reduced'])

Accuracy for feature rich vs. reduced (all lists):
t(126) = -0.290, p = 0.772, d = -0.051, CI = [-2.263, 1.614]


In [109]:
print('Temporal clustering for feature rich vs. reduced (all lists):')
ttest(results['fingerprint']['feature rich'], results['fingerprint']['reduced'], x_col='temporal', y_col='temporal')

Temporal clustering for feature rich vs. reduced (all lists):
t(126) = 10.632, p < 0.001, d = 1.882, CI = [7.890, 14.056]


In [110]:
features = ['category', 'size', 'wordLength', 'firstLetter']
for i, f in enumerate(features):
    print(f'{f} clustering for feature rich vs. reduced (all lists):')
    ttest(results['fingerprint']['feature rich'], results['fingerprint']['reduced'], x_col=f, y_col=f)

    if i < len(features) - 1:
        print('\n')

category clustering for feature rich vs. reduced (all lists):
t(126) = 10.148, p < 0.001, d = 1.796, CI = [7.426, 13.529]


size clustering for feature rich vs. reduced (all lists):
t(126) = 12.033, p < 0.001, d = 2.129, CI = [9.151, 15.557]


wordLength clustering for feature rich vs. reduced (all lists):
t(126) = 10.720, p < 0.001, d = 1.897, CI = [7.091, 15.203]


firstLetter clustering for feature rich vs. reduced (all lists):
t(126) = 6.679, p < 0.001, d = 1.182, CI = [4.327, 9.377]


# Performance on {*feature rich*, *reduced*} lists versus *reduced ({early, late})* lists
We'll compare early and late lists separately (e.g., early to early, late to late, early to late, etc.)

Metrics:
  - accuracy
  - temporal clustering
  - non-visual feature-based clustering (category, size, length, first letter)

## Early vs. late (all conditions and metrics)

### Accuracy

In [111]:
print('Accuracy for feature rich (early lists) vs. feature rich (late lists):')
ttest(results['accuracy']['feature rich'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Late', independent_sample=False)

Accuracy for feature rich (early lists) vs. feature rich (late lists):
t(66) = 4.553, p < 0.001, d = 0.233, CI = [2.547, 7.200]


In [112]:
print('Accuracy for reduced (early lists) vs. reduced (late lists):')
ttest(results['accuracy']['reduced'], results['accuracy']['reduced'], x_lists='Early', y_lists='Late', independent_sample=False)

Accuracy for reduced (early lists) vs. reduced (late lists):
t(60) = 2.434, p = 0.018, d = 0.134, CI = [0.355, 4.929]


In [113]:
print('Accuracy for reduced (early) (early lists) vs. reduced (early) (late lists):')
ttest(results['accuracy']['reduced (early)'], results['accuracy']['reduced (early)'], x_lists='Early', y_lists='Late', independent_sample=False)

Accuracy for reduced (early) (early lists) vs. reduced (early) (late lists):
t(41) = 1.499, p = 0.141, d = 0.098, CI = [-0.352, 3.607]


In [114]:
print('Accuracy for reduced (late) (early lists) vs. reduced (late) (late lists):')
ttest(results['accuracy']['reduced (late)'], results['accuracy']['reduced (late)'], x_lists='Early', y_lists='Late', independent_sample=False)

Accuracy for reduced (late) (early lists) vs. reduced (late) (late lists):
t(40) = 1.462, p = 0.152, d = 0.121, CI = [-0.552, 2.989]


### Temporal clustering

In [115]:
print('Temporal clustering for feature rich (early lists) vs. feature rich (late lists):')
ttest(results['fingerprint']['feature rich'], results['fingerprint']['feature rich'], x_col='temporal', y_col='temporal', x_lists='Early', y_lists='Late', independent_sample=False)

Temporal clustering for feature rich (early lists) vs. feature rich (late lists):
t(66) = 2.268, p = 0.027, d = 0.181, CI = [0.477, 4.297]


In [116]:
print('Temporal clustering for reduced (early lists) vs. reduced (late lists):')
ttest(results['fingerprint']['reduced'], results['fingerprint']['reduced'], x_col='temporal', y_col='temporal', x_lists='Early', y_lists='Late', independent_sample=False)

Temporal clustering for reduced (early lists) vs. reduced (late lists):
t(60) = 0.986, p = 0.328, d = 0.061, CI = [-0.782, 3.436]


In [117]:
print('Temporal clustering for reduced (early) (early lists) vs. reduced (early) (late lists):')
ttest(results['fingerprint']['reduced (early)'], results['fingerprint']['reduced (early)'], x_col='temporal', y_col='temporal', x_lists='Early', y_lists='Late', independent_sample=False)

Temporal clustering for reduced (early) (early lists) vs. reduced (early) (late lists):
t(41) = 0.857, p = 0.396, d = 0.068, CI = [-1.048, 2.877]


In [118]:
print('Temporal clustering for reduced (late) (early lists) vs. reduced (late) (late lists):')
ttest(results['fingerprint']['reduced (late)'], results['fingerprint']['reduced (late)'], x_col='temporal', y_col='temporal', x_lists='Early', y_lists='Late', independent_sample=False)

Temporal clustering for reduced (late) (early lists) vs. reduced (late) (late lists):
t(40) = 1.244, p = 0.221, d = 0.128, CI = [-0.667, 3.174]


### Non-visual feature based clustering

In [119]:
for i, f in enumerate(features):
    print(f'{f} clustering for feature rich (early lists) vs. feature rich (late lists):')
    ttest(results['fingerprint']['feature rich'], results['fingerprint']['feature rich'], x_col=f, y_col=f, x_lists='Early', y_lists='Late', independent_sample=False)

    if i < len(features) - 1:
        print('\n')

category clustering for feature rich (early lists) vs. feature rich (late lists):
t(66) = 3.684, p < 0.001, d = 0.220, CI = [1.782, 5.932]


size clustering for feature rich (early lists) vs. feature rich (late lists):
t(66) = 1.629, p = 0.108, d = 0.100, CI = [-0.361, 3.611]


wordLength clustering for feature rich (early lists) vs. feature rich (late lists):
t(66) = -0.100, p = 0.921, d = -0.010, CI = [-2.363, 1.896]


firstLetter clustering for feature rich (early lists) vs. feature rich (late lists):
t(66) = -0.412, p = 0.681, d = -0.045, CI = [-2.331, 1.515]


In [120]:
for i, f in enumerate(features):
    print(f'{f} clustering for reduced (early lists) vs. reduced (late lists):')
    ttest(results['fingerprint']['reduced'], results['fingerprint']['reduced'], x_col=f, y_col=f, x_lists='Early', y_lists='Late', independent_sample=False)

    if i < len(features) - 1:
        print('\n')

category clustering for reduced (early lists) vs. reduced (late lists):
t(60) = 2.755, p = 0.008, d = 0.177, CI = [0.765, 5.107]


size clustering for reduced (early lists) vs. reduced (late lists):
t(60) = 3.081, p = 0.003, d = 0.201, CI = [1.032, 5.125]


wordLength clustering for reduced (early lists) vs. reduced (late lists):
t(60) = 3.762, p < 0.001, d = 0.261, CI = [1.824, 6.465]


firstLetter clustering for reduced (early lists) vs. reduced (late lists):
t(60) = 1.721, p = 0.090, d = 0.175, CI = [-0.155, 4.125]


In [121]:
for i, f in enumerate(features):
    print(f'{f} clustering for reduced (early) (early lists) vs. reduced (early) (late lists):')
    ttest(results['fingerprint']['reduced (early)'], results['fingerprint']['reduced (early)'], x_col=f, y_col=f, x_lists='Early', y_lists='Late', independent_sample=False)

    if i < len(features) - 1:
        print('\n')

category clustering for reduced (early) (early lists) vs. reduced (early) (late lists):
t(41) = 0.707, p = 0.484, d = 0.068, CI = [-1.432, 2.664]


size clustering for reduced (early) (early lists) vs. reduced (early) (late lists):
t(41) = 0.803, p = 0.427, d = 0.079, CI = [-1.167, 2.897]


wordLength clustering for reduced (early) (early lists) vs. reduced (early) (late lists):
t(41) = 0.461, p = 0.648, d = 0.060, CI = [-1.406, 2.443]


firstLetter clustering for reduced (early) (early lists) vs. reduced (early) (late lists):
t(41) = 0.781, p = 0.439, d = 0.101, CI = [-1.144, 2.957]


In [122]:
for i, f in enumerate(features):
    print(f'{f} clustering for reduced (late) (early lists) vs. reduced (late) (late lists):')
    ttest(results['fingerprint']['reduced (late)'], results['fingerprint']['reduced (late)'], x_col=f, y_col=f, x_lists='Early', y_lists='Late', independent_sample=False)

    if i < len(features) - 1:
        print('\n')

category clustering for reduced (late) (early lists) vs. reduced (late) (late lists):
t(40) = -0.101, p = 0.920, d = -0.009, CI = [-2.306, 1.711]


size clustering for reduced (late) (early lists) vs. reduced (late) (late lists):
t(40) = 0.555, p = 0.582, d = 0.058, CI = [-1.453, 2.318]


wordLength clustering for reduced (late) (early lists) vs. reduced (late) (late lists):
t(40) = 1.482, p = 0.146, d = 0.126, CI = [-0.496, 3.570]


firstLetter clustering for reduced (late) (early lists) vs. reduced (late) (late lists):
t(40) = -0.143, p = 0.887, d = -0.017, CI = [-2.289, 1.945]


## Feature rich vs. reduced ({early, late})

### Accuracy

In [123]:
print('Accuracy for feature rich vs. reduced (early) (all lists):')
ttest(results['accuracy']['feature rich'], results['accuracy']['reduced (early)'])

Accuracy for feature rich vs. reduced (early) (all lists):
t(107) = -2.230, p = 0.028, d = -0.439, CI = [-4.244, -0.264]


In [124]:
print('Accuracy for feature rich vs. reduced (late) (all lists):')
ttest(results['accuracy']['feature rich'], results['accuracy']['reduced (late)'])

Accuracy for feature rich vs. reduced (late) (all lists):
t(106) = -0.638, p = 0.525, d = -0.126, CI = [-2.711, 1.328]


### Temporal clustering

In [125]:
print('Temporal clustering for feature rich vs. reduced (early) (all lists):')
ttest(results['fingerprint']['feature rich'], results['fingerprint']['reduced (early)'], x_col='temporal', y_col='temporal')

Temporal clustering for feature rich vs. reduced (early) (all lists):
t(107) = -1.379, p = 0.171, d = -0.271, CI = [-3.362, 0.658]


In [126]:
print('Temporal clustering for feature rich vs. reduced (late) (all lists):')
ttest(results['fingerprint']['feature rich'], results['fingerprint']['reduced (late)'], x_col='temporal', y_col='temporal')

Temporal clustering for feature rich vs. reduced (late) (all lists):
t(106) = -0.535, p = 0.593, d = -0.106, CI = [-2.646, 1.531]


### Non-visual feature based clustering

In [127]:
features = ['category', 'size', 'wordLength', 'firstLetter']
for i, f in enumerate(features):
    print(f'{f} clustering for feature rich vs. reduced (early) (all lists):')
    ttest(results['fingerprint']['feature rich'], results['fingerprint']['reduced (early)'], x_col=f, y_col=f)

    if i < len(features) - 1:
        print('\n')

category clustering for feature rich vs. reduced (early) (all lists):
t(107) = 0.013, p = 0.989, d = 0.003, CI = [-1.783, 2.120]


size clustering for feature rich vs. reduced (early) (all lists):
t(107) = -0.349, p = 0.728, d = -0.069, CI = [-2.124, 1.494]


wordLength clustering for feature rich vs. reduced (early) (all lists):
t(107) = -0.581, p = 0.563, d = -0.114, CI = [-2.338, 1.329]


firstLetter clustering for feature rich vs. reduced (early) (all lists):
t(107) = 0.636, p = 0.526, d = 0.125, CI = [-1.207, 2.826]


In [128]:
features = ['category', 'size', 'wordLength', 'firstLetter']
for i, f in enumerate(features):
    print(f'{f} clustering for feature rich vs. reduced (late) (all lists):')
    ttest(results['fingerprint']['feature rich'], results['fingerprint']['reduced (late)'], x_col=f, y_col=f)

    if i < len(features) - 1:
        print('\n')

category clustering for feature rich vs. reduced (late) (all lists):
t(106) = -1.345, p = 0.181, d = -0.267, CI = [-3.559, 0.569]


size clustering for feature rich vs. reduced (late) (all lists):
t(106) = -1.441, p = 0.153, d = -0.286, CI = [-3.737, 0.578]


wordLength clustering for feature rich vs. reduced (late) (all lists):
t(106) = -1.261, p = 0.210, d = -0.250, CI = [-3.736, 0.811]


firstLetter clustering for feature rich vs. reduced (late) (all lists):
t(106) = 0.939, p = 0.350, d = 0.186, CI = [-0.932, 2.921]


## Reduced vs. reduced ({early, late})

### Accuracy

In [129]:
print('Accuracy for reduced vs. reduced (early) (all lists):')
ttest(results['accuracy']['reduced'], results['accuracy']['reduced (early)'])

Accuracy for reduced vs. reduced (early) (all lists):
t(101) = -2.045, p = 0.043, d = -0.410, CI = [-3.812, -0.175]


In [130]:
print('Accuracy for reduced vs. reduced (late) (all lists):')
ttest(results['accuracy']['reduced'], results['accuracy']['reduced (late)'])

Accuracy for reduced vs. reduced (late) (all lists):
t(100) = -0.407, p = 0.685, d = -0.082, CI = [-2.520, 1.435]


### Temporal clustering

In [131]:
print('Temporal clustering for reduced vs. reduced (early) (all lists):')
ttest(results['fingerprint']['reduced'], results['fingerprint']['reduced (early)'], x_col='temporal', y_col='temporal')

Temporal clustering for reduced vs. reduced (early) (all lists):
t(101) = -10.689, p < 0.001, d = -2.143, CI = [-13.400, -8.526]


In [132]:
print('Temporal clustering for reduced vs. reduced (late) (all lists):')
ttest(results['fingerprint']['reduced'], results['fingerprint']['reduced (late)'], x_col='temporal', y_col='temporal')

Temporal clustering for reduced vs. reduced (late) (all lists):
t(100) = -9.885, p < 0.001, d = -1.996, CI = [-14.439, -6.547]


### Non-visual feature based clustering

In [133]:
features = ['category', 'size', 'wordLength', 'firstLetter']
for i, f in enumerate(features):
    print(f'{f} clustering for reduced vs. reduced (early) (all lists):')
    ttest(results['fingerprint']['reduced'], results['fingerprint']['reduced (early)'], x_col=f, y_col=f)

    if i < len(features) - 1:
        print('\n')

category clustering for reduced vs. reduced (early) (all lists):
t(101) = -9.538, p < 0.001, d = -1.912, CI = [-12.262, -7.558]


size clustering for reduced vs. reduced (early) (all lists):
t(101) = -12.222, p < 0.001, d = -2.451, CI = [-15.418, -9.763]


wordLength clustering for reduced vs. reduced (early) (all lists):
t(101) = -10.620, p < 0.001, d = -2.129, CI = [-14.149, -8.346]


firstLetter clustering for reduced vs. reduced (early) (all lists):
t(101) = -5.213, p < 0.001, d = -1.045, CI = [-7.341, -3.436]


In [134]:
features = ['category', 'size', 'wordLength', 'firstLetter']
for i, f in enumerate(features):
    print(f'{f} clustering for reduced vs. reduced (late) (all lists):')
    ttest(results['fingerprint']['reduced'], results['fingerprint']['reduced (late)'], x_col=f, y_col=f)

    if i < len(features) - 1:
        print('\n')

category clustering for reduced vs. reduced (late) (all lists):
t(100) = -10.436, p < 0.001, d = -2.107, CI = [-15.533, -7.108]


size clustering for reduced vs. reduced (late) (all lists):
t(100) = -12.413, p < 0.001, d = -2.507, CI = [-18.183, -8.493]


wordLength clustering for reduced vs. reduced (late) (all lists):
t(100) = -9.672, p < 0.001, d = -1.953, CI = [-14.600, -6.274]


firstLetter clustering for reduced vs. reduced (late) (all lists):
t(100) = -4.555, p < 0.001, d = -0.920, CI = [-7.167, -2.387]


# Order manipulation analyses

- When lists are sorted by a given feature, how is memory performance affected (relative to feature rich -- early lists only)?
- Do some order manipulations matter more than others?  E.g. compare semantic vs. lexicographic vs. visual -- early lists only

### Accuracy

In [135]:
print('Accuracy for category vs. feature rich (early lists):')
ttest(results['accuracy']['category'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for size vs. feature rich (early lists):')
ttest(results['accuracy']['size'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for length vs. feature rich (early lists):')
ttest(results['accuracy']['length'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for first letter vs. feature rich (early lists):')
ttest(results['accuracy']['first letter'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for color vs. feature rich (early lists):')
ttest(results['accuracy']['color'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for location vs. feature rich (early lists):')
ttest(results['accuracy']['location'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

Accuracy for category vs. feature rich (early lists):
t(95) = 3.034, p = 0.003, d = 0.667, CI = [1.249, 5.399]

Accuracy for size vs. feature rich (early lists):
t(95) = -1.013, p = 0.314, d = -0.223, CI = [-3.275, 1.063]

Accuracy for length vs. feature rich (early lists):
t(95) = -0.550, p = 0.584, d = -0.121, CI = [-2.505, 1.278]

Accuracy for first letter vs. feature rich (early lists):
t(95) = -0.690, p = 0.492, d = -0.152, CI = [-2.719, 1.076]

Accuracy for color vs. feature rich (early lists):
t(96) = 1.850, p = 0.067, d = 0.402, CI = [0.073, 3.668]

Accuracy for location vs. feature rich (early lists):
t(95) = 0.043, p = 0.966, d = 0.010, CI = [-1.677, 1.540]


### Temporal clustering

In [136]:
print('Temporal clustering for category vs. feature rich (early lists):')
ttest(results['fingerprint']['category'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for size vs. feature rich (early lists):')
ttest(results['fingerprint']['size'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for length vs. feature rich (early lists):')
ttest(results['fingerprint']['length'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for first letter vs. feature rich (early lists):')
ttest(results['fingerprint']['first letter'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for color vs. feature rich (early lists):')
ttest(results['fingerprint']['color'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for location vs. feature rich (early lists):')
ttest(results['fingerprint']['location'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

Temporal clustering for category vs. feature rich (early lists):
t(95) = 8.813, p < 0.001, d = 1.936, CI = [6.847, 12.119]

Temporal clustering for size vs. feature rich (early lists):
t(95) = 2.630, p = 0.010, d = 0.578, CI = [0.686, 4.720]

Temporal clustering for length vs. feature rich (early lists):
t(95) = -1.547, p = 0.125, d = -0.340, CI = [-3.670, 0.167]

Temporal clustering for first letter vs. feature rich (early lists):
t(95) = 2.858, p = 0.005, d = 0.628, CI = [1.003, 4.808]

Temporal clustering for color vs. feature rich (early lists):
t(96) = -1.339, p = 0.184, d = -0.291, CI = [-3.339, 0.323]

Temporal clustering for location vs. feature rich (early lists):
t(95) = 1.705, p = 0.092, d = 0.374, CI = [-0.077, 3.598]


### Feature-based clustering

In [137]:
conds = ['category', 'size', 'length', 'first letter', 'color', 'location']
features = ['category', 'size', 'wordLength', 'firstLetter', 'color', 'location']

for i, c in enumerate(conds):
    for j, f in enumerate(features):
        print(f'{f} clustering for {c} vs. feature rich (early lists):')
        ttest(results['fingerprint'][c], results['fingerprint']['feature rich'], x_col=f, y_col=f, x_lists='Early', y_lists='Early')

        if (j < len(features) - 1) or (i < len(conds) - 1):
            print('\n')
    
    if i < len(conds) - 1:
        print('--- \n')

category clustering for category vs. feature rich (early lists):
t(95) = 4.429, p < 0.001, d = 0.973, CI = [2.936, 6.327]


size clustering for category vs. feature rich (early lists):
t(95) = 3.727, p < 0.001, d = 0.819, CI = [1.927, 5.619]


wordLength clustering for category vs. feature rich (early lists):
t(95) = 0.154, p = 0.878, d = 0.034, CI = [-1.700, 1.862]


firstLetter clustering for category vs. feature rich (early lists):
t(95) = -1.610, p = 0.111, d = -0.354, CI = [-3.772, 0.136]


color clustering for category vs. feature rich (early lists):
t(95) = -0.375, p = 0.709, d = -0.082, CI = [-2.766, 1.512]


location clustering for category vs. feature rich (early lists):
t(95) = -0.347, p = 0.730, d = -0.076, CI = [-2.621, 1.601]


--- 

category clustering for size vs. feature rich (early lists):
t(95) = 0.330, p = 0.742, d = 0.073, CI = [-1.609, 2.373]


size clustering for size vs. feature rich (early lists):
t(95) = 2.421, p = 0.017, d = 0.532, CI = [0.586, 4.431]


wordL

## Combine "categories" of features:
  - semantic = category + size
  - lexicographic = length + first letter
  - visual = color + location

In [138]:
groups = {k: v for k, v in feature_groupings.items() if k in ['semantic', 'lexicographic', 'visual']}

merged_results = {}
for k, v in results.items():
    if k not in ['fingerprint', 'accuracy', 'corrected fingerprint']:
        continue
    merged_results[k] = merge_results(v, groups)

## Compare semantic vs. feature rich

### Accuracy

In [139]:
print('Accuracy for semantic vs. feature rich (early lists):')
ttest(merged_results['accuracy']['semantic'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for lexicographic vs. feature rich (early lists):')
ttest(merged_results['accuracy']['lexicographic'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for visual vs. feature rich (early lists):')
ttest(merged_results['accuracy']['visual'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

Accuracy for semantic vs. feature rich (early lists):
t(125) = 1.197, p = 0.233, d = 0.213, CI = [-0.748, 3.129]

Accuracy for lexicographic vs. feature rich (early lists):
t(125) = -0.776, p = 0.439, d = -0.138, CI = [-2.920, 1.124]

Accuracy for visual vs. feature rich (early lists):
t(126) = 1.256, p = 0.212, d = 0.222, CI = [-0.778, 3.122]


In [140]:
print('Accuracy for semantic vs. feature rich (early lists):')
ttest(merged_results['accuracy']['semantic'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for lexicographic vs. feature rich (early lists):')
ttest(merged_results['accuracy']['lexicographic'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

print('\nAccuracy for visual vs. feature rich (early lists):')
ttest(merged_results['accuracy']['visual'], results['accuracy']['feature rich'], x_lists='Early', y_lists='Early')

Accuracy for semantic vs. feature rich (early lists):
t(125) = 1.197, p = 0.233, d = 0.213, CI = [-0.662, 3.135]

Accuracy for lexicographic vs. feature rich (early lists):
t(125) = -0.776, p = 0.439, d = -0.138, CI = [-2.809, 1.083]

Accuracy for visual vs. feature rich (early lists):
t(126) = 1.256, p = 0.212, d = 0.222, CI = [-0.790, 3.238]


### Temporal clustering

In [141]:
print('Temporal clustering for semantic vs. feature rich (early lists):')
ttest(merged_results['fingerprint']['semantic'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for lexicographic vs. feature rich (early lists):')
ttest(merged_results['fingerprint']['lexicographic'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for visual vs. feature rich (early lists):')
ttest(merged_results['fingerprint']['visual'], results['fingerprint']['feature rich'], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

Temporal clustering for semantic vs. feature rich (early lists):
t(125) = 6.474, p < 0.001, d = 1.151, CI = [4.493, 8.866]

Temporal clustering for lexicographic vs. feature rich (early lists):
t(125) = 0.786, p = 0.433, d = 0.140, CI = [-1.213, 2.682]

Temporal clustering for visual vs. feature rich (early lists):
t(126) = 0.254, p = 0.800, d = 0.045, CI = [-1.757, 2.181]


### Feature based clustering

In [142]:
conds = ['semantic', 'lexicographic', 'visual']
features = ['category', 'size', 'wordLength', 'firstLetter', 'color', 'location']

for i, c in enumerate(conds):
    for j, f in enumerate(features):
        print(f'{f} clustering for {c} vs. feature rich (early lists):')
        ttest(merged_results['fingerprint'][c], results['fingerprint']['feature rich'], x_col=f, y_col=f, x_lists='Early', y_lists='Early')

        if (j < len(features) - 1) or (i < len(conds) - 1):
            print('\n')
    
    if i < len(conds) - 1:
        print('--- \n')

category clustering for semantic vs. feature rich (early lists):
t(125) = 2.722, p = 0.007, d = 0.484, CI = [0.708, 4.832]


size clustering for semantic vs. feature rich (early lists):
t(125) = 3.866, p < 0.001, d = 0.687, CI = [2.037, 5.881]


wordLength clustering for semantic vs. feature rich (early lists):
t(125) = 0.521, p = 0.603, d = 0.093, CI = [-1.567, 2.460]


firstLetter clustering for semantic vs. feature rich (early lists):
t(125) = -0.842, p = 0.401, d = -0.150, CI = [-2.908, 1.088]


color clustering for semantic vs. feature rich (early lists):
t(125) = -0.650, p = 0.517, d = -0.116, CI = [-2.714, 1.191]


location clustering for semantic vs. feature rich (early lists):
t(125) = -0.251, p = 0.802, d = -0.045, CI = [-2.445, 1.526]


--- 

category clustering for lexicographic vs. feature rich (early lists):
t(125) = -1.040, p = 0.301, d = -0.185, CI = [-3.213, 0.825]


size clustering for lexicographic vs. feature rich (early lists):
t(125) = 0.006, p = 0.995, d = 0.001,

## Compare each pair of (category of) order manipulation conditions (semantic, lexicographic, and visual)

### Accuracy

In [143]:
for i, c1 in enumerate(conds):
    for c2 in conds[i+1:]:
        print(f'Accuracy for {c1} vs. {c2} (early lists):')
        ttest(merged_results['accuracy'][c1], merged_results['accuracy'][c2], x_lists='Early', y_lists='Early')

        print(f'\nAccuracy for {c1} vs. {c2} (late lists):')
        ttest(merged_results['accuracy'][c1], merged_results['accuracy'][c2], x_lists='Late', y_lists='Late')

        print('\n\n')

Accuracy for semantic vs. lexicographic (early lists):
t(118) = 1.936, p = 0.055, d = 0.353, CI = [-0.006, 3.984]

Accuracy for semantic vs. lexicographic (late lists):
t(118) = -0.388, p = 0.699, d = -0.071, CI = [-2.519, 1.504]



Accuracy for semantic vs. visual (early lists):
t(119) = 0.113, p = 0.910, d = 0.021, CI = [-1.874, 2.064]

Accuracy for semantic vs. visual (late lists):
t(119) = -0.833, p = 0.407, d = -0.151, CI = [-3.157, 1.020]



Accuracy for lexicographic vs. visual (early lists):
t(119) = -2.145, p = 0.034, d = -0.390, CI = [-4.126, -0.139]

Accuracy for lexicographic vs. visual (late lists):
t(119) = -0.352, p = 0.726, d = -0.064, CI = [-2.319, 1.569]





### Temporal clustering

In [144]:
for i, c1 in enumerate(conds):
    for c2 in conds[i+1:]:
        print(f'Temporal clustering for {c1} vs. {c2} (early lists):')
        ttest(merged_results['fingerprint'][c1], merged_results['fingerprint'][c2], x_lists='Early', y_lists='Early', x_col='temporal', y_col='temporal')

        print(f'\nTemporal clustering for {c1} vs. {c2} (late lists):')
        ttest(merged_results['fingerprint'][c1], merged_results['fingerprint'][c2], x_lists='Late', y_lists='Late', x_col='temporal', y_col='temporal')

        print('\n')

Temporal clustering for semantic vs. lexicographic (early lists):
t(118) = 5.620, p < 0.001, d = 1.026, CI = [3.507, 7.995]

Temporal clustering for semantic vs. lexicographic (late lists):
t(118) = -0.758, p = 0.450, d = -0.138, CI = [-2.743, 1.358]


Temporal clustering for semantic vs. visual (early lists):
t(119) = 6.613, p < 0.001, d = 1.202, CI = [4.469, 9.364]

Temporal clustering for semantic vs. visual (late lists):
t(119) = -0.322, p = 0.748, d = -0.059, CI = [-2.417, 1.550]


Temporal clustering for lexicographic vs. visual (early lists):
t(119) = 0.589, p = 0.557, d = 0.107, CI = [-1.276, 2.591]

Temporal clustering for lexicographic vs. visual (late lists):
t(119) = 0.562, p = 0.575, d = 0.102, CI = [-1.511, 2.473]




### Feature-based clustering

In [145]:
for i, c1 in enumerate(conds):
    for c2 in conds[i+1:]:
        for f in features:
            print(f'{f} clustering for {c1} vs. {c2} (early lists):')
            ttest(merged_results['fingerprint'][c1], merged_results['fingerprint'][c2], x_col=f, y_col=f, x_lists='Early', y_lists='Early')

            print(f'\n{f} clustering for {c1} vs. {c2} (late lists):')
            ttest(merged_results['fingerprint'][c1], merged_results['fingerprint'][c2], x_col=f, y_col=f, x_lists='Late', y_lists='Late')

            print('\n')

category clustering for semantic vs. lexicographic (early lists):
t(118) = 3.667, p < 0.001, d = 0.670, CI = [1.646, 6.074]

category clustering for semantic vs. lexicographic (late lists):
t(118) = -0.720, p = 0.473, d = -0.131, CI = [-2.597, 1.387]


size clustering for semantic vs. lexicographic (early lists):
t(118) = 4.043, p < 0.001, d = 0.738, CI = [2.109, 6.185]

size clustering for semantic vs. lexicographic (late lists):
t(118) = -1.897, p = 0.060, d = -0.346, CI = [-3.922, -0.011]


wordLength clustering for semantic vs. lexicographic (early lists):
t(118) = -3.390, p < 0.001, d = -0.619, CI = [-5.444, -1.559]

wordLength clustering for semantic vs. lexicographic (late lists):
t(118) = 1.153, p = 0.251, d = 0.211, CI = [-0.777, 3.229]


firstLetter clustering for semantic vs. lexicographic (early lists):
t(118) = -5.705, p < 0.001, d = -1.042, CI = [-7.852, -3.806]

firstLetter clustering for semantic vs. lexicographic (late lists):
t(118) = -0.880, p = 0.381, d = -0.161, CI

# When *early* lists are sorted by a given feature, how is memory performance on *late* lists affected (relative to *feature rich* late lists)

### Accuracy

In [146]:
print('Accuracy for semantic vs. feature rich (late lists):')
ttest(merged_results['accuracy']['semantic'], results['accuracy']['feature rich'], x_lists='Late', y_lists='Late')

print('\nAccuracy for lexicographic vs. feature rich (late lists):')
ttest(merged_results['accuracy']['lexicographic'], results['accuracy']['feature rich'], x_lists='Late', y_lists='Late')

print('\nAccuracy for visual vs. feature rich (late lists):')
ttest(merged_results['accuracy']['visual'], results['accuracy']['feature rich'], x_lists='Late', y_lists='Late')

Accuracy for semantic vs. feature rich (late lists):
t(125) = 0.487, p = 0.627, d = 0.087, CI = [-1.577, 2.426]

Accuracy for lexicographic vs. feature rich (late lists):
t(125) = 0.878, p = 0.382, d = 0.156, CI = [-1.013, 2.929]

Accuracy for visual vs. feature rich (late lists):
t(126) = 1.437, p = 0.153, d = 0.254, CI = [-0.384, 3.395]


### Temporal clustering

In [147]:
print('Temporal clustering for semantic vs. feature rich (late lists):')
ttest(merged_results['fingerprint']['semantic'], results['fingerprint']['feature rich'], x_lists='Late', y_lists='Late', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for lexicographic vs. feature rich (late lists):')
ttest(merged_results['fingerprint']['lexicographic'], results['fingerprint']['feature rich'], x_lists='Late', y_lists='Late', x_col='temporal', y_col='temporal')

print('\nTemporal clustering for visual vs. feature rich (late lists):')
ttest(merged_results['fingerprint']['visual'], results['fingerprint']['feature rich'], x_lists='Late', y_lists='Late', x_col='temporal', y_col='temporal')

Temporal clustering for semantic vs. feature rich (late lists):
t(125) = 0.157, p = 0.875, d = 0.028, CI = [-1.785, 2.209]

Temporal clustering for lexicographic vs. feature rich (late lists):
t(125) = 0.998, p = 0.320, d = 0.177, CI = [-1.052, 2.952]

Temporal clustering for visual vs. feature rich (late lists):
t(126) = 0.548, p = 0.585, d = 0.097, CI = [-1.402, 2.321]


## Feature-based clustering

In [148]:
conds = ['semantic', 'lexicographic', 'visual']
features = ['category', 'size', 'wordLength', 'firstLetter', 'color', 'location']

for i, c in enumerate(conds):
    for j, f in enumerate(features):
        print(f'{f} clustering for {c} vs. feature rich (late lists):')
        ttest(merged_results['fingerprint'][c], results['fingerprint']['feature rich'], x_col=f, y_col=f, x_lists='Late', y_lists='Late')

        if (j < len(features) - 1) or (i < len(conds) - 1):
            print('\n')
    
    if i < len(conds) - 1:
        print('--- \n')

category clustering for semantic vs. feature rich (late lists):
t(125) = -0.041, p = 0.967, d = -0.007, CI = [-1.890, 1.949]


size clustering for semantic vs. feature rich (late lists):
t(125) = -0.989, p = 0.324, d = -0.176, CI = [-3.045, 0.963]


wordLength clustering for semantic vs. feature rich (late lists):
t(125) = -0.045, p = 0.964, d = -0.008, CI = [-2.006, 1.874]


firstLetter clustering for semantic vs. feature rich (late lists):
t(125) = -0.369, p = 0.713, d = -0.066, CI = [-2.521, 1.627]


color clustering for semantic vs. feature rich (late lists):
t(125) = -0.602, p = 0.548, d = -0.107, CI = [-2.671, 1.215]


location clustering for semantic vs. feature rich (late lists):
t(125) = -0.521, p = 0.603, d = -0.093, CI = [-2.592, 1.310]


--- 

category clustering for lexicographic vs. feature rich (late lists):
t(125) = 0.678, p = 0.499, d = 0.121, CI = [-1.399, 2.603]


size clustering for lexicographic vs. feature rich (late lists):
t(125) = 0.915, p = 0.362, d = 0.163, C

# Individual difference feature clustering analyses, part 1

Compute the correlations (across participants) between feature clustering, recall probability, and temporal clustering for early and late lists.  As a summary, also compute the correlations (across conditions) between the per-condition averages.  Note: for each feature clustering score, consider only the condition of interest-- e.g., for the category condition consider category clustering, for the length condition consider length clustering, and so on.

Start by creating a dataframe that combines across all of the order manipulation conditions:
  - index: subject, list group --- but rename subjects so they're unique across conditions
  - columns:
    - feature clustering score: pick out the appropriate element of that list/subject's fingerprint, based on the current condition
    - temporal clustering score
    - recall probability
    - condition

In [149]:
clustering_results = create_clustering_df(results)
clustering_results

Unnamed: 0,Subject,List,Condition,Feature clustering score,Corrected feature clustering score,Temporal clustering score,Recall probability
0,0,Early,Feature rich,0.495229,0.485292,0.562750,0.484375
1,1,Early,Feature rich,0.559542,0.582875,0.486750,0.859375
2,2,Early,Feature rich,0.538500,0.532000,0.660500,0.625000
3,3,Early,Feature rich,0.519375,0.474292,0.596500,0.531250
4,4,Early,Feature rich,0.566917,0.565833,0.680000,0.546875
...,...,...,...,...,...,...,...
491,243,Late,Location,0.407500,0.388000,0.698875,0.484375
492,244,Late,Location,0.553750,0.503000,0.603500,0.671875
493,245,Late,Location,0.598250,0.563750,0.768250,0.609375
494,246,Late,Location,0.405000,0.428750,0.683750,0.539062


In [150]:
def corr_helper(clustering_results, x='Feature clustering score', y='Recall probability', xlists=None, ylists=None):
    clustering_results = clustering_results.query('Condition != "Feature rich"')

    def print_corr(a, b, label=None, n_iter=1000):
        corr = stats.pearsonr(a, b)

        # compute bootstrap-estimated 95% confidence interval
        bootstrapped = []        
        max_tries = 10
        tries = 0
        for i in range(n_iter):
            inds = np.random.randint(0, len(a), len(a))
            while len(np.unique(a.iloc[inds])) == 1 and len(np.unique(b.iloc[inds])) == 1 and tries < max_tries:
                inds = np.random.randint(0, len(a), len(a))
                tries += 1
            if tries >= max_tries:
                pass
            tries = 0
            bootstrapped.append(stats.pearsonr(a.iloc[inds], b.iloc[inds])[0])
        low, high = np.percentile(bootstrapped, [2.5, 97.5])

        if label is None:
            prefix = ''
        else:
            prefix = label + ': '

        if corr.pvalue < 0.001:
            p_string = 'p < 0.001'
        else:
            p_string = f'p = {corr.pvalue:.3f}'

        print(f'\t{prefix}: r({len(a) - 2}) = {corr.statistic:.3f}, {p_string}, CI = [{low:.3f}, {high:.3f}]')
        return corr

    if xlists is not None:
        x_results = clustering_results.query('List == @xlists')
    else:
        x_results = clustering_results
    
    if ylists is not None:
        y_results = clustering_results.query('List == @ylists')
    else:
        y_results = clustering_results
    
    print(f'Correlations between {x} and {y} (x lists: {xlists}, y lists: {ylists})')
    # Combine across all conditions, compute correlation across subjects
    print_corr(x_results[x], y_results[y], label='Combined (across subjects)')

    # Per-condition (across subjects)
    print('\n')
    min_r = np.inf
    max_p = -np.inf

    for c in clustering_results['Condition'].unique():
        corr = print_corr(x_results.query('Condition == @c')[x], y_results.query('Condition == @c')[y], label=f'{c} (across subjects)')
        min_r = min(min_r, corr.statistic)
        max_p = max(max_p, corr.pvalue)
    print(f'\tWithin condition, across subjects: all $r$s $\geq {min_r:.3f}$, all $p$s $\leq {max_p:.3f}$')

    # Across-condition correlation
    print('\n')
    x_ave = x_results.groupby('Condition').mean(numeric_only=True)[x]
    y_ave = y_results.groupby('Condition').mean(numeric_only=True)[y]
    print_corr(x_ave, y_ave, label='Across conditions')

## Recall probability vs. feature clustering

### Early vs. early

In [151]:
corr_helper(clustering_results, x='Feature clustering score', y='Recall probability', xlists='Early', ylists='Early')

Correlations between Feature clustering score and Recall probability (x lists: Early, y lists: Early)
	Combined (across subjects): : r(179) = 0.492, p < 0.001, CI = [0.339, 0.608]


	Category (across subjects): : r(28) = 0.862, p < 0.001, CI = [0.787, 0.920]
	Size (across subjects): : r(28) = 0.897, p < 0.001, CI = [0.813, 0.947]
	Length (across subjects): : r(28) = 0.481, p = 0.007, CI = [0.083, 0.725]
	First letter (across subjects): : r(28) = 0.414, p = 0.023, CI = [0.004, 0.728]
	Color (across subjects): : r(29) = 0.331, p = 0.069, CI = [-0.026, 0.674]
	Location (across subjects): : r(28) = 0.360, p = 0.051, CI = [0.049, 0.596]
	Within condition, across subjects: all $r$s $\geq 0.331$, all $p$s $\leq 0.069$


	Across conditions: : r(4) = 0.511, p = 0.300, CI = [-0.999, 0.999]


### Late vs. late

In [152]:
corr_helper(clustering_results, x='Feature clustering score', y='Recall probability', xlists='Late', ylists='Late')

Correlations between Feature clustering score and Recall probability (x lists: Late, y lists: Late)
	Combined (across subjects): : r(179) = 0.403, p < 0.001, CI = [0.265, 0.523]


	Category (across subjects): : r(28) = 0.662, p < 0.001, CI = [0.329, 0.882]
	Size (across subjects): : r(28) = 0.744, p < 0.001, CI = [0.497, 0.905]
	Length (across subjects): : r(28) = 0.520, p = 0.003, CI = [0.247, 0.741]
	First letter (across subjects): : r(28) = 0.404, p = 0.027, CI = [-0.019, 0.731]
	Color (across subjects): : r(29) = 0.532, p = 0.002, CI = [0.281, 0.751]
	Location (across subjects): : r(28) = 0.419, p = 0.021, CI = [0.081, 0.664]
	Within condition, across subjects: all $r$s $\geq 0.404$, all $p$s $\leq 0.027$


	Across conditions: : r(4) = -0.304, p = 0.559, CI = [-0.873, 0.773]


### Late vs. Early

In [153]:
corr_helper(clustering_results, x='Feature clustering score', y='Recall probability', xlists='Early', ylists='Late')

Correlations between Feature clustering score and Recall probability (x lists: Early, y lists: Late)
	Combined (across subjects): : r(179) = 0.230, p = 0.002, CI = [0.074, 0.376]


	Category (across subjects): : r(28) = 0.474, p = 0.008, CI = [0.199, 0.701]
	Size (across subjects): : r(28) = 0.574, p < 0.001, CI = [0.351, 0.746]
	Length (across subjects): : r(28) = 0.405, p = 0.027, CI = [0.088, 0.625]
	First letter (across subjects): : r(28) = 0.385, p = 0.035, CI = [-0.037, 0.736]
	Color (across subjects): : r(29) = 0.212, p = 0.251, CI = [-0.162, 0.531]
	Location (across subjects): : r(28) = 0.320, p = 0.085, CI = [-0.002, 0.590]
	Within condition, across subjects: all $r$s $\geq 0.212$, all $p$s $\leq 0.251$


	Across conditions: : r(4) = -0.338, p = 0.512, CI = [-0.977, 0.643]


### Early vs. Late

In [154]:
corr_helper(clustering_results, x='Feature clustering score', y='Recall probability', xlists='Late', ylists='Early')

Correlations between Feature clustering score and Recall probability (x lists: Late, y lists: Early)
	Combined (across subjects): : r(179) = 0.464, p < 0.001, CI = [0.325, 0.578]


	Category (across subjects): : r(28) = 0.687, p < 0.001, CI = [0.432, 0.818]
	Size (across subjects): : r(28) = 0.561, p = 0.001, CI = [0.283, 0.782]
	Length (across subjects): : r(28) = 0.438, p = 0.015, CI = [0.112, 0.750]
	First letter (across subjects): : r(28) = 0.377, p = 0.040, CI = [0.039, 0.674]
	Color (across subjects): : r(29) = 0.431, p = 0.016, CI = [0.148, 0.666]
	Location (across subjects): : r(28) = 0.395, p = 0.031, CI = [0.108, 0.645]
	Within condition, across subjects: all $r$s $\geq 0.377$, all $p$s $\leq 0.040$


	Across conditions: : r(4) = 0.451, p = 0.369, CI = [-0.780, 0.999]


## Temporal clustering vs. feature clustering

### Early vs. early

In [155]:
corr_helper(clustering_results, x='Feature clustering score', y='Temporal clustering score', xlists='Early', ylists='Early')

Correlations between Feature clustering score and Temporal clustering score (x lists: Early, y lists: Early)
	Combined (across subjects): : r(179) = 0.916, p < 0.001, CI = [0.891, 0.938]


	Category (across subjects): : r(28) = 0.965, p < 0.001, CI = [0.916, 0.992]
	Size (across subjects): : r(28) = 0.926, p < 0.001, CI = [0.865, 0.962]
	Length (across subjects): : r(28) = 0.945, p < 0.001, CI = [0.882, 0.975]
	First letter (across subjects): : r(28) = 0.855, p < 0.001, CI = [0.743, 0.932]
	Color (across subjects): : r(29) = 0.817, p < 0.001, CI = [0.669, 0.903]
	Location (across subjects): : r(28) = 0.883, p < 0.001, CI = [0.772, 0.946]
	Within condition, across subjects: all $r$s $\geq 0.817$, all $p$s $\leq 0.000$


	Across conditions: : r(4) = 0.946, p = 0.004, CI = [0.593, 0.999]


### Late vs. late

In [156]:
corr_helper(clustering_results, x='Feature clustering score', y='Temporal clustering score', xlists='Late', ylists='Late')

Correlations between Feature clustering score and Temporal clustering score (x lists: Late, y lists: Late)
	Combined (across subjects): : r(179) = 0.273, p < 0.001, CI = [0.140, 0.384]


	Category (across subjects): : r(28) = 0.293, p = 0.116, CI = [-0.084, 0.596]
	Size (across subjects): : r(28) = 0.307, p = 0.099, CI = [-0.068, 0.652]
	Length (across subjects): : r(28) = 0.353, p = 0.056, CI = [-0.037, 0.660]
	First letter (across subjects): : r(28) = 0.660, p < 0.001, CI = [0.432, 0.819]
	Color (across subjects): : r(29) = 0.333, p = 0.068, CI = [-0.066, 0.662]
	Location (across subjects): : r(28) = 0.235, p = 0.212, CI = [-0.134, 0.535]
	Within condition, across subjects: all $r$s $\geq 0.235$, all $p$s $\leq 0.212$


	Across conditions: : r(4) = -0.190, p = 0.718, CI = [-0.833, 0.770]


### Late vs. early

In [157]:
corr_helper(clustering_results, x='Feature clustering score', y='Temporal clustering score', xlists='Early', ylists='Late')

Correlations between Feature clustering score and Temporal clustering score (x lists: Early, y lists: Late)
	Combined (across subjects): : r(179) = 0.266, p < 0.001, CI = [0.124, 0.388]


	Category (across subjects): : r(28) = 0.298, p = 0.110, CI = [-0.000, 0.512]
	Size (across subjects): : r(28) = 0.314, p = 0.091, CI = [0.023, 0.578]
	Length (across subjects): : r(28) = 0.535, p = 0.002, CI = [0.256, 0.769]
	First letter (across subjects): : r(28) = 0.443, p = 0.014, CI = [0.106, 0.737]
	Color (across subjects): : r(29) = 0.491, p = 0.005, CI = [0.115, 0.724]
	Location (across subjects): : r(28) = 0.355, p = 0.054, CI = [0.062, 0.613]
	Within condition, across subjects: all $r$s $\geq 0.298$, all $p$s $\leq 0.110$


	Across conditions: : r(4) = 0.064, p = 0.903, CI = [-0.971, 0.972]


### Early vs. late

In [158]:
corr_helper(clustering_results, x='Feature clustering score', y='Temporal clustering score', xlists='Late', ylists='Early')

Correlations between Feature clustering score and Temporal clustering score (x lists: Late, y lists: Early)
	Combined (across subjects): : r(179) = 0.549, p < 0.001, CI = [0.431, 0.645]


	Category (across subjects): : r(28) = 0.613, p < 0.001, CI = [0.413, 0.784]
	Size (across subjects): : r(28) = 0.392, p = 0.032, CI = [0.057, 0.661]
	Length (across subjects): : r(28) = 0.271, p = 0.148, CI = [-0.068, 0.558]
	First letter (across subjects): : r(28) = 0.345, p = 0.062, CI = [-0.048, 0.635]
	Color (across subjects): : r(29) = 0.005, p = 0.980, CI = [-0.419, 0.411]
	Location (across subjects): : r(28) = 0.240, p = 0.201, CI = [-0.067, 0.515]
	Within condition, across subjects: all $r$s $\geq 0.005$, all $p$s $\leq 0.980$


	Across conditions: : r(4) = 0.855, p = 0.030, CI = [0.279, 1.000]


## Feature clustering on early vs. late lists

In [159]:
corr_helper(clustering_results, x='Feature clustering score', y='Feature clustering score', xlists='Late', ylists='Early')

Correlations between Feature clustering score and Feature clustering score (x lists: Late, y lists: Early)
	Combined (across subjects): : r(179) = 0.591, p < 0.001, CI = [0.474, 0.694]


	Category (across subjects): : r(28) = 0.590, p < 0.001, CI = [0.373, 0.752]
	Size (across subjects): : r(28) = 0.488, p = 0.006, CI = [0.126, 0.737]
	Length (across subjects): : r(28) = 0.384, p = 0.036, CI = [0.044, 0.681]
	First letter (across subjects): : r(28) = 0.202, p = 0.284, CI = [-0.209, 0.653]
	Color (across subjects): : r(29) = -0.183, p = 0.325, CI = [-0.548, 0.260]
	Location (across subjects): : r(28) = 0.031, p = 0.870, CI = [-0.238, 0.286]
	Within condition, across subjects: all $r$s $\geq -0.183$, all $p$s $\leq 0.870$


	Across conditions: : r(4) = 0.942, p = 0.005, CI = [0.487, 1.000]


# Difference analyses (early - late lists)

In [160]:
clustering_result_diffs = clustering_results.query('List == "Early"').copy()
clustering_result_diffs['List'] = 'Early - Late'
clustering_result_diffs['Feature clustering score'] = clustering_result_diffs['Feature clustering score'].values - clustering_results.query('List == "Late"')['Feature clustering score'].values
clustering_result_diffs['Temporal clustering score'] = clustering_result_diffs['Temporal clustering score'].values - clustering_results.query('List == "Late"')['Temporal clustering score'].values
clustering_result_diffs['Recall probability'] = clustering_result_diffs['Recall probability'].values - clustering_results.query('List == "Late"')['Recall probability'].values
clustering_result_diffs

Unnamed: 0,Subject,List,Condition,Feature clustering score,Corrected feature clustering score,Temporal clustering score,Recall probability
0,0,Early - Late,Feature rich,-0.050729,0.485292,-0.045000,0.039062
1,1,Early - Late,Feature rich,-0.058125,0.582875,-0.039750,-0.031250
2,2,Early - Late,Feature rich,0.092667,0.532000,0.259625,0.148438
3,3,Early - Late,Feature rich,0.070625,0.474292,-0.019375,0.125000
4,4,Early - Late,Feature rich,0.089312,0.565833,0.194500,0.101562
...,...,...,...,...,...,...,...
461,243,Early - Late,Location,0.272875,0.397500,0.011625,-0.031250
462,244,Early - Late,Location,0.229750,0.527500,0.248500,-0.046875
463,245,Early - Late,Location,-0.001750,0.320750,-0.017250,-0.062500
464,246,Early - Late,Location,0.436625,0.647750,0.162500,0.109375


## Recall probability versus feature clustering (differences)

In [161]:
corr_helper(clustering_result_diffs, x='Feature clustering score', y='Recall probability', xlists='Early - Late', ylists='Early - Late')

Correlations between Feature clustering score and Recall probability (x lists: Early - Late, y lists: Early - Late)
	Combined (across subjects): : r(179) = 0.307, p < 0.001, CI = [0.134, 0.478]


	Category (across subjects): : r(28) = 0.350, p = 0.058, CI = [0.013, 0.625]
	Size (across subjects): : r(28) = 0.708, p < 0.001, CI = [0.470, 0.865]
	Length (across subjects): : r(28) = 0.205, p = 0.276, CI = [-0.128, 0.488]
	First letter (across subjects): : r(28) = 0.081, p = 0.672, CI = [-0.454, 0.604]
	Color (across subjects): : r(29) = 0.155, p = 0.406, CI = [-0.141, 0.536]
	Location (across subjects): : r(28) = 0.052, p = 0.787, CI = [-0.274, 0.377]
	Within condition, across subjects: all $r$s $\geq 0.052$, all $p$s $\leq 0.787$


	Across conditions: : r(4) = 0.635, p = 0.176, CI = [-0.918, 0.983]


### Temporal clustering versus featuer clustering (differences)

In [162]:
corr_helper(clustering_result_diffs, x='Feature clustering score', y='Temporal clustering score', xlists='Early - Late', ylists='Early - Late')

Correlations between Feature clustering score and Temporal clustering score (x lists: Early - Late, y lists: Early - Late)
	Combined (across subjects): : r(179) = 0.426, p < 0.001, CI = [0.297, 0.541]


	Category (across subjects): : r(28) = 0.110, p = 0.564, CI = [-0.288, 0.426]
	Size (across subjects): : r(28) = 0.447, p = 0.013, CI = [0.068, 0.725]
	Length (across subjects): : r(28) = 0.482, p = 0.007, CI = [0.254, 0.720]
	First letter (across subjects): : r(28) = 0.584, p < 0.001, CI = [0.224, 0.767]
	Color (across subjects): : r(29) = 0.406, p = 0.023, CI = [0.054, 0.723]
	Location (across subjects): : r(28) = 0.498, p = 0.005, CI = [0.261, 0.693]
	Within condition, across subjects: all $r$s $\geq 0.110$, all $p$s $\leq 0.564$


	Across conditions: : r(4) = 0.649, p = 0.163, CI = [-0.849, 0.996]


# Adaptive condition

## Accuracy: comparing across each list type

### Stabilize vs. random

In [163]:
ttest(results['accuracy']['adaptive'], results['accuracy']['adaptive'], x_lists='stabilize', y_lists='random', independent_sample=False)

t(59) = 1.740, p = 0.087, d = 0.095, CI = [-0.160, 3.840]


### Destabilize vs. random

In [164]:
ttest(results['accuracy']['adaptive'], results['accuracy']['adaptive'], x_lists='destabilize', y_lists='random', independent_sample=False)

t(59) = -0.249, p = 0.804, d = -0.017, CI = [-2.360, 1.608]


### Stabilize vs. destabilize

In [165]:
ttest(results['accuracy']['adaptive'], results['accuracy']['adaptive'], x_lists='stabilize', y_lists='destabilize', independent_sample=False)

t(59) = 1.714, p = 0.092, d = 0.114, CI = [-0.283, 4.101]


## Temporal clustering: comparing across each list type

### Stabilize vs. random

In [166]:
ttest(results['fingerprint']['adaptive'], results['fingerprint']['adaptive'], x_lists='stabilize', y_lists='random', independent_sample=False, x_col='temporal', y_col='temporal')

t(59) = 3.428, p = 0.001, d = 0.306, CI = [1.465, 5.426]


### Destabilize vs. random

In [167]:
ttest(results['fingerprint']['adaptive'], results['fingerprint']['adaptive'], x_lists='destabilize', y_lists='random', independent_sample=False, x_col='temporal', y_col='temporal')

t(59) = -0.880, p = 0.382, d = -0.081, CI = [-3.054, 0.952]


### Stabilize vs. destabilize

In [168]:
ttest(results['fingerprint']['adaptive'], results['fingerprint']['adaptive'], x_lists='stabilize', y_lists='destabilize', independent_sample=False, x_col='temporal', y_col='temporal')

t(59) = 4.174, p < 0.001, d = 0.374, CI = [1.962, 6.759]


## Correlations between accuracy and temporal clustering (adaptive condition)

In [169]:
df = results['accuracy']['adaptive'].data.reset_index().query('List not in ["init"]').rename({0: 'Recall probability'}, axis=1)
df['Temporal clustering score'] = results['fingerprint']['adaptive'].data.reset_index().query('List not in ["init"]')['temporal']
df.rename({'List': 'Condition'}, axis=1, inplace=True)
df['List'] = 'All'
df

Unnamed: 0,Subject,Condition,Recall probability,Temporal clustering score,List
0,0,destabilize,0.578125,0.47100,All
1,1,destabilize,0.250000,0.34775,All
2,2,destabilize,0.515625,0.51500,All
3,3,destabilize,0.625000,0.86400,All
4,4,destabilize,0.500000,0.46650,All
...,...,...,...,...,...
235,55,stabilize,0.406250,0.58750,All
236,56,stabilize,0.703125,0.81600,All
237,57,stabilize,0.343750,0.41925,All
238,58,stabilize,0.343750,0.56250,All


In [170]:
corr_helper(df, x='Temporal clustering score', y='Recall probability', xlists='All', ylists='All')

Correlations between Temporal clustering score and Recall probability (x lists: All, y lists: All)
	Combined (across subjects): : r(178) = 0.701, p < 0.001, CI = [0.590, 0.790]


	destabilize (across subjects): : r(58) = 0.674, p < 0.001, CI = [0.459, 0.813]
	random (across subjects): : r(58) = 0.651, p < 0.001, CI = [0.413, 0.829]
	stabilize (across subjects): : r(58) = 0.784, p < 0.001, CI = [0.619, 0.880]
	Within condition, across subjects: all $r$s $\geq 0.651$, all $p$s $\leq 0.000$


	Across conditions: : r(1) = 0.998, p = 0.044, CI = [0.998, 1.000]


# Fingerprint stability analysis (feature rich)

Each participant in the feature rich condition studied and recalled a total of 16 lists, yielding 16 sets of “fingerprints” for that participant.  Below we asked: holding out one of these fingerprints at a time, could we “match up” which participant it belonged to?  Specifically, we created two distributions of correlations.  The first distribution comprised “within-participant” correlations between the fingerprint from a held-out list and the average fingerprint from all remaining lists.  Each participant contributes a total of 16 correlations to this distribution.  The second distribution comprise  “across-participant” correlations between one the fingerprint from one held-out list from one participant, and the average fingerprints (across all lists) for each other participant.  We repeat these across-participant comparisons for each pairing of lists (from the “template” participant) and other participants.  Therefore each participant contributes $16 \times (N - 1)$ correlations to this second distribution (one per list, times $N - 1$--- i.e., the number of participants excluding the template participant). 

In [210]:
# load in *per list* fingerprints for the feature rich condition
from analyze import results_file, analyze_data

raw_results, _, _ = analyze_data(savefile=results_file)
fingerprints = raw_results['fingerprint']['feature rich'].data.reset_index()

include = features
include.append('temporal')

within_corrs = []
across_corrs = []

# for each participant, for each list, compute the correlation between the fingerprint for that list and the average fingerprint for the other lists
for x in tqdm(fingerprints['Subject'].unique()):
    for y in fingerprints.query('Subject == @x')['List'].unique():
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            ref = fingerprints.query('Subject == @x and List == @y')[include].mean().values
            within = fingerprints.query('Subject == @x and List != @y')[include].mean().values
            within_corrs.append(stats.pearsonr(ref, within)[0])

            for z in fingerprints['Subject'].unique():
                if z == x:
                    continue
                across = fingerprints.query('Subject == @z')[include].mean().values
                across_corrs.append(stats.pearsonr(ref, across)[0])


100%|██████████| 67/67 [02:03<00:00,  1.85s/it]

Within list correlation: nan +/- nan
Across list correlation: nan +/- nan





In [230]:
def ttest_helper(a, b, independent=True, n_iter=1000, alpha=0.05, prefix=''):
    if independent:
        results = stats.ttest_ind(a, b)
        df = len(a) + len(b) - 2
    else:
        results = stats.ttest_rel(a, b)
        df = len(a) - 1
    t, p = results.statistic, results.pvalue
    
    # compute bootstrap-estimated 95% confidence interval
    bootstrapped = []
    n_iter = 1000
    for i in range(n_iter):
        if independent:
            bootstrapped.append(stats.ttest_ind(np.random.choice(a, len(a), replace=True), np.random.choice(b, len(b), replace=True)).statistic)
        else:
            inds = np.random.randint(0, len(a), len(b))
            bootstrapped.append(stats.ttest_rel(a[inds], b[inds]).statistic)
    
    low, high = np.percentile(bootstrapped, [alpha * 50, 100 - (alpha * 50)])

    if p < 0.001:
        p_string = 'p < 0.001'
    else:
        p_string = f'p = {p:.3f}'

    d = (np.mean(within_corrs) - np.mean(across_corrs)) / np.sqrt((np.std(within_corrs) ** 2 + np.std(across_corrs) ** 2) / 2)

    print(f'{prefix}t({df}) = {t:.3f}, {p_string}, d = {d:.3f}, CI = [{low:.3f}, {high:.3f}]')

In [232]:
within_corrs = np.array(within_corrs)
across_corrs = np.array(across_corrs)

# remove nans
within_corrs = within_corrs[~np.isnan(within_corrs)]
across_corrs = across_corrs[~np.isnan(across_corrs)]

print(f'Within list correlation: {np.mean(within_corrs):.3f} +/- {np.std(within_corrs):.3f}')
print(f'Across list correlation: {np.mean(across_corrs):.3f} +/- {np.std(across_corrs):.3f}')

ttest_helper(within_corrs, across_corrs, independent=True, prefix='Within vs. across list correlation: ')

Within list correlation: 0.481 +/- 0.362
Across list correlation: 0.399 +/- 0.386
Within vs. across list correlation: t(70280) = 6.773, p < 0.001, d = 0.217, CI = [4.847, 8.703]


# Clustering by feature rank

In [244]:
# sort each row in descending order and rename the columns to match the sort ranks

x = results_by_list['fingerprint']['feature rich'].data
x = x.groupby('Subject').mean().apply(lambda x: x.sort_values(ascending=False, ignore_index=True), axis=1)
x.columns = [f'Rank {i}' for i in range(1, len(x.columns) + 1)]
x

Unnamed: 0_level_0,Rank 1,Rank 2,Rank 3,Rank 4,Rank 5,Rank 6,Rank 7
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.759750,0.620000,0.585250,0.497000,0.487000,0.384125,0.375688
1,0.993250,0.906000,0.506625,0.464875,0.418250,0.401500,0.347750
2,0.730625,0.705000,0.530687,0.470625,0.423937,0.339438,0.283375
3,0.606187,0.576000,0.550312,0.546312,0.501312,0.369000,0.361437
4,0.675000,0.587500,0.582750,0.518000,0.469062,0.443000,0.441000
...,...,...,...,...,...,...,...
62,0.623812,0.602125,0.564813,0.464250,0.460813,0.363875,0.335500
63,0.724562,0.577125,0.435937,0.416500,0.399125,0.366250,0.358438
64,0.885250,0.762375,0.569000,0.455875,0.438375,0.408375,0.390875
65,0.723187,0.501437,0.461313,0.449437,0.440250,0.431812,0.409250


In [251]:
for rank in x.columns:
    ttest_helper(x[rank], 0.5 * np.ones_like(x[rank]), independent=False, prefix=f'{rank}: ')

Rank 1: t(66) = 12.751, p < 0.001, d = 0.217, CI = [8.681, 19.423]
Rank 2: t(66) = 8.196, p < 0.001, d = 0.217, CI = [5.078, 13.389]
Rank 3: t(66) = 3.243, p = 0.002, d = 0.217, CI = [0.927, 6.870]
Rank 4: t(66) = -3.112, p = 0.003, d = 0.217, CI = [-4.951, -1.873]
Rank 5: t(66) = -7.154, p < 0.001, d = 0.217, CI = [-12.539, -5.515]
Rank 6: t(66) = -12.608, p < 0.001, d = 0.217, CI = [-21.801, -9.199]
Rank 7: t(66) = -18.397, p < 0.001, d = 0.217, CI = [-27.244, -13.835]
