In [32]:
from sklearn.datasets import make_classification
from scipy.stats import pearsonr
from sklearn import svm
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from tqdm.notebook import tqdm
import pingouin as pg

In [33]:
def generate_data(separation, seed=None):
    
    x, y = make_classification(n_samples=100, n_features=16, n_classes=2,
                               n_informative=16, n_redundant=0, class_sep=separation,
                               flip_y=0., shuffle=False, n_clusters_per_class=1, random_state=seed)
    return x, y


def trainandtest_svm(x, y, shuffle=True, seed=42):
    
    zscore = lambda x: (x - np.mean(x))/np.std(x)
    
    n_train = np.ceil(0.8*len(x)).astype(int)
    
    if shuffle:
        rng = np.random.default_rng(seed=seed)
        shuffled_indices = rng.permutation(len(x))
        x = x[shuffled_indices]
        y = y[shuffled_indices]
    
    train_x = x[:n_train]
    train_y = y[:n_train]
    test_x = x[n_train:]
    test_y = y[n_train:]

    train_z = zscore(train_x)
    test_z = zscore(test_x)

    clf = svm.SVC(kernel='linear')
    clf.fit(train_z, train_y)

    y_pred = clf.predict(test_z)

    # Compute distance from bound
    y = clf.decision_function(test_z)
    w_norm = np.linalg.norm(clf.coef_)
    dist = y / w_norm

    zscoredist = zscore(dist)
    zscoredist[test_y==0] *= -1
    
    return np.mean(zscoredist)

In [34]:
def gen_activations_and_separations(n, min_act=0., max_act=3.6, slope=0.6, intercept=0., sigma=0.01):
    
    activations = []
    separations = []
    for _ in range(n):
        act = random.uniform(min_act, max_act)
        sep = slope*act + intercept + np.random.normal(0., sigma)
        activations.append(act)
        separations.append(sep)
    
    return activations, separations

In [35]:
def separation_to_accuracy(separations):
    
    accuracies = []
    for s in separations:
        x, y = generate_data(s)
        accuracies.append(trainandtest_svm(x, y))
    
    return accuracies

In [36]:
def random_split(ls, n_partitions, seed=0):
    assert len(ls) % n_partitions == 0
    part_length = int(len(ls)/n_partitions)
    rng = np.random.default_rng(seed=seed)
    shuffled_indices = rng.permutation(len(ls))
    shuffled_list = [ls[i] for i in shuffled_indices]
    
    splits = []
    for i in range(n_partitions):
        start = int(i*part_length)
        end = start + part_length
        
        splits.append(shuffled_list[start:end])
        
    return splits

In [37]:
def split_datasets(datasets, n_partitions=4, seed=0):
    split_datasets = random_split(datasets, n_partitions, seed=seed)
    
    merged_datasets = []
    for d in split_datasets:
        thisx = []
        thisy = []
        for x, y in d:
            thisx.append(x)
            thisy.append(y)
        thisx = np.vstack(thisx)
        thisy = np.concatenate(thisy)
        merged_datasets.append((thisx, thisy))
    
    return merged_datasets

In [38]:
activations, separations = gen_activations_and_separations(1000, sigma=0.001)
    
# Reshape into 2D arrays with 'sequences' of activations and separations
activations_2d = np.array(activations).reshape(100, 10)
separations_2d = np.array(separations).reshape(100, 10)

In [45]:
distances_per_timestep = []

for t in tqdm(range(10)):
    these_seps = separations_2d[:, t]
    this_ds_list = [generate_data(s, seed=t) for s in these_seps]
    this_ds_partition = split_datasets(this_ds_list, n_partitions=4,
                                       seed=0)
    dist_per_partition = [trainandtest_svm(x, y) for x, y in this_ds_partition]
    distances_per_timestep.append(dist_per_partition)

distances_per_timestep = np.array(distances_per_timestep).T

  0%|          | 0/10 [00:00<?, ?it/s]

In [46]:
distances_per_timestep.shape

(4, 10)

In [66]:
activ_partitions = random_split(activations_2d, n_partitions=4, seed=0)
activ_per_timestep = np.vstack([np.mean(np.vstack(ap), axis=0) for ap in activ_partitions])

In [59]:
activ_per_timestep.shape

(4, 10)

In [67]:
pearsonr(np.mean(distances_per_timestep, axis=0),
         np.mean(activ_per_timestep, axis=0))

(0.71311620622371, 0.020602239745932525)

In [57]:
np.mean(np.vstack(activ_partitions[0]), axis=0)

array([1.7057959 , 1.54542701, 1.68846235, 1.82222203, 1.69999018,
       2.00676122, 1.78212253, 1.44425447, 1.85590893, 1.59306922])

In [23]:
these_seps

array([0.72655843, 0.33891983, 1.27219577, 1.37579568, 0.51060143,
       0.90373207, 2.12097436, 1.71332475, 1.70586994, 1.5907184 ,
       2.15669107, 0.84842538, 1.13246183, 1.30881953, 0.61061269,
       0.07786286, 0.90551103, 1.0272264 , 0.3500482 , 1.54164962,
       1.68103269, 0.94392848, 0.76169013, 0.29383876, 1.78394549,
       0.31215392, 0.05548082, 1.94209729, 1.30577095, 0.80481233,
       1.51716638, 0.3256873 , 1.49638963, 0.45818935, 1.76105187,
       1.55739212, 0.1552157 , 1.01272907, 0.20988017, 1.104654  ,
       0.6198075 , 1.78955237, 0.8416445 , 1.00388408, 1.15072974,
       0.57645349, 0.4376151 , 1.96071177, 1.12225996, 0.13665137,
       2.08516275, 1.08266006, 1.22411242, 1.94726071, 0.8402276 ,
       1.94905163, 1.83772973, 0.67553006, 1.68173806, 1.92405153,
       1.82183464, 0.58505582, 2.03799549, 0.25230365, 2.08259412,
       1.71299973, 1.58228609, 2.12028411, 1.71738626, 1.00466554,
       0.30096072, 1.29661698, 0.68372996, 0.87310342, 1.21808

In [20]:
type(this_ds_partition[0])

tuple

In [12]:
len(this_ds_list)

100

In [9]:
these_seps

array([0.72655843, 0.33891983, 1.27219577, 1.37579568, 0.51060143,
       0.90373207, 2.12097436, 1.71332475, 1.70586994, 1.5907184 ,
       2.15669107, 0.84842538, 1.13246183, 1.30881953, 0.61061269,
       0.07786286, 0.90551103, 1.0272264 , 0.3500482 , 1.54164962,
       1.68103269, 0.94392848, 0.76169013, 0.29383876, 1.78394549,
       0.31215392, 0.05548082, 1.94209729, 1.30577095, 0.80481233,
       1.51716638, 0.3256873 , 1.49638963, 0.45818935, 1.76105187,
       1.55739212, 0.1552157 , 1.01272907, 0.20988017, 1.104654  ,
       0.6198075 , 1.78955237, 0.8416445 , 1.00388408, 1.15072974,
       0.57645349, 0.4376151 , 1.96071177, 1.12225996, 0.13665137,
       2.08516275, 1.08266006, 1.22411242, 1.94726071, 0.8402276 ,
       1.94905163, 1.83772973, 0.67553006, 1.68173806, 1.92405153,
       1.82183464, 0.58505582, 2.03799549, 0.25230365, 2.08259412,
       1.71299973, 1.58228609, 2.12028411, 1.71738626, 1.00466554,
       0.30096072, 1.29661698, 0.68372996, 0.87310342, 1.21808

In [7]:
def simulate_splits(sameseed=True):
    
    seed1 = random.randint(100, 999)
    if sameseed:
        seed2 = seed1
    else:
        seed2 = random.randint(100, 999)
    #print(seed1, '-', seed2)
        
    
    activations, separations = gen_activations_and_separations(1000, sigma=0.001)
    
    # Reshape into 2D arrays with 'sequences' of activations and separations
    activations_2d = np.array(activations).reshape(100, 10)
    separations_2d = np.array(separations).reshape(100, 10)
    
    sep_splits = [np.mean(s, axis=0) for s in random_split(separations_2d, 4, seed=seed1)]
    act_splits = [np.mean(a, axis=0) for a in random_split(activations_2d, 4, seed=seed2)]
    dis_splits = [np.array(separation_to_accuracy(s)) for s in sep_splits]
    
    #return act_splits, dis_splits
    
    avg_act = np.mean(np.vstack(act_splits), axis=0)
    avg_dis = np.mean(np.vstack(dis_splits), axis=0)
    
    return pearsonr(avg_act, avg_dis)[0]  

In [8]:
activations, separations = gen_activations_and_separations(1000, sigma=0.01)
distances = separation_to_accuracy(separations)

In [None]:
split_activations = np.vstack([a for a in random_split(np.array(activations).reshape(1000, 10), 4)])

In [None]:
split_activations.shape

In [None]:
datasets = [generate_data(s) for s in separations]
split_ds = split_datasets(datasets, n_partitions=4)
split_distances = [trainandtest_svm(x, y) for x, y in split_ds]
split_activations = [np.mean]

In [None]:
distances

In [None]:
type(split_datasets[0][0])

In [None]:
plt.scatter(activations, distances)

In [None]:
simulate_splits(sameseed=False)

In [None]:
print('Simulating same seed...')
sameseed_corrs = [simulate_splits(sameseed=True) for _ in tqdm(range(1000))]
print('Simulating different seed...')
diffseed_corrs = [simulate_splits(sameseed=False) for _ in tqdm(range(1000))]

In [None]:
np.mean(sameseed_corrs)

In [None]:
np.mean(diffseed_corrs)