## Download mnist

In [2]:
import pickle, gzip, numpy, urllib.request, json

# Load the dataset
urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
with gzip.open('mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = pickle.load(f, encoding='latin1')

In [3]:
import pandas as pd

In [25]:
df = pd.DataFrame ()

for label, (features, labels) in zip (
    ('train_set', 'valid_set', 'test_set'),
    (train_set, valid_set, test_set),
):
    df = df.append (pd.DataFrame (
        {'features': feature, 'data': label, 'labels': a_label}
        for feature, a_label in zip (features[:], labels)
    ))

In [30]:
df.to_parquet ('data/mnist_labeled.parquet', index=False)

In [None]:
def 

df.apply (f, axis=1, raw=True)

In [2]:
mnist_image_dim = numpy.int64 (numpy.sqrt (train_set[0][:].shape[1]))

In [5]:
digits, labels = train_set

## Generate bernoulli columns

In [3]:
import numpy as np
from scipy.ndimage import convolve, zoom
sqrt, log = np.sqrt, np.log

In [4]:
def LIL_estimate (count, delta=1 / 64):
    N = 1
    delta_curr = np.float64 (delta)
    curr_frac = np.float64 (.5)

    while (count * curr_frac + sqrt (
        2 * curr_frac * (1 - curr_frac) * count * log (log (count))
    )) > (- 192 * log (delta_curr)):
        N += 1
        delta_curr /= 3.
        curr_frac *= .5
    
    return N, delta_curr

In [7]:
digits_len = len (digits)
delta = .1

N_max, deep_delta = LIL_estimate (digits_len, delta)
N_max, deep_delta

(6, 0.00041152263374485596)

In [8]:
rand = np.random.rand

In [9]:
zoom_factor = 3.0

window_size = 12
new_image_dim = int (mnist_image_dim * zoom_factor) // window_size

def sample_normalize_cols (id_vec):
    a_id, vec = id_vec
    
    new_vec = zoom (vec.reshape (
        mnist_image_dim, mnist_image_dim
    ), 3.0).reshape (
        new_image_dim, window_size, new_image_dim, window_size
    ).sum (axis=3).sum (axis=1).reshape (-1)
    
    new_vec /= window_size * window_size
    new_vec /= np.linalg.norm (new_vec)
    
    return (
        (rand (N_max) < .5).cumprod ().astype (np.bool), 
        (a_id, new_vec)
    )

In [12]:
rdd_subsampled = rdd_digits.map (sample_normalize_cols).persist ()

In [13]:
rdd_subsampled.take (1)[0][-1][-1].shape

(49,)

# Loop to sample with ridge leverage

In [14]:
from gc import collect

from scipy.linalg import pinvh, eigh, cholesky

import numba
import numba.types as ntypes

## kernel func

In [15]:
@numba.jit (
    ntypes.float64 (ntypes.float64[:], ntypes.float64[:]),
    nopython=True, nogil=True
)
def kernel_func (x, y):
    
    return x.dot (y)

@numba.jit (
    ntypes.float64[:, :] (
        ntypes.float64[:, :], ntypes.float64, 
        ntypes.float64[:, :], ntypes.float64[:]
    ),
    looplift=True, nogil=True, nopython=True
)
def fast_kernel_gram (X, a_lamb, out, iknorms_X):    
    for i in range (X.shape[0]):
        out[i, i] = kernel_func (X[i], X[i])
        iknorms_X[i] = 1 / np.sqrt (out[i, i])
        out[i, i] += a_lamb
        
        for j in range (i):
            n_kernel_ij = kernel_func (X[i], X[j]) * iknorms_X[i] * iknorms_X[j]
            out[i, j] = n_kernel_ij
            out[j, i] = n_kernel_ij
    
    return out

def kernel_gram (X, a_lamb=0.0):
    out = np.empty ((X.shape[0], X.shape[0]))
    iknorms_X = np.empty (X.shape[0])

    return fast_kernel_gram (X, a_lamb, out, iknorms_X)

@numba.jit (
    ntypes.float64[:, :] (
        ntypes.float64[:, :], ntypes.float64[:, :], 
        ntypes.float64[:, :], ntypes.float64[:]
    ),
    looplift=True, nogil=True, nopython=True
)
def fast_kernel_cross (X, Y, out, iknorms_Y):
    for i in range (Y.shape[0]):
        iknorms_Y[i] = 1 / np.sqrt (kernel_func (Y[i], Y[i]))
    
    for i in range (X.shape[0]):
        iknorm_Xi = 1 / np.sqrt (kernel_func (X[i], X[i]))
        
        for j in range (Y.shape[0]):
            out[i, j] = kernel_func (X[i], Y[j]) * iknorm_Xi * iknorms_Y[j]
    
    return out

def kernel_cross (X, Y):
    out = np.empty ((X.shape[0], Y.shape[0]))
    iknorms_Y = np.empty (Y.shape[0])
    
    return fast_kernel_cross (X, Y, out, iknorms_Y)

## Leverage Score

In [16]:
@numba.jit (
    ntypes.float64 (ntypes.float64[:], ntypes.float64[:], ntypes.float64[:, :]),
    nopython=True, nogil=True
)
def leverage_scores (x, Kx, isqrt_G):
    x_Kproj = isqrt_G.dot (Kx)    

    return kernel_func (x, x) - x_Kproj.dot (x_Kproj)

@numba.jit (
    ntypes.float64[:] (
        ntypes.float64[:, :], ntypes.float64[:, :], 
        ntypes.float64[:, :], ntypes.float64[:]
    ),
    looplift=True, nogil=True, nopython=True
)
def fast_mat_leverage_score (X, KX, isqrt_G, out):
    X_Kproj_T = isqrt_G.dot (KX.T)
    X_Kproj_T *= X_Kproj_T
    X_Kproj_dot = X_Kproj_T.sum (axis=0)
    
    for i in range (X.shape[0]):
        out[i] = kernel_func (X[i], X[i]) - X_Kproj_dot[i]

    return out

def mat_leverage_score (X, KX, isqrt_G):
    out = np.empty (X.shape[0])

    return fast_mat_leverage_score (X, KX, isqrt_G, out)

## Leverage Map

In [17]:
def pair_leverages_sample (rows, base_sample):
    rows_ids_values = list (rows)

    import numpy as np

    rows_ids = np.fromiter (
        (row[0] for row in rows_ids_values), 
        dtype=np.int64
    )

    rows_mat = np.vstack (
        row[-1] for row in rows_ids_values
    ).astype (np.float64)

    del rows_ids_values

    #  gram evaluation
    sample_km = kernel_gram (base_sample, lamb)
    isample_km = pinvh (sample_km); del sample_km

    # cholesky opf inverse gram evaluation
    km_isqrt = cholesky (isample_km, overwrite_a=True, check_finite=False)

    # cross kernel evaluation
    cross_km = kernel_cross (rows_mat, base_sample)

    leverages = mat_leverage_score (rows_mat, cross_km, km_isqrt)
    del cross_km, km_isqrt, isample_km

    leverages *= 1.5 / lamb

    collect ()

    return (
        (leverage, (a_id, row)) 
        for a_id, leverage, row 
        in zip (rows_ids, leverages, rows_mat[:])
    )

## First Sample

In [18]:
base_sample_ids_values = rdd_subsampled.filter (lambda row: row[0][-1]).values ().collect ()

base_sample = np.vstack (
    value for _, value in base_sample_ids_values
).astype (np.float64)

base_sample_ids = np.fromiter (
    (a_id for a_id, _ in base_sample_ids_values),
    dtype=np.int64,
)

del base_sample_ids_values

collect ()

lamb = .01
i = N_max - 2
delta_curr = deep_delta * 3

## Subsampled loop

In [19]:
print (base_sample.shape)

while i >= 0:
    rdd_leverages = rdd_subsampled.filter (
        lambda row: row[0][i]
    ).values ().mapPartitions (
        lambda rows: pair_leverages_sample (rows, base_sample), 
        preservesPartitioning=True
    ).persist ()

    leverage_sum = rdd_leverages.keys ().sum ()

    leverage_mean = rdd_leverages.keys ().mean ()

    log_leverage_sum = 1 * log (leverage_sum / delta_curr)

    print (leverage_sum, leverage_mean, log_leverage_sum)

    base_sample_ids_values = rdd_leverages.filter (lambda row:
        rand () < row[0] * log_leverage_sum
    ).values ().collect ()
    
    base_sample_ids = np.hstack ((
        base_sample_ids,
        *(a_id for a_id, _ in base_sample_ids_values)
    ))
    
    base_sample = np.vstack ((
        base_sample,
        *(value for _, value in base_sample_ids_values)
    )).astype (np.float64)
    
    del base_sample_ids_values
    rdd_leverages.unpersist ();
    
    collect ()

    i -= 1
#     i = i if i > 0 else 0
    delta_curr *= 3
    
    print (base_sample.shape)
    
rdd_leverages = rdd_subsampled.values ().mapPartitions (
    lambda rows: pair_leverages_sample (rows, base_sample), 
    preservesPartitioning=True
).persist ()

leverage_sum = rdd_leverages.keys ().sum ()

leverage_mean = rdd_leverages.keys ().mean ()

log_leverage_sum = 1 * log (leverage_sum / delta_curr)

print (leverage_sum, leverage_mean, log_leverage_sum)

base_sample_ids_values = rdd_leverages.filter (lambda row:
    rand () < row[0] * log_leverage_sum
).values ().collect ()

base_sample_ids = np.hstack ((
    base_sample_ids,
    *(a_id for a_id, _ in base_sample_ids_values)
))

base_sample = np.vstack ((
    base_sample,
    *(value for _, value in base_sample_ids_values)
)).astype (np.float64)

del base_sample_ids_values
rdd_leverages.unpersist ();

collect ()

print (base_sample.shape)

(806, 49)
133.28688257387546 0.08087796272686616 11.58953806471177
(1984, 49)
95.04302748185847 0.03197948434786624 10.15275166892289
(2880, 49)
122.4805965425398 0.02051257687867021 9.30776229219804
(4014, 49)
165.38816222589227 0.013692206492747095 8.509492591109387
(5517, 49)
236.75244828948806 0.009658240455655697 7.76960016625064
(7394, 49)


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job 18 cancelled 
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1651)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1838)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1821)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1810)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:165)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor48.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


## Full loop

In [None]:
base_sample_a = base_sample
base_sample_a_ids = base_sample_ids

print (base_sample_a.shape)

for i in range (20):
    rdd_leverages = rdd_subsampled.values ().mapPartitions (
        lambda rows: pair_leverages_sample (rows, base_sample_a),
        preservesPartitioning=True
    ).persist ()

    leverage_sum = rdd_leverages.keys ().sum ()

    leverage_mean = rdd_leverages.keys ().mean ()

    log_leverage_sum = 1 * log (leverage_sum / delta)

    print (leverage_sum, leverage_mean, log_leverage_sum)

    base_sample_a_ids_values = rdd_leverages.filter (lambda row:
        rand () < row[0]
    ).values ().collect ()
    
    base_sample_a = np.vstack ((
        base_sample_a,
        *(value for _, value in base_sample_a_ids_values)
    )).astype (np.float64)

    base_sample_a_ids = np.hstack ((
        base_sample_a_ids,
        *(a_id for a_id, _ in base_sample_a_ids_values)
    ))
    
    del base_sample_a_ids_values
    rdd_leverages.unpersist ();
    
    collect ()

    i -= 1
    i = i if i > 0 else 0
    delta_curr *= 3
    
    print (base_sample_a.shape)

# Leverages Stats

In [None]:
from pyspark import Row

In [None]:
rdd_leverages = rdd_subsampled.values ().mapPartitions (
    lambda rows: pair_leverages_sample (rows, base_sample_a), 
    preservesPartitioning=True
).persist ()

In [None]:
df_leverages = rdd_leverages.keys ().map (
    lambda val: Row (leverage_score=float (val))
).toDF ()

In [None]:
df_leverages.describe ().show ()

In [None]:
df_leverages.approxQuantile (
    'leverage_score', 
    [
        float (i) for i in np.linspace (0, 1, 11)
    ], 
    .01
)

# Class splitting