In [1]:
import numpy as np
from pyspark import RDD
from collections import namedtuple

class MultilabledPoint:
    __slots__ = ['lables', 'features']
    
    def __init__(self, lables, features):
        self.lables = lables
        self.features = features

In [2]:
from collections import namedtuple
MultilabledPoint = namedtuple('MultilabledPoint', ['lables', 'features'], verbose=False)

In [3]:
data = [MultilabledPoint([1,2,3], np.array([1,2,0,4])),
        MultilabledPoint([2,3,4], np.array([1,2,0,4])),
        MultilabledPoint([3,4,5], np.array([1,2,0,4])),
        MultilabledPoint([4,5,6], np.array([1,2,0,4])),
        MultilabledPoint([1,2,3], np.array([1,2,0,4])),
        MultilabledPoint([2,3,4], np.array([1,2,0,4])),
        MultilabledPoint([3,4,5], np.array([1,2,0,4])),
        MultilabledPoint([4,5,6], np.array([1,2,0,4])),
        MultilabledPoint([1,2,3], np.array([1,2,0,4])),
        MultilabledPoint([2,3,4], np.array([1,2,0,4]))]

In [4]:
rdd = sc.parallelize(data)

In [5]:
class Model:
    def predict(self, x):
        return [(1,.09), (2, .091)]

In [6]:
class ModelBuilder:
    @classmethod
    def train(cls, data: RDD):
        return Model()

In [7]:
def shuffle_and_split(data: RDD, fold_n: int, seed = 0):
    fold_weights = [1 / fold_n] * fold_n
    return data.randomSplit(fold_weights)

In [12]:
class Metric:
    def __init__(self, name: str, verbose=False):
        self._name = name
        self._results = []
        self._verbose = verbose
        
    @property
    def name(self):
        return self._name
    
    @property
    def results(self):
        return self._results
    
    @property
    def avg(self):
        return np.average(_results)
    
    def evaluate(self, lables, predictions):
        pass

In [46]:
class AccuracyMetric(Metric):
    def __init__(self, pred_n: int, intersect_n: int, *args, **kwargs):
        self._pred_n = pred_n
        self._intersect_n = intersect_n
        super(AccuracyMetric, self).__init__(*args, **kwargs)
        
    def evaluate(self, lables_and_predictions: RDD):
        TP = lables_and_predictions.map(lambda x:
                                    (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                    filter(lambda x:
                                           len(x[0].intersection(x[1])) > self._intersect_n)
        accuracy = 100.0 * TP.count() / lables_and_predictions.count()
        if self._verbose:
            print('accuracy: ', accuracy)
        self._results.append(accuracy)
        return accuracy

In [47]:
def hold_out(data: RDD, k: int, metrics: list):
    folds = shuffle_and_split(data, k)
    for i in range(k):
        test = folds[i]
        training = sc.union(folds[:i] + folds[i + 1:])
        model = ModelBuilder.train(training)
        lables_and_predictions = test.map(lambda x: (x.lables, model.predict(x.features)))
        for metric in metrics:
            metric.evaluate(lables_and_predictions)
    return metrics

In [49]:
hold_out(rdd, 5, [AccuracyMetric(1, 1,"Acc", True)])

accuracy:  0.0


[<__main__.AccuracyMetric at 0x7f905a0c5908>]