In [2]:
from collections import namedtuple, Counter
from pyspark.mllib.linalg import Vectors
import json

In [18]:
from pyspark import RDD

def shuffle_and_split(data: RDD, fold_n: int, seed = 0):
    fold_weights = [1 / fold_n] * fold_n
    return data.randomSplit(fold_weights)

def hold_out(sc, data: RDD, k: int, model_builder, metrics: list):
    folds = shuffle_and_split(data, k)
    for i in range(k):
        test = folds[i]
        training = sc.union(folds[:i] + folds[i + 1:])
        model = model_builder(training)
        lables_and_predictions = test.map(lambda x: (x['lables'], model.predict_all(x['features'])))
        for metric in metrics:
            metric.evaluate(lables_and_predictions)
    return metrics

from pyspark import RDD

class Metric:
    def __init__(self, name: str, verbose=False):
        self._name = name
        self._results = []
        self._verbose = verbose
        
    @property
    def name(self):
        return self._name
    
    @property
    def results(self):
        return self._results
    
    @property
    def avg(self):
        return np.average(_results)
    
    def evaluate(self, lables, predictions):
        pass

class AccuracyMetric(Metric):
    def __init__(self, pred_n: int, intersect_n: int):
        self._pred_n = pred_n
        self._intersect_n = intersect_n
        super(AccuracyMetric, self).__init__(name='Accuracy', verbose=False)
        
    def evaluate(self, lables_and_predictions: RDD):
        TP = lables_and_predictions.map(lambda x:
                                    (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                    filter(lambda x:
                                           len(x[0].intersection(x[1])) > self._intersect_n)
        accuracy = 100.0 * TP.count() / lables_and_predictions.count()
        if self._verbose:
            print('accuracy: ', accuracy)
        self._results.append(accuracy)
        return accuracy

from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.linalg import Vectors
from pyspark import RDD
import numpy as np
import math

def scale(x: np.ndarray):
    mean_x = x.mean()
    max_x = x.max()
    min_x = x.min()
    return (x - min_x) / (max_x - min_x)

class MLNaiveBayesModel(NaiveBayesModel):
    def predict_all(self, x):
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict_all(v))
        x = _convert_to_vector(x)
        log_probs = self.pi + x.dot(self.theta.transpose())
        scaled_log_probs = scale(log_probs)
        int_lables = [int(l_i) for l_i in self.labels]
        labels_and_log_probs = zip(int_lables, scaled_log_probs)
        return sorted(labels_and_log_probs, key=lambda x: x[1], reverse=True)
    
# RDD (labels) (features)
def train_model(data, l = 1.0):
    aggreagated = data.flatMap(lambda x: [(l, x['features']) for l in x['lables']]). \
        combineByKey(lambda v: (1, v),
                 lambda c, v: (c[0] + 1, c[1] + v),
                 lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])). \
        sortBy(lambda x: x[0]). \
        collect()
    num_labels = len(aggreagated)
    num_documents = data.count()
    num_features = aggreagated[0][1][1].array.size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels, dtype=int)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggreagated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom
        theta_log_denom = math.log(sum_term_freq.toArray().sum() + num_features * l)
        for j in range(num_features):
            theta[i,j] = math.log(sum_term_freq[j] + l) - theta_log_denom
        i += 1  
    return MLNaiveBayesModel(labels, pi, theta)

In [4]:
MultilabledPoint = namedtuple('MultilabledPoint',
                              ['lables', 'features'], verbose=False)

In [5]:
class MultilabledPoint(object):
    def __init__(self, lables, features):
        self.lables = lables
        self.features = features

In [6]:
raw_data = sc.textFile('hdfs://master:54310/test3').map(json.loads)

In [7]:
word_idx = raw_data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
label_idx = raw_data.flatMap(lambda x: x['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()

In [8]:
num_features = len(word_idx)

In [9]:
def vectorize_words(words: list):
    word_counts = Counter([word_idx[w] for w in words])
    return Vectors.sparse(num_features, word_counts.items())

def vectorize_data(x: dict):
    features = vectorize_words(x['Features'])
    lables = [label_idx[l] for l in x['Labels']]
    return {'lables':lables, 'features':Vectors.dense(features.toArray())}

In [10]:
data = raw_data.map(vectorize_data)

In [24]:
result = hold_out(sc, data, 2, train_model, [AccuracyMetric(3, 1)])

[38.60060090645211, 38.65106251924854]

In [13]:
model = train_model(data.zipWithIndex().filter(lambda x: x[1] < 1000). \
                   map(lambda x: x[0]))

In [21]:
test = data.zipWithIndex().filter(lambda x: x[1] < 1). \
    map(lambda x: x[0]). \
    map(lambda x: (x['lables'],
        model.predict_all(x['features'])))

In [23]:
AccuracyMetric(3,1).evaluate(test)

100.0