In [1]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.common import callMLlibFunc, callJavaFunc
from pyspark.mllib.classification import LabeledPoint
import numpy as np
import math

In [2]:
from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import _convert_to_vector
from pyspark import RDD

class MLNaiveBayesModel(NaiveBayesModel):
    def predict_all(self, x):
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict_all(v))
        x = _convert_to_vector(x)
        return list(zip(self.labels,self.pi + x.dot(self.theta.transpose())))

In [3]:
# RDD (labels) (features)
def train_model(data, l = 1.0):
    aggreagated = data.flatMap(lambda x: [(l, x[1]) for l in x[0]]). \
        combineByKey(lambda v: (1, v),
                 lambda c, v: (c[0] + 1, c[1] + v),
                 lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])). \
        sortBy(lambda x: x[0]). \
        collect()
    num_labels = len(aggreagated)
    num_documents = data.count()
    num_features = aggreagated[0][1][1].array.size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggreagated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom
        theta_log_denom = math.log(sum_term_freq.toArray().sum() + num_features * l)
        for j in range(num_features):
            theta[i,j] = math.log(sum_term_freq[j] + l) - theta_log_denom
        i += 1  
    return MLNaiveBayesModel(labels, pi, theta)

In [4]:
import json
data = sc.textFile('hdfs://master:54310/clean'). \
    map(lambda x: json.loads(x))

In [6]:
word_idx = data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()

In [7]:
data.take(1)[0].keys()

dict_keys(['Id', 'Features', 'Labels'])

In [8]:
label_idx = data.flatMap(lambda x: x['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()

In [9]:
num_features = len(word_idx)

In [10]:
from collections import Counter

In [14]:
def vectorize_words(words: list):
    word_counts = Counter([word_idx[w] for w in words])
    return Vectors.sparse(num_features, word_counts.items())

def vectorize_data(x: dict):
    features = vectorize_words(x['Features'])
    labels = [label_idx[l] for l in x['Labels']]
    return (labels, features)

In [15]:
clean_data = data.map(lambda x: vectorize_data(x)). \
    map(lambda x: (x[0], Vectors.dense(x[1].toArray())))

In [16]:
m = train_model(clean_data)

In [13]:
part1 = clean_data.filter(lambda x: len(x[0]) > 1)
part1.count()

22453

In [18]:
clean_data.repartition(12).map(lambda x: (x[0], int(m.predict(x[1])))). \
    filter(lambda x: x[1] in x[0]). \
    count()

32874

In [19]:
clean_data.count()

47575

In [15]:
data.map(lambda x: int(x['Id'])).max()

266865