In [1]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.common import callMLlibFunc, callJavaFunc
from pyspark.mllib.classification import LabeledPoint
import numpy as np
import math

In [2]:
from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import _convert_to_vector
from pyspark import RDD

class MLNaiveBayesModel(NaiveBayesModel):
    def predict_all(self, x):
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict_all(v))
        x = _convert_to_vector(x)
        return list(zip(self.labels,self.pi + x.dot(self.theta.transpose())))
    
    def predict_n(self, x, n):
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict_n(v), n)
        return [int(l) for l,p in sorted(self.predict_all(x), key=lambda x: x[1], reverse=True)[:n]]

In [3]:
# RDD (labels) (features)
def train_model(data, l = 1.0):
    aggreagated = data.flatMap(lambda x: [(l, x[1]) for l in x[0]]). \
        combineByKey(lambda v: (1, v),
                 lambda c, v: (c[0] + 1, c[1] + v),
                 lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])). \
        sortBy(lambda x: x[0]). \
        collect()
    num_labels = len(aggreagated)
    num_documents = data.count()
    num_features = aggreagated[0][1][1].array.size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggreagated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom
        theta_log_denom = math.log(sum_term_freq.toArray().sum() + num_features * l)
        for j in range(num_features):
            theta[i,j] = math.log(sum_term_freq[j] + l) - theta_log_denom
        i += 1  
    return MLNaiveBayesModel(labels, pi, theta)

In [4]:
import json
data = sc.textFile('hdfs://master:54310/new_lables3'). \
    map(lambda x: json.loads(x))

In [5]:
word_idx = data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()

In [6]:
data.take(1)[0].keys()

dict_keys(['Id', 'Features', 'Labels'])

In [7]:
label_idx = data.flatMap(lambda x: x['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()

In [8]:
num_features = len(word_idx)

In [9]:
from collections import Counter

In [10]:
def vectorize_words(words: list):
    word_counts = Counter([word_idx[w] for w in words])
    return Vectors.sparse(num_features, word_counts.items())

def vectorize_data(x: dict):
    features = vectorize_words(x['Features'])
    labels = [label_idx[l] for l in x['Labels']]
    return (labels, features)

In [11]:
clean_data = data.map(lambda x: vectorize_data(x)). \
    map(lambda x: (x[0], Vectors.dense(x[1].toArray())))

In [16]:
m = train_model(clean_data)

In [13]:
part1 = clean_data.filter(lambda x: len(x[0]) > 1)
part1.count()

22453

In [17]:
clean_data.repartition(12).map(lambda x: (x[0], int(m.predict(x[1])))). \
    filter(lambda x: x[1] in x[0]). \
    count()

36743

In [12]:
clean_data.count()

39119

In [15]:
data.map(lambda x: int(x['Id'])).max()

266865

### Честный эксперимент

In [13]:
training, test = clean_data.randomSplit([0.7, 0.3], seed=9)

In [13]:
model = train_model(training)

In [14]:
test.repartition(12).map(lambda x: (set(x[0]), set(model.predict_n(x[1], 5)))). \
    filter(lambda x: len(x[1].intersection(x[0])) > 0). \
    count()

11570

In [15]:
test.count()

13151

###Hold Out

In [14]:
import random

def hold_out(data, k):
    data_count = data.count()
    print('data count', data_count)
    partition_num = data.getNumPartitions()
    # shuffle
    idxs = list(range(data_count))
    random.shuffle(idxs)
    idxs_rdd = sc.parallelize(idxs).repartition(partition_num)
    shuffled = idxs_rdd.keyBy(lambda x: x).join(data.zipWithIndex().map(lambda x: (x[1], x[0]))). \
        sortByKey().map(lambda x: x[1][1])
    
    sum_of_pred_accurace = 0
    
    h = data_count // k
    idxs = range(0, data_count, h)
    print(list(idxs))
    indexed_data = shuffled.zipWithIndex()
    indexed_data.cache()
    for i, (l,r) in enumerate(zip(idxs, idxs[1:])):
        print('#' + str(i))
        print('l:',l,'r:',r)
        test = indexed_data.filter(lambda x: l <= x[1] < r).map(lambda x: x[0])
        training = indexed_data.filter(lambda x: x[1] < l or x[1] >= r).map(lambda x: x[0])
        
        model = train_model(training)
        predictionAndLabels = test.map(lambda p: (model.predict_n(p[1], 3), p[0]))
        accurace = 1.0 * predictionAndLabels.filter(lambda x: len(set(x[1]).intersection(set(x[0]))) > 0). \
                                            count() / (r - l) * 100
        print(accurace)
        sum_of_pred_accurace += accurace
    result = sum_of_pred_accurace / k
    return result

In [15]:
hold_out(clean_data, 5)

data count 39119
[0, 7823, 15646, 23469, 31292, 39115]
#0
l: 0 r: 7823
83.35676850313179
#1
l: 7823 r: 15646
83.97034385785504
#2
l: 15646 r: 23469
83.7913843793941
#3
l: 23469 r: 31292
83.45903106225234
#4
l: 31292 r: 39115
83.33120286335165


83.58174613319697