In [1]:
from pyspark.mllib.linalg import Vectors
import numpy as np
import math

In [22]:
from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import _convert_to_vector
from pyspark import RDD

def scale(x: np.ndarray):
    mean_x = x.mean()
    max_x = x.max()
    min_x = x.min()
    return (x - min_x) / (max_x - min_x)

class MLNaiveBayesModel(NaiveBayesModel):
    def predict_all(self, x):
        if isinstance(x, RDD):
            return x.map(lambda v: self.predict_all(v))
        x = _convert_to_vector(x)
        log_probs = self.pi + x.dot(self.theta.transpose())
        scaled_log_probs = scale(log_probs)
        int_lables = [int(l_i) for l_i in self.labels]
        labels_and_log_probs = zip(int_lables, scaled_log_probs)
        return sorted(labels_and_log_probs, key=lambda x: x[1], reverse=True)

In [3]:
from collections import namedtuple
MultilabledPoint = namedtuple('MultilabledPoint', ['lables', 'features'], verbose=False)

In [4]:
class MultilabledPoint:
    def __init__(self, lables, features):
        self.lables = lables
        self.features = features

In [5]:
# RDD (labels) (features)
def train_model(data, l = 1.0):
    aggreagated = data.flatMap(lambda x: [(l, x[1]) for l in x[0]]). \
        combineByKey(lambda v: (1, v),
                 lambda c, v: (c[0] + 1, c[1] + v),
                 lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])). \
        sortBy(lambda x: x[0]). \
        collect()
    num_labels = len(aggreagated)
    num_documents = data.count()
    num_features = aggreagated[0][1][1].array.size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels, dtype=int)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggreagated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom
        theta_log_denom = math.log(sum_term_freq.toArray().sum() + num_features * l)
        for j in range(num_features):
            theta[i,j] = math.log(sum_term_freq[j] + l) - theta_log_denom
        i += 1  
    return MLNaiveBayesModel(labels, pi, theta)

In [6]:
import json
data = sc.textFile('hdfs://master:54310/new_lables3'). \
    map(lambda x: json.loads(x))

In [7]:
word_idx = data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()

In [8]:
data.take(1)[0].keys()

dict_keys(['Id', 'Features', 'Labels'])

In [9]:
label_idx = data.flatMap(lambda x: x['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()

In [10]:
num_features = len(word_idx)

In [11]:
from collections import Counter

In [12]:
def vectorize_words(words: list):
    word_counts = Counter([word_idx[w] for w in words])
    return Vectors.sparse(num_features, word_counts.items())

def vectorize_data(x: dict):
    features = vectorize_words(x['Features'])
    lables = [label_idx[l] for l in x['Labels']]
    return (lables, Vectors.dense(features.toArray()))

In [13]:
clean_data = data.map(vectorize_data)

In [17]:
clean_data.repartition(12).map(lambda x: (x[0], int(m.predict(x[1])))). \
    filter(lambda x: x[1] in x[0]). \
    count()

36743

In [12]:
clean_data.count()

39119

In [15]:
data.map(lambda x: int(x['Id'])).max()

266865

### Честный эксперимент

In [14]:
training, test = clean_data.randomSplit([0.7, 0.3], seed=9)

In [23]:
model = train_model(training)

In [105]:
def update_model(model):
    return MLNaiveBayesModel(model.labels, model.pi, model.theta)

new_model = update_model(model)

In [24]:
def predict(p, model):
    lables = p[0]
    features = p[1]
    prediction = model.predict_all(features)[:3]
    pred_lables = [l for l,w in prediction]
    return (set(lables), set(pred_lables))
    
test.sample(True, 0.1, seed=0). \
    map(lambda p: predict(p, model)). \
    filter(lambda x: len(x[1].intersection(x[0])) > 0). \
    count() / test.sample(True, 0.1, seed=0).count() * 100.0

82.6923076923077

In [31]:
sc.parallelize(test.take(1)).map(lambda x: (x[0], model.predict_all2(x[1])[:10])).collect()

[([78, 356, 736],
  [(356, 1.0),
   (598, 0.92654447339747259),
   (581, 0.88540524476127247),
   (0, 0.87720416501301546),
   (456, 0.80356100547688381),
   (419, 0.76109548746825673),
   (14, 0.75507679650700554),
   (495, 0.74542706876345788),
   (521, 0.73420304065459474),
   (385, 0.7289780541300076)])]

In [80]:
pi = model.pi
theta = model.theta
labels = model.labels
def create_model():
    return MLNaiveBayesModel(labels,pi,theta)
tM = create_model()

In [89]:
tM.predict_all(test.take(1)[0][1])[860:]

[(612.0, 0.040052240826358991),
 (554.0, 0.039637278302299933),
 (273.0, 0.038522293503595047),
 (424.0, 0.037150267510892017),
 (370.0, 0.037077360223540773),
 (401.0, 0.036049647411316846),
 (186.0, 0.034987988025660041),
 (47.0, 0.034329014311766902),
 (813.0, 0.034097259062080623),
 (841.0, 0.033965041958208063),
 (771.0, 0.032399057752911589),
 (390.0, 0.031379267624089734),
 (440.0, 0.024585464938292962),
 (103.0, 0.022672843849451986),
 (752.0, 0.014863573462702419),
 (202.0, 0.0099958485328929449),
 (241.0, 0.0093664731812346923),
 (714.0, 0.0)]

In [None]:
test.count()

###Hold Out

In [14]:
import random

def hold_out(data, k):
    data_count = data.count()
    print('data count', data_count)
    partition_num = data.getNumPartitions()
    # shuffle
    idxs = list(range(data_count))
    random.shuffle(idxs)
    idxs_rdd = sc.parallelize(idxs).repartition(partition_num)
    shuffled = idxs_rdd.keyBy(lambda x: x).join(data.zipWithIndex().map(lambda x: (x[1], x[0]))). \
        sortByKey().map(lambda x: x[1][1])
    
    sum_of_pred_accurace = 0
    
    h = data_count // k
    idxs = range(0, data_count, h)
    print(list(idxs))
    indexed_data = shuffled.zipWithIndex()
    indexed_data.cache()
    for i, (l,r) in enumerate(zip(idxs, idxs[1:])):
        print('#' + str(i))
        print('l:',l,'r:',r)
        test = indexed_data.filter(lambda x: l <= x[1] < r).map(lambda x: x[0])
        training = indexed_data.filter(lambda x: x[1] < l or x[1] >= r).map(lambda x: x[0])
        
        model = train_model(training)
        predictionAndLabels = test.map(lambda p: (model.predict_n(p[1], 3), p[0]))
        accurace = 1.0 * predictionAndLabels.filter(lambda x: len(set(x[1]).intersection(set(x[0]))) > 0). \
                                            count() / (r - l) * 100
        print(accurace)
        sum_of_pred_accurace += accurace
    result = sum_of_pred_accurace / k
    return result

In [15]:
hold_out(clean_data, 5)

data count 39119
[0, 7823, 15646, 23469, 31292, 39115]
#0
l: 0 r: 7823
83.35676850313179
#1
l: 7823 r: 15646
83.97034385785504
#2
l: 15646 r: 23469
83.7913843793941
#3
l: 23469 r: 31292
83.45903106225234
#4
l: 31292 r: 39115
83.33120286335165


83.58174613319697