In [1]:
import json
raw_data = sc.textFile('hdfs://master:54310/single-label'). \
    map(lambda line: json.loads(line))

In [2]:
excluding_wrods = raw_data.flatMap(lambda doc: [(f,1) for f in doc['Features']]). \
    reduceByKey(lambda a,b: a+b). \
    filter(lambda wc: wc[1] < 6). \
    collectAsMap()

In [3]:
def exclude_words(obj: dict):
    obj['Features'] = [w for w in obj['Features'] if w not in excluding_wrods]
    return obj

In [4]:
data = raw_data.map(lambda x: exclude_words(x)).filter(lambda x: x['Features'])

In [5]:
word_idx = data.flatMap(lambda doc: doc['Features']). \
        distinct().zipWithIndex().collectAsMap()

In [6]:
features_num = len(word_idx)

In [7]:
label_idx = data.map(lambda doc: doc['Label']). \
        distinct().zipWithIndex().collectAsMap()

In [8]:
from pyspark.mllib.classification import LabeledPoint
from collections import Counter          
from pyspark.mllib.linalg import Vectors

In [9]:
def map_obj_to_lpoint(obj: dict):
    label = label_idx[obj['Label']]
    word_count = Counter(obj['Features'])
    feature_vec = Vectors.sparse(features_num, [(word_idx[wc[0]], wc[1]) 
                                                for wc in dict(word_count).items() if wc[0] in word_idx])
    return LabeledPoint(label, feature_vec)

In [10]:
all_data = data.map(lambda obj: map_obj_to_lpoint(obj))

# Размерность

In [12]:
all_data.take(1)[0].features.size

119377

In [11]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

In [15]:
all_data.cache()

PythonRDD[19] at RDD at PythonRDD.scala:43

In [18]:
import random

def hold_out(data, k):
    data_count = data.count()
    print('data count', data_count)
    partition_num = data.getNumPartitions()
    # shuffle
    idxs = list(range(data_count))
    random.shuffle(idxs)
    idxs_rdd = sc.parallelize(idxs).repartition(partition_num)
    shuffled = idxs_rdd.keyBy(lambda x: x).join(data.zipWithIndex().map(lambda x: (x[1], x[0]))). \
        sortByKey().map(lambda x: x[1][1])
    
    sum_of_pred_accurace = 0
    
    h = data_count // k
    idxs = range(0, data_count, h)
    print(list(idxs))
    indexed_data = shuffled.zipWithIndex()
    for i, (l,r) in enumerate(zip(idxs, idxs[1:])):
        print('#' + str(i))
        print('l:',l,'r:',r)
        test = indexed_data.filter(lambda x: l <= x[1] < r).map(lambda x: x[0])
        training = indexed_data.filter(lambda x: x[1] < l or x[1] >= r).map(lambda x: x[0])
        
        model = NaiveBayes.train(training)
        predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
        accurace = 1.0 * predictionAndLabel.filter(lambda x: x[0] == x[1]).count() / (r - l) * 100
        print(accurace)
        sum_of_pred_accurace += accurace
    result = sum_of_pred_accurace.value / k
    return result

In [None]:
hold_out(all_data, 10)

In [12]:
import random

In [13]:
data_count = all_data.count()
print('data count', data_count)
partition_num = all_data.getNumPartitions()
# shuffle
idxs = list(range(data_count))
random.shuffle(idxs)
idxs_rdd = sc.parallelize(idxs).repartition(partition_num)
shuffled = idxs_rdd.keyBy(lambda x: x).join(all_data.zipWithIndex().map(lambda x: (x[1], x[0]))). \
    sortByKey().map(lambda x: x[1][1])
training, test = shuffled.randomSplit([0.6, 0.4], seed=0)

data count 82106


In [14]:
model = NaiveBayes.train(training)

In [15]:
100.0 * test.map(lambda x: model.predict(x.features) == x.label).count() / test.count()

100.0

In [None]:
model.save(sc, 'hdfs://master:54310/bayes-model')

In [18]:
import pickle

In [19]:
pickle.dump(model, open('/home/hadoop/models/bayes/bayes.p', 'wb+'))