In [1]:
from collections import namedtuple, Counter
from pyspark.mllib.linalg import Vectors
import json
from pyspark import RDD
import numpy as np

In [2]:
sc.addPyFile('/home/hadoop/spark/lib/sparse.py')
sc.addPyFile('/home/hadoop/spark/lib/model.py')

In [3]:
from sparse import sparse_vector
np.ndarray

numpy.ndarray

In [4]:
from model import MLNaiveBayesModel

In [5]:
def shuffle_and_split(data: RDD, fold_n: int, seed = 0):
    fold_weights = [1 / fold_n] * fold_n
    return data.randomSplit(fold_weights)

def hold_out(sc, data: RDD, k: int, model_builder, metrics: list):
    folds = shuffle_and_split(data, k)
    for i in range(k):
        test = folds[i]
        training = sc.union(folds[:i] + folds[i + 1:])
        model = model_builder(training)
        model_broadcast = sc.broadcast(model)
        lables_and_predictions = test.map(lambda x: (x['lables'],
                                      model_broadcast.value.predict_all(x['features'])))
        for metric in metrics:
            metric.evaluate(lables_and_predictions)
    return metrics

class Metric:
    def __init__(self, name: str, verbose=False):
        self._name = name
        self._results = []
        self._verbose = verbose
        
    @property
    def name(self):
        return self._name
    
    @property
    def results(self):
        return self._results
    
    @property
    def avg(self):
        return np.average(self._results)
    
    def evaluate(self, lables, predictions):
        pass

class AccuracyMetric(Metric):
    def __init__(self, pred_n: int, intersect_n: int):
        self._pred_n = pred_n
        self._intersect_n = intersect_n
        super(AccuracyMetric, self).__init__(name='Accuracy', verbose=False)
        
    def evaluate(self, lables_and_predictions: RDD):
        TP = lables_and_predictions.map(lambda x:
                                    (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                    filter(lambda x:
                                           len(x[0].intersection(x[1])) >= self._intersect_n)
        accuracy = 100.0 * TP.count() / lables_and_predictions.count()
        if self._verbose:
            print('accuracy: ', accuracy)
        self._results.append(accuracy)
        return accuracy

from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.linalg import Vectors
from pyspark import RDD
import numpy as np
import math


# RDD (labels) (features)
def train_model(data, l = 1.0):
    aggreagated = data.flatMap(lambda x: [(l, x['features']) for l in x['lables']]). \
        combineByKey(lambda v: (1, v),
                 lambda c, v: (c[0] + 1, c[1] + v),
                 lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])). \
        sortBy(lambda x: x[0]). \
        collect()
    num_labels = len(aggreagated)
    num_documents = data.count()
    num_features = aggreagated[0][1][1].size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels, dtype=int)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggreagated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom
        sum_term_freq_dense = sum_term_freq.toarray()
        theta_log_denom = math.log(sum_term_freq.sum() + num_features * l)
        theta[i,:] = np.log(sum_term_freq_dense + l) - theta_log_denom
        i += 1  
    return MLNaiveBayesModel(labels, pi, theta)

In [6]:
raw_data = sc.textFile('hdfs://master:54310/single-label').map(json.loads)

In [22]:
top_900 = raw_data.flatMap(lambda x: x['Labels']). \
    map(lambda l: (l,1)). \
    reduceByKey(int.__add__). \
    sortBy(lambda lc: lc[1], ascending=False). \
    map(lambda lc: lc[0]). \
    take(900)
#top_1000[:10]

In [23]:
top_900_set = set(top_900)
top_900_set_br = sc.broadcast(top_900_set)

In [24]:
def filter_lables(obj: dict):
    obj['Labels'] = [l for l in obj['Labels'] if l in top_900_set_br.value]
    return obj

In [8]:
data = raw_data
#data.count()

In [9]:
data.take(1)[0].keys()

dict_keys(['Label', 'Id', 'Features'])

In [22]:
excl_words = data.flatMap(lambda x: x['Features']). \
    map(lambda w: (w,1)). \
    reduceByKey(int.__add__). \
    filter(lambda x: x[1] < 6). \
    map(lambda x: x[0]).collect()
excl_words_set = set(excl_words)
excl_words_br = sc.broadcast(excl_words_set)

In [23]:
def filter_words(doc: dict):
    doc['Features'] = [w for w in doc['Features'] if w not in excl_words_br.value]
    return doc

In [24]:
data = data.map(filter_words)

In [25]:
word_idx = data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
label_idx = data.map(lambda x: x['Label']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
num_features = len(word_idx)

In [26]:
word_idx_br = sc.broadcast(word_idx)
label_idx_br = sc.broadcast(label_idx)

In [27]:
def vectorize_words(words: list):
    word_counts = Counter([word_idx_br.value[w] for w in words])
    return sparse_vector(list(word_counts.items()), length=num_features, dtype=np.int32)

def vectorize_data(x: dict):
    features = vectorize_words(x['Features'])
    lables = [label_idx_br.value[x['Label']]]
    return {'lables':lables, 'features':features}

In [13]:
vect_data = data.map(vectorize_data)
#vect_data.cache()
vect_data.count()

74961

In [28]:
vect_data = data.filter(lambda x: len(x['Features']) > 19). \
    map(vectorize_data)
vect_data.cache()

PythonRDD[51] at RDD at PythonRDD.scala:43

In [29]:
train, test = vect_data.randomSplit([0.7, 0.3])

In [30]:
model = train_model(train)

In [32]:
model.theta.shape

(2526, 94063)

In [31]:
model_br = sc.broadcast(model)

MemoryError: 

In [17]:
def predict(p, model):
    lable = p['lables'][0]
    features = p['features']
    prediction = model.predict_all(features)[0][0]
    return lable, prediction
    
test.repartition(12). \
    map(lambda p: predict(p, model_br)). \
    filter(lambda x: x[0] == x[1]). \
    count() / test.count() * 100.0

MemoryError: 

In [30]:
metric = AccuracyMetric(3, 1)

In [31]:
result = hold_out(sc, vect_data, 4, train_model, [metric])

In [32]:
result[0].results

[82.87012575293248, 82.80796126949973, 82.13278368409634, 82.51681075888568]

In [33]:
metric_2 = AccuracyMetric(3, 1)

In [34]:
result_2 = hold_out(sc, vect_data, 3, train_model, [metric_2])

In [35]:
metric_2.results

[82.13314130260842, 81.90346375881975, 81.87474747474748]

##CV мин длины текста

In [None]:
result = []
for i in range(0, 100, 10):
    top_1000 = raw_data.flatMap(lambda x: x['Labels']). \
    map(lambda l: (l,1)). \
    reduceByKey(int.__add__). \
    sortBy(lambda lc: lc[1], ascending=False). \
    map(lambda lc: lc[0]). \
    take(1000)
    
    top_1000_set = set(top_1000)
    top_1000_set_br = sc.broadcast(top_1000_set)

    data = raw_data.map(filter_lables). \
    filter(lambda x: len(x['Labels']) > 2)
    
    word_idx = data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
    label_idx = data.flatMap(lambda x: x['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
    num_features = len(word_idx)
    
    word_idx_br = sc.broadcast(word_idx)
    label_idx_br = sc.broadcast(label_idx)
    
    vect_data = data.filter(lambda x: len(x['Features']) > i). \
    map(vectorize_data)
    temp = hold_out(sc, vect_data, 4, train_model, [AccuracyMetric(3, 1)])
    print(temp[0].avg)
    result.extend(temp)

# 70/30

In [13]:
train, testing = vect_data.randomSplit([0.7, 0.3], seed=)

In [14]:
model = train_model(train)

In [32]:
model = MLNaiveBayesModel(model.labels, model.pi, model.theta)

In [16]:
def predict(p, model):
    lables = p['lables']
    features = p['features']
    prediction = model.predict_all(features)[:3]
    pred_lables = [l for l,w in prediction]
    return (set(lables), set(pred_lables))
    
testing.repartition(12). \
    map(lambda p: predict(p, model)). \
    filter(lambda x: len(x[1].intersection(x[0])) > 0). \
    count() / testing.count() * 100.0


77.8570557385047