In [1]:
from collections import namedtuple, Counter
from pyspark.mllib.linalg import Vectors
import json
from pyspark import RDD
import numpy as np

In [2]:
sc.addPyFile('/home/hadoop/spark/lib/sparse.py')
sc.addPyFile('/home/hadoop/spark/lib/model.py')

In [3]:
from sparse import sparse_vector
np.ndarray

numpy.ndarray

In [4]:
from model import MLNaiveBayesModel

In [5]:
def shuffle_and_split(data: RDD, fold_n: int, seed = 0):
    fold_weights = [1 / fold_n] * fold_n
    return data.randomSplit(fold_weights)

def hold_out(sc, data: RDD, k: int, model_builder, metrics: list):
    folds = shuffle_and_split(data, k)
    for i in range(k):
        test = folds[i]
        training = sc.union(folds[:i] + folds[i + 1:])
        model = model_builder(training)
        model_broadcast = sc.broadcast(model)
        lables_and_predictions = test.map(lambda x: (x['lables'],
                                      model_broadcast.value.predict_all(x['features'])))
        for metric in metrics:
            metric.evaluate(lables_and_predictions)
    return metrics

class Metric:
    def __init__(self, name: str, verbose=False):
        self._name = name
        self._results = []
        self._verbose = verbose
        
    @property
    def name(self):
        return self._name
    
    @property
    def results(self):
        return self._results
    
    @property
    def avg(self):
        return np.average(self._results)
    
    def evaluate(self, lables, predictions):
        pass

class AccuracyMetric(Metric):
    def __init__(self, pred_n: int, intersect_n: int):
        self._pred_n = pred_n
        self._intersect_n = intersect_n
        super(AccuracyMetric, self).__init__(name='Accuracy', verbose=False)
        
    def evaluate(self, lables_and_predictions: RDD):
        TP = lables_and_predictions.map(lambda x:
                                    (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                    filter(lambda x:
                                           len(x[0].intersection(x[1])) >= self._intersect_n)
        accuracy = 100.0 * TP.count() / lables_and_predictions.count()
        if self._verbose:
            print('accuracy: ', accuracy)
        self._results.append(accuracy)
        return accuracy

from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.linalg import Vectors
from pyspark import RDD
import numpy as np
import math


# RDD (labels) (features)
def train_model(data, l = 1.0):
    aggreagated = data.flatMap(lambda x: [(l, x['features']) for l in x['lables']]). \
        combineByKey(lambda v: (1, v),
                 lambda c, v: (c[0] + 1, c[1] + v),
                 lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])). \
        sortBy(lambda x: x[0]). \
        collect()
    num_labels = len(aggreagated)
    num_documents = data.count()
    num_features = aggreagated[0][1][1].size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels, dtype=int)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggreagated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom
        sum_term_freq_dense = sum_term_freq.toarray()
        theta_log_denom = math.log(sum_term_freq.sum() + num_features * l)
        theta[i,:] = np.log(sum_term_freq_dense + l) - theta_log_denom
        i += 1  
    return MLNaiveBayesModel(labels, pi, theta)

In [6]:
from pyspark.mllib.util import MLUtils

In [7]:
raw_data = MLUtils.loadLibSVMFile(sc, 'hdfs://master:54310/exp_f/l_svm_b_t')

In [8]:
def sparse_vector_to_coo(vect):
    return tuple([(int(i), vect[int(i)]) for i in vect.indices])      

In [9]:
temp_data = raw_data.map(lambda lp: (sparse_vector_to_coo(lp.features), lp.label)).groupByKey() \
    .map(lambda g: {'features': g[0], 'labels': list(g[1])}).filter(lambda x: len(x['features']) > 99)

In [10]:
temp_data.persist()

PythonRDD[8] at RDD at PythonRDD.scala:43

In [11]:
temp_data.count()

85782

In [10]:
import json

In [11]:
with open('/home/hadoop/f_imp.json', 'r') as fp:
    f_rating = json.load(fp)

In [14]:
len(f_rating)

226119

In [None]:
def filter_features(doc, w_l):
    old = doc['features']
    new = [(w_l[i], int(v)) for i,v in old if i in w_l]
    f_num = len(w_l)
    doc['features'] = sparse_vector(new, length=f_num)
    doc['lables'] = doc['labels']
    del doc['labels']
    return doc

In [17]:
f_count = 90000
top_f = dict(zip(f_rating[:f_count], range(f_count)))
top_f_br = sc.broadcast(top_f)
new_data = temp_data.map(lambda x: filter_features(x, top_f_br.value)).filter(lambda x: len(x['features']) > 14)
new_data.cache()
metric = AccuracyMetric(5, 1)
hold_out(sc, new_data, 4, train_model, [metric])
new_data.unpersist()
print('count: ', f_count, 'res: [', metric.results, '] avg: ', metric.avg)

count:  90000 res: [ [79.38970416957838, 77.62520945820145, 77.82626862899856, 78.41175918098995] ] avg:  78.3132353594


In [None]:
f_count = 110000
top_f = dict(zip(f_rating[:f_count], range(f_count)))
top_f_br = sc.broadcast(top_f)
new_data = temp_data.map(lambda x: filter_features(x, top_f_br.value)).filter(lambda x: len(x['features']) > 14)
new_data.cache()
metric = AccuracyMetric(5, 1)
hold_out(sc, new_data, 4, train_model, [metric])
new_data.unpersist()
print('count: ', f_count, 'res: [', metric.results, '] avg: ', metric.avg)

In [None]:
for f_count in range(130000, 210000, 10000):
    top_f = dict(zip(f_rating[:f_count], range(f_count)))
    top_f_br = sc.broadcast(top_f)
    new_data = temp_data.map(lambda x: filter_features(x, top_f_br.value)).filter(lambda x: len(x['features'])   > 14)
    new_data.cache()
    metric = AccuracyMetric(5, 1)
    hold_out(sc, new_data, 4, train_model, [metric])
    new_data.unpersist()
    print('count: ', f_count, 'res: [', metric.results, '] avg: ', metric.avg)

count:  130000 res: [ [77.63885661166628, 77.44606358819077, 76.95402830450467, 77.73025245246176] ] avg:  77.4423002392
count:  140000 res: [ [77.36163900991934, 77.28337236533957, 77.58436118222015, 77.24699387077153] ] avg:  77.3690916071
count: 

In [16]:
metric = AccuracyMetric(3, 1)

In [17]:
result = hold_out(sc, vect_data, 4, train_model, [metric])

In [18]:
result[0].results

[65.01506024096386, 64.63493645651633, 64.69965732715178, 65.3236982775688]

In [33]:
metric_2 = AccuracyMetric(3, 1)

In [34]:
result_2 = hold_out(sc, vect_data, 3, train_model, [metric_2])

In [35]:
metric_2.results

[82.13314130260842, 81.90346375881975, 81.87474747474748]

##CV мин длины текста

In [None]:
result = []
for i in range(0, 100, 10):
    top_1000 = raw_data.flatMap(lambda x: x['Labels']). \
    map(lambda l: (l,1)). \
    reduceByKey(int.__add__). \
    sortBy(lambda lc: lc[1], ascending=False). \
    map(lambda lc: lc[0]). \
    take(1000)
    
    top_1000_set = set(top_1000)
    top_1000_set_br = sc.broadcast(top_1000_set)

    data = raw_data.map(filter_lables). \
    filter(lambda x: len(x['Labels']) > 2)
    
    word_idx = data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
    label_idx = data.flatMap(lambda x: x['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
    num_features = len(word_idx)
    
    word_idx_br = sc.broadcast(word_idx)
    label_idx_br = sc.broadcast(label_idx)
    
    vect_data = data.filter(lambda x: len(x['Features']) > i). \
    map(vectorize_data)
    temp = hold_out(sc, vect_data, 4, train_model, [AccuracyMetric(3, 1)])
    print(temp[0].avg)
    result.extend(temp)

# 70/30

In [13]:
train, testing = vect_data.randomSplit([0.7, 0.3], seed=)

In [14]:
model = train_model(train)

In [32]:
model = MLNaiveBayesModel(model.labels, model.pi, model.theta)

In [16]:
def predict(p, model):
    lables = p['lables']
    features = p['features']
    prediction = model.predict_all(features)[:3]
    pred_lables = [l for l,w in prediction]
    return (set(lables), set(pred_lables))
    
testing.repartition(12). \
    map(lambda p: predict(p, model)). \
    filter(lambda x: len(x[1].intersection(x[0])) > 0). \
    count() / testing.count() * 100.0


77.8570557385047