In [4]:
from collections import namedtuple, Counter
from pyspark.mllib.linalg import Vectors
import json
from pyspark import RDD
import numpy as np

In [5]:
sc.addPyFile('/home/hadoop/spark/lib/sparse.py')
sc.addPyFile('/home/hadoop/spark/lib/model.py')

In [6]:
from sparse import sparse_vector
np.ndarray

numpy.ndarray

In [7]:
from model import MLNaiveBayesModel

In [8]:
def shuffle_and_split(data: RDD, fold_n: int, seed = 0):
    fold_weights = [1 / fold_n] * fold_n
    return data.randomSplit(fold_weights)

def hold_out(sc, data: RDD, k: int, model_builder, metrics: list):
    folds = shuffle_and_split(data, k)
    for i in range(k):
        test = folds[i]
        training = sc.union(folds[:i] + folds[i + 1:])
        model = model_builder(training)
        model_broadcast = sc.broadcast(model)
        lables_and_predictions = test.map(lambda x: (x['lables'],
                                      model_broadcast.value.predict_all(x['features'])))
        for metric in metrics:
            metric.evaluate(lables_and_predictions)
    return metrics

class Metric:
    def __init__(self, name: str, verbose=False):
        self._name = name
        self._results = []
        self._verbose = verbose
        
    @property
    def name(self):
        return self._name
    
    @property
    def results(self):
        return self._results
    
    @property
    def avg(self):
        return np.average(self._results)
    
    def evaluate(self, lables, predictions):
        pass

class AccuracyMetric(Metric):
    def __init__(self, pred_n: int, intersect_n: int):
        self._pred_n = pred_n
        self._intersect_n = intersect_n
        super(AccuracyMetric, self).__init__(name='Accuracy', verbose=False)
        
    def evaluate(self, lables_and_predictions: RDD):
        TP = lables_and_predictions.map(lambda x:
                                    (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                    filter(lambda x:
                                           len(x[0].intersection(x[1])) >= self._intersect_n)
        accuracy = 100.0 * TP.count() / lables_and_predictions.count()
        if self._verbose:
            print('accuracy: ', accuracy)
        self._results.append(accuracy)
        return accuracy

from pyspark.mllib.classification import NaiveBayesModel
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.linalg import Vectors
from pyspark import RDD
import numpy as np
import math


# RDD (labels) (features)
def train_model(data, l = 1.0):
    aggreagated = data.flatMap(lambda x: [(l, x['features']) for l in x['lables']]). \
        combineByKey(lambda v: (1, v),
                 lambda c, v: (c[0] + 1, c[1] + v),
                 lambda c1, c2: (c1[0] + c2[0], c1[1] + c2[1])). \
        sortBy(lambda x: x[0]). \
        collect()
    num_labels = len(aggreagated)
    num_documents = data.count()
    num_features = aggreagated[0][1][1].size
    labels = np.zeros(num_labels)
    pi = np.zeros(num_labels, dtype=int)
    theta = np.zeros((num_labels, num_features))
    pi_log_denom = math.log(num_documents + num_labels * l)
    i = 0
    for (label, (n, sum_term_freq)) in aggreagated:
        labels[i] = label
        pi[i] = math.log(n + l) - pi_log_denom
        sum_term_freq_dense = sum_term_freq.toarray()
        theta_log_denom = math.log(sum_term_freq.sum() + num_features * l)
        theta[i,:] = np.log(sum_term_freq_dense + l) - theta_log_denom
        i += 1  
    return MLNaiveBayesModel(labels, pi, theta)

In [9]:
raw_data = sc.textFile('hdfs://master:54310/excluded_fin').map(json.loads)

In [12]:
import re
top_2000 = raw_data.flatMap(lambda x: x['Labels']). \
    map(lambda l: (l,1)). \
    reduceByKey(int.__add__). \
    sortBy(lambda lc: lc[1], ascending=False). \
    map(lambda lc: lc[0]). \
    take(2000)
eng_from_top_2000 = [l for l in top_2000 if re.match('.*([a-z]).*', l)]
with open('/home/hadoop/data/eng_from_top_2000.txt', 'w+') as fp:
    for l in sorted(eng_from_top_2000):
        fp.write(l + '\n')

In [25]:
with open('/home/hadoop/data/top_2000.txt', 'w+') as fp:
    for l in sorted(top_2000):
        fp.write(l + '\n')

In [13]:
top_1000 = raw_data.flatMap(lambda x: x['Labels']). \
    map(lambda l: (l,1)). \
    reduceByKey(int.__add__). \
    sortBy(lambda lc: lc[1], ascending=False). \
    map(lambda lc: lc[0]). \
    take(1000)
#top_1000[:10]

In [14]:
top_1000_set = set(top_1000)
top_1000_set_br = sc.broadcast(top_1000_set)

In [15]:
def filter_lables(obj: dict):
    obj['Labels'] = [l for l in obj['Labels'] if l in top_1000_set_br.value]
    return obj

In [16]:
data = raw_data.map(filter_lables). \
    filter(lambda x: len(x['Labels']) > 2)
#data.count()

In [17]:
word_idx = data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
label_idx = data.flatMap(lambda x: x['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
num_features = len(word_idx)

In [18]:
word_idx_br = sc.broadcast(word_idx)
label_idx_br = sc.broadcast(label_idx)

In [19]:
def vectorize_words(words: list):
    word_counts = Counter([word_idx_br.value[w] for w in words])
    return sparse_vector(list(word_counts.items()), length=num_features, dtype=np.int32)

def vectorize_data(x: dict):
    features = vectorize_words(x['Features'])
    lables = [label_idx_br.value[l] for l in x['Labels']]
    return {'lables':lables, 'features':features}

In [20]:
vect_data = data.map(vectorize_data)
#vect_data.cache()
vect_data.count()

58843

In [42]:
unique_labels = sc.textFile('hdfs://master:54310/raw_data'). \
    map(json.loads). \
    flatMap(lambda x: x['Hubs'] + x['Tags']).distinct()
unique_labels.cache()

PythonRDD[123] at RDD at PythonRDD.scala:43

In [47]:
def find(pattern):
    regexp = re.compile('.*('+pattern+').*')
    return unique_labels.filter(lambda l: regexp.match(l))

In [73]:
find('manag').take(20)

['scm-manager',
 'managedblocker',
 'manager',
 'googe tag manager',
 'managed code',
 'mobile device manager',
 'coremanager',
 'system management mode',
 'ifxmanager',
 'process manager',
 'compliance management',
 'window manager',
 'risk managment',
 'package management',
 'clents management',
 'application performance management',
 'billmanager',
 'intel node manager',
 'test management',
 'sales management']

In [21]:
vect_data = data.filter(lambda x: len(x['Features']) > 19). \
    map(vectorize_data)
vect_data.cache()

PythonRDD[48] at RDD at PythonRDD.scala:43

In [77]:
train, test = vect_data.randomSplit([0.7, 0.3])

In [78]:
model = train_model(train)

In [79]:
def predict(p, model):
    lables = p['lables']
    features = p['features']
    prediction = model.predict_all(features)[:3]
    pred_lables = [l for l,w in prediction]
    return (set(lables), set(pred_lables))
    
test.repartition(12). \
    map(lambda p: predict(p, model)). \
    filter(lambda x: len(x[1].intersection(x[0])) > 0). \
    count() / test.count() * 100.0

63.57375167400038

In [24]:
result = hold_out(sc, vect_data, 4, train_model, [AccuracyMetric(3, 1)])

##CV мин длины текста

In [None]:
result = []
for i in range(0, 100, 10):
    top_1000 = raw_data.flatMap(lambda x: x['Labels']). \
    map(lambda l: (l,1)). \
    reduceByKey(int.__add__). \
    sortBy(lambda lc: lc[1], ascending=False). \
    map(lambda lc: lc[0]). \
    take(1000)
    
    top_1000_set = set(top_1000)
    top_1000_set_br = sc.broadcast(top_1000_set)

    data = raw_data.map(filter_lables). \
    filter(lambda x: len(x['Labels']) > 2)
    
    word_idx = data.flatMap(lambda x: set(x['Features'])). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
    label_idx = data.flatMap(lambda x: x['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
    num_features = len(word_idx)
    
    word_idx_br = sc.broadcast(word_idx)
    label_idx_br = sc.broadcast(label_idx)
    
    vect_data = data.filter(lambda x: len(x['Features']) > i). \
    map(vectorize_data)
    temp = hold_out(sc, vect_data, 4, train_model, [AccuracyMetric(3, 1)])
    print(temp[0].avg)
    result.extend(temp)

# 70/30

In [13]:
train, testing = vect_data.randomSplit([0.7, 0.3], seed=)

In [14]:
model = train_model(train)

In [32]:
model = MLNaiveBayesModel(model.labels, model.pi, model.theta)

In [16]:
def predict(p, model):
    lables = p['lables']
    features = p['features']
    prediction = model.predict_all(features)[:3]
    pred_lables = [l for l,w in prediction]
    return (set(lables), set(pred_lables))
    
testing.repartition(12). \
    map(lambda p: predict(p, model)). \
    filter(lambda x: len(x[1].intersection(x[0])) > 0). \
    count() / testing.count() * 100.0


77.8570557385047