In [1]:
import json
from collections import Counter

In [2]:
data = sc.textFile('hdfs://master:54310/clean_ml'). \
    map(json.loads)

In [3]:
excluding_words = set(data.flatMap(lambda doc: set(doc['Features'])). \
    map(lambda w: (w, 1)). \
    reduceByKey(int.__add__). \
    filter(lambda wc: wc[1] == 1). \
    collectAsMap().keys())

In [4]:
excluding_words_br = sc.broadcast(excluding_words)

In [5]:
len(excluding_words)

264750

In [6]:
data.flatMap(lambda doc: set(doc['Features'])).distinct().count()

524889

In [6]:
def exclude_words(doc: dict):
    doc['Features'] = [w for w in doc['Features']
                       if w not in excluding_words_br.value]
    return doc    

In [7]:
clean_data = data.map(exclude_words).filter(lambda x: x['Features'])

In [8]:
word_idx = clean_data.flatMap(lambda doc: doc['Features']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()
label_idx = clean_data.flatMap(lambda doc: doc['Labels']). \
    distinct(). \
    zipWithIndex(). \
    collectAsMap()

In [9]:
word_idx_br = sc.broadcast(word_idx)
label_idx_br = sc.broadcast(label_idx)

In [10]:
def vectorize_data(doc: dict):
    doc['Features'] = [word_idx_br.value[w] for w in doc['Features']]
    doc['Labels'] = [label_idx_br.value[l] for l in doc['Labels']]
    return doc

In [11]:
vect_data = clean_data.map(vectorize_data)

In [12]:
def divide_by_total_count(word_count: dict):
    val_sum = sum(word_count.values())
    for k,v in word_count.items():
        word_count[k] = v/val_sum
    return word_count

In [13]:
sc.addPyFile('/home/hadoop/spark/lib/sparse.py')

In [14]:
from sparse import sparse_vector
import numpy as np

In [15]:
num_features = len(word_idx)

In [16]:
vect_data.cache()

PythonRDD[19] at RDD at PythonRDD.scala:43

In [17]:
probs = vect_data.map(lambda doc: (doc['Labels'], Counter(doc['Features']))). \
    flatMap(lambda x: [(h, x[1]) for h in x[0]]). \
    reduceByKey(lambda a,b: a+b). \
    map(lambda x: (x[0], divide_by_total_count(dict(x[1])))). \
    map(lambda x: (x[0], sparse_vector(list(x[1].items()), length=num_features, dtype=float))). \
    collectAsMap()

In [18]:
probs_br = sc.broadcast(probs)

In [19]:
inv_word_idx = dict([(v, k) for k,v in word_idx.items()])
inv_label_idx = dict([(v, k) for k,v in label_idx.items()])
inv_word_idx_br = sc.broadcast(inv_word_idx)
inv_label_idx_br = sc.broadcast(inv_label_idx)

In [27]:
def unvectorize_data(doc: dict):
    doc['Label'] = inv_label_idx_br.value[doc['Label']]
    doc['Features'] = [inv_word_idx_br.value[key] for key in doc['Features']]
    return doc

In [28]:
from math import log
def calc_log_prob(label: int, words: list):
    word_probs = probs_br.value[label]
    return sum([log(word_probs[word]) for word in set(words)]) 

def choose_max_prob_hub(doc: list, hubs: list):
    if len(hubs) == 1:
        return hubs[0]
    else:
        return max([(h, calc_log_prob(h, doc)) for h in hubs], key=lambda x: x[1])[0]
    
vect_data.map(lambda doc: {'Id': doc['Id'],
                     'Features': doc['Features'],
                     'Label': choose_max_prob_hub(doc['Features'], doc['Labels'])}). \
    map(unvectorize_data). \
    map(json.dumps). \
    repartition(6). \
    saveAsTextFile('hdfs://master:54310/single-label')