In [1]:
import json
from pyspark.mllib.util import MLUtils
from collections import Counter
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.classification import LabeledPoint

In [2]:
def add_bi_tri_gramms(doc):
    words = doc['Features']
    bigramms = ['{} {}'.format(w_0, w_1) for w_0, w_1 in zip(words[:-1], words[1:])]
    trigramms = ['{} {} {}'.format(w_0, w_1, w_2) for w_0, w_1, w_2 in zip(words[:-2], words[1:-1], words[2:])]
    doc['Features'].extend(bigramms)
    doc['Features'].extend(trigramms)
    return doc    

In [3]:
temp_data = sc.textFile('hdfs://master:54310/exp_f/clean') \
    .map(json.loads).map(add_bi_tri_gramms)

In [5]:
word_WL = temp_data.flatMap(lambda x: set(x['Features'])) \
    .map(lambda w: (w, 1)) \
    .reduceByKey(int.__add__) \
    .filter(lambda wc: wc[1] > 24) \
    .map(lambda wc: wc[0])  \
    .collect()

In [6]:
len(word_WL)

226119

In [7]:
word_WL_br = sc.broadcast(set(word_WL))

In [8]:
def filter_words(doc):
    doc['Features'] = [w for w in doc['Features'] if w in word_WL_br.value]
    return doc

In [9]:
clean_rdd = temp_data.map(filter_words)

In [10]:
word_idx = clean_rdd.flatMap(lambda x: x['Features']) \
    .distinct() \
    .zipWithIndex() \
    .collectAsMap()
    
label_idx = clean_rdd.flatMap(lambda x: x['Labels']) \
    .distinct() \
    .zipWithIndex() \
    .collectAsMap()    

In [11]:
with open('/home/hadoop/data/model/w_idx.json', 'w+') as fp:
    json.dump(word_idx, fp)
    
with open('/home/hadoop/data/model/l_idx.json', 'w+') as fp:
    json.dump(label_idx, fp)

In [12]:
word_idx_br = sc.broadcast(word_idx)
label_idx_br = sc.broadcast(label_idx)

In [13]:
feature_num = len(word_idx)

In [14]:
def to_l_points(x):
    word_count = Counter(x['Features'])
    feature_vect = Vectors.sparse(feature_num, [(word_idx_br.value[w], c) for w,c in word_count.items()])
    label_idxs = [label_idx_br.value[l] for l in x['Labels']]
    return [LabeledPoint(l, feature_vect) for l in label_idxs]

In [15]:
l_points_rdd = clean_rdd.flatMap(to_l_points)

In [16]:
MLUtils.saveAsLibSVMFile(l_points_rdd.repartition(1), 'hdfs://master:54310/exp_f/l_svm_b_t')