In [1]:
import json
data = sc.textFile('hdfs://master:54310/single-label'). \
        map(lambda line: json.loads(line))

In [3]:
zero_w = data.map(lambda x: len(x['Features'])).filter(lambda x: x==0)
zero_w.count()

0

In [6]:
excluding_words = data.flatMap(lambda d: set(d['Features'])). \
    map(lambda d: (d,1)). \
    reduceByKey(int.__add__). \
    filter(lambda f: f[1] == 1). \
    map(lambda f: f[0]).collect()

In [7]:
excluding_words_set = set(excluding_words)
excluding_words_set_br = sc.broadcast(excluding_words_set)

In [9]:
def exclude_words(doc):
    doc['Features'] = [w for w in doc['Features'] if w not in excluding_words_set_br.value]
    return doc

In [10]:
data_clean_words = data.map(exclude_words).filter(lambda doc: doc['Features'])

In [11]:
features_idx = data_clean_words.flatMap(lambda doc: set(doc['Features'])). \
            distinct().zipWithIndex().collectAsMap()

In [12]:
labels_idx = data_clean_words.map(lambda doc: doc['Label']). \
            distinct().zipWithIndex().collectAsMap()

In [13]:
from collections import Counter

In [14]:
sc.addPyFile('/home/hadoop/spark/lib/sparse.py')

In [15]:
num_features = len(features_idx)
feature_idx_br = sc.broadcast(features_idx)
label_idx_br = sc.broadcast(labels_idx)

In [16]:
from sparse import sparse_vector
from pyspark.mllib.linalg import Vectors
import numpy as np

In [17]:
def vectorize_words(words: list):
    word_counts = Counter([feature_idx_br.value[w] for w in words])
    return sparse_vector(list(word_counts.items()), length=num_features, dtype=np.int32)

def vectorize_data(x: dict):
    features = vectorize_words(x['Features'])
    lables = label_idx_br.value[x['Label']]
    return {'lable':lables, 'features':features}

In [18]:
vect_data = data_clean_words.map(vectorize_data)

In [19]:
vd = vect_data.collect()

In [20]:
doc_count = len(vd)

In [21]:
from scipy.sparse import dok_matrix

In [22]:
X = dok_matrix((doc_count, num_features), dtype=np.int32)
y = np.zeros(doc_count, dtype=np.int32)

In [23]:
for row_num, doc in enumerate(vd):
    if row_num % 2000 == 0:
        print(row_num)
    label = doc['lable']
    features = doc['features']
    y[row_num] = label
#     X[row_num,:] = features.inner
    for col_num in features.nonzero():
        X[row_num, col_num] = features[col_num]

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000


In [24]:
import json
with open('/home/hadoop/exp/label_idx.json', 'w+') as fp:
    json.dump(labels_idx, fp, ensure_ascii=False)
with open('/home/hadoop/exp/feature_idx.json', 'w+') as fp:
    json.dump(features_idx, fp, ensure_ascii=False)
from scipy import io
io.mmwrite('/home/hadoop/exp/X.mtx', X)
np.save('/home/hadoop/exp/y.npy', y)