In [3]:
import json
data = sc.textFile('hdfs://master:54310/single-label'). \
        map(lambda line: json.loads(line))

In [8]:
excluding = data.flatMap(lambda doc: set(doc['Features'])). \
            map(lambda w: (w, 1)). \
            reduceByKey(int.__add__). \
            filter(lambda w: w[1] == 1). \
            collect()

In [9]:
excluding_set = set(excluding)

In [12]:
def exclude_words(doc):
    doc['Features'] = [word for word in doc['Features'] if word not in excluding_set]
    return doc

In [13]:
data_clean_words = data.map(exclude_words).filter(lambda doc: doc['Features'])

In [29]:
features_idx = data_clean_words.flatMap(lambda doc: set(doc['Features'])). \
            distinct().zipWithIndex().collectAsMap()

In [30]:
labels_idx = data_clean_words.map(lambda doc: doc['Label']). \
            distinct().zipWithIndex().collectAsMap()

In [28]:
from collections import Counter

In [34]:
sc.addPyFile('/home/hadoop/spark/lib/sparse.py')

In [37]:
num_features = len(features_idx)
feature_idx_br = sc.broadcast(features_idx)
label_idx_br = sc.broadcast(labels_idx)

In [45]:
from sparse import sparse_vector
from pyspark.mllib.linalg import Vectors
import numpy as np

In [46]:
def vectorize_words(words: list):
    word_counts = Counter([feature_idx_br.value[w] for w in words])
    return sparse_vector(list(word_counts.items()), length=num_features, dtype=np.int32)

def vectorize_data(x: dict):
    features = vectorize_words(x['Features'])
    lables = label_idx_br.value[x['Label']]
    return {'lable':lables, 'features':features}

In [47]:
vect_data = data_clean_words.map(vectorize_data)

In [55]:
vd = vect_data.collect()

In [58]:
doc_count = len(vd)

In [59]:
from scipy.sparse import dok_matrix

In [79]:
X = dok_matrix((doc_count, num_features), dtype=np.int32)
y = np.zeros(doc_count, dtype=np.int32)

In [78]:
del X
del y

In [None]:
for row_num, doc in enumerate(vd):
    if row_num % 1000 == 0:
        print(row_num)
    label = doc['lable']
    features = doc['features']
    y[row_num] = label
#     X[row_num,:] = features.inner
    for col_num in features.nonzero():
        X[row_num, col_num] = features[col_num]

In [83]:
import json
with open('/home/hadoop/exp/label_idx.json', 'w+') as fp:
    json.dump(labels_idx, fp, ensure_ascii=False)
with open('/home/hadoop/exp/feature_idx.json', 'w+') as fp:
    json.dump(features_idx, fp, ensure_ascii=False)
from scipy import io
io.mmwrite('/home/hadoop/exp/X.mtx', X)
np.save('/home/hadoop/exp/y.npy', y)
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_jobs=3)
clf.fit(X, y)