In [6]:
import json
data = sc.textFile('hdfs://master:54310/clean'). \
    map(lambda line: json.loads(line)). \
    map(lambda doc: {'Features': doc['Text'], 'Labels': doc['Hubs']})

In [7]:
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF

In [8]:
documents = data.map(lambda x: x['Features'])

In [9]:
hashingTF = HashingTF()
tf = hashingTF.transform(documents)

In [10]:
#tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [11]:
import numpy as np
from pyspark.mllib.linalg import Vectors

In [12]:
def map_tfidf_vector(array: np.ndarray, percents = [10, 20, 30, 40, 50]):
    return [filter_values_under_percent(array, p) for p in percents]

def cast_sparse_vector_to_dict_and_scale(v):
    max_v = v.toArray().max()
    return dict([(int(i), v[int(i)] / max_v) for i in v.indices])

import operator
def order_dict_by_values(d: dict, asc = True):
    return sorted(d.items(), key = operator.itemgetter(1), reverse=asc)

In [13]:
def filter_values_under_percent(vec, percent: int):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    ordered_tfidfs = order_dict_by_values(tfidfs)
    indexes = list(map(lambda k: k, ordered_tfidfs))
    index = int(len(indexes) * percent / 100)
    return indexes[:index]

In [14]:
wc = documents.flatMap(lambda d: set(d)).countByValue()

In [15]:
wc_rdd = sc.parallelize(wc.items()). \
    map(lambda wc: (hashingTF.indexOf(wc[0]), wc[1])). \
    persist()
tfidf = tfidf.persist()

In [None]:
experiments = {}
for p in [10, 20, 30, 40, 50, 60]:
    excluded = tfidf.flatMap(lambda vec: filter_values_under_percent(vec, p)). \
        countByValue()
    excluded_rdd = sc.parallelize(excluded.items()).persist()
    stat = wc_rdd.join(excluded_rdd).map(lambda s: (s[0], s[1][1]/s[1][0])).persist()
    experiment = {}
    for t in [.9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
    stat.unpersist()
    experiments[p] = experiment    

In [None]:
excluded = tfidf.flatMap(lambda vec: filter_values_under_percent(vec, 50)). \
        countByValue()
excluded_rdd = sc.parallelize(excluded.items())

In [None]:
stat = wc_rdd.join(excluded_rdd)
stat = stat.map(lambda s: (s[0], s[1][1]/s[1][0]))

In [None]:
stat.take(1)

In [124]:
def exclude_words_bellow_threshold(word_tfidfs: dict, t: float):
    return [i_tfidf[0] for i_tfidf in word_tfidfs.items() if i_tfidf[1] < t]

import operator

def exclude_part_of_words(word_tfidfs: dict, percent: float):
    sorted_by_tf_idf = sorted(word_tfidfs.items(), key=operator.itemgetter(1))
    border = int(percent * len(word_tfidfs))
    return sorted_by_tf_idf[0:border]

In [136]:
exclude_part_of_words({'a':0.002, 'b':0.2, 'c':0.00001},.4)

[('c', 1e-05)]

In [143]:
t = sc.parallelize(documents.take(1)).zip(sc.parallelize(tfidf.take(1))).take(1)[0]

In [148]:
d = sparse_vector_to_dict(t[1])

In [152]:
ex = exclude_part_of_words(d, .3)
restore_words_by_ids(t[0], ex)
t[0]

['добр',
 'ден',
 'хабр',
 'продолжа',
 'изучен',
 'новинок',
 'представлен',
 'компан',
 'samsung',
 'ces',
 'обрат',
 'семейств',
 'ativ',
 'ряд',
 'котор',
 'стал',
 'устройств',
 'гост',
 'выставк',
 'показа',
 'нов',
 'компактн',
 'ноутбук',
 'ativ',
 'book',
 'edit',
 'моноблок',
 'ativ',
 'one',
 'edit',
 'нов',
 'ноутбук',
 'window',
 'сложн',
 'догада',
 'созда',
 'основ',
 'флагманск',
 'модел',
 'прошл',
 'год',
 'ativ',
 'book',
 'тонк',
 'цельнометаллическ',
 'корпус',
 'устройств',
 'порад',
 'приятн',
 'дизайн',
 'изрядн',
 'потолстевш',
 'аккумулятор',
 'час',
 'беспрерывн',
 'работ',
 'дюймов',
 'сенсорн',
 'full',
 'ярк',
 'прот',
 'предшественник',
 'подавн',
 'нов',
 'эргономичн',
 'клавиатур',
 'имеет',
 'сертификат',
 'предустановлен',
 'плеер',
 'splayer',
 'чип',
 'wolfson',
 'dac',
 'обеща',
 'достойн',
 'качеств',
 'звучан',
 'техническ',
 'характеристик',
 'ativ',
 'book',
 'edit',
 'экра',
 'дюймов',
 'full',
 'операцион',
 'систем',
 'window',
 'процессор',

In [9]:
def restore_words_by_ids(doc, ids):
    words = []
    for w in set(doc): #set
        w_idx = hashingTF.indexOf(w)
        if w_idx in ids:
            words.append(w)
    return words

In [38]:
doc = documents.take(1)[0]

In [39]:
doc_tf = hashingTF.transform(doc)
doc_tfidf = idf.transform(doc_tf)

In [53]:
restore_words_by_ids(doc, exclude_words_bellow_threshold(sparse_vector_to_dict(doc_tfidf), .02))

['компан',
 'котор',
 'созда',
 'год',
 'splayer',
 'качеств',
 'врем',
 'втор',
 'всем',
 'такж',
 'одн',
 'сто',
 'пользовател']

In [131]:
def do_magic(doc_tfidf):
    _dict = sparse_vector_to_dict(doc_tfidf[1])
    #excluded_words = exclude_words_bellow_threshold(_dict, .2)
    excluded_words = exclude_part_of_words(_dict, .3)
    return restore_words_by_ids(doc_tfidf[0], excluded_words)

#---------------------------------------------------------------------

In [None]:
documents.map(lambda d: do_magic(d)).take(1)

In [None]:
exclude_count = documents.zip(tfidf).flatMap(lambda d_tfidf: do_magic(d_tfidf)).countByValue()

In [76]:
exclude_count = sc.parallelize(stat.items()).sortBy(lambda w_c: w_c[1], ascending=False)

#----------------------------------------------------------------------

In [128]:
wc = documents.map(lambda d: set(d)).flatMap(lambda d: d).countByValue() # set(d)

In [132]:
exclude_count = documents.zip(tfidf).flatMap(lambda d_tfidf: do_magic(d_tfidf)).countByValue()

In [133]:
wc_rdd = sc.parallelize(wc.items()).filter(lambda wc: wc[1]>1)
ec_rdd = sc.parallelize(exclude_count.items())

In [134]:
wc_rdd.count(), ec_rdd.count()

(211724, 0)

In [99]:
stat = wc_rdd.join(ec_rdd).map(lambda item: (item[0], item[1][1]/item[1][0], (item[1][0],item[1][1])))

In [101]:
stat.sortBy(lambda x: x[1], ascending=False).take(10)

[('борел', 1.0, (2, 2)),
 ('объянен', 1.0, (2, 2)),
 ('eyer', 1.0, (2, 2)),
 ('loadsvgfromstr', 1.0, (2, 2)),
 ('расссмотрен', 1.0, (2, 2)),
 ('stringscount', 1.0, (2, 2)),
 ('archipelago', 1.0, (3, 3)),
 ('cimgdisplay', 1.0, (2, 2)),
 ('zik', 1.0, (2, 2)),
 ('matn', 1.0, (2, 2))]

In [72]:
print(wc_rdd.count(), ec_rdd.count(), stat.count())

211724 29340 26104


In [41]:
ec_rdd.map(lambda ec: len(ec[0])).min()

3

In [73]:
stat.filter(lambda x: x[1]>=.85).sortBy(lambda x: x[1]).take(10)

[('getsublist', 1.0, (2, 2)),
 ('kursi', 1.0, (2, 2)),
 ('unknownscor', 1.0, (2, 2)),
 ('confbridgelock', 1.0, (2, 2)),
 ('getlogicfunc', 1.0, (2, 2)),
 ('dahdidndon', 1.0, (2, 2)),
 ('mnode', 1.0, (2, 2)),
 ('mailboxstatus', 1.0, (2, 2)),
 ('logicstr', 1.0, (2, 2)),
 ('предпредпоследн', 1.0, (2, 2))]

In [74]:
ec_rdd.sortBy(lambda ec: ec[1], ascending=True).take(10)

[('реструктуризац', 1),
 ('bal', 1),
 ('бнз', 1),
 ('songbird', 1),
 ('батон', 1),
 ('gre', 1),
 ('конф', 1),
 ('vashi', 1),
 ('минификатор', 1),
 ('chroot', 1)]

In [93]:
stat.filter(lambda s: s[1] > .05).sortBy(lambda x: x[1], ascending=True).take(10)

[('уровен', 0.05004170141784821, (8393, 420)),
 ('use', 0.050052872752908, (5674, 284)),
 ('верхн', 0.050056242969628795, (5334, 267)),
 ('леж', 0.050060802594244024, (4934, 247)),
 ('поможет', 0.05006765899864682, (6651, 333)),
 ('разниц', 0.05011577902745617, (6046, 303)),
 ('локальн', 0.05011901613329807, (7562, 379)),
 ('конец', 0.050142894258248895, (3849, 193)),
 ('существен', 0.05016402405686167, (7316, 367)),
 ('недавн', 0.05019334293729623, (13189, 662))]

In [98]:
stat.count()

227698

In [101]:
print(wc_rdd.count())
print(ec_rdd.count())

552218
227698


In [112]:
import re

In [113]:
stat.filter(lambda x: x[1]>1 and not re.match('[a-z]',x[0])). \
sortBy(lambda x: x[1], ascending=False). \
map(lambda x: json.dumps(x,ensure_ascii=False)). \
saveAsTextFile('/home/hadoop/words')