In [1]:
import json
data = sc.textFile('hdfs://master:54310/clean'). \
    map(lambda line: json.loads(line))

In [4]:
all_words = data.flatMap(lambda x: set(x['Text']))

In [5]:
word_count = all_words.countByValue()

In [14]:
list(word_count.items())[2]

('clearless', 1)

In [16]:
import re

In [17]:
ordered_wc = sc.parallelize(word_count.items()). \
    distinct(). \
    sortBy(lambda wc: wc[1])

In [18]:
ordered_wc.filter(lambda wc: wc[1] == 1).count()

340494

In [48]:
ordered_wc = ordered_wc.filter(lambda wc: wc[1] > 1)

In [49]:
rus = ordered_wc.filter(lambda wc: not re.match('[a-z]', wc[0])). \
    sortBy(lambda wc: wc[1])

In [50]:
eng = ordered_wc.filter(lambda wc: re.match('[a-z]', wc[0])). \
    sortBy(lambda wc: wc[1], ascending=True)

In [51]:
rus.count()

108988

In [52]:
eng.count()

102736

In [60]:
eng.filter(lambda wc: len(wc[0]) == 5).sortBy(lambda wc: wc[1], ascending=False).take(50)

[('googl', 17665),
 ('linux', 8860),
 ('class', 7937),
 ('sourc', 6727),
 ('updat', 5368),
 ('iphon', 4945),
 ('intel', 4572),
 ('manag', 4345),
 ('start', 4163),
 ('index', 4117),
 ('creat', 3976),
 ('error', 3627),
 ('mobil', 3440),
 ('flash', 3410),
 ('group', 3387),
 ('array', 3133),
 ('store', 3107),
 ('https', 3105),
 ('world', 3084),
 ('mysql', 3071),
 ('local', 2913),
 ('opera', 2778),
 ('email', 2749),
 ('secur', 2702),
 ('event', 2683),
 ('activ', 2653),
 ('phone', 2645),
 ('input', 2629),
 ('build', 2583),
 ('width', 2564),
 ('style', 2510),
 ('engin', 2455),
 ('count', 2403),
 ('apach', 2398),
 ('media', 2393),
 ('gmail', 2377),
 ('write', 2368),
 ('model', 2320),
 ('color', 2306),
 ('share', 2225),
 ('hello', 2173),
 ('print', 2164),
 ('break', 2112),
 ('nokia', 2110),
 ('defin', 2062),
 ('first', 2047),
 ('devic', 2028),
 ('modul', 1996),
 ('delet', 1994),
 ('const', 1965)]

In [43]:
eng.sortBy(lambda wc: wc[1], ascending=False).take(50)[30:]

[('server', 6274),
 ('code', 6250),
 ('public', 6228),
 ('file', 6219),
 ('set', 6161),
 ('var', 6094),
 ('java', 6006),
 ('els', 5924),
 ('fals', 5907),
 ('open', 5811),
 ('string', 5737),
 ('text', 5724),
 ('use', 5674),
 ('valu', 5558),
 ('usb', 5410),
 ('app', 5393),
 ('void', 5388),
 ('updat', 5368),
 ('url', 5297),
 ('null', 5295)]

In [47]:
eng.filter(lambda wc: len(wc[0])==10).take(20)

[('stdelement', 1),
 ('longstatus', 1),
 ('xygxyhanla', 1),
 ('markerfilt', 1),
 ('webometrix', 1),
 ('keyfilenam', 1),
 ('teckcrunch', 1),
 ('reactpivot', 1),
 ('ignoreopen', 1),
 ('dontgotouk', 1),
 ('titlefield', 1),
 ('windowdata', 1),
 ('gimballock', 1),
 ('readsymbol', 1),
 ('gdatabundl', 1),
 ('dlloverrid', 1),
 ('marginstep', 1),
 ('soapservic', 1),
 ('davesexton', 1),
 ('outcontrol', 1)]

In [61]:
texts = data.map(lambda row: row['Text'])

In [62]:
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF

In [63]:
hashingTF = HashingTF()
tf = hashingTF.transform(texts)

In [65]:
from pyspark.mllib.feature import IDF
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [67]:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

In [82]:
hashingTF.indexOf('google')

680122

In [88]:
data.take(1)[0]['Text'][0]

'добр'

In [89]:
hashingTF.indexOf('добр')

956959

In [108]:
temp = tfidf.filter(lambda d: 956959 in d.indices).map(lambda d: d[956959])

In [171]:
hashes = all_words.distinct().map(lambda w: (hashingTF.indexOf(w), w))

In [121]:
def sparse_vector_to_dict(vector):
    return [(int(i), vector[int(i)]) for i in vector.indices]

In [None]:
import numpy as np

def calc_stat(group):
    values = np.array(list(group))    
    min_v = values.min()
    max_v = values.max()
    mean_v = values.mean()
    return (min_v, max_v, mean_v)

In [159]:
stat = tfidf.flatMap(lambda doc: sparse_vector_to_dict(doc)). \
    groupBy(lambda x: x[0]). \
    map(lambda g: (g[0], calc_stat(g[1]))). \
    collect()

In [163]:
sc.parallelize(stat).map(lambda x: x[0]).distinct().count()

428674

In [164]:
hashes.count()

552218

In [172]:
joined = hashes.join(sc.parallelize(stat))

In [173]:
joined.take(1)

[(376832, ('линз', (5.5548525356221568, 376832.0, 188423.30485689137)))]

In [186]:
hist = joined.map(lambda x: x[1][1][1]).histogram(20)

In [187]:
hist

([0.0,
  52428.699999999997,
  104857.39999999999,
  157286.09999999998,
  209714.79999999999,
  262143.5,
  314572.19999999995,
  367000.89999999997,
  419429.59999999998,
  471858.29999999999,
  524287.0,
  576715.69999999995,
  629144.39999999991,
  681573.09999999998,
  734001.79999999993,
  786430.5,
  838859.19999999995,
  891287.89999999991,
  943716.59999999998,
  996145.29999999993,
  1048574.0],
 [27626,
  27639,
  27801,
  27571,
  27403,
  27576,
  27606,
  27652,
  27855,
  27543,
  27646,
  27534,
  27531,
  27717,
  27664,
  27707,
  27360,
  27563,
  27552,
  27672])