In [1]:
from pyspark import RDD

In [2]:
class StoringTF(object):
    def fit(self, vacabulary):
        self.word_idx = vacabulary.distinct().zipWithIndex().collectAsMap()
        self.numFeatures = len(self.word_idx)
        
    def indexOf(self, term):
        """ Returns the index of the input term. """
        return self.word_idx[term]

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency
        vectors, or transform the RDD of document to RDD of term
        frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

In [5]:
import json
#from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
import numpy as np
from pyspark.mllib.linalg import Vectors

In [6]:
data = sc.textFile('hdfs://master:54310/clean'). \
    map(lambda line: json.loads(line))

documents = data.map(lambda x: x['Features'])
documents.cache()

PythonRDD[5] at RDD at PythonRDD.scala:43

In [7]:
vocabulary = documents.flatMap(lambda d: d).distinct()

In [8]:
storingTF = StoringTF()
storingTF.fit(vocabulary)

In [9]:
tf = storingTF.transform(documents)
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [10]:
wc = documents.flatMap(lambda d: set(d)). \
    map(lambda w: (w,1)). \
    reduceByKey(lambda a,b: a+b). \
    map(lambda wc: (storingTF.indexOf(wc[0]), wc[1])). \
    persist()

In [11]:
documents.flatMap(lambda d: set(d)).map(lambda w: storingTF.indexOf(w)).distinct().count()

98621

In [12]:
def cast_sparse_vector_to_dict_and_scale(v):
    max_v = v.toArray().max()
    return dict([(int(i), v[int(i)] / max_v) for i in v.indices])

import operator
def order_dict_by_values(d: dict, asc = True):
    return sorted(d.items(), key = operator.itemgetter(1), reverse= not asc)

def filter_values_under_percent(vec, percent: int):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    ordered_tfidfs = order_dict_by_values(tfidfs)
    indexes = list(map(lambda k: k[0], ordered_tfidfs))
    index = int(len(indexes) * percent / 100)
    return indexes[:index]

In [13]:
experiments = {}
for p in [10, 20, 30, 40, 50, 60]:
    excluded = tfidf.flatMap(lambda vec: filter_values_under_percent(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b).persist()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0])).persist()
    experiment = {}
    for t in [.9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
        print("p: {0}, t: {1}, count: {2}".format(p, t, excluded_count))
    stat.unpersist()
    excluded.unpersist()
    experiments[p] = experiment    

p: 10, t: 0.9, count: 0
p: 10, t: 0.8, count: 0
p: 10, t: 0.7, count: 1
p: 10, t: 0.6, count: 6
p: 10, t: 0.5, count: 41
p: 10, t: 0.4, count: 89
p: 10, t: 0.3, count: 131
p: 10, t: 0.2, count: 179
p: 10, t: 0.1, count: 255
p: 20, t: 0.9, count: 0
p: 20, t: 0.8, count: 1
p: 20, t: 0.7, count: 15
p: 20, t: 0.6, count: 108
p: 20, t: 0.5, count: 216
p: 20, t: 0.4, count: 300
p: 20, t: 0.3, count: 388
p: 20, t: 0.2, count: 479
p: 20, t: 0.1, count: 629
p: 30, t: 0.9, count: 1
p: 30, t: 0.8, count: 12
p: 30, t: 0.7, count: 123
p: 30, t: 0.6, count: 339
p: 30, t: 0.5, count: 492
p: 30, t: 0.4, count: 613
p: 30, t: 0.3, count: 741
p: 30, t: 0.2, count: 908
p: 30, t: 0.1, count: 1216
p: 40, t: 0.9, count: 3
p: 40, t: 0.8, count: 74
p: 40, t: 0.7, count: 402
p: 40, t: 0.6, count: 659
p: 40, t: 0.5, count: 881
p: 40, t: 0.4, count: 1101
p: 40, t: 0.3, count: 1345
p: 40, t: 0.2, count: 1624
p: 40, t: 0.1, count: 2169
p: 50, t: 0.9, count: 17
p: 50, t: 0.8, count: 313
p: 50, t: 0.7, count: 830
p: 

In [14]:
import pandas as pd

In [15]:
df = pd.DataFrame(experiments)

In [16]:
df

Unnamed: 0,10,20,30,40,50,60
0.1,255,629,1216,2169,3901,7737
0.2,179,479,908,1624,2883,5431
0.3,131,388,741,1345,2319,4292
0.4,89,300,613,1101,1907,3445
0.5,41,216,492,881,1552,2704
0.6,6,108,339,659,1213,2083
0.7,1,15,123,402,830,1494
0.8,0,1,12,74,313,837
0.9,0,0,1,3,17,101


In [18]:
def exclude_words_bellow_threshold(vec, t: float):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    return [i_tfidf[0] for i_tfidf in tfidfs.items() if i_tfidf[1] < t]

experiments_2 = {}
for p in [.5, .4, .3, .2, .1, .05, .01, .005]:
    excluded = tfidf.flatMap(lambda vec: exclude_words_bellow_threshold(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b).persist()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0])).persist()
    experiment = {}
    for t in [.9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
        print("p: {0}, t: {1}, count: {2}".format(p, t, excluded_count))
    stat.unpersist()
    excluded.unpersist()
    experiments_2[p] = experiment    

p: 0.5, t: 0.9, count: 71864
p: 0.5, t: 0.8, count: 82381
p: 0.5, t: 0.7, count: 89513
p: 0.5, t: 0.6, count: 93136
p: 0.5, t: 0.5, count: 94038
p: 0.5, t: 0.4, count: 96707
p: 0.5, t: 0.3, count: 97427
p: 0.5, t: 0.2, count: 97559
p: 0.5, t: 0.1, count: 97623
p: 0.4, t: 0.9, count: 60149
p: 0.4, t: 0.8, count: 74208
p: 0.4, t: 0.7, count: 84474
p: 0.4, t: 0.6, count: 89860
p: 0.4, t: 0.5, count: 91441
p: 0.4, t: 0.4, count: 95392
p: 0.4, t: 0.3, count: 96629
p: 0.4, t: 0.2, count: 96898
p: 0.4, t: 0.1, count: 97030
p: 0.3, t: 0.9, count: 42576
p: 0.3, t: 0.8, count: 59652
p: 0.3, t: 0.7, count: 74157
p: 0.3, t: 0.6, count: 82756
p: 0.3, t: 0.5, count: 85979
p: 0.3, t: 0.4, count: 92390
p: 0.3, t: 0.3, count: 94894
p: 0.3, t: 0.2, count: 95560
p: 0.3, t: 0.1, count: 95868
p: 0.2, t: 0.9, count: 19740
p: 0.2, t: 0.8, count: 33481
p: 0.2, t: 0.7, count: 51274
p: 0.2, t: 0.6, count: 65199
p: 0.2, t: 0.5, count: 72501
p: 0.2, t: 0.4, count: 83742
p: 0.2, t: 0.3, count: 89707
p: 0.2, t: 0.2

In [19]:
pd.DataFrame(experiments_2)

Unnamed: 0,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5
0.1,337,1326,54816,81103,93014,95868,97030,97623
0.2,132,437,34945,74179,91873,95560,96898,97559
0.3,68,229,19923,64431,89707,94894,96629,97427
0.4,30,102,9273,48101,83742,92390,95392,96707
0.5,7,27,3430,29451,72501,85979,91441,94038
0.6,7,27,2206,19556,65199,82756,89860,93136
0.7,7,25,1218,10564,51274,74157,84474,89513
0.8,7,25,892,5467,33481,59652,74208,82381
0.9,7,25,865,4005,19740,42576,60149,71864


In [27]:
idx_word = dict([(idx,w) for w,idx in storingTF.word_idx.items()])
len(idx_word)

98621

In [28]:
excluded = tfidf.flatMap(lambda vec: exclude_words_bellow_threshold(vec, 0.2)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b).persist()
stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0])).persist()
excluded_words = stat.filter(lambda s: s[1] > 0.8).map(lambda x: idx_word[x[0]]).collect()

In [29]:
len(excluded_words)

33481

In [30]:
excluded_words[:5]

['итог', 'sps', 'восьмиугольн', 'корн', 'неосвещен']

In [31]:
set_of_excluded_words = set(excluded_words)

In [32]:
len(set_of_excluded_words)

33481

In [33]:
def exclude_words(doc: dict):
    doc['Features'] = [w for w in doc['Features'] if w not in set_of_excluded_words]
    return doc

In [37]:
data.map(exclude_words).map(lambda x: len(x['Features'])).mean()

91.39934839726753

In [38]:
data.map(lambda x: len(x['Features'])).mean()

482.166663163426

In [50]:
doc_lengths = data.map(exclude_words).map(lambda x: len(x['Features'])).filter(lambda x: x<10).collect()

In [51]:
from matplotlib import pyplot as plt

In [56]:
plt.hist(doc_lengths, bins=40)
plt.show()

In [53]:
len(doc_lengths)

3229

In [59]:
data.map(exclude_words). \
    filter(lambda x: len(x['Features']) > 9). \
    map(lambda x: json.dumps(x)). \
    repartition(4). \
    saveAsTextFile('hdfs://master:54310/test')