In [1]:
from pyspark import RDD

In [2]:
class StoringTF(object):
    def fit(self, vacabulary):
        self.word_idx = vacabulary.distinct().zipWithIndex().collectAsMap()
        self.numFeatures = len(self.word_idx)
        
    def indexOf(self, term):
        """ Returns the index of the input term. """
        return self.word_idx[term]

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency
        vectors, or transform the RDD of document to RDD of term
        frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

In [3]:
import json
#from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
import numpy as np
from pyspark.mllib.linalg import Vectors

In [4]:
data = sc.textFile('hdfs://master:54310/exp_2/ml_data_without_urls'). \
    map(lambda line: json.loads(line))

documents = data.map(lambda x: x['Features'])
documents.cache()

PythonRDD[2] at RDD at PythonRDD.scala:43

In [5]:
vocabulary = documents.flatMap(lambda d: d).distinct()

In [6]:
storingTF = StoringTF()
storingTF.fit(vocabulary)

In [7]:
tf = storingTF.transform(documents)
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [8]:
wc = documents.flatMap(lambda d: set(d)). \
    map(lambda w: (w,1)). \
    reduceByKey(lambda a,b: a+b). \
    map(lambda wc: (storingTF.indexOf(wc[0]), wc[1])). \
    persist()

In [9]:
documents.flatMap(lambda d: set(d)).map(lambda w: storingTF.indexOf(w)).distinct().count()

98621

In [9]:
import pandas as pd

In [12]:
def exclude_words_bellow_threshold(vec, t: float):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    return [i_tfidf[0] for i_tfidf in tfidfs.items() if i_tfidf[1] < t]



def cast_sparse_vector_to_dict_and_scale(v):
    max_v = v.toArray().max()
    return dict([(int(i), v[int(i)] / max_v) for i in v.indices])

In [15]:
experiments_2 = {}
for p in [.6, .5, .4, .3, .2, .1, .05, .01, .005]:
    excluded = tfidf.flatMap(lambda vec: exclude_words_bellow_threshold(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b).persist()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0])).persist()
    experiment = {}
    for t in [.99, .9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
        print("p: {0}, t: {1}, count: {2}".format(p, t, excluded_count))
    stat.unpersist()
    excluded.unpersist()
    experiments_2[p] = experiment    

p: 0.6, t: 0.99, count: 50161
p: 0.6, t: 0.9, count: 71668
p: 0.6, t: 0.8, count: 82575
p: 0.6, t: 0.7, count: 89464
p: 0.6, t: 0.6, count: 93845
p: 0.6, t: 0.5, count: 94690
p: 0.6, t: 0.4, count: 100599
p: 0.6, t: 0.3, count: 101474
p: 0.6, t: 0.2, count: 101624
p: 0.6, t: 0.1, count: 101678
p: 0.5, t: 0.99, count: 39460
p: 0.5, t: 0.9, count: 59554
p: 0.5, t: 0.8, count: 73845
p: 0.5, t: 0.7, count: 83284
p: 0.5, t: 0.6, count: 89437
p: 0.5, t: 0.5, count: 90936
p: 0.5, t: 0.4, count: 99095
p: 0.5, t: 0.3, count: 100599
p: 0.5, t: 0.2, count: 100899
p: 0.5, t: 0.1, count: 101012
p: 0.4, t: 0.99, count: 29133
p: 0.4, t: 0.9, count: 43330
p: 0.4, t: 0.8, count: 60551
p: 0.4, t: 0.7, count: 73789
p: 0.4, t: 0.6, count: 82553
p: 0.4, t: 0.5, count: 85349
p: 0.4, t: 0.4, count: 96286
p: 0.4, t: 0.3, count: 98936
p: 0.4, t: 0.2, count: 99566
p: 0.4, t: 0.1, count: 99833
p: 0.3, t: 0.99, count: 18858
p: 0.3, t: 0.9, count: 24452
p: 0.3, t: 0.8, count: 39697
p: 0.3, t: 0.7, count: 56683
p: 

In [16]:
pd.DataFrame(experiments_2)

Unnamed: 0,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6
0.1,106,487,33481,70053,91132,97380,99833,101012,101678
0.2,59,259,13767,58223,88819,96654,99566,100899,101624
0.3,43,190,7401,43377,84693,95214,98936,100599,101474
0.4,24,109,3871,25534,74585,90425,96286,99095,100599
0.5,0,1,660,8875,53002,75165,85349,90936,94690
0.6,0,1,489,5482,42233,69695,82553,89437,93845
0.7,0,0,257,2760,26373,56683,73789,83284,89464
0.8,0,0,233,1798,14369,39697,60551,73845,82575
0.9,0,0,232,1602,9862,24452,43330,59554,71668
0.99,0,0,232,1594,8929,18858,29133,39460,50161


In [17]:
idx_word = dict([(idx,w) for w,idx in storingTF.word_idx.items()])
len(idx_word)

102514

In [13]:
excluded = tfidf.flatMap(lambda vec: exclude_words_bellow_threshold(vec, 0.3)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b).persist()
stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0])).persist()
excluded_words = stat.filter(lambda s: s[1] > 0.9).map(lambda x: idx_word[x[0]]).collect()

In [14]:
len(excluded_words)

42576

In [15]:
excluded_words[:5]

['безделушк', 'тэгирован', 'байтаерт', 'efnet', 'sslversion']

In [16]:
set_of_excluded_words = set(excluded_words)

In [17]:
len(set_of_excluded_words)

42576

In [18]:
def exclude_words(doc: dict):
    doc['Features'] = [w for w in doc['Features'] if w not in set_of_excluded_words]
    return doc

In [19]:
data.map(exclude_words).map(lambda x: len(x['Features'])).mean()

97.41282186022069

In [20]:
data.map(lambda x: len(x['Features'])).mean()

482.166663163426

In [21]:
doc_lengths = data.map(exclude_words).map(lambda x: len(x['Features'])).filter(lambda x: x<10).collect()

In [22]:
from matplotlib import pyplot as plt

In [23]:
plt.hist(doc_lengths, bins=40)
plt.show()

In [53]:
len(doc_lengths)

3229

In [22]:
data.map(exclude_words). \
    filter(lambda x: len(x['Features']) > 20). \
    map(lambda x: json.dumps(x)). \
    repartition(4). \
    saveAsTextFile('hdfs://master:54310/test3')