In [16]:
from pyspark import RDD
import json
#from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
import numpy as np
from pyspark.mllib.linalg import Vectors

In [17]:
class StoringTF(object):
    def fit(self, vacabulary):
        self.word_idx = vacabulary.distinct().zipWithIndex().collectAsMap()
        self.numFeatures = len(self.word_idx)
        
    def indexOf(self, term):
        """ Returns the index of the input term. """
        return self.word_idx[term]

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency
        vectors, or transform the RDD of document to RDD of term
        frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

In [41]:
data = sc.textFile('hdfs://master:54310/exp_2/ml_data_without_urls'). \
    map(lambda line: json.loads(line))
    
documents = data.map(lambda x: x['Features'])
documents.cache()

PythonRDD[145] at RDD at PythonRDD.scala:43

In [42]:
vocabulary = documents.flatMap(lambda d: d).distinct()

storingTF = StoringTF()
storingTF.fit(vocabulary)

tf = storingTF.transform(documents)
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [44]:
tfidf.cache()

MapPartitionsRDD[165] at mapPartitions at PythonMLLibAPI.scala:1480

In [43]:
wc = documents.flatMap(lambda d: set(d)). \
    map(lambda w: (w,1)). \
    reduceByKey(lambda a,b: a+b). \
    map(lambda wc: (storingTF.indexOf(wc[0]), wc[1])). \
    persist()

In [27]:
documents.flatMap(lambda d: set(d)).map(lambda w: storingTF.indexOf(w)).distinct().count()

102514

In [45]:
def cast_sparse_vector_to_dict_and_scale(v):
    max_v = v.values.max()
    return dict([(int(i), v[int(i)] / max_v) for i in v.indices])

import operator
def order_dict_by_values(d: dict, asc = True):
    return sorted(d.items(), key = operator.itemgetter(1), reverse= not asc)

def filter_values_under_percent(vec, percent: int):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    ordered_tfidfs = order_dict_by_values(tfidfs)
    indexes = list(map(lambda k: k[0], ordered_tfidfs))
    index = int(len(indexes) * percent / 100)
    return indexes[:index]

In [46]:
import pandas as pd

In [47]:
def exclude_words_bellow_threshold(vec, t: float):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    return [i_tfidf[0] for i_tfidf in tfidfs.items() if i_tfidf[1] < t]

In [48]:
experiments_2 = {}
for p in [.9, .8, .7, .6, .5, .4, .3, .2, .1, .05, .01, .005]:
    excluded = tfidf_1st.flatMap(lambda vec: exclude_words_bellow_threshold(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b)
    excluded.cache()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0]))
    stat.cache()
    experiment = {}
    for t in [.99, .95, .9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
        print("p: {0}, t: {1}, count: {2}".format(p, t, excluded_count))
    stat.unpersist()
    excluded.unpersist()
    experiments_2[p] = experiment    
pd.DataFrame(experiments_2)

p: 0.9, t: 0.99, count: 54478
p: 0.9, t: 0.95, count: 54769
p: 0.9, t: 0.9, count: 55131
p: 0.9, t: 0.8, count: 56137
p: 0.9, t: 0.7, count: 58351
p: 0.9, t: 0.6, count: 62242
p: 0.9, t: 0.5, count: 63994
p: 0.9, t: 0.4, count: 68881
p: 0.9, t: 0.3, count: 73661
p: 0.9, t: 0.2, count: 78436
p: 0.9, t: 0.1, count: 85499
p: 0.8, t: 0.99, count: 53935
p: 0.8, t: 0.95, count: 54273
p: 0.8, t: 0.9, count: 54652
p: 0.8, t: 0.8, count: 55712
p: 0.8, t: 0.7, count: 57933
p: 0.8, t: 0.6, count: 61793
p: 0.8, t: 0.5, count: 63527
p: 0.8, t: 0.4, count: 68556
p: 0.8, t: 0.3, count: 73398
p: 0.8, t: 0.2, count: 78164
p: 0.8, t: 0.1, count: 85265
p: 0.7, t: 0.99, count: 53206
p: 0.7, t: 0.95, count: 53548
p: 0.7, t: 0.9, count: 53998
p: 0.7, t: 0.8, count: 55077
p: 0.7, t: 0.7, count: 57278
p: 0.7, t: 0.6, count: 61145
p: 0.7, t: 0.5, count: 62887
p: 0.7, t: 0.4, count: 68049
p: 0.7, t: 0.3, count: 72948
p: 0.7, t: 0.2, count: 77744
p: 0.7, t: 0.1, count: 84898
p: 0.6, t: 0.99, count: 52195
p: 0.6,

Unnamed: 0,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0.1,3445,7983,36585,56258,72761,78965,81777,83335,84365,84898,85265,85499
0.2,2872,6812,31489,49362,65021,71241,74223,75947,77072,77744,78164,78436
0.3,2572,6224,28662,45095,60306,66412,69390,71085,72246,72948,73398,73661
0.4,2158,5383,25343,40154,54843,61099,64228,66076,67316,68049,68556,68881
0.5,1522,3997,20322,32824,46992,53983,57887,60260,61914,62887,63527,63994
0.6,1465,3896,19784,31834,45476,52394,56179,58550,60197,61145,61793,62242
0.7,1358,3644,18613,29952,42419,48900,52555,54806,56361,57278,57933,58351
0.8,1310,3518,17941,28872,40673,46772,50397,52584,54137,55077,55712,56137
0.9,1290,3451,17635,28377,39919,45665,49174,51440,53024,53998,54652,55131
0.95,1283,3437,17562,28247,39723,45366,48747,50940,52531,53548,54273,54769


In [50]:
wc.count()

102514

In [51]:
idx_word = dict([(idx,w) for w,idx in storingTF.word_idx.items()])
len(idx_word)

102514

In [61]:
excluded = tfidf.flatMap(lambda vec: exclude_words_bellow_threshold(vec, .3)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b)
stat = wc.join(excluded).map(lambda s: (s[0], s[1][1] == [1][0]))
excluded_words = stat.filter(lambda s: s[1] > 0.6).map(lambda x: idx_word[x[0]]).collect()

In [62]:
len(excluded_words)

14313

In [54]:
set_of_excluded_words = set(excluded_words)

In [55]:
len(set_of_excluded_words)

18915

In [56]:
def exclude_words(doc: dict, excluded):
    doc['Features'] = [w for w in doc['Features'] if w not in excluded]
    return doc

In [57]:
data.map(lambda x: exclude_words(x, set_of_excluded_words)). \
    map(lambda x: len(x['Features'])).mean()

339.97511553812757

In [58]:
data.map(lambda x: len(x['Features'])).mean()

340.6226629787831

In [115]:
stage.map(json.dumps). \
    repartition(6). \
    saveAsTextFile('hdfs://master:54310/excluded_fin')