In [18]:
from pyspark import RDD

In [19]:
class StoringTF(object):
    def fit(self, vacabulary):
        self.word_idx = vacabulary.distinct().zipWithIndex().collectAsMap()
        self.numFeatures = len(self.word_idx)
        
    def indexOf(self, term):
        """ Returns the index of the input term. """
        return self.word_idx[term]

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency
        vectors, or transform the RDD of document to RDD of term
        frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

In [7]:
import json
#from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
import numpy as np
from pyspark.mllib.linalg import Vectors

In [8]:
data = sc.textFile('hdfs://master:54310/clean'). \
    map(lambda line: json.loads(line)). \
    map(lambda doc: {'Features': doc['Text'], 'Labels': doc['Hubs']})

documents = data.map(lambda x: x['Features'])
documents.cache()

PythonRDD[14] at RDD at PythonRDD.scala:43

In [9]:
vocabulary = documents.flatMap(lambda d: d).distinct()

In [20]:
storingTF = StoringTF()
storingTF.fit(vocabulary)

In [21]:
tf = storingTF.transform(documents)
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [45]:
wc = documents.flatMap(lambda d: set(d)). \
    map(lambda w: (w,1)). \
    reduceByKey(lambda a,b: a+b). \
    map(lambda wc: (storingTF.indexOf(wc[0]), wc[1])). \
    persist()

In [16]:
documents.flatMap(lambda d: set(d)).map(lambda w: storingTF.indexOf(w)).distinct().count()

552218

In [29]:
def cast_sparse_vector_to_dict_and_scale(v):
    max_v = v.toArray().max()
    return dict([(int(i), v[int(i)] / max_v) for i in v.indices])

import operator
def order_dict_by_values(d: dict, asc = True):
    return sorted(d.items(), key = operator.itemgetter(1), reverse= not asc)

def filter_values_under_percent(vec, percent: int):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    ordered_tfidfs = order_dict_by_values(tfidfs)
    indexes = list(map(lambda k: k[0], ordered_tfidfs))
    index = int(len(indexes) * percent / 100)
    return indexes[:index]

In [47]:
experiments = {}
for p in [10, 20, 30, 40, 50, 60]:
    excluded = tfidf.flatMap(lambda vec: filter_values_under_percent(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b).persist()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0])).persist()
    experiment = {}
    for t in [.9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
        print("p: {0}, t: {1}, count: {2}".format(p, t, excluded_count))
    stat.unpersist()
    excluded.unpersist()
    experiments[p] = experiment    

p: 10, t: 0.9, count: 2
p: 10, t: 0.8, count: 2
p: 10, t: 0.7, count: 2
p: 10, t: 0.6, count: 8
p: 10, t: 0.5, count: 47
p: 10, t: 0.4, count: 94
p: 10, t: 0.3, count: 130
p: 10, t: 0.2, count: 179
p: 10, t: 0.1, count: 256
p: 20, t: 0.9, count: 2
p: 20, t: 0.8, count: 2
p: 20, t: 0.7, count: 17
p: 20, t: 0.6, count: 119
p: 20, t: 0.5, count: 229
p: 20, t: 0.4, count: 321
p: 20, t: 0.3, count: 410
p: 20, t: 0.2, count: 508
p: 20, t: 0.1, count: 681
p: 30, t: 0.9, count: 69
p: 30, t: 0.8, count: 77
p: 30, t: 0.7, count: 213
p: 30, t: 0.6, count: 423
p: 30, t: 0.5, count: 570
p: 30, t: 0.4, count: 792
p: 30, t: 0.3, count: 964
p: 30, t: 0.2, count: 1181
p: 30, t: 0.1, count: 1576
p: 40, t: 0.9, count: 433
p: 40, t: 0.8, count: 510
p: 40, t: 0.7, count: 859
p: 40, t: 0.6, count: 1121
p: 40, t: 0.5, count: 1349
p: 40, t: 0.4, count: 1714
p: 40, t: 0.3, count: 2051
p: 40, t: 0.2, count: 2439
p: 40, t: 0.1, count: 3257
p: 50, t: 0.9, count: 1064
p: 50, t: 0.8, count: 1392
p: 50, t: 0.7, coun

In [48]:
import pandas as pd

In [49]:
df = pd.DataFrame(experiments)

In [50]:
df

Unnamed: 0,10,20,30,40,50,60
0.1,256,681,1576,3257,7146,16734
0.2,179,508,1181,2439,5047,11601
0.3,130,410,964,2051,4177,9618
0.4,94,321,792,1714,3458,7769
0.5,47,229,570,1349,2688,5932
0.6,8,119,423,1121,2322,5246
0.7,2,17,213,859,1908,4579
0.8,2,2,77,510,1392,3863
0.9,2,2,69,433,1064,3074


In [72]:
def exclude_words_bellow_threshold(vec, t: float):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    return [i_tfidf[0] for i_tfidf in tfidfs.items() if i_tfidf[1] < t]
del experiments_2
experiments_2 = {}
for p in [.5, .4, .3, .2, .1, .05, .01, .005]:
    excluded = tfidf.flatMap(lambda vec: exclude_words_bellow_threshold(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b).persist()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0])).persist()
    experiment = {}
    for t in [.9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
        print("p: {0}, t: {1}, count: {2}".format(p, t, excluded_count))
    stat.unpersist()
    excluded.unpersist()
    experiments_2[p] = experiment    

p: 0.5, t: 0.9, count: 443959
p: 0.5, t: 0.8, count: 463783
p: 0.5, t: 0.7, count: 478124
p: 0.5, t: 0.6, count: 488174
p: 0.5, t: 0.5, count: 490280
p: 0.5, t: 0.4, count: 505355
p: 0.5, t: 0.3, count: 507678
p: 0.5, t: 0.2, count: 508097
p: 0.5, t: 0.1, count: 508240
p: 0.4, t: 0.9, count: 403093
p: 0.4, t: 0.8, count: 428180
p: 0.4, t: 0.7, count: 447929
p: 0.4, t: 0.6, count: 462163
p: 0.4, t: 0.5, count: 465877
p: 0.4, t: 0.4, count: 486383
p: 0.4, t: 0.3, count: 490332
p: 0.4, t: 0.2, count: 491196
p: 0.4, t: 0.1, count: 491547
p: 0.3, t: 0.9, count: 341086
p: 0.3, t: 0.8, count: 368175
p: 0.3, t: 0.7, count: 394978
p: 0.3, t: 0.6, count: 415201
p: 0.3, t: 0.5, count: 421944
p: 0.3, t: 0.4, count: 450680
p: 0.3, t: 0.3, count: 457858
p: 0.3, t: 0.2, count: 459721
p: 0.3, t: 0.1, count: 460593
p: 0.2, t: 0.9, count: 255010
p: 0.2, t: 0.8, count: 271538
p: 0.2, t: 0.7, count: 299229
p: 0.2, t: 0.6, count: 327515
p: 0.2, t: 0.5, count: 341186
p: 0.2, t: 0.4, count: 382389
p: 0.2, t:

In [73]:
pd.DataFrame(experiments_2)

Unnamed: 0,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5
0.1,1723,6301,143659,279280,404315,460593,491547,508240
0.2,1351,4805,110408,266222,401785,459721,491196,508097
0.3,1285,4417,88106,247902,396859,457858,490332,507678
0.4,1176,3840,70950,215868,382389,450680,486383,505355
0.5,960,2950,53104,166827,341186,421944,465877,490280
0.6,960,2949,51725,153253,327515,415201,462163,488174
0.7,959,2942,49803,139341,299229,394978,447929,478124
0.8,959,2942,49419,132941,271538,368175,428180,463783
0.9,959,2942,49391,131474,255010,341086,403093,443959
