In [1]:
from pyspark import RDD
import json
#from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
import numpy as np
from pyspark.mllib.linalg import Vectors

In [2]:
class StoringTF(object):
    def fit(self, vacabulary):
        self.word_idx = vacabulary.distinct().zipWithIndex().collectAsMap()
        self.numFeatures = len(self.word_idx)
        
    def indexOf(self, term):
        """ Returns the index of the input term. """
        return self.word_idx[term]

    def transform(self, document):
        """
        Transforms the input document (list of terms) to term frequency
        vectors, or transform the RDD of document to RDD of term
        frequency vectors.
        """
        if isinstance(document, RDD):
            return document.map(self.transform)

        freq = {}
        for term in document:
            i = self.indexOf(term)
            freq[i] = freq.get(i, 0) + 1.0
        return Vectors.sparse(self.numFeatures, freq.items())

In [None]:
data = sc.textFile('hdfs://master:54310/ml_data_temp'). \
    map(lambda line: json.loads(line))

In [59]:
documents_1st = data.map(lambda x: x['Features'])
documents_1st.cache()
vocabulary_1st = documents_1st.flatMap(lambda d: d).distinct()
storingTF_1st = StoringTF()
storingTF_1st.fit(vocabulary_1st)
tf_1st = storingTF_1st.transform(documents_1st)
idf_1st = IDF().fit(tf_1st)
tfidf_1st = idf_1st.transform(tf_1st)
tf_1st.cache()
tf_1st.count()
tfidf_1st.cache()
tfidf_1st.count()

112807

In [60]:
wc_1st = documents_1st.flatMap(lambda d: set(d)). \
    map(lambda w: (w,1)). \
    reduceByKey(lambda a,b: a+b). \
    map(lambda wc: (storingTF_1st.indexOf(wc[0]), wc[1])). \
    cache()

In [12]:
documents_1st.flatMap(lambda d: set(d)).map(lambda w: storingTF_1st.indexOf(w)).distinct().count()

292094

In [10]:
def cast_sparse_vector_to_dict_and_scale(v):
    max_v = v.values.max()
    return dict([(int(i), v[int(i)] / max_v) for i in v.indices])

import operator
def order_dict_by_values(d: dict, asc = True):
    return sorted(d.items(), key = operator.itemgetter(1), reverse= not asc)

def filter_values_under_percent(vec, percent: int):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    ordered_tfidfs = order_dict_by_values(tfidfs)
    indexes = list(map(lambda k: k[0], ordered_tfidfs))
    index = int(len(indexes) * percent / 100)
    return indexes[:index]

In [11]:
import pandas as pd

In [13]:
def exclude_words_bellow_threshold(vec, t: float):
    tfidfs = cast_sparse_vector_to_dict_and_scale(vec)
    return [i_tfidf[0] for i_tfidf in tfidfs.items() if i_tfidf[1] < t]

In [None]:
experiments_2 = {}
for p in [.9, .8, .7, .6, .5, .4, .3, .2, .1, .05, .01, .005]:
    excluded = tfidf_1st.flatMap(lambda vec: exclude_words_bellow_threshold(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b)
    excluded.cache()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0]))
    stat.cache()
    experiment = {}
    for t in [.99, .95, .9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
        print("p: {0}, t: {1}, count: {2}".format(p, t, excluded_count))
    stat.unpersist()
    excluded.unpersist()
    experiments_2[p] = experiment    
pd.DataFrame(experiments_2)

In [27]:
experiments_3 = {}
for p in [.9, .8, .7, .6, .5, .4, .3, .2, .1, .05, .01, .005]:
    excluded = tfidf_1st.flatMap(lambda vec: exclude_words_bellow_threshold(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b)
    excluded.cache()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1] == [1][0]))
    stat.cache()
    excluded_count = stat.filter(lambda s: s[1]).count()
    experiments_3[p] = excluded_count
    print("p: {0}, count: {1}".format(p,excluded_count))
    stat.unpersist()
    excluded.unpersist()   

p: 0.9, count: 89379


In [64]:
idx_word_1st = dict([(idx,w) for w,idx in storingTF_1st.word_idx.items()])
len(idx_word_1st)

292094

In [65]:
excluded_1st = tfidf_1st.flatMap(lambda vec: exclude_words_bellow_threshold(vec, .9)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b)
stat_1st = wc_1st.join(excluded_1st).map(lambda s: (s[0], s[1][1] == [1][0]))
excluded_words_1st = stat_1st.filter(lambda s: s[1]).map(lambda x: idx_word_1st[x[0]]).collect()

In [66]:
len(excluded_words_1st)

89379

In [67]:
set_of_excluded_words_1st = set(excluded_words_1st)

In [68]:
len(set_of_excluded_words_1st)

89379

In [69]:
def exclude_words(doc: dict, excluded):
    doc['Features'] = [w for w in doc['Features'] if w not in excluded]
    return doc

In [36]:
data.map(lambda x: exclude_words(x, set_of_excluded_words_1st)). \
    map(lambda x: len(x['Features'])).mean()

411.90720434015554

In [29]:
data.map(lambda x: len(x['Features'])).mean()

415.06630794188277

In [70]:
stage_1st = data.map(lambda x: exclude_words(x, set_of_excluded_words_1st)).filter(lambda x: x['Features'])

In [95]:
stage_1st.map(json.dumps). \
    repartition(6). \
    saveAsTextFile('hdfs://master:54310/excluding_temp')

In [71]:
stage_1st.count()

112806

In [72]:
data.count()

112807

In [96]:
stage_1st = sc.textFile('hdfs://master:54310/excluding_temp'). \
    map(json.loads)

In [97]:
documents_2nd = stage_1st.map(lambda x: x['Features'])
documents_2nd.cache()
vocabulary_2nd = documents_2nd.flatMap(lambda d: d).distinct()
storingTF_2nd = StoringTF()
storingTF_2nd.fit(vocabulary_2nd)
tf_2nd = storingTF_2nd.transform(documents_2nd)
idf_2nd = IDF().fit(tf_2nd)
tfidf_2nd = idf_2nd.transform(tf_2nd)
tf_2nd.cache()
tf_2nd.count()
tfidf_2nd.cache()
tfidf_2nd.count()

112806

In [51]:
experiments_4 = {}
for p in [.9, .8, .7, .6, .5, .4, .3, .2, .1, .05, .01, .005]:
    excluded = tfidf_2nd.flatMap(lambda vec: exclude_words_bellow_threshold(vec, p)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b)
    excluded.cache()
    stat = wc.join(excluded).map(lambda s: (s[0], s[1][1]/s[1][0]))
    stat.cache()
    experiment = {}
    for t in [.9999, .99, .95, .9, .8, .7, .6, .5, .4, .3, .2, .1]:
        excluded_count = stat.filter(lambda s: s[1] > t).count()
        experiment[t] = excluded_count
        print("p: {0}, t: {1}, count: {2}".format(p, t, excluded_count))
    stat.unpersist()
    excluded.unpersist()
    experiments_4[p] = experiment    
pd.DataFrame(experiments_4)

p: 0.9, t: 0.9999, count: 136478
p: 0.9, t: 0.99, count: 136479
p: 0.9, t: 0.95, count: 136533
p: 0.9, t: 0.9, count: 136742
p: 0.9, t: 0.8, count: 137759
p: 0.9, t: 0.7, count: 140693
p: 0.9, t: 0.6, count: 147713
p: 0.9, t: 0.5, count: 149952
p: 0.9, t: 0.4, count: 156328
p: 0.9, t: 0.3, count: 162972
p: 0.9, t: 0.2, count: 169179
p: 0.9, t: 0.1, count: 177442
p: 0.8, t: 0.9999, count: 135982
p: 0.8, t: 0.99, count: 135987
p: 0.8, t: 0.95, count: 136049
p: 0.8, t: 0.9, count: 136249
p: 0.8, t: 0.8, count: 137245
p: 0.8, t: 0.7, count: 140167
p: 0.8, t: 0.6, count: 147170
p: 0.8, t: 0.5, count: 149388
p: 0.8, t: 0.4, count: 155947
p: 0.8, t: 0.3, count: 162652
p: 0.8, t: 0.2, count: 168908
p: 0.8, t: 0.1, count: 177283
p: 0.7, t: 0.9999, count: 135249
p: 0.7, t: 0.99, count: 135251
p: 0.7, t: 0.95, count: 135320
p: 0.7, t: 0.9, count: 135510
p: 0.7, t: 0.8, count: 136523
p: 0.7, t: 0.7, count: 139396
p: 0.7, t: 0.6, count: 146317
p: 0.7, t: 0.5, count: 148523
p: 0.7, t: 0.4, count: 15

Unnamed: 0,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0.1,10608,21140,89285,132781,161489,170089,173697,175505,176514,177031,177283,177442
0.2,9672,19330,82205,123069,151479,160507,164599,166691,167863,168501,168908,169179
0.3,9174,18349,78108,117305,144939,154014,158130,160322,161570,162189,162652,162972
0.4,8427,16786,72412,109543,136707,146190,150681,153210,154598,155397,155947,156328
0.5,6976,13955,61961,96185,124799,136032,141809,145231,147259,148523,149388,149952
0.6,6894,13804,61287,95031,123102,134085,139772,143099,145091,146317,147170,147713
0.7,6682,13432,59339,91564,118038,128201,133450,136469,138302,139396,140167,140693
0.8,6582,13245,58400,90027,115804,125689,130749,133685,135471,136523,137245,137759
0.9,6543,13177,58050,89499,115021,124779,129799,132706,134465,135510,136249,136742
0.95,6537,13165,57970,89362,114863,124604,129614,132516,134271,135320,136049,136533


In [98]:
vocabulary_2nd.count()

202715

In [52]:
pd.DataFrame(experiments_4)

Unnamed: 0,0.005,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0.1,10608,21140,89285,132781,161489,170089,173697,175505,176514,177031,177283,177442
0.2,9672,19330,82205,123069,151479,160507,164599,166691,167863,168501,168908,169179
0.3,9174,18349,78108,117305,144939,154014,158130,160322,161570,162189,162652,162972
0.4,8427,16786,72412,109543,136707,146190,150681,153210,154598,155397,155947,156328
0.5,6976,13955,61961,96185,124799,136032,141809,145231,147259,148523,149388,149952
0.6,6894,13804,61287,95031,123102,134085,139772,143099,145091,146317,147170,147713
0.7,6682,13432,59339,91564,118038,128201,133450,136469,138302,139396,140167,140693
0.8,6582,13245,58400,90027,115804,125689,130749,133685,135471,136523,137245,137759
0.9,6543,13177,58050,89499,115021,124779,129799,132706,134465,135510,136249,136742
0.95,6537,13165,57970,89362,114863,124604,129614,132516,134271,135320,136049,136533


In [99]:
idx_word_2nd = dict([(idx,w) for w,idx in storingTF_2nd.word_idx.items()])
len(idx_word_2nd)

202715

In [100]:
wc_2nd = documents_2nd.flatMap(lambda d: set(d)). \
    map(lambda w: (w,1)). \
    reduceByKey(lambda a,b: a+b). \
    map(lambda wc: (storingTF_2nd.indexOf(wc[0]), wc[1])). \
    cache()

In [101]:
excluded_2nd = tfidf_2nd.flatMap(lambda vec: exclude_words_bellow_threshold(vec, .5)). \
        map(lambda i: (i, 1)). \
        reduceByKey(lambda a,b: a+b)
stat_2nd = wc_2nd.join(excluded_2nd).map(lambda s: (s[0], s[1][1] / [1][0]))
excluded_words_2nd = stat_2nd.filter(lambda s: s[1] > 0.9999).map(lambda x: idx_word[x[0]]).collect()

In [104]:
excluded_words_2nd_set = set(excluded_words_2nd)

In [105]:
stage_2nd = stage_1st.map(lambda x: exclude_words(x, excluded_words_2nd_set))
stage_2nd.cache()

PythonRDD[840] at RDD at PythonRDD.scala:43

In [106]:
stage_2nd.map(lambda x: len(x['Features'])).mean()

144.53563640231928

In [112]:
doc_lengths = stage_2nd.map(lambda x: len(x['Features'])).filter(lambda l: l<10).collect()
from matplotlib import pyplot as plt
len(doc_lengths)

5563

In [111]:
plt.hist(doc_lengths, bins=100)
plt.show()

In [116]:
stage_2nd.count()

112806

In [115]:
stage_2nd.map(json.dumps). \
    repartition(6). \
    saveAsTextFile('hdfs://master:54310/excluded_fin')