In [1]:
import string
import json 
import pickle as pkl
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pyspark as ps
from collections import Counter
import numpy as np

In [2]:
sc = ps.SparkContext()

In [3]:
test_strings = ['the quick brown fox jumps over the brown fence.',
              'the boy paints a tall fence brown!',
              'basketball players are tall.',
              'quick basketball players jump high']

In [4]:
import nltk, string

def tokenize(text):
    tokens = [] 
    
    for word in nltk.word_tokenize(text):
        if word \
            not in nltk.corpus.stopwords.words('english') \
            and word not in string.punctuation \
            and word != '``':    
                tokens.append(word)
    
    return tokens

In [5]:
test_tokens = sc.parallelize(test_strings).map(tokenize)
test_tokens.collect()

[['quick', 'brown', 'fox', 'jumps', 'brown', 'fence'],
 ['boy', 'paints', 'tall', 'fence', 'brown'],
 ['basketball', 'players', 'tall'],
 ['quick', 'basketball', 'players', 'jump', 'high']]

In [6]:
vocab = test_tokens.flatMap(lambda words: words).distinct()
vocab.collect()

['quick',
 'jump',
 'high',
 'brown',
 'players',
 'tall',
 'paints',
 'boy',
 'jumps',
 'basketball',
 'fox',
 'fence']

In [7]:
vocab.count()

12

In [8]:
from collections import Counter
import numpy as np

broadcastVocab = sc.broadcast(vocab.collect())

def bow_vectorize(tokens):
    word_counts = Counter(tokens)
    vector = [word_counts[v] if v in word_counts else 0 for v in broadcastVocab.value]
    return np.array(vector)

In [9]:
test_tokens.map(bow_vectorize).collect()

[array([1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 1, 1]),
 array([0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1]),
 array([0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0]),
 array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0])]

In [10]:
test_tokens.collect()

[['quick', 'brown', 'fox', 'jumps', 'brown', 'fence'],
 ['boy', 'paints', 'tall', 'fence', 'brown'],
 ['basketball', 'players', 'tall'],
 ['quick', 'basketball', 'players', 'jump', 'high']]

In [11]:
broadcastVocab.value

['quick',
 'jump',
 'high',
 'brown',
 'players',
 'tall',
 'paints',
 'boy',
 'jumps',
 'basketball',
 'fox',
 'fence']

## TF-IDF

In [12]:
term_freq = test_tokens.map(lambda terms: Counter(terms))

In [13]:
doc_freq = term_freq.flatMap(lambda counts: counts.keys()).map(lambda keys: (keys, 1)).reduceByKey(lambda a, b: a + b)

In [14]:
total_docs = term_freq.count()

In [15]:
import math

idf = map(lambda tup: (tup[0], math.log(float(total_docs)/ (1 + tup[1]))), doc_freq.collect())

In [24]:
broadcast_idf = sc.broadcast(idf)

def tfidf_vectorize(tokens):
    word_counts = Counter(tokens)
    doc_length = sum(word_counts.values())
    
    vector = [ word_counts.get(word[0], 0) * word[1] / float(doc_length) for word in broadcast_idf.value]
    return np.array(vector)

In [25]:
test_tokens.map(tfidf_vectorize).collect()

[array([ 0.28768207,  0.        ,  0.        ,  0.57536414,  0.        ,
         0.        ,  0.        ,  0.        ,  0.69314718,  0.        ,
         0.69314718,  0.28768207]),
 array([ 0.        ,  0.        ,  0.        ,  0.28768207,  0.28768207,
         0.69314718,  0.        ,  0.69314718,  0.        ,  0.        ,
         0.        ,  0.28768207]),
 array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.28768207,
         0.        ,  0.28768207,  0.        ,  0.        ,  0.28768207,
         0.        ,  0.        ]),
 array([ 0.28768207,  0.69314718,  0.69314718,  0.        ,  0.        ,
         0.        ,  0.28768207,  0.        ,  0.        ,  0.28768207,
         0.        ,  0.        ])]

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

tf = TfidfVectorizer(lowercase = False, smooth_idf = False)
print tf.fit_transform(test_strings)

  (0, 4)	0.250320232562
  (0, 9)	0.352797303326
  (0, 8)	0.352797303326
  (0, 5)	0.352797303326
  (0, 3)	0.500640465123
  (0, 12)	0.250320232562
  (0, 14)	0.500640465123
  (1, 13)	0.354157614079
  (1, 10)	0.499144036105
  (1, 2)	0.499144036105
  (1, 4)	0.354157614079
  (1, 3)	0.354157614079
  (1, 14)	0.354157614079
  (2, 0)	0.631156936704
  (2, 11)	0.447824713197
  (2, 1)	0.447824713197
  (2, 13)	0.447824713197
  (3, 6)	0.533737855662
  (3, 7)	0.533737855662
  (3, 11)	0.378702963137
  (3, 1)	0.378702963137
  (3, 12)	0.378702963137


In [20]:
test_tokens.map(bow_vectorize).collect()

[array([1, 0, 0, 2, 0, 0, 0, 0, 1, 0, 1, 1]),
 array([0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1]),
 array([0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0]),
 array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0])]