## Modifying the fit and transform functions so that the vocab will contain only 50 terms with top idf scores.

In [1]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np
import pickle


In [2]:
with open('cleaned_strings', 'rb') as f:
    new_corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(new_corpus))

Number of documents in corpus =  746


In [3]:
print(new_corpus[:5]) # preview


['slow moving aimless movie distressed drifting young man', 'not sure lost flat characters audience nearly half walked', 'attempting artiness black white clever camera angles movie disappointed became even ridiculous acting poor plot lines almost non existent', 'little music anything speak', 'best scene movie gerardo trying find song keeps running head']


In [4]:
def fit(new_corpus):
    '''Returns the unique words within a corpus and their index
       keys represent the unique words and values indexes'''
    unique_words = set()
    if isinstance(new_corpus, (list,)):
        for row in new_corpus: # for each review in the dataset
            for word in row.split(" "):
                word = word.lower() 
                if len(word) < 2: # dont consider words less than two
                    continue
                unique_words.add(word) # add unique words only
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)} # words and their respective index
        return vocab
    
    else:
        print("Expected a list of sentences")

In [5]:
def idf(new_corpus,vocab):
    '''Returns the inverse document frequency of a word and the word itself
       keys represent the word and the values their respective idf values'''
    num_of_docs = len(new_corpus)
    idf_values = {}
   
    for word in vocab:
        num_of_docs_with_term = 0
        for doc in new_corpus:
            if word in doc:
                num_of_docs_with_term += 1
    
        idf = 1 + (math.log((1+num_of_docs)/(1 + num_of_docs_with_term)))
        
        idf_values[word] = idf
   
    # return the first 50 words with top idf
    new_set = {}
    for word_idf in sorted(zip(idf_values.values(),idf_values.keys()))[-50:]:
        new_set[word_idf[1]] = word_idf[0]
    return new_set
    
        
        
        

In [6]:
# first 50 words with the highest idf value
vocab_set = fit(new_corpus)
vocabulary = idf(new_corpus,vocab_set)
vocabulary

{'waster': 6.922918004572872,
 'wasting': 6.922918004572872,
 'wave': 6.922918004572872,
 'waylaid': 6.922918004572872,
 'wayne': 6.922918004572872,
 'weaker': 6.922918004572872,
 'weariness': 6.922918004572872,
 'weaving': 6.922918004572872,
 'website': 6.922918004572872,
 'wedding': 6.922918004572872,
 'weight': 6.922918004572872,
 'welsh': 6.922918004572872,
 'went': 6.922918004572872,
 'whenever': 6.922918004572872,
 'whine': 6.922918004572872,
 'whites': 6.922918004572872,
 'whoever': 6.922918004572872,
 'wide': 6.922918004572872,
 'widmark': 6.922918004572872,
 'wife': 6.922918004572872,
 'wih': 6.922918004572872,
 'wild': 6.922918004572872,
 'william': 6.922918004572872,
 'willie': 6.922918004572872,
 'wily': 6.922918004572872,
 'within': 6.922918004572872,
 'witticisms': 6.922918004572872,
 'woa': 6.922918004572872,
 'wondered': 6.922918004572872,
 'wong': 6.922918004572872,
 'wont': 6.922918004572872,
 'worked': 6.922918004572872,
 'worry': 6.922918004572872,
 'worthless': 6.9

In [7]:
def transform(new_corpus,vocab):
    # initialize a matrix
    sparse_matrix= csr_matrix((len(new_corpus), len(vocab)))
    
    # assign words to their indexes
    vocab_with_index = {}
    for idx,uniq_word in enumerate(vocabulary):
        vocab_with_index[uniq_word] = idx
    
    # loop through the corpus and calculate tfidf
    for index, doc in enumerate(new_corpus):
        
        num_of_words_in_doc = Counter(new_corpus[index].split())
        for word in new_corpus[index].split():
        #  print(new_corpus[index].split())
         
            if word in vocab.keys():

                num_of_terms_in_doc = len(doc.split())

                tf = num_of_words_in_doc[word] / num_of_terms_in_doc

                idf_value = idf(new_corpus,vocab)[word]
                tf_idf_value = tf * idf_value
                
             
                sparse_matrix[index,vocab_with_index[word]] = tf_idf_value
                
    print(normalize(sparse_matrix))

In [8]:
vocabulary

transform(new_corpus,vocabulary)

  (19, 16)	0.5773502691896258
  (19, 32)	0.5773502691896258
  (19, 45)	0.5773502691896258
  (55, 5)	1.0
  (68, 19)	1.0
  (70, 9)	1.0
  (80, 14)	1.0
  (109, 49)	1.0
  (134, 4)	1.0
  (135, 6)	0.408248290463863
  (135, 8)	0.408248290463863
  (135, 20)	0.408248290463863
  (135, 26)	0.408248290463863
  (135, 27)	0.408248290463863
  (135, 38)	0.408248290463863
  (148, 3)	0.5773502691896257
  (148, 17)	0.5773502691896257
  (148, 42)	0.5773502691896257
  (155, 39)	1.0
  (191, 24)	1.0
  (222, 43)	1.0
  (251, 37)	1.0
  (270, 47)	1.0
  (321, 48)	1.0
  (326, 18)	1.0
  (337, 15)	1.0
  (340, 35)	1.0
  (341, 23)	1.0
  (350, 22)	0.7071067811865476
  (350, 29)	0.7071067811865476
  (361, 40)	1.0
  (366, 31)	1.0
  (378, 10)	1.0
  (421, 11)	1.0
  (452, 36)	1.0
  (464, 12)	1.0
  (495, 30)	1.0
  (514, 41)	1.0
  (518, 46)	1.0
  (521, 0)	1.0
  (525, 1)	1.0
  (535, 13)	1.0
  (562, 21)	1.0
  (633, 25)	1.0
  (634, 28)	1.0
  (644, 44)	1.0
  (680, 34)	1.0
  (719, 2)	1.0
  (720, 33)	1.0
  (734, 7)	1.0


  self._set_intXint(row, col, x.flat[0])
