## Implementing TFIDF vectorizer

#### Task-1
* Build a TF-IDF Vectorizer and compare it's result with sklearn's tf-idf vectorizer

In [1]:
# first let us create a function that calculates idf values for each word in the corpus
def idf(corpus, unique_words):
    """
    Calculate the inverse document frequency of a given word.
    :param corpus: The corpus to calculate idf over.
    :param unique_words: The word to calculate idf for.
    :return: The idf value.
    """
    idf_dict = {}
    # get the number of documents in the corpus
    N = len(corpus)
    
    for document in unique_words:
        # get the number of documents that contain the word
        n = 0
        for sen in corpus:
            if document in sen.split():
                n += 1
            idf_dict[document]=(math.log((1+N)/(n+1)))+1
    return idf_dict 
    corpus = [
      'this is the first document',
      'this document is the second document',
      'and this is the third one',
      'is this the first document',
 ]

In [2]:
# creating a fit method for the TF-IDF model
import math

def fit(dataset):    
    unique_words = set() 
    if isinstance(dataset, (list,)):
        for row in dataset: 
            for word in row.split(): 
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        
        idf_values_of_unique_words = idf(dataset, unique_words)
        return vocab, idf_values_of_unique_words
    else:
        print("you need to pass list of sentance")


corpus = [
      'this is the first document',
      'this document is the second document',
      'and this is the third one',
      'is this the first document',
 ] 

vocabulary, idf_of_vocab = fit(corpus)
print(vocabulary, idf_of_vocab)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8} {'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}


In [3]:
print(list(idf_of_vocab.values()))
# After using the fit function on the corpus the vocab has 9 words in it,
# and each has its idf value.

[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [4]:
# let's see if the above result matches with sklearn's
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [5]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [6]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

In [7]:
# now we will be creating a Transform function for the TF-IDF model
def transform(dataset, vocabulary, idf_of_vocab):
    sparse_matrix= csr_matrix( (len(dataset), len(vocabulary)), dtype=np.float64)
    for row  in range(0,len(dataset)):
        number_of_words_in_sentence=Counter(dataset[row].split())
        for word in dataset[row].split():
            if word in  list(vocabulary.keys()):
               tf_idf_value=(number_of_words_in_sentence[word]/len(dataset[row].split()))*(idf_of_vocab[word])
               sparse_matrix[row,vocabulary[word]]=tf_idf_value
    # refferred : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
    output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output
final_output=transform(corpus,vocabulary,idf_of_vocab)
# print(final_output)
# print(final_output.shape)


  self._set_intXint(row, col, x.flat[0])


In [8]:
# To understand the output better,
#here we are converting the sparse output matrix to dense matrix and printing it.
print(final_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [9]:
# checking with sklearn's vectorizer
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [10]:
"""Both the results satisfies with our implementation"""

'Both the results satisfies with our implementation'

In [11]:
################End of task 1############################

#### Task-2
*  Implement max features functionality

In [12]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [13]:
def modified_idf(corpus, unique_words):
    """
    Calculate the inverse document frequency of a given word.
    :param corpus: The corpus to calculate idf over.
    :param unique_words: The word to calculate idf for.
    :return: The idf value.
    """
    idf_dict = {}
    # get the number of documents in the corpus
    N = len(corpus)
    
    for document in unique_words:
        # get the number of documents that contain the word
        n = 0
        for sen in corpus:
            if document in sen.split():
                n += 1
            idf_dict[document]=(math.log((1+N)/(n+1)))+1
    return idf_dict 


In [14]:
import math
# here we will be modifying the fit and transform functions so that our vocab will contain only 50 terms from the pickle file with top idf scores.
def fit_modified(dataset):
    unique_words = set() 
    if isinstance(dataset, (list,)):
        for row in dataset:
            for word in row.split(): 
                if len(word) < 2:
                    continue
                unique_words.add(word)
                    
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        
        idf_values_of_unique_50_words = modified_idf(dataset, unique_words)
        return vocab, idf_values_of_unique_50_words, unique_words
    else:
        print("you need to pass list of sentance")
vocabulary, idf_of_vocabulary_top_50,unique_words = fit_modified(corpus)
# print(fit_modified(corpus))

In [15]:
# creating a list of top 50 idf values
my_lst = list(idf_of_vocabulary_top_50.values())
my_lst = sorted(my_lst, reverse=True)
my_lst = my_lst[:50]
#above list is used in the later cell; it gives top 50 values of idf score
# sorting the dictionary to get top 50 idf scores and storing it as sortdict
s_lst = sorted(idf_of_vocabulary_top_50.items(), reverse=True, key=lambda x:x[1])
sortdict = dict(s_lst)
# print(sortdict)


In [16]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

In [17]:
# cross checking our findings with sklearn's for the pickle file
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names())
print(X.shape)

(746, 2886)


In [18]:
# created in cell 15, my_lst gives us top 50 
#idf values of pickle file which we have implemented from scratch.
print(my_lst)

[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]


In [19]:
vectorizer.idf_ = sorted(vectorizer.idf_, reverse= True)
print(vectorizer.idf_[:50])
#sklearn's idf output

[6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918
 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918
 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918
 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918
 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918
 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918 6.922918
 6.922918 6.922918]


In [20]:
# modifying the transform function as well for the pickle file
def modified_transform(dataset, vocabulary, sortdict):

    sparse_matrix= csr_matrix( (len(dataset), len(vocabulary)), dtype=np.float64)   
    for row  in range(0,len(dataset)):
        number_of_words_in_sentence=Counter(dataset[row].split())
        for word in dataset[row].split():
            if word in  list(vocabulary.keys()):
               tf_idf_value=(number_of_words_in_sentence[word]/len(dataset[row].split()))*(sortdict[word])
               sparse_matrix[row,vocabulary[word]]=tf_idf_value
    # refferred : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
    output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output
final_output=modified_transform(corpus,vocabulary,sortdict)
print(final_output[0])

  self._set_intXint(row, col, x.flat[0])


  (0, 53)	0.4123943870778812
  (0, 688)	0.4123943870778812
  (0, 720)	0.4123943870778812
  (0, 1545)	0.30566026894803877
  (0, 1651)	0.16192317905848022
  (0, 1653)	0.35781145622317734
  (0, 2287)	0.3377679916467555
  (0, 2878)	0.35781145622317734


In [21]:
# now let's check for the transform and tf-idf values
print(X[0])

  (0, 1545)	0.3056602689480387
  (0, 2878)	0.3578114562231773
  (0, 720)	0.41239438707788106
  (0, 688)	0.41239438707788106
  (0, 1651)	0.1619231790584802
  (0, 53)	0.41239438707788106
  (0, 1653)	0.3578114562231773
  (0, 2287)	0.33776799164675547


In [22]:
"""Both the results satisfies with our implementation"""

'Both the results satisfies with our implementation'