## The fit() and transform() methods of my custom implementation of tfidf vectorizer.

In [1]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

In [2]:
def fit(corpus):
    '''Returns the unique words within a corpus and their index
       keys represent the unique words and values indexes'''
    unique_words = set()
    if isinstance(corpus, (list,)):
        for row in corpus: # for each review in the dataset
            for word in row.split(" "):
                word = word.lower() 
                if len(word) < 2: # dont consider words less than two
                    continue
                unique_words.add(word) # add unique words only
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)} # words and their respective index
        return vocab
    else:
        print("Expected a list of sentences")

In [3]:
def idf(corpus,vocab):
    '''Returns the inverse document frequency of a word and the word itself
       keys represent the word and the values their respective idf values'''
    num_of_docs = len(corpus)
    idf_values = {}
   
    for word in vocab:
        num_of_docs_with_term = 0
        for doc in corpus:
            if word in doc:
                num_of_docs_with_term += 1
    
        idf = 1 + (math.log((1+num_of_docs)/(1 + num_of_docs_with_term)))
        
        idf_values[word] = idf
        
    return idf_values
                       

In [4]:
def transform(corpus,vocab):
    # initialize a matrix
    sparse_matrix= csr_matrix((len(corpus), len(vocab)))
    
    # loop through the corpus and calculate tfidf
    for index, doc in enumerate(corpus):
        num_of_words_in_doc = Counter(corpus[index].split())
        for word in corpus[index].split():
            if word in vocab.keys():
                num_of_terms_in_doc = len(doc.split())

                tf = num_of_words_in_doc[word] / num_of_terms_in_doc

                idf_value = idf(corpus,vocab)[word]
                tf_idf_value = tf * idf_value
  
                sparse_matrix[index, vocab[word]] = tf_idf_value
    print(normalize(sparse_matrix))

In [5]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [6]:
vocab = fit(corpus)
transform(corpus,vocab)


# Final output of my sparse matrix

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


  self._set_intXint(row, col, x.flat[0])
