### Corpus

In [43]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [45]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [46]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [47]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [48]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [49]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [50]:
# Write your code here.
# Make sure its well documented and readble with appropriate comments.
# Compare your results with the above sklearn tfidf vectorizer
# You are not supposed to use any other library apart from the ones given below

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

def fit(corpus):
  '''
  This function generates a dictionary set/Vocabulary and
  returns :
  vocabulary = list of unique words in corpus
  vocabulary_with_dimension = {word : dimension no}
  '''
  unique_words = []
  dimension = []
  i=0
  for sentence in corpus:
    for word in sentence.split():
      if word not in unique_words:
        unique_words.append(word)
        dimension.append(i)
        i+=1
  vocabulory = sorted(unique_words)
  vocabulary_with_dimension = dict(zip(vocabulory,dimension))
  return vocabulary_with_dimension, vocabulory

vocabulary_with_dimension, vocabulory = fit(corpus)
print("vocabulary_with_dimension= ",vocabulary_with_dimension )    


      




vocabulary_with_dimension=  {'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [51]:

def calcualte_idf(vocabulory):
  '''
  This method calculates the IDF value for each word in dictionary 
  set/Vocabulary so generated in method "fit"
  returns: list of IDF values corresponding to each word in Vocabulary
  '''
  N = len(corpus)
  idf_value = []
  for word in vocabulory:
    total_count = 0
    for sentence in corpus:
      if word in sentence.split():
        total_count+=1

    # calculating idf values
    idf_val = 1+ np.log(((1 + N) / (1 + total_count)))
    idf_value.append(idf_val)
  return idf_value

idf_value = calcualte_idf(vocabulory)
print(idf_value)

[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [52]:
def transform(corpus,idf_value):
  '''
  This method calculates the TF * IDF value for each word in dictionary 
  set/Vocabulary so generated in method "fit" along with the words
  position in matrix
  returns: returns: Normalized Sparse matrix : normalized_sparse_matrix

  '''
  row_no = 0
  final_TF_IDF_list = []

  #  For each document in corpus
  for sentence in corpus:
    len_sentence = len(sentence.split())

    # for each unique word in the dicitionary set/vocabulory
    for word in vocabulary_with_dimension:
      count_word_in_sentence= (sentence.split()).count(word)

      #  Calculating TF value
      tf_val = count_word_in_sentence / len_sentence

      # vocabulary_with_dimension[word] : gives the dimension number 
      # corresponding to word
      # hence idf_value[vocabulary_with_dimension[word]] will get us the idf 
      # value corresponding to 'word'

      tf_idf_calc = tf_val * idf_value[vocabulary_with_dimension[word]]

      # calculating position (row_no,col_no) 
      position = (row_no,vocabulary_with_dimension[word])

      # Appending list conatining position of word and its corresponding value
      final_TF_IDF_list.append([position,tf_idf_calc])

    row_no += 1
  
  # Creation of sparse matrix
  sparse_matrix = []

  for list_element in enumerate(final_TF_IDF_list):
    # print(list_element)
    if list_element[1][1] != 0.0:
      # del sparse_matrix[list_element[0]]
      sparse_matrix.append(list_element[1])

  # getting paramaters of csr_matrix to get the sparse matrix
  row = []
  col = []
  val = []
  for i in sparse_matrix:
    row.append(i[0][0])
    col.append(i[0][1])
    val.append(i[1])
  
  # Creation of sparse matrix using csr_matrix
  sparse_matrix = csr_matrix((val,(row,col)),shape=(len(corpus),len(vocabulary_with_dimension)))

  # Normalizing Sparse matrix
  normalized_sparse_matrix = normalize(sparse_matrix)

  return normalized_sparse_matrix


sparse_matrix = transform(corpus,idf_value)
print(sparse_matrix[0])
print(type(sparse_matrix))


  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
<class 'scipy.sparse.csr.csr_matrix'>


In [53]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [54]:
def fit(corpus):
  '''
  This function generates a dictionary set/Vocabulary for BOW and returns
  vocabulary = list of unique words in corpus
  vocabulary_with_dimension = {word : dimension no}
  '''
  unique_words = []
  dimension = []
  i=0
  for sentence in corpus:
    for word in sentence.split():
      if word not in unique_words:
        unique_words.append(word)
        dimension.append(i)
        i+=1
  vocabulory = sorted(unique_words)
  vocabulary_with_dimension = dict(zip(vocabulory,dimension))
  return vocabulary_with_dimension, vocabulory

vocabulary_with_dimension, vocabulory = fit(corpus)
# print("vocabulary = ",vocabulary_with_dimension )



In [55]:

def calcualte_idf(vocabulory):
  '''
  This method calculates the IDF value for each word in dictionary 
  set/Vocabulary so generated in method "fit" and takes top 50 IDF values
  returns: dictionary of top 50 IDF values corresponding to each word in Vocabulary
  with words as keys and IDF score as values
  '''
  N = len(corpus)
  idf_value = []
  for word in vocabulory:
    total_count = 0
    for sentence in corpus:
      if word in sentence.split():
        total_count+=1

    # calculating idf values
    idf_val = 1+ np.log(((1 + N) / (1 + total_count)))
    idf_value.append(idf_val)

    # Creating dictionary of word (as key) with its corresponding IDF score (as value)
    word_with_idf = dict(zip(vocabulory,idf_value))

    # Get top 50 words with maxm Idf scores : dictionary format
    top_50_IDF_vocabulory_dict = dict(sorted(word_with_idf.items(),key= lambda x : x[1],reverse=True)[:50])

  return top_50_IDF_vocabulory_dict

top_50_IDF_vocabulory_dict = calcualte_idf(vocabulory)
print(top_50_IDF_vocabulory_dict)


# Get top 50 words with maxm Idf scores : list of words
top_50_IDF_vocabulory = list(top_50_IDF_vocabulory_dict.keys())

# Top 50 IDF Values
top_50_IDF_values = list(top_50_IDF_vocabulory_dict.values())

# Override value of  "vocabulary_with_dimension" with top 50 words and its dimensions

dimension = [i for i in range(50)]

vocabulary_with_dimension = dict(zip(top_50_IDF_vocabulory,dimension))
# print(vocabulary_with_dimension)



{'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.922918004572872, 'abstruse': 6.922918004572872, 'academy': 6.922918004572872, 'accents': 6.922918004572872, 'accessible': 6.922918004572872, 'acclaimed': 6.922918004572872, 'accolades': 6.922918004572872, 'accurate': 6.922918004572872, 'accurately': 6.922918004572872, 'achille': 6.922918004572872, 'ackerman': 6.922918004572872, 'actions': 6.922918004572872, 'adams': 6.922918004572872, 'add': 6.922918004572872, 'added': 6.922918004572872, 'admins': 6.922918004572872, 'admiration': 6.922918004572872, 'admitted': 6.922918004572872, 'adrift': 6.922918004572872, 'adventure': 6.922918004572872, 'aesthetically': 6.922918004572872, 'affected': 6.922918004572872, 'affleck': 6.922918004572872, 'afternoon': 6.922918004572872, 'aged': 6.922918004572872, 'ages': 6.922918004572872, 'agree': 6.922918004572872, 'agreed': 6.922918004572872, 'aimless': 6.922918004572872, 'aired': 6.922918004572872, 'akasha': 6.922918004572872, '

In [56]:
def transform(corpus,idf_value):
  '''
  This method calculates the TF * IDF value for each word in dictionary 
  set/Vocabulary so generated in method "fit" along with the words
  position in matrix
  returns: Normalized Sparse matrix : normalized_sparse_matrix

  '''
  row_no = 0
  final_TF_IDF_list = []

  #  For each document in corpus
  for sentence in corpus:
    len_sentence = len(sentence.split())
    # for each unique word in the dicitionary set/vocabulory
    for word in vocabulary_with_dimension:
      count_word_in_sentence= (sentence.split()).count(word)

      #  Calculating TF value
      tf_val = count_word_in_sentence / len_sentence

      # vocabulary_with_dimension[word] : gives the dimension number corresponding to word
      # hence idf_value[vocabulary_with_dimension[word] will get us the idf value corresponding to 'word'
      tf_idf_calc = tf_val * idf_value[vocabulary_with_dimension[word]]

      # calculating position (row_no,col_no) 
      position = (row_no,vocabulary_with_dimension[word])

      # Appending list conatining position of word and its corresponding value
      final_TF_IDF_list.append([position,tf_idf_calc])

    row_no += 1

  # Creation of sparse matrix
  sparse_matrix = []

  for list_element in enumerate(final_TF_IDF_list):
    # print(list_element)
    if list_element[1][1] != 0.0:
      # del sparse_matrix[list_element[0]]
      sparse_matrix.append(list_element[1])

  # getting paramaters of csr_matrix to get the sparse matrix
  row = []
  col = []
  val = []
  for i in sparse_matrix:
    row.append(i[0][0])
    col.append(i[0][1])
    val.append(i[1])

  # Creation of sparse matrix using csr_matrix
  sparse_matrix = csr_matrix((val,(row,col)),shape=(len(corpus),len(vocabulary_with_dimension)))

  # Normalizing Sparse matrix
  normalized_sparse_matrix = normalize(sparse_matrix)
  return normalized_sparse_matrix

sparse_matrix = transform(corpus,top_50_IDF_values)
print(sparse_matrix[0])
print(sparse_matrix[0].toarray())
print(sparse_matrix.shape)


  (0, 30)	1.0
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]
(746, 50)
