### SkLearn Implementation

In [None]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [None]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [None]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Custom implementation

In [None]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
from math import log
import operator
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
#importing sklearn for comparison
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',]

In [None]:
def fit(text):
  unique_w = set()
  if isinstance(text, (list,)):
    for row in text:
      for w in row.split(" "):
        if len(w) <2:
          continue
        unique_w.add(w)
    unique_w = sorted(list(unique_w))
    voc = {j:i for i,j in enumerate(unique_w)}
    return voc
  else:
    print("Pass the text as a list - fit")

In [None]:
print('fit function - custom implementation: \n',fit(corpus).keys())
print('fit function - sklearn implementation: \n',vectorizer.get_feature_names())

fit function - custom implementation: 
 dict_keys(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'])
fit function - sklearn implementation: 
 ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
def get_BoW(text, voc):
  rows = []
  cols = []
  vals = []
  if isinstance(text, (list,)):
    for idx, row in enumerate(text):
      w_freq = dict(Counter(row.split()))
      for w, freq in w_freq.items():
        if len(w) < 2:
          continue
        col_idx = voc.get(w, -1)
        if col_idx != -1:
          rows.append(idx)
          cols.append(col_idx)
          vals.append(freq)
    return csr_matrix((vals, (rows, cols)), shape=(len(text),len(voc)))
  else:
    print("Pass the text as a list - transform")

In [None]:
def myTF(corpus):
  doc_count = len(get_BoW(corpus,fit(corpus)).toarray())
  vocab_len = len(fit(corpus))
  TF = csr_matrix(([0],([0],[0])),shape=(doc_count,vocab_len))
  BoW = get_BoW(corpus,fit(corpus)).toarray()
  for i,j in enumerate(BoW):
    term_doc_count = j.sum()
    for m,n in enumerate(j):
      if n == 0:
        continue
      TF += csr_matrix(([n/term_doc_count],([i],[m])),shape=(doc_count,vocab_len))
  return(TF.toarray())

In [None]:
def myIDF(text):
  idf = {}
  row_cnt = 0
  row_with_w = 0
  w_tfidf = {}
  if isinstance(text, (list,)):
    for row in text:
      for w in row.split(" "):
        for row1 in text:
          if w in row1.split(" "):
            row_with_w += 1
        w_tfidf.update({w:row_with_w})
        row_with_w = 0
      row_cnt += 1
    for row in text:
      for w in row.split(" "):
        if len(w) <2:
          continue
    for key,value in fit(text).items():
      idf.update({key:round(1 + log((1 + row_cnt)/(1 + w_tfidf.get(key))),8)})
  else:
    print("Pass the text as a list - tfidf")
  return([value for key,value in idf.items()])

In [None]:
print('Custom IDF: \n',myIDF(corpus))
print('sklearn IDF: \n',vectorizer.idf_)

Custom IDF: 
 [1.91629073, 1.22314355, 1.51082562, 1.0, 1.91629073, 1.91629073, 1.0, 1.91629073, 1.0]
sklearn IDF: 
 [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
def transform(corpus):
  TF = myTF(corpus)
  IDF = myIDF(corpus)
  doc_count = len(get_BoW(corpus,fit(corpus)).toarray())
  vocab_len = len(fit(corpus))
  TFIDF_init = csr_matrix(([0],([0],[0])),shape=(doc_count,vocab_len))
  TFIDF_fnl = csr_matrix(([0],([0],[0])),shape=(doc_count,vocab_len))
  sum_of_sq = 0

  for i,j in enumerate(IDF):
    for m,n in enumerate(TF):
      for x,y in enumerate(n):
        if i == x:
          TFIDF_init += csr_matrix(([j*y],([m],[x])),shape=(doc_count,vocab_len))

  for i_1,j_1 in enumerate(TFIDF_init.toarray()):
    sum_of_sq = sum([m_1**2 for m_1 in j_1])**0.5
    for x_1,y_1 in enumerate(j_1):
      TFIDF_fnl += csr_matrix(([y_1/sum_of_sq],([i_1],[x_1])),shape=(doc_count,vocab_len))
    sum_of_sq = 0
  return(TFIDF_fnl)

In [None]:
print('Diamension of custom TFIDF matrix: ',transform(corpus).shape)
print('Diamension of sklearn TFIDF matrix: ',skl_output.shape)

Diamension of custom TFIDF matrix:  (4, 9)
Diamension of sklearn TFIDF matrix:  (4, 9)


In [None]:
print('Custom TFIDF: \n',transform(corpus)[0])
print('sklearn TFIDF: \n',skl_output[0])

Custom TFIDF: 
   (0, 1)	0.46979138558088085
  (0, 2)	0.5802858228626505
  (0, 3)	0.3840852413282814
  (0, 6)	0.3840852413282814
  (0, 8)	0.3840852413282814
sklearn TFIDF: 
   (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


Implement max features functionality

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
with open('/content/drive/My Drive/Applied AI Course/Assignments/cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [None]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
from math import log
import operator
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
#importing sklearn for comparison
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
def order_words(text):
  unique_w = set()
  if isinstance(text, (list,)):
    for row in text:
      for w in row.split(" "):
        if len(w) <2:
          continue
        unique_w.add(w)
    unique_w = sorted(list(unique_w))
    voc = {j:i for i,j in enumerate(unique_w)}
    return voc
  else:
    print("Pass the text as a list - order_words")

In [None]:
def get_BoW(text, voc):
  rows = []
  cols = []
  vals = []
  if isinstance(text, (list,)):
    for idx, row in enumerate(text):
      w_freq = dict(Counter(row.split()))
      for w, freq in w_freq.items():
        if len(w) < 2:
          continue
        col_idx = voc.get(w, -1)
        if col_idx != -1:
          rows.append(idx)
          cols.append(col_idx)
          vals.append(freq)
    return csr_matrix((vals, (rows, cols)), shape=(len(text),len(voc)))
  else:
    print("Pass the text as a list - transform")

In [None]:
def myTF(corpus,BoW,doc_count_pub,vocab_len_pub):
  doc_count = doc_count_pub
  vocab_len = vocab_len_pub
  TF = csr_matrix(([0],([0],[0])),shape=(doc_count,vocab_len))
  BoW_TF = BoW.toarray()
  for i,j in enumerate(BoW_TF):
    term_doc_count = j.sum()
    for m,n in enumerate(j):
      if n == 0:
        continue
      TF += csr_matrix(([n/term_doc_count],([i],[m])),shape=(doc_count,vocab_len))
  return(TF.toarray())

In [None]:
def IDF_vocab(text,I_or_v):
  # Initialize idf variable as dictionary to add idf values iteratively
  idf = {}
  # Initialize a counter variable to count number of documents in the collection
  row_cnt = 0
  # Initialize a counter variable to count documents having current term
  row_with_w = 0
  # Initialize a dictionary to store number of documents for all the terms in the collection
  w_tfidf = {}
  # Initialize a dictionary to store the vocabulary with the IDF values
  vocab_IDF = {}
  # Check if the collection is a list
  if isinstance(text, (list,)):
    # For loop to iterate through all the documents in the collection to find
    # number of documents with the term
    for row in text:
      # For loop to split the terms in the document and iterate through all the terms
      # in the documents to find number of documents with the term
      for w in row.split(" "):
        # For loop to iterate through each documents in the collection
        # The above two for loop fixes a term and the below for loop will run to find
        # and count documents with matching term
        for row1 in text:
          # Split the documents in the collection and check if the document
          # has the term (w) fix by the the for loop one above
          if w in row1.split(" "):
            # Count and store the number of documents with the term (w) fix by the
            # for loop one above
            row_with_w += 1
        # For each term "w" in the document, store the term and the count of documents
        # having that term, to the dictionary "w_tfidf"
        w_tfidf.update({w:row_with_w})
        # Reset the counter variable that store number of documents with term "w"
        row_with_w = 0
      # Increment the document counter to count the total number of documents in the collection
      row_cnt += 1
    # The below for loop is to generate the IDF dictionary that calculate the
    # IDF value from the number of documents in the collection and the number
    # of documents that has a given term both are calculated in the above steps.
    # The for loop will iterate through all the terms in the vocabulary created by the
    # function "order_words" and create IDF for all the terms in the collection.
    for key,value in order_words(text).items():
      # Create and store IDF values for all the terms in the collection

#      This line of code calculate the IDF value for all the terms in the collection.
#      idf is a dictionary and update function add values to the dictionary. It accepts a key and value in curley brackets.
#      "key" is the term from the vocabulary which is generated by the function "order_words".
#      round is the round function used to round the output of IDF calculation to 8 digits to match the output of sklearn.
#      Next is the equation for IDF value with updates for sklearn. The updated formula for IDF in sklearn is 1+log((1+Total number of documents in collection)/(1+Number of documents with term t in it))
#      Here, "row_cnt" is the total number of documents in collection and "w_tfidf.get(key)" is the number of documents with term t in it.
#      The "row_cnt" is updated by iterating through all the documents in the collection. The "w_tfidf" is a dictionary that contains the terms and corresponding numbers for the number of documents with term t that is updated by two for loops in the previous steps.

      idf.update({key:round(1 + log((1 + row_cnt)/(1 + w_tfidf.get(key))),8)})

  # else to show a message if the collection is not given as list (list of documents)
  else:
    # printing message to show the given input is not list
    print("Pass the text as a list - tfidf")
  # Sorting the IDF values and taking the top 50 items
  idf = sorted(idf.items(),key=lambda x:x[1],reverse=True)[:50]
  # The sorting returns a list and the below code converts it to a dictionary
  idf = dict(idf)
  # Sorting the IDF dictionary and get only terms - removing IDF values for vocabulary
  vocab = sorted(idf.keys())
  # Adding index numbers to the vocabulary
  vocab = {j:i for i,j in enumerate(vocab)}
  # Iterating through vocabulary and adding  terms and IDF values to a new dictionary
  for key,value in vocab.items():
    # Updating the new dictionary with terms and IDF values
    vocab_IDF.update({key:idf.get(key)})
  #The below "if-else" loop is to return vocabulary, IDF and both based on the parameters in the calling function
  if I_or_v == 'Iv':
    # Return vocabulary with IDF values in dictionary - sorted by words
    return(vocab_IDF)
  elif I_or_v == 'I':
    # Return only IDF values in list - sorted by words
    return([value for key,value in vocab_IDF.items()])
  elif I_or_v == 'v':
    # Return vocabulary with index ID in dictionary - sorted by words
    return(vocab)

In [None]:
def fit(corpus):
  global IDF_pub
  global TF_pub
  global doc_count_pub
  global vocab_len_pub
  global BoW
  global fit_corpus
  fit_corpus = corpus
  vocab_return = IDF_vocab(corpus,'v')
  BoW = get_BoW(corpus,vocab_return)
  doc_count_pub = len(BoW.toarray())
  vocab_len_pub = len(vocab_return)
  TF_pub = myTF(corpus,BoW,doc_count_pub,vocab_len_pub)
  IDF_pub = IDF_vocab(corpus,'I')
  return(vocab_return)

In [None]:
def myIDF(corpus):
  return(IDF_vocab(corpus,'I'))
def vocab_with_IDF(corpus):
  return(IDF_vocab(corpus,'Iv'))

print('fit function returning 50 terms with top IDF values: \n',fit(corpus))
print('Top 50 IDF values sorted by ascending order of words: \n',myIDF(corpus))
print('Vocabulary with IDF values for top 50 words: \n',vocab_with_IDF(corpus))

fit function returning 50 terms with top IDF values: 
 {'aailiyah': 0, 'abandoned': 1, 'abroad': 2, 'abstruse': 3, 'academy': 4, 'accents': 5, 'accessible': 6, 'acclaimed': 7, 'accolades': 8, 'accurate': 9, 'accurately': 10, 'achille': 11, 'ackerman': 12, 'actions': 13, 'adams': 14, 'add': 15, 'added': 16, 'admins': 17, 'admiration': 18, 'admitted': 19, 'adrift': 20, 'adventure': 21, 'aesthetically': 22, 'affected': 23, 'affleck': 24, 'afternoon': 25, 'aged': 26, 'ages': 27, 'agree': 28, 'agreed': 29, 'aimless': 30, 'aired': 31, 'akasha': 32, 'akin': 33, 'alert': 34, 'alike': 35, 'allison': 36, 'allow': 37, 'allowing': 38, 'alongside': 39, 'amateurish': 40, 'amaze': 41, 'amazed': 42, 'amazingly': 43, 'amusing': 44, 'amust': 45, 'anatomist': 46, 'angel': 47, 'angela': 48, 'angelina': 49}
Top 50 IDF values sorted by ascending order of words: 
 [6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922918, 6.922

In [None]:
def transform(corpus):
  TF = TF_pub
  IDF = IDF_pub
  doc_count = doc_count_pub
  vocab_len = vocab_len_pub
  fit_corp = fit_corpus
  TFIDF_1 = csr_matrix(([0],([0],[0])),shape=(doc_count,vocab_len))
  TFIDF_2 = csr_matrix(([0],([0],[0])),shape=(doc_count,vocab_len))
  sum_of_sq = 0

  for i,j in enumerate(IDF):
    for m,n in enumerate(TF):
      for x,y in enumerate(n):
        if i == x:
          TFIDF_1 += csr_matrix(([j*y],([m],[x])),shape=(doc_count,vocab_len))

  for i_1,j_1 in enumerate(TFIDF_1.toarray()):
    sum_of_sq = sum([m_1**2 for m_1 in j_1])**0.5
    for x_1,y_1 in enumerate(j_1):
      if math.isnan(y_1/sum_of_sq):
        tfidf_sum = 0.0
      else:
        tfidf_sum = y_1/sum_of_sq
      TFIDF_2 += csr_matrix(([tfidf_sum],([i_1],[x_1])),shape=(doc_count,vocab_len))
    sum_of_sq = 0
  if corpus == fit_corp:
    return(TFIDF_2)
  elif (len(fit_corp) > 1) & (len(corpus) == 1):
    for i in range(0,len(fit_corp)):
      if fit_corp[i] == corpus[0]:
        return(TFIDF_2[i])
  else:
    print('The dimension of fit data and transform data are not matching')

In [None]:
fit(corpus)
print('transform function returning sparse matrix, converting to array: \n',transform(corpus).toarray())

transform function returning sparse matrix, converting to array: 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
fit(corpus)
print('Output of a single document in the collection: \n',transform([corpus[500]]).toarray())

Output of a single document in the collection: 
 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0.]]
