### Corpus

In [1]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [3]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']




In [4]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)


[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [5]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [6]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [7]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [8]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

In [9]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]
print(len(corpus))

4


In [10]:
# the below code will calculate the IDF of all the unique words in the corpus.
def IDF(corpus, unique_words):
    idf_dict={}     #will store the IDF of corresponding word here.
    N = len(corpus) # it will be required in the formula of IDF.
    for words in unique_words:  #iterating through the each unique words.
        count=0 # will store the frequecny of a word in a review of the corpus.
        for review in corpus:  # iterating through a review in the corpus.
            if words in review.split(): # spliting the word in a review by space and checking if the words is present in the review or not.
                count=count+1  # frequency of a word in a review.
                idf_dict[words]=1+(math.log((1+N)/(count+1)))   #calculating IDF and storing in dictionary along with word.(word:idf)
    return idf_dict 

In [11]:
# in the code below i will create the set of unique words that i need to pass in IDF function to calculate idf.
# Moreover i will also create a dictionay from word. each words will be assign a numbers that can be used as column index.
def fit(corpus):    
    unique_words = set() # as set only store the unique values so using set to store the all unique words of the corpus.
    # check if its list type or not
    if isinstance(corpus, (list,)):
        for review in corpus: # for each review in the corpus.
            for word in review.split(" "): # for each word in the review.split method converts a string into list of words
                if len(word) < 2:  #words of length less than 2 can be ignored as usually it does not add any usual meansing(in this case) to work
                    continue        # will skip this iteration if above condition is true.
                unique_words.add(word)  #adding the words to the set.
        unique_words = sorted(list(unique_words))  #coverting to list and sorting in alphabetical order.
        vocab = {j:i for i,j in enumerate(unique_words)} #assign a number to each words which i will use as index later while constructing the matrix.
        Idf_values_of_all_unique_words=IDF(corpus,unique_words)  #Calling the IDF function.
    return vocab, Idf_values_of_all_unique_words

#function calling....
Vocabulary, idf_of_vocabulary=fit(corpus)       #calling the fit function 
print(Vocabulary) 
print("\n")
print(idf_of_vocabulary)      

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


{'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}


In [12]:
# in the below code i will contruct a sparse materix using the 

def transform(corpus,vocabulary,idf_of_vocabulary):
    sparse_matrix= csr_matrix( (len(corpus), len(vocabulary)), dtype=np.float64)  
    for i  in range(0,len(corpus)):
       number_of_words_in_sentence=Counter(corpus[i].split())  # this will return a dictionary. (key = words : values=word frequency in a review/TF.)
       for word in corpus[i].split():
           if word in  list(vocabulary.keys()): #Will create a list unique word as volcabulary(dic) already contain only unique words a keys.
               tf_idf_value=(number_of_words_in_sentence[word]/len(corpus[i].split()))*(idf_of_vocabulary[word])
               sparse_matrix[i,vocabulary[word]]=tf_idf_value  #storing the location and tf_idf values.
    output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output
final_output=transform(corpus,Vocabulary,idf_of_vocabulary)
print(final_output)

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


  self._set_intXint(row, col, x.flat[0])


In [13]:
final_output.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [14]:
final_output.toarray().shape

(4, 9)

### In the below code we will limit the maximum feature

In [38]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type
from google.colab import files
uploaded = files.upload()

import pickle
with open('cleaned_strings', 'rb') as f:
    Corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(Corpus))

Saving cleaned_strings to cleaned_strings (1)
Number of documents in corpus =  746


In [39]:
type(Corpus)

list

In [40]:
Corpus[0:20]

['slow moving aimless movie distressed drifting young man',
 'not sure lost flat characters audience nearly half walked',
 'attempting artiness black white clever camera angles movie disappointed became even ridiculous acting poor plot lines almost non existent',
 'little music anything speak',
 'best scene movie gerardo trying find song keeps running head',
 'rest movie lacks art charm meaning emptiness works guess empty',
 'wasted two hours',
 'saw movie today thought good effort good messages kids',
 'bit predictable',
 'loved casting jimmy buffet science teacher',
 'baby owls adorable',
 'movie showed lot florida best made look appealing',
 'songs best muppets hilarious',
 'cool',
 'right case movie delivers everything almost right face',
 'average acting main person low budget clearly see',
 'review long overdue since consider tale two sisters single greatest film ever made',
 'put gem movie terms screenplay cinematography acting post production editing directing aspect film makin

In [None]:
# Write your code here.
# Try not to hardcode any values.
# Make sure its well documented and readble with appropriate comments.

In [56]:
# the below code will calculate the IDF of all the unique words in the Corpus.
def IDF(Corpus, unique_words):
    idf_of_vocabulary={}     #will store the IDF of corresponding word here.
    N = len(Corpus) # it will be required in the formula of IDF.
    for words in unique_words:  #iterating through the each unique words.
        count=0 # will store the frequecny of a word in a review of the Corpus.
        for review in Corpus:  # iterating through a review in the corpus.
            if words in review.split(): # spliting the word in a review by space and checking if the words is present in the review or not.
                count = count+1  # frequency of a word in a review.
                idf_of_vocabulary[words]=1+(math.log((1+N)/(count+1)))   #calculating IDF and storing in dictionary along with word.(word:idf)
    # i am sorting the idf_of_vocabulary on the basis of IDF values.

    idf_of_vocabulary_sorted = dict(sorted(idf_of_vocabulary.items(),reverse=True, key=lambda x:x[1]))  #sorting the dict on the bases of value/idf
    #print(idf_of_vocabulary_sorted)

    idf_vocab_sorted_50_feature = dict(list(idf_of_vocabulary_sorted.items())[0:50])  #selecting top 50 feature based on idf values.
    #print(idf_vocab_sorted_50_feature)
    
    return idf_vocab_sorted_50_feature 

In [57]:
# in the code below i will create the set of unique words that i need to pass in IDF function to calculate idf.
# Moreover i will also create a dictionay from word. each words will be assign a numbers that can be used as column index.
def fit(Corpus):    
    unique_words = set() # as set only store the unique values so using set to store the all unique words of the corpus.
    # check if its list type or not
    if isinstance(Corpus, (list,)):
        for review in Corpus: # for each review in the corpus.
            for word in review.split(" "): # for each word in the review.split method converts a string into list of words
                if len(word) < 2:  #words of length less than 2 can be ignored as usually it does not add any usual meansing(in this case) to work
                    continue        # will skip this iteration if above condition is true.
                unique_words.add(word)  #adding the words to the set.
        unique_words = sorted(list(unique_words))  #coverting to list and sorting in alphabetical order.
        idf_vocab_sorted_50_feature=IDF(Corpus,unique_words)  #Calling the IDF function.
        #in the below code i am creatig a voabulary of 50 words from the keys of idf_vocab_sorted_50_feature    
        lst = []
        for x in idf_vocab_sorted_50_feature.keys():
            lst.append(x)
        New_vocab = {j:i for i,j in enumerate(lst)}  # assigne each word a number i will use them as feature index
        #print(New_vocab)
    return New_vocab, idf_vocab_sorted_50_feature

#function calling....
New_vocab, idf_vocab_sorted_50_feature=fit(Corpus)       #calling the fit function 
print(Vocabulary) 
print("\n")
print(idf_vocab_sorted_50_feature)      

{'aailiyah': 0, 'abandoned': 1, 'abroad': 2, 'abstruse': 3, 'academy': 4, 'accents': 5, 'accessible': 6, 'acclaimed': 7, 'accolades': 8, 'accurate': 9, 'accurately': 10, 'achille': 11, 'ackerman': 12, 'actions': 13, 'adams': 14, 'add': 15, 'added': 16, 'admins': 17, 'admiration': 18, 'admitted': 19, 'adrift': 20, 'adventure': 21, 'aesthetically': 22, 'affected': 23, 'affleck': 24, 'afternoon': 25, 'aged': 26, 'ages': 27, 'agree': 28, 'agreed': 29, 'aimless': 30, 'aired': 31, 'akasha': 32, 'akin': 33, 'alert': 34, 'alike': 35, 'allison': 36, 'allow': 37, 'allowing': 38, 'alongside': 39, 'amateurish': 40, 'amaze': 41, 'amazed': 42, 'amazingly': 43, 'amusing': 44, 'amust': 45, 'anatomist': 46, 'angel': 47, 'angela': 48, 'angelina': 49}


{'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.922918004572872, 'abstruse': 6.922918004572872, 'academy': 6.922918004572872, 'accents': 6.922918004572872, 'accessible': 6.922918004572872, 'acclaimed': 6.922918004572872, 'accol

In [58]:
# in the below code i will contruct a sparse materix using the 

def transform(Corpus,New_vocab,idf_vocab_sorted_3_feature):
    sparse_matrix= csr_matrix( (len(Corpus), len(New_vocab)), dtype=np.float64)  
    for i  in range(0,len(Corpus)):
       number_of_words_in_sentence=Counter(Corpus[i].split())  # this will return a dictionary. (key = words : values=word frequency in a review/TF.)
       for word in Corpus[i].split():
           if word in  list(New_vocab.keys()): #Will create a list unique word as volcabulary(dic) already contain only unique words a keys.
               tf_idf_value=(number_of_words_in_sentence[word]/len(Corpus[i].split()))*(idf_vocab_sorted_3_feature[word])
               sparse_matrix[i,New_vocab[word]]=tf_idf_value  #storing the location and tf_idf values.
    output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output
final_output=transform(Corpus,New_vocab,idf_vocab_sorted_3_feature)
print(final_output)

  (0, 30)	1.0
  (68, 24)	1.0
  (72, 29)	1.0
  (74, 31)	1.0
  (119, 33)	1.0
  (135, 3)	0.37796447300922725
  (135, 10)	0.37796447300922725
  (135, 18)	0.37796447300922725
  (135, 20)	0.37796447300922725
  (135, 36)	0.37796447300922725
  (135, 40)	0.37796447300922725
  (135, 41)	0.37796447300922725
  (176, 49)	1.0
  (181, 13)	1.0
  (192, 21)	1.0
  (193, 23)	1.0
  (216, 2)	1.0
  (222, 47)	1.0
  (225, 19)	1.0
  (227, 17)	1.0
  (241, 44)	1.0
  (270, 1)	1.0
  (290, 25)	1.0
  (333, 26)	1.0
  (334, 15)	1.0
  (341, 43)	1.0
  (344, 42)	1.0
  (348, 8)	1.0
  (377, 37)	1.0
  (409, 5)	1.0
  (430, 39)	1.0
  (457, 45)	1.0
  (461, 4)	1.0
  (465, 38)	1.0
  (475, 35)	1.0
  (493, 6)	1.0
  (500, 48)	1.0
  (548, 0)	0.7071067811865475
  (548, 32)	0.7071067811865475
  (608, 14)	1.0
  (612, 11)	1.0
  (620, 46)	1.0
  (632, 7)	1.0
  (644, 12)	0.7071067811865475
  (644, 27)	0.7071067811865475
  (664, 28)	1.0
  (667, 22)	1.0
  (691, 34)	1.0
  (697, 9)	1.0
  (722, 16)	1.0


  self._set_intXint(row, col, x.flat[0])


In [52]:
final_output.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
final_output.toarray().shape

(746, 50)