CODE FOR TASK1

In [10]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
def fit(dataset):
    '''
    Makes vocabulary from a corpus
    '''
    unique_words = set()   #For keeping only unique words
    if isinstance(dataset, (list,)):  #Checks if the given corpus is in the format of list or not
        for row in dataset:           #For each document in the dataset
            for word in row.split():  #Creates a list of words in the document
                if len(word) < 2:     
                    continue
                unique_words.add(word)
        vocab = {j : i for i, j in enumerate(sorted(list(unique_words)))} #Creates a dictionary of all unique words
        return vocab
    else:
         print('You need to pass a list of sentences')

In [17]:
corpus = ['this is the first document',
          'this document is the second document',
          'and this is the third one',
          'is this the first document']
vocab = fit(corpus)
print(vocab)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


In [18]:
def DocumentLength(doc):
    '''
    Computes the length of the given document
    '''
    doc_length = 0
    for ele in doc:  
        doc_length += 1
    return doc_length

In [19]:
def WordOccurrence(doc, word):
    '''
    Computes the number of times the given term occurred in the given document
    '''
    word_frequency = 0
    for ele in doc:
        if ele == word:
            word_frequency += 1
    return word_frequency

In [20]:
def TotalDocumentNumber(dataset):
    '''
    Count the number of documents in the given corpus
    '''
    total_document = 0
    for doc in dataset:
        total_document += 1
    return total_document

In [21]:
def DocumentWithTerm(dataset, word):
    '''
    Count the number of documents which has the given term in it
    '''
    docs_with_term = 0
    for doc in dataset:
        for doc_word in doc.split():
            if doc_word == word:
                docs_with_term += 1
                break
    return docs_with_term

In [22]:
def transform(dataset):
    '''
    Transforms the given corpus into TFIDF vectorizer
    '''
    if isinstance(dataset, (list,)):  #checks if the given corpus is in the format of list or not
        rows = []     
        columns = []
        values = []
        IDF = []
        #vocab contains the unique words and the dimension of each word in the format of dictionary where the key
        #represents the unique word and the value of the key is the dimension of that word
        vocab = fit(dataset)   
        #total_documents holds the number of documents in the corpus
        total_documents = TotalDocumentNumber(dataset)
        for word in vocab:   
            IDF.append(1 + np.log((1 + total_documents) / (1 + DocumentWithTerm(dataset, word)))) #Computes the
            #IDF value of the each unique word in the vocabulary 
        for idx, doc in enumerate(tqdm(dataset)):
            for word in list(set(doc.split())): #Splits the document which is in the format of string and makes 
                #a list of unique words in the document
                if len(word) < 2:
                    continue
                for key in vocab: #Iterates through the each key in the vocabulary
                    if key == word: #If the matching key is found then the dimension of the word is stored in the 
                        column_index = vocab[key] #column index
                        break
                rows.append(idx)
                columns.append(column_index)
                #tf holds the tf value of a term in the current document
                tf = WordOccurrence(doc.split(), word) / DocumentLength(doc.split())
                values.append(tf * IDF[column_index])
        return csr_matrix((values, (rows, columns)), shape = (len(dataset), len(vocab)))
    else:
        print('You need to pass a list of sentences')

In [23]:
tfidf = transform(corpus)  #tfidf holds the tfidf values of the documents in the corpus
tfidfl2 = normalize(tfidf) #tfidfl2 holds the L2 normalize values of the documents in the corpus 
print(tfidfl2[0])

100%|██████████| 4/4 [00:00<00:00, 1284.63it/s]

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149





In [24]:
#For comparing result with the sklearn implementation of the TFIDF
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)
print(vectorizer.get_feature_names())
print(vectorizer.idf_)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [25]:
skl_output.shape

(4, 9)

In [26]:
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [27]:
print(skl_output.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [28]:
print(tfidfl2.toarray()) #prints the tfidf values of all the documents in the dense form

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


CODE FOR TASK2

In [1]:
import pickle

In [2]:
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)

In [3]:
def UniqueWords(dataset):
    '''
    Counts unique words in the given corpus
    '''
    unique_words = set()
    for doc in dataset:   #To iterate through each document
        for word in doc.split():  #To iterate through each term in the document
            if len(word) < 2:
                continue
            unique_words.add(word)
    return unique_words

In [4]:
def DocumentLength(doc):
    '''
    Finds the length of the given document
    '''
    doc_length = 0
    for word in doc.split():   #To iterate through each term in the document
        doc_length += 1
    return doc_length

In [5]:
def TotalDocumentNumber(dataset):
    '''
    Counts the total document in the given corpus
    '''
    doc_number = 0
    for doc in dataset:  #To iterate through each document in the corpus
        doc_number += 1
    return doc_number

In [6]:
def WordOccurrence(doc, word):
    '''
    Counts the occurrences of the given term in the given document
    '''
    word_occ = 0
    for ele in doc.split(): #To iterate through each term in the document
        if ele == word:
            word_occ += 1
    return word_occ

In [7]:
def DocumentWithTerm(dataset, word):
    '''
    Counts the documents which have the given term
    '''
    docs_with_term = 0
    for doc in dataset:  #To iterate through each document in the corpus
        for ele in doc.split(): #To iterate through each term in a document
            if ele == word:
                docs_with_term += 1
                break
    return docs_with_term

In [8]:
def fit(dataset):
    '''
    Constructs the vocabulary of the top 50 terms based on their idf values 
    '''
    if isinstance(dataset, (list,)):
        unique_words = UniqueWords(dataset)
        IDF = []
        temp_idf = []
        #At the end of the for loop we get a list named IDF containing idf values of all unique words
        for word in list(unique_words):
            lst = []
            idf = 1 + np.log((1 + TotalDocumentNumber(dataset)) / (1 + DocumentWithTerm(dataset,word)))
            lst.append(word)
            lst.append(idf)
            IDF.append(lst)
        lst = []
        for ele in IDF:
            lst.append(ele[1])  #Taking only idf values in the list lst
        lst = sorted(lst, reverse = True)  #Sorting the list lst in decreasing order of idf values
        for i in range(0, 50):   #To iterate only through top 50 idf values
            for j in range(0, len(IDF)):  #To iterate through the list of idf values 
                if lst[i] == IDF[j][1]:   #To check if the idf value of a term is in the list of top 50 idf values
                    temp_idf.append(IDF[j]) #If it is,then append the value in the list temp_idf
                    IDF.pop(j) #Here we are popping the term whose idf value is in the list of top 50 values, otherwise 
                    break      #we may get the idf value of a term many times in the list temp_idf if some idf values are equal in list IDF
        vocab = {temp_idf[i][0] : i for i in range(0, 50)} #To constructs the vocabulary of the top 50 terms according to the idf values
        return vocab, temp_idf

In [11]:
vocab, IDF = fit(corpus)
print('The vocabulary of the top  50 terms based on their idf values')
print(vocab)

The vocabulary of the top  50 terms based on their idf values
{'views': 0, 'lid': 1, 'lestat': 2, 'represents': 3, 'george': 4, 'rough': 5, 'spacey': 6, 'redeemed': 7, 'scripted': 8, 'mystifying': 9, 'rumbles': 10, 'tender': 11, 'conrad': 12, 'seat': 13, 'thanks': 14, 'affected': 15, 'empowerment': 16, 'iffy': 17, 'guys': 18, 'chimp': 19, 'judging': 20, 'cases': 21, 'cry': 22, 'faultless': 23, 'weaving': 24, 'rpg': 25, 'celebrity': 26, 'traditional': 27, 'bond': 28, 'artistic': 29, 'value': 30, 'juano': 31, 'embassy': 32, 'fifties': 33, 'decisions': 34, 'greatness': 35, 'dysfunction': 36, 'strange': 37, 'messages': 38, 'bakery': 39, 'elderly': 40, 'random': 41, 'stuart': 42, 'boogeyman': 43, 'fare': 44, 'tear': 45, 'ben': 46, 'oriented': 47, 'cuts': 48, 'distressed': 49}


In [12]:
print('The top 50 idf values')
print(IDF)

The top 50 idf values
[['views', 6.922918004572872], ['lid', 6.922918004572872], ['lestat', 6.922918004572872], ['represents', 6.922918004572872], ['george', 6.922918004572872], ['rough', 6.922918004572872], ['spacey', 6.922918004572872], ['redeemed', 6.922918004572872], ['scripted', 6.922918004572872], ['mystifying', 6.922918004572872], ['rumbles', 6.922918004572872], ['tender', 6.922918004572872], ['conrad', 6.922918004572872], ['seat', 6.922918004572872], ['thanks', 6.922918004572872], ['affected', 6.922918004572872], ['empowerment', 6.922918004572872], ['iffy', 6.922918004572872], ['guys', 6.922918004572872], ['chimp', 6.922918004572872], ['judging', 6.922918004572872], ['cases', 6.922918004572872], ['cry', 6.922918004572872], ['faultless', 6.922918004572872], ['weaving', 6.922918004572872], ['rpg', 6.922918004572872], ['celebrity', 6.922918004572872], ['traditional', 6.922918004572872], ['bond', 6.922918004572872], ['artistic', 6.922918004572872], ['value', 6.922918004572872], ['j

In [13]:
def transform(dataset):
    '''
    Transforms the given corpus into TFIDF vectorizer containing only the top 50 terms based on their idf values 
    '''
    if isinstance(dataset, (list,)): #To check whether the given corpus is in the form of list
        rows = []
        columns = []
        values = []
        vocab, IDF = fit(dataset)   #vocab contains the vocabulary of the top 50 terms based on their idf values, and IDF contains the idf values of the corresponding term  in the vocabulary
        totaldocument = TotalDocumentNumber(dataset)
        for idx, doc in enumerate(tqdm(dataset)):
            for term in list(set(doc.split())):  #Splits the document which is in the format of string, and makes a list of all unique words
                if len(term) < 2:
                    continue
                for key in vocab:  #At the end of this for loop we get tfidf values of the top 50 terms based on their idf values
                    #in the list 'values',and the lists 'rows' and 'columns' contain the corresponding row and column values of the term which is in the vocabulary
                    if key == term:
                        column_index = vocab[key]
                        rows.append(idx)
                        columns.append(column_index)
                        tf = WordOccurrence(doc, term) / DocumentLength(doc)
                        values.append(tf * IDF[column_index][1])
                        break
        return csr_matrix((values, (rows, columns)), shape = (len(dataset), len(vocab)))  #transform function returns the tfidf vectorizer in the form of sparse matrix
    else:
        print('You need to pass a list of documents')

In [14]:
tfidf = transform(corpus) #tfidf contains the sparse matrix of the corpus
tfidfl2 = normalize(tfidf) #tfidfl2 contains the l2 normalized sparse matrix of the corpus

100%|██████████| 746/746 [00:00<00:00, 21438.81it/s]


In [15]:
print('The l2 normalized top 50 tfidf values in the form of sparse matrix')
print(tfidfl2)
print('The l2 normalized top 50 tfidf values in the form of dense matrix')
print(tfidfl2.toarray())

The l2 normalized top 50 tfidf values in the form of sparse matrix
  (0, 49)	1.0
  (7, 38)	1.0
  (19, 44)	1.0
  (60, 13)	1.0
  (68, 46)	1.0
  (75, 21)	1.0
  (80, 22)	1.0
  (87, 7)	1.0
  (106, 16)	1.0
  (109, 43)	1.0
  (135, 10)	0.5
  (135, 14)	0.5
  (135, 33)	0.5
  (135, 35)	0.5
  (137, 29)	1.0
  (148, 39)	0.7071067811865476
  (148, 45)	0.7071067811865476
  (161, 28)	1.0
  (190, 23)	1.0
  (193, 15)	1.0
  (205, 3)	1.0
  (222, 1)	1.0
  (238, 19)	1.0
  (254, 30)	1.0
  (277, 40)	1.0
  (305, 0)	1.0
  (310, 5)	1.0
  (350, 12)	1.0
  (396, 11)	0.7071067811865476
  (396, 47)	0.7071067811865476
  (434, 4)	1.0
  (436, 41)	1.0
  (437, 32)	1.0
  (447, 17)	1.0
  (475, 20)	1.0
  (516, 25)	1.0
  (517, 48)	1.0
  (547, 2)	0.7071067811865476
  (547, 42)	0.7071067811865476
  (550, 36)	1.0
  (572, 37)	1.0
  (619, 8)	1.0
  (628, 31)	1.0
  (644, 6)	0.5773502691896257
  (644, 26)	0.5773502691896257
  (644, 34)	0.5773502691896257
  (652, 9)	1.0
  (665, 18)	1.0
  (715, 27)	1.0
  (734, 24)	1.0
The l2 normalized 