In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

# tf-idf (term frequency inverse document frequency)

In [2]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [3]:
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, list):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        
        return vocab
    else:
        print("you need to pass list of sentance")


def tf(w,r): #here we are creating function to calculate the term_frequency
    # create new reveiw which length>=2
    d=dict(Counter(r.split(" "))) # creating dictionary, word as key and number of times that word occured in r as value
    n_w=d.get(w) # using get method to get the value of key which corresponds as key
    T=len(r.split(" ")) # getting total number of words in review
    return n_w/T # formula for term frequency

def idf(w,dataset): #here creating idf function to calculate idf value
    N=len(dataset)  # length of all reviews in dataset
    n_i=0           # intialising variable 
    for r in dataset: # iterating through dataset to find (in how many reviews our word exists)
        if w in r:
            n_i=n_i+1
    return 1+(np.log((N+1)/(n_i+1))) # formula of idf which is used by sklearn but actual formual is np.log(N/n_i)
            
    
def transform(dataset,vocab):
    rows=[]
    columns=[]
    values=[]
    if isinstance(dataset,list):
        for idx,r in enumerate(dataset):
            for w in r.split(" "):
                if len(w) < 2:
                     continue
                col_index = vocab.get(w, -1)
                if col_index !=-1:
                    rows.append(idx)
                    columns.append(col_index)
                    values.append(tf(w,r)*idf(w,dataset))
                
        return csr_matrix((values,(rows,columns)),shape=(len(dataset),len(vocab)))

In [4]:
vocab=fit(corpus)
print(vocab)
temp=transform(corpus,vocab)
print("sparse matrix representation is given by\n",temp)
print(5*"***********************")
print("before normalising\n",temp.toarray())
print(5*"***********************")
print("after normalising\n",normalize(temp).toarray()) #we normalising because in sklearn output is normalised

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}
sparse matrix representation is given by
   (0, 1)	0.24462871026284194
  (0, 2)	0.3021651247531982
  (0, 3)	0.2
  (0, 6)	0.2
  (0, 8)	0.2
  (1, 1)	0.8154290342094731
  (1, 3)	0.16666666666666666
  (1, 5)	0.3193817886456925
  (1, 6)	0.16666666666666666
  (1, 8)	0.16666666666666666
  (2, 0)	0.3193817886456925
  (2, 3)	0.16666666666666666
  (2, 4)	0.3193817886456925
  (2, 6)	0.16666666666666666
  (2, 7)	0.3193817886456925
  (2, 8)	0.16666666666666666
  (3, 1)	0.24462871026284194
  (3, 2)	0.3021651247531982
  (3, 3)	0.2
  (3, 6)	0.2
  (3, 8)	0.2
*******************************************************************************************************************
before normalising
 [[0.         0.24462871 0.30216512 0.2        0.         0.
  0.2        0.         0.2       ]
 [0.         0.81542903 0.         0.16666667 0.         0.31938179
  0.16666667 0.         0.16666667]
 [0.31938179 

# cross validation using sklearn library

In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [6]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [7]:
print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [8]:
skl_output.shape

(4, 9)

In [9]:
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [10]:
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


# now applying tfidf on file

In [11]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
#print("Number of documents in corpus = ",len(corpus))
vocab=fit(corpus)
temp=transform(corpus,vocab)
print(temp)
print(70*"*")
print("before normalizing\n",temp.toarray())
print(70*"*")
print("after normalizing\n",normalize(temp).toarray())

  (0, 53)	0.865364750571609
  (0, 688)	0.865364750571609
  (0, 720)	0.865364750571609
  (0, 1545)	0.46052994405106146
  (0, 1651)	0.3282040747942601
  (0, 1653)	0.708769379509688
  (0, 2287)	0.6920779554316226
  (0, 2878)	0.7280382144880952
  (1, 149)	0.6471450795449735
  (1, 374)	0.4611476980370101
  (1, 966)	0.724161432940523
  (1, 1132)	0.615180404828109
  (1, 1511)	0.6674030302998574
  (1, 1676)	0.7692131116192079
  (1, 1712)	0.30454087354796283
  (1, 2446)	0.6471450795449735
  (1, 2764)	0.724161432940523
  (2, 20)	0.206693985843099
  (2, 64)	0.2796568469546722
  (2, 89)	0.3430238366560372
  (2, 124)	0.36436410550383536
  (2, 145)	0.36436410550383536
  (2, 201)	0.3430238366560372
  (2, 236)	0.2796568469546722
  (2, 320)	0.270060975544464
  :	:
  (741, 268)	0.8580896103855309
  (741, 429)	0.9889882863675531
  (741, 1096)	0.7011449977186582
  (741, 1354)	0.9889882863675531
  (741, 1422)	0.9310646994949582
  (741, 2471)	0.7454528446192066
  (741, 2785)	0.6673751722809108
  (742, 939)	