In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from fractions import Fraction
import re
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aneeq\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aneeq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.DataFrame(["sunshine state enjoy sunshine",
                  "brown fox jump high brown fox run",
                  "sunshine state fox run fast"], 
                 index = ["S1", "S2", "S3"])


In [6]:
df[0]

S1         sunshine state enjoy sunshine
S2    brown fox jump high, brown fox run
S3           sunshine state fox run fast
Name: 0, dtype: object

## Bag of Words

In [8]:
vectorizer = CountVectorizer()
bagOfWords= vectorizer.fit_transform(df[0])

In [9]:
print(bagOfWords.toarray())

[[0 1 0 0 0 0 0 1 2]
 [2 0 0 2 1 1 1 0 0]
 [0 0 1 1 0 0 1 1 1]]


## TF (Term Frequency)

In [24]:
tfidf_vectorizer = TfidfVectorizer(use_idf=False)
result = tfidf_vectorizer.fit_transform(df[0])

In [14]:
tfidf_df = pd.DataFrame(result.toarray(), index=["S1","S2","S3"], columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,brown,enjoy,fast,fox,high,jump,run,state,sunshine
s1,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248,0.816497
s2,0.603023,0.0,0.0,0.603023,0.301511,0.301511,0.301511,0.0,0.0
s3,0.0,0.0,0.447214,0.447214,0.0,0.0,0.447214,0.447214,0.447214


## IDF (Inverse Document Frequency) (Using sklearn)

In [28]:
tfidf_vectorizer2 = TfidfVectorizer(use_idf=True)
result = tfidf_vectorizer2.fit_transform(df[0])

In [30]:

print('\nidf values:')
for ele1, ele2 in zip(tfidf_vectorizer2.get_feature_names_out(),tfidf_vectorizer2.idf_):
    print(ele1, ':', ele2)


idf values:
brown : 1.6931471805599454
enjoy : 1.6931471805599454
fast : 1.6931471805599454
fox : 1.2876820724517808
high : 1.6931471805599454
jump : 1.6931471805599454
run : 1.2876820724517808
state : 1.2876820724517808
sunshine : 1.2876820724517808


## TF-IDF

In [31]:
TFIDF = pd.DataFrame(result.toarray(), index=["S1","S2","S3"], columns=tfidf_vectorizer.get_feature_names_out())
TFIDF

Unnamed: 0,brown,enjoy,fast,fox,high,jump,run,state,sunshine
s1,0.0,0.50689,0.0,0.0,0.0,0.0,0.0,0.385503,0.771006
s2,0.670703,0.0,0.0,0.510087,0.335352,0.335352,0.255044,0.0,0.0
s3,0.0,0.0,0.549351,0.417796,0.0,0.0,0.417796,0.417796,0.417796


## Cosine Similarity b/w S1 & S3

In [33]:
s1_list = word_tokenize("sunshine state enjoy sunshine")
s3_list = word_tokenize("sunshine state fox run fast")

sw = stopwords.words('english')
l1 =[];l2 =[]

s1_set = {w for w in s1_list if not w in sw}
s3_set = {w for w in s3_list if not w in sw}

rvector = s1_set.union(s3_set)
for w in rvector:
    if w in s1_set: l1.append(1) 
    else: l1.append(0)
    if w in s3_set: l2.append(1)
    else: l2.append(0)
c = 0

for i in range(len(rvector)):
        c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
print("Cosine similarity: ", cosine)

Cosine similarity:  0.5163977794943222
