In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
text = [
    "John likes soccer. John plays soccer every afternoon after school.",
    "Mary reads books. Mary reads books in the library every evening.",
    "The cat chased the cat around the yard until the cat tired.",
    "He likes football and she likes football more than any other sport."
]
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    return [word for word in words if word.lower() not in stop_words and word not in string.punctuation]

tokenized_doc = [remove_stopwords(t) for t in text]

for doc in tokenized_doc:
    print(doc)

['John', 'likes', 'soccer', 'John', 'plays', 'soccer', 'every', 'afternoon', 'school']
['Mary', 'reads', 'books', 'Mary', 'reads', 'books', 'library', 'every', 'evening']
['cat', 'chased', 'cat', 'around', 'yard', 'cat', 'tired']
['likes', 'football', 'likes', 'football', 'sport']


In [4]:
#calculate TF
from collections import Counter
def calculate_tf(doc):
  tf_count = Counter(doc)
  total_count = len(doc)
  return {word: round(count/total_count, 3) for word, count in tf_count.items()}

tf_doc = [calculate_tf(doc) for doc in tokenized_doc]

for tf in tf_doc:
  print(tf)

{'John': 0.222, 'likes': 0.111, 'soccer': 0.222, 'plays': 0.111, 'every': 0.111, 'afternoon': 0.111, 'school': 0.111}
{'Mary': 0.222, 'reads': 0.222, 'books': 0.222, 'library': 0.111, 'every': 0.111, 'evening': 0.111}
{'cat': 0.429, 'chased': 0.143, 'around': 0.143, 'yard': 0.143, 'tired': 0.143}
{'likes': 0.4, 'football': 0.4, 'sport': 0.2}


In [5]:
import math
#calculate IDF
def calculate_idf(tokenized_doc):
  N = len(tokenized_doc)
  all_terms = set(term for doc in tokenized_doc for term in doc)
  idf = {}
  for term in all_terms:
    # count how many documents contain the word
    df = sum(1 for doc in tokenized_doc if term in doc)
    idf[term] = math.log(N + 1 / (1 + df))
  return idf

idf_doc = calculate_idf(tokenized_doc)
for idf in idf_doc.items():
  print(idf)

('sport', 1.5040773967762742)
('every', 1.466337068793427)
('yard', 1.5040773967762742)
('soccer', 1.5040773967762742)
('evening', 1.5040773967762742)
('afternoon', 1.5040773967762742)
('tired', 1.5040773967762742)
('reads', 1.5040773967762742)
('library', 1.5040773967762742)
('cat', 1.5040773967762742)
('John', 1.5040773967762742)
('books', 1.5040773967762742)
('Mary', 1.5040773967762742)
('likes', 1.466337068793427)
('plays', 1.5040773967762742)
('school', 1.5040773967762742)
('chased', 1.5040773967762742)
('around', 1.5040773967762742)
('football', 1.5040773967762742)


In [6]:
#calculate TF-IDF

def calculate_tf_idf(tf, idf):
  tf_idf = {}
  for word, tf_value in tf.items():
    tf_idf[word] = tf_value * idf[word]
  return tf_idf

tf_idf_doc = [calculate_tf_idf(tf, idf_doc) for tf in tf_doc]

for tf_idf in tf_idf_doc:
  print(tf_idf)

{'John': 0.3339051820843329, 'likes': 0.16276341463607039, 'soccer': 0.3339051820843329, 'plays': 0.16695259104216645, 'every': 0.16276341463607039, 'afternoon': 0.16695259104216645, 'school': 0.16695259104216645}
{'Mary': 0.3339051820843329, 'reads': 0.3339051820843329, 'books': 0.3339051820843329, 'library': 0.16695259104216645, 'every': 0.16276341463607039, 'evening': 0.16695259104216645}
{'cat': 0.6452492032170216, 'chased': 0.21508306773900718, 'around': 0.21508306773900718, 'yard': 0.21508306773900718, 'tired': 0.21508306773900718}
{'likes': 0.5865348275173708, 'football': 0.6016309587105098, 'sport': 0.3008154793552549}


In [7]:
Counter(tokenized_doc[0])

Counter({'John': 2,
         'likes': 1,
         'soccer': 2,
         'plays': 1,
         'every': 1,
         'afternoon': 1,
         'school': 1})

In [8]:
docs = [
    "I love horror movie.",
    "Lights out is a horror movie."
]

tokenized_doc = [remove_stopwords(t) for t in docs]
for doc in tokenized_doc:
    print(doc)

['love', 'horror', 'movie']
['Lights', 'horror', 'movie']


In [9]:
unique_terms = set(term for doc in tokenized_doc for term in doc)
print(unique_terms)

{'horror', 'love', 'movie', 'Lights'}


In [17]:
def create_vector(doc):
       return [1 if term in doc else 0 for term in unique_terms]

vectors = [create_vector(doc) for doc in tokenized_doc]

for vector in vectors:
    print(vector)

[1, 1, 1, 0]
[1, 0, 1, 1]


In [18]:
def cosine_sim(v1, v2):
  dot_product = sum(a * b for a, b in zip(v1, v2))
  mag_v1 = math.sqrt(sum(a**2 for a in v1))
  mag_v2 = math.sqrt(sum(b**2 for b in v2))
  return dot_product / (mag_v1 * mag_v2)

similarity = cosine_sim(vectors[0], vectors[1])
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.6666666666666667
