# TF-IDF Word Embedding #1


### 1. Importing libraries


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

### 2. Load documents

In [None]:
with open('documents.txt', 'r', encoding = 'utf-8') as f:
    documents = f.readlines()

documents = [doc.strip() for doc in documents if doc.strip()]


['The sky is blue and beautiful.', 'Love this blue and beautiful sky!', 'The quick brown fox jumps over the lazy dog.', "A king's breakfast has sausages, ham, bacon, eggs, toast and beans.", 'I love green eggs, ham, sausages and bacon!', 'The brown fox is quick and the blue dog is lazy!', 'The sky is very blue and the sky is very beautiful today.']


### 3. Initialize TF-IDF Vectorizer

In [18]:
vectorizer = TfidfVectorizer(stop_words= 'english')

### 4. Fit & Transform documents to TF-IDF matrix

In [19]:
tfidf_matrix = vectorizer.fit_transform(documents)


### 5. Feature names

In [20]:
print("Vocabulary: ", vectorizer.get_feature_names_out())



Vocabulary:  ['bacon' 'beans' 'beautiful' 'blue' 'breakfast' 'brown' 'dog' 'eggs' 'fox'
 'green' 'ham' 'jumps' 'king' 'lazy' 'love' 'quick' 'sausages' 'sky'
 'toast' 'today']


### 6. Converting TF-IDF matrix to dense form

In [9]:
dense_matrix = tfidf_matrix.todense()

print(dense_matrix)

[[0.30073434 0.         0.         0.44920459 0.39000294 0.
  0.         0.         0.         0.         0.         0.
  0.         0.44920459 0.         0.         0.         0.
  0.         0.         0.         0.44920459 0.39000294 0.
  0.         0.         0.        ]
 [0.26144771 0.         0.         0.39052245 0.33905465 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.45687599
  0.         0.         0.         0.39052245 0.         0.55039605
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.31457246 0.31457246 0.         0.31457246 0.         0.
  0.         0.         0.37896375 0.         0.31457246 0.
  0.37896375 0.31457246 0.         0.         0.46689805 0.
  0.         0.         0.        ]
 [0.16813526 0.29381386 0.35395598 0.         0.         0.35395598
  0.         0.         0.29381386 0.         0.         0.29381386
  0.35395598 0.     

### 7. Calculating Cosine Similarity

In [12]:
cosine_sim = cosine_similarity(tfidf_matrix)

print("Cosine Similarity:" )
print(cosine_sim)

Cosine Similarity:
[[1.         0.56170756 0.18209161 0.05056405 0.06609892 0.54513468
  0.71979396]
 [0.56170756 1.         0.         0.04395858 0.23294226 0.12095572
  0.3410637 ]
 [0.18209161 0.         1.         0.         0.         0.68306509
  0.16850397]
 [0.05056405 0.04395858 0.         1.         0.48835015 0.02900527
  0.02339548]
 [0.06609892 0.23294226 0.         0.48835015 1.         0.03791661
  0.03058332]
 [0.54513468 0.12095572 0.68306509 0.02900527 0.03791661 1.
  0.44008221]
 [0.71979396 0.3410637  0.16850397 0.02339548 0.03058332 0.44008221
  1.        ]]


### Finding most similar document to the first one

In [13]:
most_sim = np.argsort(cosine_sim[0])[::-1][1]
print(f"Most similar to document 0: document {most_sim}")
print("doc 0: ", documents[0])
print("doc", most_sim, ":", documents[most_sim])

Most similar to document 0: document 6
doc 0:  The sky is blue and beautiful.
doc 6 : The sky is very blue and the sky is very beautiful today.
