In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Define the three sentences
sentences = [
    "I love reading books",
    "Books are my favorite pastime",
    "Reading is a great way to relax"
]

In [4]:
# Tokenization and preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [6]:
# Preprocess the sentences
preprocessed_sentences = [preprocess(sentence) for sentence in sentences]

In [7]:
# Compute TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

In [8]:
# Calculate cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
# Output the similarity matrix
print("Cosine Similarity Matrix:")
print(cosine_similarities)

Cosine Similarity Matrix:
[[1.         0.24527199 0.208199  ]
 [0.24527199 1.         0.        ]
 [0.208199   0.         1.        ]]


In [10]:
# Ranking based on similarity
sorted_indices = (-cosine_similarities).argsort(axis=1)[:, 1:]

In [11]:
print("\nRanking based on Cosine Similarity:")
for i, indices in enumerate(sorted_indices):
    print(f"Sentence {i + 1}:")
    for rank, index in enumerate(indices):
        print(f"Rank {rank + 1}: Sentence {index + 1} - Similarity: {cosine_similarities[i][index]}")


Ranking based on Cosine Similarity:
Sentence 1:
Rank 1: Sentence 2 - Similarity: 0.24527198569314448
Rank 2: Sentence 3 - Similarity: 0.20819899938446804
Sentence 2:
Rank 1: Sentence 1 - Similarity: 0.24527198569314448
Rank 2: Sentence 3 - Similarity: 0.0
Sentence 3:
Rank 1: Sentence 1 - Similarity: 0.20819899938446804
Rank 2: Sentence 2 - Similarity: 0.0
