In [1]:
# Experiment Title: Implement text document similarity recognizer for the given text
# Experiment No: 03
# Author: Adrija Dastidar
# PRN: 1032221315

In [2]:
# Import Libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# Sample Documents
documents = [
    "This is a sample document for testing the text similarity",
    "We will use NLTK for computing similarity of text",
    "NLTK is a powerful library for natural language processing"
]

In [5]:
# Define stop words and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [6]:
def preprocess_text(text):
    """Tokenizes, removes stopwords, and applies stemming."""
    words = word_tokenize(text.lower())
    filtered_words = []

    for word in words:
        if word.isalnum() and word not in stop_words:
            filtered_words.append(stemmer.stem(word))

    return ' '.join(filtered_words)

In [7]:
# Preprocess documents
preprocessed_docs = []

for doc in documents:
    preprocessed_docs.append(preprocess_text(doc))

In [8]:
# Display preprocessed documents
print("Preprocessed Documents:")
for i in range(len(documents)):
    print(f"Document {i+1}: {preprocess_text(documents[i])}")

Preprocessed Documents:
Document 1: sampl document test text similar
Document 2: use nltk comput similar text
Document 3: nltk power librari natur languag process


In [9]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_docs)
feature_names = tfidf_vectorizer.get_feature_names_out()

In [10]:
# Display TF-IDF matrix
print("TF-IDF Matrix:")
for i in range(len(documents)):
    print(f"Document {i+1}:")
    for j in range(len(feature_names)):
        print(f"\t {feature_names[j]}: {tfidf_matrix.toarray()[i, j]:.4f}")

TF-IDF Matrix:
Document 1:
	 comput: 0.0000
	 document: 0.4905
	 languag: 0.0000
	 librari: 0.0000
	 natur: 0.0000
	 nltk: 0.0000
	 power: 0.0000
	 process: 0.0000
	 sampl: 0.4905
	 similar: 0.3730
	 test: 0.4905
	 text: 0.3730
	 use: 0.0000
Document 2:
	 comput: 0.5174
	 document: 0.0000
	 languag: 0.0000
	 librari: 0.0000
	 natur: 0.0000
	 nltk: 0.3935
	 power: 0.0000
	 process: 0.0000
	 sampl: 0.0000
	 similar: 0.3935
	 test: 0.0000
	 text: 0.3935
	 use: 0.5174
Document 3:
	 comput: 0.0000
	 document: 0.0000
	 languag: 0.4234
	 librari: 0.4234
	 natur: 0.4234
	 nltk: 0.3220
	 power: 0.4234
	 process: 0.4234
	 sampl: 0.0000
	 similar: 0.0000
	 test: 0.0000
	 text: 0.0000
	 use: 0.0000


In [11]:
import math
from collections import Counter

# Get vocabulary from preprocessed documents
vocab = set()
for doc in preprocessed_docs:
    vocab.update(doc.split())

# Compute Term Frequency (TF)
def compute_tf(doc):
    words = doc.split()
    word_counts = Counter(words)
    total_words = len(words)
    return {word: word_counts[word] / total_words for word in vocab}

tf_scores = [compute_tf(doc) for doc in preprocessed_docs]

In [12]:
# Compute Inverse Document Frequency (IDF)
def compute_idf(docs):
    num_docs = len(docs)
    idf_scores = {}

    for word in vocab:
        containing_docs = sum(1 for doc in docs if word in set(doc.split()))
        idf_scores[word] = math.log(num_docs / (1 + containing_docs))

    return idf_scores

In [13]:
# Compute TF-IDF
idf_scores = compute_idf(preprocessed_docs)
tfidf_scores = [{word: tf * idf_scores[word] for word, tf in tf.items()} for tf in tf_scores]

print("\nTF-IDF Scores:")
for i, doc_tfidf in enumerate(tfidf_scores):
    print(f"\nDocument {i+1}:")
    for word, score in doc_tfidf.items():
        print(f"\t{word}: {score:.4f}")


TF-IDF Scores:

Document 1:
	natur: 0.0000
	text: 0.0000
	similar: 0.0000
	comput: 0.0000
	sampl: 0.0811
	nltk: 0.0000
	use: 0.0000
	document: 0.0811
	languag: 0.0000
	test: 0.0811
	process: 0.0000
	librari: 0.0000
	power: 0.0000

Document 2:
	natur: 0.0000
	text: 0.0000
	similar: 0.0000
	comput: 0.0811
	sampl: 0.0000
	nltk: 0.0000
	use: 0.0811
	document: 0.0000
	languag: 0.0000
	test: 0.0000
	process: 0.0000
	librari: 0.0000
	power: 0.0000

Document 3:
	natur: 0.0676
	text: 0.0000
	similar: 0.0000
	comput: 0.0000
	sampl: 0.0000
	nltk: 0.0000
	use: 0.0000
	document: 0.0000
	languag: 0.0676
	test: 0.0000
	process: 0.0676
	librari: 0.0676
	power: 0.0676


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
def calculate_similarity(text1, text2):
    preprocessed_texts = [preprocess_text(text1), preprocess_text(text2)]

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_texts)

    similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return similarity_score

In [15]:
def find_similar_words(text1, text2):
    words1 = set(preprocess_text(text1).split())
    words2 = set(preprocess_text(text2).split())

    common_words = words1.intersection(words2)
    return common_words

In [16]:
if __name__ == "__main__":
    text1 = "I enjoy eating apples"
    text2 = "Apples are sweet"

In [17]:
similarity_score = calculate_similarity(text1, text2)
print(f"Similarity Score: {similarity_score:.4f}")

Similarity Score: 0.2606


In [18]:
similar_words = find_similar_words(text1, text2)
print(f"Common Words: {similar_words}")

Common Words: {'appl'}
