<a href="https://colab.research.google.com/github/Ashishsingh-del/NLP-Tasks/blob/main/LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from numpy.linalg import svd

# ==============================
# Step 1: Get Documents from User
# ==============================
n = int(input("Enter number of documents: "))

docs = []
for i in range(n):
    text = input(f"Enter text for Document {i+1}: ")
    docs.append(text)

# ==============================
# Step 2: Build Term-Document Matrix
# ==============================
vectorizer = CountVectorizer(stop_words='english', binary=True)
A = vectorizer.fit_transform(docs).toarray()

print("\n Term-Document Matrix (A):\n", A)
print("Terms:", vectorizer.get_feature_names_out())

# ==============================
# Step 3: Perform SVD (LSA)
# ==============================
U, S, Vt = svd(A, full_matrices=False)

print("\nSingular Values (S):", np.round(S, 3))
print("\nU (Word-Concept Matrix):\n", np.round(U, 3))
print("\nV^T (Document-Concept Matrix):\n", np.round(Vt, 3))

# ==============================
# Step 4: Construct Concept Spaces
# ==============================
word_concept = U @ np.diag(S)
document_concept = (np.diag(S) @ Vt).T

print("\nWord-Concept Space (U * S):\n", np.round(word_concept, 3))
print("\nDocument-Concept Space ((S * V^T)^T):\n", np.round(document_concept, 3))

# ==============================
# Step 5: Cosine Similarity Function
# ==============================
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# ==============================
# Step 6: Compute Similarity Matrix
# ==============================
n_docs = document_concept.shape[0]
similarity_matrix = np.zeros((n_docs, n_docs))
for i in range(n_docs):
    for j in range(n_docs):
        similarity_matrix[i, j] = cosine_similarity(document_concept[i], document_concept[j])

print("\nCosine Similarity Matrix (Documents):\n", np.round(similarity_matrix, 3))

# ==============================
# Step 7: Display Results
# ==============================
similarity_threshold = 0.7
print("\n Document Similarity Check (Threshold =", similarity_threshold, "):")
for i in range(n_docs):
    for j in range(i + 1, n_docs):
        sim_score = similarity_matrix[i, j]
        if sim_score >= similarity_threshold:
            print(f" Doc{i+1} and Doc{j+1} are similar (Similarity = {sim_score:.3f})")
        else:
            print(f" Doc{i+1} and Doc{j+1} are NOT similar (Similarity = {sim_score:.3f})")



Enter number of documents: 2
Enter text for Document 1: he is a boy
Enter text for Document 2: he loves playing football

 Term-Document Matrix (A):
 [[1 0 0 0]
 [0 1 1 1]]
Terms: ['boy' 'football' 'loves' 'playing']

Singular Values (S): [1.732 1.   ]

U (Word-Concept Matrix):
 [[0. 1.]
 [1. 0.]]

V^T (Document-Concept Matrix):
 [[0.    0.577 0.577 0.577]
 [1.    0.    0.    0.   ]]

Word-Concept Space (U * S):
 [[0.    1.   ]
 [1.732 0.   ]]

Document-Concept Space ((S * V^T)^T):
 [[0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]

Cosine Similarity Matrix (Documents):
 [[1. 0. 0. 0.]
 [0. 1. 1. 1.]
 [0. 1. 1. 1.]
 [0. 1. 1. 1.]]

 Document Similarity Check (Threshold = 0.7 ):
 Doc1 and Doc2 are NOT similar (Similarity = 0.000)
 Doc1 and Doc3 are NOT similar (Similarity = 0.000)
 Doc1 and Doc4 are NOT similar (Similarity = 0.000)
 Doc2 and Doc3 are similar (Similarity = 1.000)
 Doc2 and Doc4 are similar (Similarity = 1.000)
 Doc3 and Doc4 are similar (Similarity = 1.000)
