<a href="https://colab.research.google.com/github/2022civarshara-dot/ML_Lab/blob/main/Session_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# K means
!pip install scikit-learn matplotlib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# --------------------------------------------------------------
# 1. Sample text documents
# --------------------------------------------------------------
documents = [
    "Artificial intelligence is transforming the world.",
    "Machine learning is a subset of artificial intelligence.",
    "Deep learning uses neural networks.",
    "Neural networks train on large datasets.",
    "The cat sits on the mat.",
    "Cats and dogs are common household pets.",
    "Dogs enjoy playing fetch with their owners.",
    "Pets like cats and dogs are loved by families."
]

# --------------------------------------------------------------
# 2. Convert documents to TF-IDF vectors
# --------------------------------------------------------------
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# --------------------------------------------------------------
# 3. Perform K-means clustering
# --------------------------------------------------------------
k = 2  # number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X)

print("\n===== DOCUMENT CLUSTERS =====")
for i, label in enumerate(labels):
    print(f"Cluster {label}: {documents[i]}")

# --------------------------------------------------------------
# 4. Similarity Recognition Example
# --------------------------------------------------------------
query = ["AI and machine learning are closely related."]

query_vec = vectorizer.transform(query)
similarities = cosine_similarity(query_vec, X).flatten()

print("\n===== SIMILARITY SCORES WITH QUERY =====")
for i, score in enumerate(similarities):
    print(f"{score:.4f}  -->  {documents[i]}")

best_match = np.argmax(similarities)

print("\nMost similar document:")
print(documents[best_match])

# --------------------------------------------------------------
# 5. PCA for Visualization (2D graph)
# --------------------------------------------------------------
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X.toarray())

plt.figure(figsize=(10, 6))

# Plot each document
for i in range(len(X_2d)):
    plt.scatter(X_2d[i, 0], X_2d[i, 1],
                c='red' if labels[i] == 0 else 'blue', s=70)
    plt.text(X_2d[i, 0] + 0.01, X_2d[i, 1] + 0.01, str(i), fontsize=9)

plt.title("K-Means Clustering Visualized with PCA")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()