In [36]:
import numpy as np

In [39]:
docs = [
    "john plays football",
    "library books read",
    "mary likes tennis",
    "books novels library",
    "john likes football",
    "mary reads novels"
]

**K-means**

In [40]:
# Vocabulary and vectorization
vocab = list(set(word for doc in docs for word in doc.split()))
def vectorize(doc):
    return np.array([1 if word in doc.split() else 0 for word in vocab])

X = np.array([vectorize(doc) for doc in docs])

In [43]:
# K-Means manually
def kmeans(X, k=2, max_iters=100):
    centroids = X[np.random.choice(len(X), k, replace=False)]

    for _ in range(max_iters):
        clusters = [np.argmin([np.sum(np.abs(x - c)) for c in centroids]) for x in X]
        new_centroids = []

        for i in range(k):
            points = X[np.array(clusters) == i]
            new_centroids.append(points.mean(axis=0) if len(points) > 0 else centroids[i])

        new_centroids = np.array(new_centroids)

        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids

    return clusters

clusters = kmeans(X, k=2)

for i, doc in enumerate(docs):
    print(f"Document {i+1}: '{doc}' -> Cluster {clusters[i]}")



Document 1: 'john plays football' -> Cluster 1
Document 2: 'library books read' -> Cluster 0
Document 3: 'mary likes tennis' -> Cluster 0
Document 4: 'books novels library' -> Cluster 0
Document 5: 'john likes football' -> Cluster 1
Document 6: 'mary reads novels' -> Cluster 0


**K-medoids**

In [67]:
def kmedoids(X, k=2, max_iters=100):
    medoids = X[np.random.choice(len(X), k, replace=False)]

    for _ in range(max_iters):
        clusters = [np.argmin([np.sum(np.abs(x - m)) for m in medoids]) for x in X]
        new_medoids = []

        for i in range(k):
            points = X[np.array(clusters) == i]

            if len(points) == 0:
                new_medoids.append(medoids[i])
                continue

            medoid = points[np.argmin([sum(np.linalg.norm(p - points, axis=1)) for p in points])]
            new_medoids.append(medoid)

        new_medoids = np.array(new_medoids)

        if np.allclose(medoids, new_medoids):
            break
        medoids = new_medoids

    return clusters

kmedoid_clusters = kmedoids(X, k=2)

for i, doc in enumerate(docs):
    print(f"Document {i+1}: '{doc}' -> Cluster {kmedoid_clusters[i]}")


Document 1: 'john plays football' -> Cluster 0
Document 2: 'library books read' -> Cluster 1
Document 3: 'mary likes tennis' -> Cluster 0
Document 4: 'books novels library' -> Cluster 1
Document 5: 'john likes football' -> Cluster 0
Document 6: 'mary reads novels' -> Cluster 1


**Text shingling**

In [87]:
docs = [
    "machine learning is a field of artificial intelligence that uses statistical techniques to give computer systems the ability to learn from data",
    "deep learning is a subset of machine learning that uses neural networks with representation learning to model complex patterns in data",
    "natural language processing enables computers to understand human language and involves text analysis, tokenization, and semantic modeling",
    "computer vision is a field that enables machines to interpret and process visual information from the world, including images and videos"
]


In [93]:
def shingles(doc, k=2):
    words = doc.split()
    return set([" ".join(words[i:i+k]) for i in range(len(words)-k+1)])

doc_shingles = [shingles(doc, k=2) for doc in docs]

# Print shingles for each document
for i, s in enumerate(doc_shingles):
    print(f"Document {i+1}: Shingles (count={len(s)})")
    print(sorted(list(s))[:10], "...")  # show first 10 shingles for brevity



Document 1: Shingles (count=21)
['a field', 'ability to', 'artificial intelligence', 'computer systems', 'field of', 'from data', 'give computer', 'intelligence that', 'is a', 'learn from'] ...
Document 2: Shingles (count=20)
['a subset', 'complex patterns', 'deep learning', 'in data', 'is a', 'learning is', 'learning that', 'learning to', 'machine learning', 'model complex'] ...
Document 3: Shingles (count=16)
['analysis, tokenization,', 'and involves', 'and semantic', 'computers to', 'enables computers', 'human language', 'involves text', 'language and', 'language processing', 'natural language'] ...
Document 4: Shingles (count=20)
['a field', 'and process', 'and videos', 'computer vision', 'enables machines', 'field that', 'from the', 'images and', 'including images', 'information from'] ...


In [94]:
def jaccard(s1, s2):
    return len(s1 & s2) / len(s1 | s2)

n = len(docs)
jac_matrix = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        jac_matrix[i,j] = jaccard(doc_shingles[i], doc_shingles[j])

# Print numeric matrix
print("Jaccard Similarity Matrix:")
for i in range(n):
    print([round(float(val), 2) for val in jac_matrix[i]])



Jaccard Similarity Matrix:
[1.0, 0.11, 0.0, 0.05]
[0.11, 1.0, 0.0, 0.03]
[0.0, 0.0, 1.0, 0.0]
[0.05, 0.03, 0.0, 1.0]
