In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
import requests
response = requests.get('https://raw.githubusercontent.com/laxmimerit/machine-learning-dataset/refs/heads/master/text-dataset-for-machine-learning/sbert-corpus.txt')
corpus = response.text.split('\r\n')

response = requests.get('https://raw.githubusercontent.com/laxmimerit/machine-learning-dataset/refs/heads/master/text-dataset-for-machine-learning/sbert-queries.txt')
queries = response.text.split('\r\n')

In [9]:
corpus_embeddings = model.encode(corpus)

In [12]:
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [13]:
cluster_assignment

array([0, 0, 0, 1, 1, 3, 3, 4, 4, 2, 2])

In [15]:
clustering_sentences = [[] for i in range(num_clusters)]
clustering_sentences

[[], [], [], [], []]

In [17]:
for sentence_id,cluster_id in enumerate(cluster_assignment):
    clustering_sentences[cluster_id].append(corpus[sentence_id])

In [18]:
for i,cluster in enumerate(clustering_sentences):
    print('cluster ', i+1)
    print(cluster)

cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']
cluster  2
['The girl is carrying a baby.', 'The baby is carried by the woman']
cluster  3
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']
cluster  4
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']
cluster  5
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']


The below code is for if the number of clustering is unknown.

In [21]:
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import numpy as np

In [23]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [24]:
import requests
response = requests.get('https://raw.githubusercontent.com/laxmimerit/machine-learning-dataset/refs/heads/master/text-dataset-for-machine-learning/sbert-corpus.txt')
corpus = response.text.split('\r\n')

In [25]:
corpus_embeddings = model.encode(corpus)

In [26]:
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings,axis=1,keepdims=True)

In [51]:
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [52]:
cluster_assignment

array([0, 0, 0, 4, 4, 1, 1, 2, 2, 3, 3], dtype=int64)

In [53]:
np.unique(cluster_assignment)

array([0, 1, 2, 3, 4], dtype=int64)

In [54]:
num_clusters = len(np.unique(cluster_assignment))
clustering_sentences = [[] for i in range(num_clusters)]
clustering_sentences

[[], [], [], [], []]

In [55]:
for sentence_id,cluster_id in enumerate(cluster_assignment):
    clustering_sentences[cluster_id].append(corpus[sentence_id])

In [56]:
for i,cluster in enumerate(clustering_sentences):
    print('cluster ', i+1)
    print(cluster)

cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']
cluster  2
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']
cluster  3
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']
cluster  4
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']
cluster  5
['The girl is carrying a baby.', 'The baby is carried by the woman']


Fast Clustering on Quora Questions Set

In [57]:
from sentence_transformers import SentenceTransformer,util
import pandas as pd
import time

In [59]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [64]:
df = pd.read_csv('.venv/NPL Projects/K-Means Clustering on text/quora_duplicate_questions.tsv',sep='\t')

In [65]:
df.shape

(404290, 6)

In [66]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0


In [67]:
sentences = df['question1'].tolist()[:1000]
len(sentences)

1000

In [70]:
corpus_embeddings = model.encode(sentences, batch_size=64,show_progress_bar=True)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [71]:
clusters = util.community_detection(corpus_embeddings,min_community_size=5,threshold=0.5)

In [72]:
clusters

[[304, 777, 870, 723, 978, 688, 92, 919, 607, 103],
 [439, 675, 321, 295, 689, 199, 877, 907, 105],
 [28, 78, 945, 284, 647, 564, 784, 273],
 [725, 733, 549, 726, 79, 590, 299],
 [140, 618, 287, 669, 598, 100],
 [263, 544, 93, 930, 401, 957],
 [644, 72, 969, 686, 364, 198],
 [734, 973, 752, 722, 895, 384],
 [49, 566, 967, 591, 302],
 [63, 3, 115, 218, 910],
 [233, 333, 425, 422, 419],
 [502, 532, 608, 852, 317],
 [540, 219, 703, 742, 858],
 [796, 996, 926, 612, 175]]

In [73]:
for i,cluster in enumerate(clusters):
    print("\ncluster {}, #{} Questions".format(i+1,len(cluster)))
    for id in cluster[0:3]:
        print("\t",sentences[id])
    print("\t","...")
        


cluster 1, #10 Questions
	 Which are the best Hollywood thriller movies?
	 What are the most underrated and overrated movies you've seen?
	 What are the best films that take place in one room?
	 ...

cluster 2, #9 Questions
	 What are your views on Modi governments decision to demonetize 500 and 1000 rupee notes? How will this affect economy?
	 What's your opinion about the decision on removal of 500 and 1000 rupees currency notes?
	 How will Indian GDP be affected from banning 500 and 1000 rupees notes?
	 ...

cluster 3, #8 Questions
	 What is best way to make money online?
	 How can I make money through the Internet?
	 What are the easy ways to earn money online?
	 ...

cluster 4, #7 Questions
	 What are the most important things for living a good life?
	 What is most important in life - money or values?
	 What is the best lesson in life?
	 ...

cluster 5, #6 Questions
	 What is our stance against Pakistan?
	 What is the reason Pakistan supports terrorism?
	 If there will be a war b