# K-Mean Clustering on Text Data

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import requests
response = requests.get('https://raw.githubusercontent.com/laxmimerit/machine-learning-dataset/master/text-dataset-for-machine-learning/sbert-corpus.txt')
corpus = response.text.split('\r\n')

In [None]:
len(corpus), print(corpus)

['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.', 'The girl is carrying a baby.', 'The baby is carried by the woman', 'A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']


(11, None)

In [None]:
corpus_embeddings = model.encode(corpus)

In [None]:
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [None]:
cluster_assignment

array([1, 1, 1, 0, 0, 3, 3, 4, 4, 2, 2], dtype=int32)

In [None]:
clustered_sentences = [[] for i in range(num_clusters)]
clustered_sentences

[[], [], [], [], []]

In [None]:
for sentence_id, cluster_id in enumerate(cluster_assignment):
  clustered_sentences[cluster_id].append(corpus[sentence_id])

In [None]:
for i, cluster in enumerate(clustered_sentences):
  print("cluster",i+1)
  print(cluster)
  print()

cluster 1
['The girl is carrying a baby.', 'The baby is carried by the woman']

cluster 2
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

cluster 3
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

cluster 4
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

cluster 5
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']



# Fast Clustering
* Agglomerative Clustering for larger datasets is quite slow, so it is only applicable for maybe a few thousand sentences.

* In fast_clustering.py we present a clustering algorithm that is tuned for large datasets (50k sentences in less than 5 seconds). In a large list of sentences it searches for local communities: A local community is a set of highly similar sentences.

* You can configure the threshold of cosine-similarity for which we consider two sentences as similar. Also, you can specify the minimal size for a local community. This allows you to get either large coarse-grained clusters or small fine-grained clusters.

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
import pandas as pd 
import time

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
df = pd.read_csv('http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv', sep='\t')

In [None]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
df.shape

(404290, 6)

In [None]:
sentences = df['question1'].tolist()[:1000]
len(sentences)

1000

In [None]:
corpus_embeddings = model.encode(sentences, batch_size = 64, show_progress_bar=True,convert_to_tensor=True)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
clusters = util.community_detection(corpus_embeddings, min_community_size=5,threshold=0.5)

In [None]:
for i, cluster in enumerate(clusters):
  print("\ncluster {}, #{} Questions" . format(i+1, len(cluster)))
  for id in cluster[0:3]:
    print('\t', sentences[id])
  print("\t","...")


cluster 1, #10 Questions
	 What are some of the best romantic movies in English?
	 Which is the best fiction novel of 2016?
	 Which are the best Hollywood thriller movies?
	 ...

cluster 2, #9 Questions
	 Will the recent demonetisation results in higher GDP? If so how much?
	 What are the effects of demonitization of 500 and 1000 rupees notes on real estate sector?
	 What will be the effect of banning 500 and 1000 notes on stock markets in India?
	 ...

cluster 3, #8 Questions
	 What is best way to make money online?
	 How can I make money through the Internet?
	 What is the best way to get traffic on your website?
	 ...

cluster 4, #7 Questions
	 What is purpose of life?
	 What the meaning of this all life?
	 What is the best lesson in life?
	 ...

cluster 5, #6 Questions
	 Will there really be any war between India and Pakistan over the Uri attack? What will be its effects?
	 What is our stance against Pakistan?
	 If there will be a war between India and Pakistan who will win?
	 ...