# 2.1 SBERT Tutorial

In [1]:
import time

In [2]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Computing Sentence Embeddings

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

(3, 384)
Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173468e-02 -4.28515449e-02 -1.56285837e-02  1.40537750e-02
  3.95538211e-02  1.21796295e-01  2.94333491e-02 -3.17524038e-02
  3.54959629e-02 -7.93140009e-02  1.75878983e-02 -4.04369719e-02
  4.97259349e-02  2.54912004e-02 -7.18701035e-02  8.14968795e-02
  1.47068314e-03  4.79626879e-02 -4.50336263e-02 -9.92174745e-02
 -2.81769708e-02  6.45046160e-02  4.44670804e-02 -4.76217009e-02
 -3.52952406e-02  4.38671410e-02 -5.28566092e-02  4.33077395e-04
  1.01921506e-01  1.64072327e-02  3.26996520e-02 -3.45986784e-02
  1.21339010e-02  7.94870853e-02  4.58346074e-03  1.57778412e-02
 -9.68204252e-03  2.87625883e-02 -5.05805947e-02 -1.55793512e-02
 -2.87906155e-02 -9.62282624e-03  3.15556899e-02  2.27348879e-02
  8.71449113e-02 -3.85027081e-02 -8.84718448e-02 -8.75496585e-03
 -2.12343428e-02  2.08923165e-02 -9.02077779e-02 -5.25732227e-02
 -1.05638811e-02  2.88310610e-02 -1.61455069e-02  6.17837021e-03


## Input Seq Len

In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Max Sequence Length:", model.max_seq_length)

#Change the length to 200
model.max_seq_length = 200

print("Max Sequence Length:", model.max_seq_length)

Max Sequence Length: 256
Max Sequence Length: 200


## Store + Load Embeddings

In [5]:
from sentence_transformers import SentenceTransformer
import pickle

model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']


embeddings = model.encode(sentences)
print(embeddings.shape)

#Store sentences & embeddings on disc
with open('embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': sentences, 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
with open('embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']

(3, 384)


## Sentence Embedding w/ Transformers

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask



#Sentences we want sentence embeddings for
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [7]:
print(sentence_embeddings.shape)

torch.Size([3, 384])


# 2.2 SBERT Clustering Tutorials

In [8]:
from sentence_transformers import SentenceTransformer, util
import os
import csv
import time

## This step we are gonna prep the data

In [9]:
# We donwload the Quora Duplicate Questions Dataset (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
# and find similar question in it
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 50000  # We limit our corpus to only the first 50k questions


# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
    print("Download dataset")
    util.http_get(url, dataset_path)

# Get all unique sentences from the file
corpus_sentences = set()
with open(dataset_path, encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        corpus_sentences.add(row['question1'])
        corpus_sentences.add(row['question2'])
        if len(corpus_sentences) >= max_corpus_size:
            break

corpus_sentences = list(corpus_sentences)

In [10]:
print(corpus_sentences[:5])

['What if I am boring?', 'What is the average/median CPM, CPC rate for K12 students?', 'How do animals living in soil get affected by heavy rain?', 'What are some truths about life?', 'If I smoked 2 weeks ago, how do I pass a drug test?']


In [11]:
print(len(corpus_sentences))

50001


In [12]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [13]:
corpus_embeddings = embedder.encode(corpus_sentences)

In [14]:
print(corpus_embeddings[:5])

[[ 0.05372459 -0.06891531  0.01492134 ... -0.02325333 -0.12786233
  -0.0756389 ]
 [ 0.08406261 -0.01325071 -0.0612961  ... -0.04815011 -0.03249244
   0.05737272]
 [ 0.00912334 -0.02651135  0.10546865 ...  0.00532189  0.07793394
  -0.01113939]
 [-0.07203567  0.04977514 -0.00698067 ...  0.04800481  0.01955936
  -0.00784752]
 [ 0.06261101  0.01837095 -0.006355   ... -0.01061986  0.01893065
  -0.03496853]]


## K Means

In [15]:
from sklearn.cluster import KMeans

In [16]:
start_time = time.time()
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
print(time.time() - start_time)
clustered_sentences = [[] for i in range(num_clusters)]

24.201020002365112


In [17]:
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus_sentences[sentence_id])

In [18]:
for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")
    if i == 1:
      break

Cluster  1
['How do I invest ₹10,000?', 'How is discontinuing 500 and 1000 rupee note going to put a hold on black money in India?', 'Is Indra the most coward god in hindu mythology?', 'Why is Cricket not popular in US even though it was a British colony?', 'How does one locate their towed away scooter in an Indian police station, really quickly?', 'Daniel Ek: When will Spotify start its application in India?', 'What are some unknown facts of Jayalalitha(Amma)?', "Why didn't Tamil get world recognition as one of the oldest surviving languages?", 'How does banning 500 and 1000 rupee notes help to control black money?', 'What is a brief summary of "La rama seca"?', 'Can any body give meaning to the Sanskrit sloka known to be written by Kalidas " Shadja Madja Kharadja veedja vasuda …"?', 'How much salary does a software engineer gets per month in India?', 'Most of Indonesia is Muslim, so why is Bali Hindu?', 'What are some of the legal but unethical tax saving techniques in India?', 'Will

## Agglomerative Clustering
---
- NOTE: It was having an OOM issue for the 50k entries. I tried with 
different distance thresholds (1.5, 2, 3), and different embedding dimensions (in part 2.3, all the way from 120 -> 10). 
- I attempted a subset of the corpus_embeddings dataset just for the purpose of testing the algorithm (10k entries)

In [19]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering

In [20]:
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [21]:
corpus_embeddings = corpus_embeddings[:10000]

In [22]:
start_time = time.time()
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
print(time.time() - start_time)

31.65395450592041


In [23]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus_sentences[sentence_id])

In [24]:
print(clustered_sentences.items())



In [25]:
print(len(clustered_sentences.items()))

1709


In [26]:
for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['Which grand theft auto is your favourite?', "Which is the worst movie you've ever watched?", 'Which is the best Hollywood suspense movie that keeps you thrilled for almost a day?', 'What is the best trucking movie?', "Which movies are the best examples of the Hero's Journey?", "What's the best suspense movie in 2015?", 'What are some English movies that are about business and economics (like The Social Network, Steve Jobs, The Big Short etc.)?', 'What are some movies like "A Serbian Film"?', 'What movies have a scene where a prisoner, locked in his cell, is burned alive?', "What movie have you watched that made you think it's life changing?", 'What are the best and iconic movie posters ?', 'What are the top best psychological movies?', 'What are the best romcom movies?', 'What are the best James Bond movies?', 'Which major movies have the biggest plot holes?']

Cluster  967
['How is discontinuing 500 and 1000 rupee note

# 2.3 Embedding Dim Reduction

## PCA Reduction Part

In [27]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [28]:
new_dimension = 120

In [29]:
print(len(corpus_sentences))

50001


In [30]:
pca_train_sentences = corpus_sentences
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

In [31]:
print(train_embeddings.shape)

(50001, 384)


In [32]:
from sklearn.decomposition import PCA

In [33]:
pca = PCA(n_components=new_dimension)
principal_components = pca.fit_transform(train_embeddings)
# pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

In [34]:
print(principal_components.shape)

(50001, 120)


## PCA reduced dataset with kmeans

In [35]:
from sklearn.cluster import KMeans

In [36]:
start_time = time.time()
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(principal_components)
cluster_assignment = clustering_model.labels_
print(time.time() - start_time)
clustered_sentences = [[] for i in range(num_clusters)]

10.465742349624634


In [37]:
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus_sentences[sentence_id])

In [38]:
for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")
    if i == 1:
      break

Cluster  1

Cluster  2
['What if I am boring?', 'If I smoked 2 weeks ago, how do I pass a drug test?', 'How can one earn money online without investment?', 'How do I get to speak fluently English?', 'What is the main cause of overheating of mobile phones and how to get rid of it?', 'The Pirate Bay: How can one lower their risk of being caught while continuing to torrent?', 'What can I do to help the situation in Aleppo?', 'How can I use Gmail / Google Account via TOR?', 'How do I Speak English fluently and build vocabulary?', 'How do you convert mL to grams?', 'How could the U.S. take over the world?', 'What should I do to be proactive at my new job?', 'How do I keep videos on iCloud after deleting them from my phone?', 'How can I make someone feel important?', 'What should I do to get 100% in CAT?', 'How do I build an online directory?', 'How do I get over the fact that the girl I loved so much cheated on me ?', 'How can someone intentionally kill themselves but make it look like an a

In [39]:
print(len(clustered_sentences))

5


## PCA reduced dataset with agglomerative clustering

In [40]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering

In [41]:
principal_components = principal_components /  np.linalg.norm(principal_components, axis=1, keepdims=True)
principal_components = principal_components[:10000]

In [42]:
print(principal_components.shape)

(10000, 120)


In [43]:
print(principal_components[1])

[ 3.71201128e-01 -2.25059524e-01 -1.43409818e-01 -9.19786543e-02
 -1.30088583e-01 -4.39970531e-02 -9.66496542e-02  4.55029644e-02
  4.40922342e-02 -9.99051183e-02 -1.57712936e-01  5.17651811e-02
 -4.04358767e-02 -1.49755940e-01 -7.39531964e-02  6.34270683e-02
 -1.11817993e-01  2.72723168e-01  5.70450574e-02 -2.42017675e-02
 -9.20803344e-04 -3.39384347e-01  1.35625929e-01  2.20730416e-02
  4.89134453e-02 -1.02752544e-01  5.54076508e-02 -1.66923106e-01
 -1.38763404e-02  3.99772301e-02 -2.40410026e-02  2.67651137e-02
  5.27425334e-02  1.53490469e-01  2.00362131e-02  7.66689330e-02
 -1.85042098e-01 -8.79468769e-02  1.00706354e-01  1.32612186e-03
 -8.44633579e-03  1.26606926e-01  1.52281940e-03  4.26556394e-02
 -4.85953838e-02  1.15382329e-01 -7.15628117e-02 -1.12744518e-01
  7.89257661e-02 -8.27217326e-02 -3.31235677e-02 -1.11598931e-01
 -4.55947295e-02 -3.31099592e-02  2.52584480e-02 -1.81820840e-02
  1.08766228e-01  5.60209341e-03 -4.75950763e-02 -5.48212305e-02
  5.96231855e-02 -1.03023

In [44]:
start_time = time.time()
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(principal_components)
cluster_assignment = clustering_model.labels_
print(time.time() - start_time)

11.293874740600586


In [45]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus_sentences[sentence_id])

In [46]:
print(clustered_sentences.items())



In [47]:
print(len(clustered_sentences.items()))

1614


In [48]:
for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  822
['What if I am boring?', 'What are some fun things to do when bored?', 'What are cool things to do when bored?', "What should I do when I'm bored?", 'What should I do to not be bored with my job?']

Cluster  1603
['What is the average/median CPM, CPC rate for K12 students?', 'What is the salary of an IAF officer after the 7th pay commission?', "How much does an Ola Mini cab earn on average per month from an investor's point of view in Mumbai?", 'How much do CPAs cost per hour on average?', 'What is the salary of LIC AAO?']

Cluster  1393
['How do animals living in soil get affected by heavy rain?', 'How do trees get water up to their leaves?', 'What are marshmallow trees?', 'What are examples of producers in a deciduous forest?']

Cluster  1331
['What are some truths about life?', 'What is the harsh truth of life that nobody can digest?', 'What makes life difficult?', 'What are some bitter truths about life in India?', "What are life's simple pleasures?"]

Cluster  81
['If

# 2.4 Real world applications

## Data proc.
First, there's an issue with the actual sentences themselves, so gotta resolve that. Should split into sentence and label based on the last comma

In [49]:
dataset_path = "./test.csv"

In [50]:
test_sentences = list()
test_labels = dict()
with open(dataset_path, encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        split_row = row['text,category'].rsplit(',', 1)
        test_sentences.append(split_row[0])
        test_labels[split_row[0]] = split_row[1]

In [51]:
len(test_sentences)

3080

In [52]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [53]:
test_embeddings = model.encode(test_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)

Batches:   0%|          | 0/49 [00:00<?, ?it/s]

In [54]:
test_embeddings.shape

torch.Size([3080, 384])

## This is the fast clustering algo section 

In [55]:
from sentence_transformers import SentenceTransformer, util
import time

In [56]:
start_time = time.time()
clusters = util.community_detection(test_embeddings, min_community_size=25, threshold=0.75)
print(time.time() - start_time)

0.2782862186431885


In [57]:
pred_labels = list()
actual_labels = list()
for i, cluster in enumerate(clusters):
    # print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster:
      pred_labels.append(i+1) # Cluster label
      actual_labels.append(test_labels[test_sentences[sentence_id]])

In [58]:
from sklearn.metrics.cluster import normalized_mutual_info_score
print(normalized_mutual_info_score(actual_labels, pred_labels))

0.8701767165285461


In [59]:
from sklearn.metrics.cluster import adjusted_rand_score
print(adjusted_rand_score(actual_labels, pred_labels))

0.6595287776974524
