In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans


In [2]:
def readDocuments(doc_file):
    docs = []
    labels = []
    with open(doc_file,encoding='utf-8') as f:
        for line in f:
            label,_,_,doc = line.strip().split(maxsplit=3)
            docs.append(doc)
            labels.append(label)
            
    return docs,labels

In [3]:
from collections import Counter

def purity(labels, clustered):
    
    #* find the set of cluster ids
    cluster_ids = set(clustered)
    
    N = len(clustered)
    majority_sum = 0
    
    for cl in cluster_ids:
        
        #* for this cluster, we compute the frequencies of the different human labels we encounter 
        #* the result will be something like {'camera':5, 'books':1,'software':3} etc 
        labels_cl = Counter(l for l,c in zip(labels,clustered) if c==cl)
        
        #* We select the highest score and add it to the total sum 
        majority_sum += max(labels_cl.values())
        
        
    #* The purity score is the sum of majority counts divided by the total number of items 
    return majority_sum/N

In [7]:
docs, labels = readDocuments('text_to_cluster.txt')
labels[1],docs[1]

('music',
 'i was misled and thought i was buying the entire cd and it contains one song')

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')
doc_matrix = vectorizer.fit_transform(docs)

In [18]:
doc_matrix[3].shape

(1, 46619)

In [9]:
clusterer = KMeans(n_clusters=7,verbose=True)

In [19]:
clustered_docs = clusterer.fit(doc_matrix)

Initialization complete
Iteration 0, inertia 22649.405654497605.
Iteration 1, inertia 11645.64819270793.
Iteration 2, inertia 11583.280634563964.
Iteration 3, inertia 11567.178491705683.
Iteration 4, inertia 11559.135645371907.
Iteration 5, inertia 11556.143446157923.
Iteration 6, inertia 11553.856896596888.
Iteration 7, inertia 11552.342369276237.
Iteration 8, inertia 11551.205964062436.
Iteration 9, inertia 11550.3279566428.
Iteration 10, inertia 11549.466110412322.
Iteration 11, inertia 11548.574437187888.
Iteration 12, inertia 11547.566463701593.
Iteration 13, inertia 11546.659999086829.
Iteration 14, inertia 11545.978772510034.
Iteration 15, inertia 11545.596583057883.
Iteration 16, inertia 11545.346095437622.
Iteration 17, inertia 11545.177157191605.
Iteration 18, inertia 11545.05968233712.
Iteration 19, inertia 11544.9489254939.
Iteration 20, inertia 11544.851695378607.
Iteration 21, inertia 11544.773638596103.
Iteration 22, inertia 11544.696777730489.
Iteration 23, inertia 1154

In [21]:
clustered_docs = clusterer.fit_predict(doc_matrix)

Initialization complete
Iteration 0, inertia 22843.45488304372.
Iteration 1, inertia 11649.19908114195.
Iteration 2, inertia 11573.103364292014.
Iteration 3, inertia 11545.293387540625.
Iteration 4, inertia 11539.139768945199.
Iteration 5, inertia 11537.004431983336.
Iteration 6, inertia 11535.901061448298.
Iteration 7, inertia 11535.325428447106.
Iteration 8, inertia 11535.00433776055.
Iteration 9, inertia 11534.819136798666.
Iteration 10, inertia 11534.672136687424.
Iteration 11, inertia 11534.525797389731.
Iteration 12, inertia 11534.437648002187.
Iteration 13, inertia 11534.351915640193.
Iteration 14, inertia 11534.2726518539.
Iteration 15, inertia 11534.173487349346.
Iteration 16, inertia 11534.04106837898.
Iteration 17, inertia 11533.898468648014.
Iteration 18, inertia 11533.784015867637.
Iteration 19, inertia 11533.711565940388.
Iteration 20, inertia 11533.68273125722.
Iteration 21, inertia 11533.662652059285.
Iteration 22, inertia 11533.651782916606.
Iteration 23, inertia 11533

In [22]:
purity(labels,clustered_docs)

0.6653516870908175

In [23]:
from sklearn.metrics.cluster import adjusted_rand_score

adjusted_rand_score(labels,clustered_docs)

0.3073981690353386

## Lets first define a distilBERT Transformer using Huggingface sytle code

In [24]:
!pip install transformers --quiet

In [25]:
from transformers import AutoModel,AutoTokenizer
import torch.nn as nn 
import torch

In [26]:
class Model(nn.Module):
    def __init__(self, checkpoint, freeze=False,device='cuda'):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(checkpoint) #* path to the desired model 
        hidden_sz = self.model.config.hidden_size
        
        # set device cuda or cpu 
        self.device = device 
        
        #* freeze model 
        if freeze:
            for layer in self.model.parameters():
                layer.requires_grad = False
                
    def forward(self,x,attention_mask=None):
        
        x = x.to(self.device)
        #* pooler_output(seq,dim)
        with torch.no_grad():
            model_out = self.model(x['input_ids'], x['attention_mask'],return_dict = True)
            
        embds = model_out.last_hidden_state # model_out[0][:,0]
        mean_pool = embds.sum(axis=1)/x['attention_mask'].sum(axis=1).unsqueeze(axis=1)
        return mean_pool

In [27]:
checkpoint = 'distilbert-base-uncased'
distilbert = Model(checkpoint=checkpoint,freeze=True)
distilbert.to('cuda')

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

config.json: 100%|██████████| 483/483 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
model.safetensors: 100%|██████████| 268M/268M [00:33<00:00, 7.98MB/s] 
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 27.3kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 795kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 3.50MB/s]


In [28]:
final_embeddings = list()
all_embeddings = []

final_sentences = docs

batch_size = 200
for idx in range(0,len(final_sentences),batch_size):
    batch_sentences = final_sentences[idx:idx+batch_size]
    for sent in batch_sentences:
        tokens = tokenizer(sent,truncation='longest_first',return_tensors='pt',return_attention_mask=True,padding=True)
        embedding = distilbert(tokens)
        final_embeddings.extend(embedding)
        all_embeddings = torch.stack(final_embeddings)
        

In [29]:
clustered_docs = clusterer.fit_predict(all_embeddings.cpu())


Initialization complete
Iteration 0, inertia 153052.2008887094.
Iteration 1, inertia 100596.89611935118.
Iteration 2, inertia 98634.30119654113.
Iteration 3, inertia 97333.19172882223.
Iteration 4, inertia 96584.58026631159.
Iteration 5, inertia 96316.3166051021.
Iteration 6, inertia 96209.70803980563.
Iteration 7, inertia 96148.79743364117.
Iteration 8, inertia 96108.32295662361.
Iteration 9, inertia 96077.6633120169.
Iteration 10, inertia 96048.50767093543.
Iteration 11, inertia 96020.20751152062.
Iteration 12, inertia 95998.29192750846.
Iteration 13, inertia 95981.08595031193.
Iteration 14, inertia 95967.57356690906.
Iteration 15, inertia 95957.60629427517.
Iteration 16, inertia 95949.69181953414.
Iteration 17, inertia 95944.19594439716.
Iteration 18, inertia 95941.29084667818.
Iteration 19, inertia 95939.92474811812.
Iteration 20, inertia 95939.40020773059.
Iteration 21, inertia 95938.76313925209.
Iteration 22, inertia 95938.21318149686.
Iteration 23, inertia 95937.89436141809.
Ite

In [30]:
purity(labels,clustered_docs)

0.7824408259190868

In [31]:
adjusted_rand_score(labels,clustered_docs)

0.5507564962031405