In [29]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans as sk_kmeans
from nltk.cluster.kmeans import KMeansClusterer as nl_kmeans
from sklearn.cluster import AgglomerativeClustering
from nltk.cluster.util import cosine_distance
import numpy as np
import pandas as pd
from sklearn import metrics

import torch
from transformers import DistilBertTokenizer, DistilBertModel

In [30]:
file = open('20docs3UnDif.txt', 'r')
corpus = [line.strip() for line in file]
file.close()

In [31]:
#'comp.graphics', 'rec.autos', 'sci.med',  'talk.politics.mideast'
categories = ['talk.politics.guns', 'talk.politics.mideast',  'talk.politics.misc']
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), categories=categories)
Y = newsgroups.target

Distil Bert

In [32]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
tokens = [tokenizer(i, return_tensors='pt', padding=True, truncation=True) for i in corpus]

In [33]:
model = DistilBertModel.from_pretrained(model_name)
X = np.array([[]])
for i in tokens:
    with torch.no_grad():
        outputs = model(**i)
        embedding = outputs.last_hidden_state[:, 0, :]
    X = np.append(X,embedding)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
X = np.reshape(X, (2625,768))
iter = 30
a_rand = np.zeros((2,iter))
v_measure = np.zeros((2,iter))
mutual = np.zeros((2,iter))
fowlkes = np.zeros((2,iter))

In [35]:
for i in range(iter):
    eukl_pred = sk_kmeans(n_clusters=3, init='k-means++', n_init='auto').fit(X)
    eucl_pred = eukl_pred.labels_
    a_rand[0][i] = metrics.rand_score(Y, eucl_pred)
    v_measure[0][i] = metrics.v_measure_score(Y,eucl_pred)
    mutual[0][i] = metrics.adjusted_mutual_info_score(Y,eucl_pred)
    fowlkes[0][i] = metrics.fowlkes_mallows_score(Y, eucl_pred)

    nl_clusterer = nl_kmeans(3, distance=cosine_distance, avoid_empty_clusters=True)
    cos_pred = nl_clusterer.cluster(X, assign_clusters=True)
    cos_pred = np.array(cos_pred)
    a_rand[1][i] = metrics.rand_score(Y, cos_pred)
    v_measure[1][i] = metrics.v_measure_score(Y, cos_pred)
    mutual[1][i] = metrics.adjusted_mutual_info_score(Y, cos_pred)
    fowlkes[1][i] = metrics.fowlkes_mallows_score(Y, cos_pred)

In [36]:
evcl = pd.DataFrame({'a_rand': [np.min(a_rand[0]), np.mean(a_rand[0]), np.max(a_rand[0])],
        'v_measure': [np.min(v_measure[0]), np.mean(v_measure[0]), np.max(v_measure[0])],
        'mutual': [np.min(mutual[0]), np.mean(mutual[0]), np.max(mutual[0])],
        'fowlkes': [np.min(fowlkes[0]), np.mean(fowlkes[0]), np.max(fowlkes[0])]},
        index=['min', 'avrg', 'max'])
evcl

Unnamed: 0,a_rand,v_measure,mutual,fowlkes
min,0.559089,0.012802,0.011842,0.330569
avrg,0.562149,0.013613,0.012647,0.335809
max,0.56524,0.014786,0.013811,0.342324


In [37]:
cos = pd.DataFrame({'a_rand': [np.min(a_rand[1]), np.mean(a_rand[1]), np.max(a_rand[1])],
        'v_measure': [np.min(v_measure[1]), np.mean(v_measure[1]), np.max(v_measure[1])],
        'mutual': [np.min(mutual[1]), np.mean(mutual[1]), np.max(mutual[1])],
        'fowlkes': [np.min(fowlkes[1]), np.mean(fowlkes[1]), np.max(fowlkes[1])]},
        index=['min', 'avrg', 'max'])
cos

Unnamed: 0,a_rand,v_measure,mutual,fowlkes
min,0.567879,0.0141,0.013146,0.326117
avrg,0.569074,0.014188,0.013238,0.326707
max,0.569713,0.014226,0.013277,0.32931


In [38]:
a_rand_h = np.zeros((2,4))
v_measure_h = np.zeros((2,4))
mutual_h = np.zeros((2,4))
fowlkes_h = np.zeros((2,4))

In [39]:
linkage = ['ward', 'complete', 'average', 'single']
for i,link in enumerate(linkage):
    hierachical = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X)
    eucl_pred = hierachical.labels_
    a_rand_h[0][i] = metrics.rand_score(Y, eucl_pred)
    v_measure_h[0][i] = metrics.adjusted_rand_score(Y, eucl_pred)
    mutual_h[0][i] = metrics.homogeneity_score(Y, eucl_pred)
    fowlkes_h[0][i] = metrics.completeness_score(Y, eucl_pred)

In [40]:
linkage = ['complete', 'average', 'single']
for i,link in enumerate(linkage):
    hierachical = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X)
    eucl_pred = hierachical.labels_
    a_rand_h[1][i] = metrics.rand_score(Y, eucl_pred)
    v_measure_h[1][i] = metrics.adjusted_rand_score(Y, eucl_pred)
    mutual_h[1][i] = metrics.homogeneity_score(Y, eucl_pred)
    fowlkes_h[1][i] = metrics.completeness_score(Y, eucl_pred)

In [41]:
eucl_h = pd.DataFrame({'a_rand': a_rand_h[0],
        'v_measure': v_measure_h[0],
        'mutual': mutual_h[0],
        'fowlkes': fowlkes_h[0]},
        index=['ward', 'complete', 'average', 'single'])
eucl_h

Unnamed: 0,a_rand,v_measure,mutual,fowlkes
ward,0.531149,0.003994,0.011266,0.011168
complete,0.47491,0.004026,0.003986,0.005737
average,0.470537,0.005092,0.003657,0.006511
single,0.336181,0.00016,0.001218,0.131524


In [42]:
cos_h = pd.DataFrame({'a_rand': a_rand_h[1],
        'v_measure': v_measure_h[1],
        'mutual': mutual_h[1],
        'fowlkes': fowlkes_h[1]},
        index=['complete', 'average', 'single', 'ward'])
cos_h

Unnamed: 0,a_rand,v_measure,mutual,fowlkes
complete,0.499025,0.008469,0.007098,0.009142
average,0.440731,0.002833,0.002213,0.004665
single,0.336163,0.000134,0.001206,0.130305
ward,0.0,0.0,0.0,0.0
