In [31]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans as sk_kmeans
from nltk.cluster.kmeans import KMeansClusterer as nl_kmeans
from sklearn.cluster import AgglomerativeClustering
from nltk.cluster.util import cosine_distance
import numpy as np
import pandas as pd
from sklearn import metrics

import torch
from transformers import BertTokenizer, BertModel

In [32]:
file = open('20docs3UnDif.txt', 'r')
corpus = [line.strip() for line in file]
file.close()

In [33]:
#'comp.graphics', 'rec.autos', 'sci.med',  'talk.politics.mideast'
categories = ['talk.politics.guns', 'talk.politics.mideast',  'talk.politics.misc']
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), categories=categories)
Y = newsgroups.target

Base Uncased

In [34]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
tokens = [tokenizer(i, return_tensors='pt', padding=True, truncation=True) for i in corpus]

In [35]:
model = BertModel.from_pretrained(model_name)
X = np.array([[]])
for i in tokens:
    with torch.no_grad():
        outputs = model(**i)
        embedding = outputs.last_hidden_state[:, 0, :]
    X = np.append(X,embedding)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
X = np.reshape(X, (2625,768))
iter = 30
a_rand = np.zeros((2,iter))
v_measure = np.zeros((2,iter))
mutual = np.zeros((2,iter))
fowlkes = np.zeros((2,iter))

In [37]:
for i in range(iter):
    eukl_pred = sk_kmeans(n_clusters=3, init='k-means++', n_init='auto').fit(X)
    eucl_pred = eukl_pred.labels_
    a_rand[0][i] = metrics.rand_score(Y, eucl_pred)
    v_measure[0][i] = metrics.v_measure_score(Y,eucl_pred)
    mutual[0][i] = metrics.adjusted_mutual_info_score(Y,eucl_pred)
    fowlkes[0][i] = metrics.fowlkes_mallows_score(Y, eucl_pred)

    nl_clusterer = nl_kmeans(3, distance=cosine_distance, avoid_empty_clusters=True)
    cos_pred = nl_clusterer.cluster(X, assign_clusters=True)
    cos_pred = np.array(cos_pred)
    a_rand[1][i] = metrics.rand_score(Y, cos_pred)
    v_measure[1][i] = metrics.v_measure_score(Y, cos_pred)
    mutual[1][i] = metrics.adjusted_mutual_info_score(Y, cos_pred)
    fowlkes[1][i] = metrics.fowlkes_mallows_score(Y, cos_pred)

In [38]:
evcl = pd.DataFrame({'a_rand': [np.min(a_rand[0]), np.mean(a_rand[0]), np.max(a_rand[0])],
        'v_measure': [np.min(v_measure[0]), np.mean(v_measure[0]), np.max(v_measure[0])],
        'mutual': [np.min(mutual[0]), np.mean(mutual[0]), np.max(mutual[0])],
        'fowlkes': [np.min(fowlkes[0]), np.mean(fowlkes[0]), np.max(fowlkes[0])]},
        index=['min', 'avrg', 'max'])
evcl

Unnamed: 0,a_rand,v_measure,mutual,fowlkes
min,0.55388,0.023583,0.022573,0.356038
avrg,0.55648,0.025223,0.02422,0.358542
max,0.558865,0.026517,0.025518,0.362367


In [39]:
cos = pd.DataFrame({'a_rand': [np.min(a_rand[1]), np.mean(a_rand[1]), np.max(a_rand[1])],
        'v_measure': [np.min(v_measure[1]), np.mean(v_measure[1]), np.max(v_measure[1])],
        'mutual': [np.min(mutual[1]), np.mean(mutual[1]), np.max(mutual[1])],
        'fowlkes': [np.min(fowlkes[1]), np.mean(fowlkes[1]), np.max(fowlkes[1])]},
        index=['min', 'avrg', 'max'])
cos

Unnamed: 0,a_rand,v_measure,mutual,fowlkes
min,0.55378,0.02277,0.021764,0.356187
avrg,0.554969,0.023509,0.022504,0.357394
max,0.555907,0.024146,0.02314,0.358678


In [40]:
a_rand_h = np.zeros((2,4))
v_measure_h = np.zeros((2,4))
mutual_h = np.zeros((2,4))
fowlkes_h = np.zeros((2,4))

In [41]:
linkage = ['ward', 'complete', 'average', 'single']
for i,link in enumerate(linkage):
    hierachical = AgglomerativeClustering(n_clusters=3, linkage=link).fit(X)
    eucl_pred = hierachical.labels_
    a_rand_h[0][i] = metrics.rand_score(Y, eucl_pred)
    v_measure_h[0][i] = metrics.adjusted_rand_score(Y, eucl_pred)
    mutual_h[0][i] = metrics.homogeneity_score(Y, eucl_pred)
    fowlkes_h[0][i] = metrics.completeness_score(Y, eucl_pred)

In [42]:
linkage = ['complete', 'average', 'single']
for i,link in enumerate(linkage):
    hierachical = AgglomerativeClustering(n_clusters=3, linkage=link, metric='cosine').fit(X)
    eucl_pred = hierachical.labels_
    a_rand_h[1][i] = metrics.rand_score(Y, eucl_pred)
    v_measure_h[1][i] = metrics.adjusted_rand_score(Y, eucl_pred)
    mutual_h[1][i] = metrics.homogeneity_score(Y, eucl_pred)
    fowlkes_h[1][i] = metrics.completeness_score(Y, eucl_pred)

In [43]:
eucl_h = pd.DataFrame({'a_rand': a_rand_h[0],
        'v_measure': v_measure_h[0],
        'mutual': mutual_h[0],
        'fowlkes': fowlkes_h[0]},
        index=['ward', 'complete', 'average', 'single'])
eucl_h

Unnamed: 0,a_rand,v_measure,mutual,fowlkes
ward,0.525782,0.012045,0.01756,0.018944
complete,0.519214,0.027639,0.04387,0.050244
average,0.354026,-0.000114,0.000738,0.005921
single,0.354008,-0.000141,0.000726,0.005827


In [44]:
cos_h = pd.DataFrame({'a_rand': a_rand_h[1],
        'v_measure': v_measure_h[1],
        'mutual': mutual_h[1],
        'fowlkes': fowlkes_h[1]},
        index=['complete', 'average', 'single', 'ward'])
cos_h

Unnamed: 0,a_rand,v_measure,mutual,fowlkes
complete,0.517154,0.028376,0.022631,0.030674
average,0.354008,-0.000141,0.000726,0.005827
single,0.354026,-0.000114,0.000738,0.005921
ward,0.0,0.0,0.0,0.0
