In [1]:
from sklearn.datasets import fetch_20newsgroups

import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [3]:
import numpy as np
import pandas as pd

In [12]:
models = [
    # "bert-base-uncased", 
          "roberta-base",
          "distilbert-base-uncased"]

In [5]:
folder = "./20news_bert/"
kmeans_res_path = "kmeans_res_20News_hard_bert.xlsx"
hierarchy_res_path = "hierarchy_res_20News_hard_bert.xlsx"

categories_light = ["comp.graphics", "rec.autos", "sci.med",  "talk.politics.mideast"]
categories_hard = ["talk.politics.guns", "talk.politics.mideast",  "talk.politics.misc"]

newsgroups = fetch_20newsgroups(subset="train",
                                remove=("headers", "footers", "quotes"),
                                categories=categories_hard)

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    
    tokens = [word.lower() for word in tokens if word.isalnum()]
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words ]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [7]:
def filter_text_by_pos(text, pos_to_keep):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    
    filtered_tokens = [token for token, pos in tagged_tokens if pos in pos_to_keep]
    
    return ' '.join(filtered_tokens)

In [8]:
def cluster_kmeans(matrix, metrics, true_labels, num_clusters=2, num_iterations=50):
    scores = {}
    for metric in metrics:
        scores.update({metric.__name__: []})

    for i in range(num_iterations):
        clusters = KMeans(n_clusters=num_clusters, random_state=i, n_init=10)

        clusters.fit_predict(matrix)
        
        for metric in metrics:
            score = metric(true_labels, clusters.labels_)
            scores[metric.__name__].append(score)

    kmeans_res = ""
    for metric in scores:
        kmeans_res += f"\n{metric} \nMax: {np.max(scores[metric])} \
                                    \nMin: {np.min(scores[metric])} \
                                    \nAVG: {np.mean(scores[metric])} \n"
        
    print(kmeans_res)
    return kmeans_res

In [9]:
def cluster_hierarchy(matrix, metrics, true_labels, num_clusters=2):
    linkages = ["complete", "average", "single"]

    hierarchy_res = ""

    if matrix.shape[0] != matrix.shape[1]:
        affinity = "euclidean"
        # matrix = matrix.toarray()
        linkages.append("ward")
    else:
        affinity = "precomputed"

    for linkage in linkages:
        hierarchy_res += f"\n{linkage}"

        agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric=affinity, linkage=linkage)

        agg_clustering.fit_predict(matrix)

        for metric in metrics:
            score = metric(true_labels, agg_clustering.labels_)
            hierarchy_res += f"\n{metric.__name__}: {score}"
        
        hierarchy_res += "\n"
    
    print(hierarchy_res)
    return hierarchy_res

In [10]:
def main(dataset, model_name):
    true_labels = dataset.target
    distances = ["none", "euclidean", "cosine"]
    metrics = [normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score]

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    kmeans_data = pd.DataFrame(columns=distances)
    hierarchy_data = pd.DataFrame(columns=distances)

    preprocessed_data = [preprocess_text(text) for text in dataset.data]
    n = len(preprocessed_data)
    print(n)

    noun_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS']) for text in preprocessed_data]
    adj_data = [filter_text_by_pos(text, pos_to_keep=['JJ', 'JJR', 'JJS']) for text in preprocessed_data]
    noun_adj_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS', 'JJ', 'JJR', 'JJS']) for text in preprocessed_data]

    list_of_data = {"ALL": preprocessed_data, "NOUNS": noun_data, "ADJ": adj_data, "NOUNS and ADJ": noun_adj_data}

    for name, data in list_of_data.items():
        print("start calculate")
        text_vectors = []

        i = 0
        for sentence in data:
            tokens = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")

            with torch.no_grad():
                sentence_vector = model(**tokens).last_hidden_state.mean(dim=1)

            i+=1
            print(str(round(i/n*100, 2)) + "%", end='\r', flush=True)
            text_vectors.append(sentence_vector.flatten())

        for distance in distances:
            if distance == "euclidean":
                distance_matrix = euclidean_distances(text_vectors)
            elif distance == "cosine":
                distance_matrix = cosine_distances(text_vectors)
            elif distance == "none":
                distance_matrix = np.array(text_vectors)

            kmeans_data.loc[name, distance] = cluster_kmeans(distance_matrix, metrics, true_labels, num_clusters=len(dataset.target_names))

            hierarchy_data.loc[name, distance] = cluster_hierarchy(distance_matrix, metrics, true_labels, num_clusters=len(dataset.target_names))

    return kmeans_data, hierarchy_data

In [13]:
for model_name in models:
    print(model_name)
    kmeans_data, hierarchy_data = main(newsgroups, model_name)
    
    kmeans_data.to_excel(folder + model_name + "/" + kmeans_res_path)
    hierarchy_data.to_excel(folder + model_name + "/" + hierarchy_res_path)

roberta-base


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1575
start calculate
100.0%
normalized_mutual_info_score 
Max: 0.016379303586388297                                     
Min: 0.016126858677298572                                     
AVG: 0.016283374520934202 

adjusted_rand_score 
Max: 0.00833174031811477                                     
Min: 0.008205135746090613                                     
AVG: 0.008283630580745589 

v_measure_score 
Max: 0.016379303586388297                                     
Min: 0.016126858677298572                                     
AVG: 0.016283374520934202 

homogeneity_score 
Max: 0.013504783850322138                                     
Min: 0.01328397079676317                                     
AVG: 0.01342087488996973 


complete
normalized_mutual_info_score: 0.010210903130514558
adjusted_rand_score: 0.00360761619803794
v_measure_score: 0.010210903130514558
homogeneity_score: 0.00832018095299523

average
normalized_mutual_info_score: 0.0181382474583357
adjusted_rand_score: 0.004850876822

In [None]:
preprocessed_data = [preprocess_text(text) for text in newsgroups.data[:5]]
n = len(preprocessed_data)
print(n)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

text_vectors = []

i = 0
for sentence in preprocessed_data:
    tokens = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        sentence_vector = model(**tokens).last_hidden_state.mean(dim=1)

    i+=1
    print(str(round(i/n*100, 2)) + "%", end='\r', flush=True)
    text_vectors.append(sentence_vector.flatten().tolist())
    

# text_vectors = np.array(text_vectors)
text_vectors = cosine_distances(text_vectors)

true_labels = newsgroups.target[:5]
metrics = [normalized_mutual_info_score]

cluster_kmeans(text_vectors, metrics, true_labels, num_clusters=len(newsgroups.target_names))
cluster_hierarchy(text_vectors, metrics, true_labels, num_clusters=len(newsgroups.target_names))

5
100.0%
normalized_mutual_info_score 
Max: 0.8326760405064335                                     
Min: 0.8326760405064335                                     
AVG: 0.8326760405064334 


complete
normalized_mutual_info_score: 0.8326760405064335

average
normalized_mutual_info_score: 0.8326760405064335

single
normalized_mutual_info_score: 0.8326760405064335



'\ncomplete\nnormalized_mutual_info_score: 0.8326760405064335\n\naverage\nnormalized_mutual_info_score: 0.8326760405064335\n\nsingle\nnormalized_mutual_info_score: 0.8326760405064335\n'