In [1]:
from sklearn.datasets import fetch_20newsgroups

from gensim import corpora, models

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score, homogeneity_score

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [3]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
folder = "./20news_lda_lsi/"
kmeans_res_path = "kmeans_res_20News_"
hierarchy_res_path = "hierarchy_res_20News_"

tokenize_models = {
    "lda": models.LdaModel}

categories_light = ["comp.graphics", "rec.autos", "sci.med",  "talk.politics.mideast"]
categories_hard = ["talk.politics.guns", "talk.politics.mideast",  "talk.politics.misc"]
categories = {"light": categories_light, "hard": categories_hard}

In [5]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    
    tokens = [word.lower() for word in tokens if word.isalnum()]
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words ]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [6]:
def filter_text_by_pos(text, pos_to_keep):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    
    filtered_tokens = [token for token, pos in tagged_tokens if pos in pos_to_keep]
    
    return filtered_tokens

In [7]:
def cluster_kmeans(matrix, metrics, true_labels, num_clusters=2, num_iterations=50):
    scores = {}
    for metric in metrics:
        scores.update({metric.__name__: []})

    # for i in tqdm(range(num_iterations), total=num_iterations, desc="Iterations"):
    for i in range(num_iterations):
        clusters = KMeans(n_clusters=num_clusters, random_state=i, n_init=5)

        clusters.fit_predict(matrix)
        
        for metric in metrics:
            score = metric(true_labels, clusters.labels_)
            scores[metric.__name__].append(score)

    kmeans_res = ""
    for metric in scores:
        kmeans_res += f"\n{metric} \nMax: {np.max(scores[metric])} \
                                    \nMin: {np.min(scores[metric])} \
                                    \nAVG: {np.mean(scores[metric])} \n"
        
    print(kmeans_res)
    return kmeans_res

In [8]:
def cluster_hierarchy(matrix, metrics, true_labels, num_clusters=2):
    linkages = ["complete", "average", "single"]

    hierarchy_res = ""

    if matrix.shape[0] != matrix.shape[1]:
        affinity = "euclidean"
        # matrix = matrix.toarray()
        linkages.append("ward")
    else:
        affinity = "precomputed"

    for linkage in linkages:
        hierarchy_res += f"\n{linkage}"

        agg_clustering = AgglomerativeClustering(n_clusters=num_clusters, metric=affinity, linkage=linkage)

        agg_clustering.fit_predict(matrix)

        for metric in metrics:
            score = metric(true_labels, agg_clustering.labels_)
            hierarchy_res += f"\n{metric.__name__}: {score}"
        
        hierarchy_res += "\n"
    
    print(hierarchy_res)
    return hierarchy_res

In [14]:
def main(dataset):
    num_topics = 10
    true_labels = dataset.target
    distances = ["none", 
                 "euclidean", 
                 "cosine"
                 ]
    metrics = [normalized_mutual_info_score, 
               adjusted_rand_score, 
               v_measure_score, 
               homogeneity_score
               ]

    kmeans_data = pd.DataFrame(columns=distances)
    hierarchy_data = pd.DataFrame(columns=distances)

    preprocessed_data = [preprocess_text(text) for text in dataset.data]
    
    n = len(preprocessed_data)
    print(n)

    noun_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS']) for text in preprocessed_data]
    adj_data = [filter_text_by_pos(text, pos_to_keep=['JJ', 'JJR', 'JJS']) for text in preprocessed_data]
    noun_adj_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS', 'JJ', 'JJR', 'JJS']) for text in preprocessed_data]

    preprocessed_data = [text.split(' ') for text in preprocessed_data]
    list_of_data = {"ALL": preprocessed_data, "NOUNS": noun_data, "ADJ": adj_data, "NOUNS and ADJ": noun_adj_data}

    for name, data in list_of_data.items():
        print("start calculate " + name)
        text_vectors = []

        dictionary = corpora.Dictionary(data)
        bow_corpus = [dictionary.doc2bow(doc) for doc in data]

        model = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=num_topics, passes=15)

        for doc_bow in bow_corpus:
            document_topics = model.get_document_topics(doc_bow, minimum_probability=0.0)
            document_topic_vector = [topic_prob for _, topic_prob in document_topics]
            text_vectors.append(document_topic_vector)

        for distance in distances:
            print(distance)
            if distance == "euclidean":
                distance_matrix = euclidean_distances(text_vectors)
            elif distance == "cosine":
                distance_matrix = cosine_distances(text_vectors)
            elif distance == "none":
                distance_matrix = np.array(text_vectors)

            kmeans_data.loc[name, distance] = cluster_kmeans(distance_matrix, metrics, true_labels, num_clusters=len(dataset.target_names))

            hierarchy_data.loc[name, distance] = cluster_hierarchy(distance_matrix, metrics, true_labels, num_clusters=len(dataset.target_names))

    return kmeans_data, hierarchy_data

In [10]:
news_data = {}
for category in categories:
    newsgroups = fetch_20newsgroups(subset="train",
                                remove=("headers", "footers", "quotes"),
                                categories=categories[category])
    news_data.update({category: newsgroups})

In [15]:
for data in news_data:
    print(data)
    kmeans_data, hierarchy_data = main(news_data[data])

    kmeans_data.to_excel(folder + "lda" + "/" + kmeans_res_path + data + ".xlsx")
    hierarchy_data.to_excel(folder + "lda" + "/" + hierarchy_res_path + data + ".xlsx")
    print("Saved")

light
2336
start calculate ALL
none

normalized_mutual_info_score 
Max: 0.16941470233894182                                     
Min: 0.12023325022628964                                     
AVG: 0.12296426142297864 

adjusted_rand_score 
Max: 0.13304557802323502                                     
Min: 0.1084912828570714                                     
AVG: 0.11031717932158246 

v_measure_score 
Max: 0.16941470233894182                                     
Min: 0.12023325022628964                                     
AVG: 0.12296426142297863 

homogeneity_score 
Max: 0.1598689441186702                                     
Min: 0.11806403583538826                                     
AVG: 0.12049876473146957 


complete
normalized_mutual_info_score: 0.0810242522274157
adjusted_rand_score: 0.06397428423937795
v_measure_score: 0.0810242522274157
homogeneity_score: 0.07357216014904544

average
normalized_mutual_info_score: 0.1462309859630083
adjusted_rand_score: 0.13019063365272163
